diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,236095 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 33723, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 2.9653352311478814e-05, + "grad_norm": 1.731080412864685, + "learning_rate": 0.0, + "loss": 12.1797, + "step": 1 + }, + { + "epoch": 5.930670462295763e-05, + "grad_norm": 1.891041874885559, + "learning_rate": 2.9585798816568047e-06, + "loss": 12.1759, + "step": 2 + }, + { + "epoch": 8.896005693443644e-05, + "grad_norm": 1.693009376525879, + "learning_rate": 5.917159763313609e-06, + "loss": 12.1769, + "step": 3 + }, + { + "epoch": 0.00011861340924591526, + "grad_norm": 1.7163257598876953, + "learning_rate": 8.875739644970414e-06, + "loss": 12.1569, + "step": 4 + }, + { + "epoch": 0.00014826676155739406, + "grad_norm": 1.8745286464691162, + "learning_rate": 1.1834319526627219e-05, + "loss": 12.0972, + "step": 5 + }, + { + "epoch": 0.00017792011386887289, + "grad_norm": 1.9830927848815918, + "learning_rate": 1.4792899408284024e-05, + "loss": 12.0626, + "step": 6 + }, + { + "epoch": 0.00020757346618035169, + "grad_norm": 3.186753988265991, + "learning_rate": 1.7751479289940828e-05, + "loss": 11.8506, + "step": 7 + }, + { + "epoch": 0.00023722681849183051, + "grad_norm": 5.254683494567871, + "learning_rate": 2.0710059171597635e-05, + "loss": 11.5069, + "step": 8 + }, + { + "epoch": 0.0002668801708033093, + "grad_norm": 4.223094463348389, + "learning_rate": 2.3668639053254438e-05, + "loss": 11.1305, + "step": 9 + }, + { + "epoch": 0.0002965335231147881, + "grad_norm": 4.439526081085205, + "learning_rate": 2.6627218934911244e-05, + "loss": 10.7522, + "step": 10 + }, + { + "epoch": 0.0003261868754262669, + "grad_norm": 5.672572612762451, + "learning_rate": 2.9585798816568047e-05, + "loss": 10.5545, + "step": 11 + }, + { + "epoch": 0.00035584022773774577, + "grad_norm": 4.278626918792725, + "learning_rate": 3.254437869822485e-05, + "loss": 10.3674, + "step": 12 + }, + { + "epoch": 0.00038549358004922457, + "grad_norm": 2.714853525161743, + "learning_rate": 3.5502958579881656e-05, + "loss": 10.1337, + "step": 13 + }, + { + "epoch": 0.00041514693236070337, + "grad_norm": 6.698078632354736, + "learning_rate": 3.846153846153846e-05, + "loss": 10.0615, + "step": 14 + }, + { + "epoch": 0.00044480028467218217, + "grad_norm": 2.4968085289001465, + "learning_rate": 4.142011834319527e-05, + "loss": 9.8415, + "step": 15 + }, + { + "epoch": 0.00047445363698366103, + "grad_norm": 2.9717440605163574, + "learning_rate": 4.437869822485207e-05, + "loss": 9.8007, + "step": 16 + }, + { + "epoch": 0.0005041069892951398, + "grad_norm": 2.477841377258301, + "learning_rate": 4.7337278106508875e-05, + "loss": 9.7118, + "step": 17 + }, + { + "epoch": 0.0005337603416066186, + "grad_norm": 1.951272964477539, + "learning_rate": 5.029585798816568e-05, + "loss": 9.5863, + "step": 18 + }, + { + "epoch": 0.0005634136939180975, + "grad_norm": 3.3958888053894043, + "learning_rate": 5.325443786982249e-05, + "loss": 9.5097, + "step": 19 + }, + { + "epoch": 0.0005930670462295762, + "grad_norm": 3.382140636444092, + "learning_rate": 5.6213017751479294e-05, + "loss": 9.3982, + "step": 20 + }, + { + "epoch": 0.0006227203985410551, + "grad_norm": 1.714876651763916, + "learning_rate": 5.9171597633136094e-05, + "loss": 9.3157, + "step": 21 + }, + { + "epoch": 0.0006523737508525338, + "grad_norm": 1.7194938659667969, + "learning_rate": 6.21301775147929e-05, + "loss": 9.1907, + "step": 22 + }, + { + "epoch": 0.0006820271031640127, + "grad_norm": 1.5182723999023438, + "learning_rate": 6.50887573964497e-05, + "loss": 9.1241, + "step": 23 + }, + { + "epoch": 0.0007116804554754915, + "grad_norm": 1.471593976020813, + "learning_rate": 6.80473372781065e-05, + "loss": 9.0192, + "step": 24 + }, + { + "epoch": 0.0007413338077869703, + "grad_norm": 1.7220451831817627, + "learning_rate": 7.100591715976331e-05, + "loss": 8.8786, + "step": 25 + }, + { + "epoch": 0.0007709871600984491, + "grad_norm": 1.3188936710357666, + "learning_rate": 7.396449704142012e-05, + "loss": 8.8354, + "step": 26 + }, + { + "epoch": 0.0008006405124099279, + "grad_norm": 1.4191982746124268, + "learning_rate": 7.692307692307693e-05, + "loss": 8.6844, + "step": 27 + }, + { + "epoch": 0.0008302938647214067, + "grad_norm": 1.1729546785354614, + "learning_rate": 7.988165680473373e-05, + "loss": 8.567, + "step": 28 + }, + { + "epoch": 0.0008599472170328856, + "grad_norm": 2.2601613998413086, + "learning_rate": 8.284023668639054e-05, + "loss": 8.4882, + "step": 29 + }, + { + "epoch": 0.0008896005693443643, + "grad_norm": 1.2205206155776978, + "learning_rate": 8.579881656804733e-05, + "loss": 8.3526, + "step": 30 + }, + { + "epoch": 0.0009192539216558432, + "grad_norm": 1.4343513250350952, + "learning_rate": 8.875739644970414e-05, + "loss": 8.2775, + "step": 31 + }, + { + "epoch": 0.0009489072739673221, + "grad_norm": 1.8261027336120605, + "learning_rate": 9.171597633136094e-05, + "loss": 8.2202, + "step": 32 + }, + { + "epoch": 0.0009785606262788008, + "grad_norm": 1.1463712453842163, + "learning_rate": 9.467455621301775e-05, + "loss": 8.1128, + "step": 33 + }, + { + "epoch": 0.0010082139785902795, + "grad_norm": 2.0714364051818848, + "learning_rate": 9.763313609467456e-05, + "loss": 8.0614, + "step": 34 + }, + { + "epoch": 0.0010378673309017585, + "grad_norm": 1.6231037378311157, + "learning_rate": 0.00010059171597633136, + "loss": 7.9978, + "step": 35 + }, + { + "epoch": 0.0010675206832132373, + "grad_norm": 0.9995529651641846, + "learning_rate": 0.00010355029585798817, + "loss": 7.9469, + "step": 36 + }, + { + "epoch": 0.001097174035524716, + "grad_norm": 1.389865756034851, + "learning_rate": 0.00010650887573964498, + "loss": 7.8943, + "step": 37 + }, + { + "epoch": 0.001126827387836195, + "grad_norm": 2.4548733234405518, + "learning_rate": 0.00010946745562130178, + "loss": 7.8763, + "step": 38 + }, + { + "epoch": 0.0011564807401476737, + "grad_norm": 1.4707447290420532, + "learning_rate": 0.00011242603550295859, + "loss": 7.8416, + "step": 39 + }, + { + "epoch": 0.0011861340924591525, + "grad_norm": 3.2467200756073, + "learning_rate": 0.0001153846153846154, + "loss": 7.846, + "step": 40 + }, + { + "epoch": 0.0012157874447706314, + "grad_norm": 1.4680674076080322, + "learning_rate": 0.00011834319526627219, + "loss": 7.7698, + "step": 41 + }, + { + "epoch": 0.0012454407970821102, + "grad_norm": 2.3748340606689453, + "learning_rate": 0.000121301775147929, + "loss": 7.8035, + "step": 42 + }, + { + "epoch": 0.001275094149393589, + "grad_norm": 2.0236849784851074, + "learning_rate": 0.0001242603550295858, + "loss": 7.8038, + "step": 43 + }, + { + "epoch": 0.0013047475017050677, + "grad_norm": 1.3136340379714966, + "learning_rate": 0.00012721893491124262, + "loss": 7.7479, + "step": 44 + }, + { + "epoch": 0.0013344008540165466, + "grad_norm": 3.245168924331665, + "learning_rate": 0.0001301775147928994, + "loss": 7.7013, + "step": 45 + }, + { + "epoch": 0.0013640542063280254, + "grad_norm": 3.715984582901001, + "learning_rate": 0.00013313609467455623, + "loss": 7.7521, + "step": 46 + }, + { + "epoch": 0.0013937075586395041, + "grad_norm": 1.2695914506912231, + "learning_rate": 0.000136094674556213, + "loss": 7.6437, + "step": 47 + }, + { + "epoch": 0.001423360910950983, + "grad_norm": 1.6805428266525269, + "learning_rate": 0.00013905325443786982, + "loss": 7.6568, + "step": 48 + }, + { + "epoch": 0.0014530142632624618, + "grad_norm": 1.5953612327575684, + "learning_rate": 0.00014201183431952663, + "loss": 7.6183, + "step": 49 + }, + { + "epoch": 0.0014826676155739406, + "grad_norm": 1.770997166633606, + "learning_rate": 0.00014497041420118343, + "loss": 7.6175, + "step": 50 + }, + { + "epoch": 0.0015123209678854195, + "grad_norm": 1.8852550983428955, + "learning_rate": 0.00014792899408284024, + "loss": 7.5909, + "step": 51 + }, + { + "epoch": 0.0015419743201968983, + "grad_norm": 2.0069291591644287, + "learning_rate": 0.00015088757396449705, + "loss": 7.5818, + "step": 52 + }, + { + "epoch": 0.001571627672508377, + "grad_norm": 1.0519505739212036, + "learning_rate": 0.00015384615384615385, + "loss": 7.4534, + "step": 53 + }, + { + "epoch": 0.0016012810248198558, + "grad_norm": 2.382587194442749, + "learning_rate": 0.00015680473372781066, + "loss": 7.5037, + "step": 54 + }, + { + "epoch": 0.0016309343771313347, + "grad_norm": 1.447076439857483, + "learning_rate": 0.00015976331360946746, + "loss": 7.4616, + "step": 55 + }, + { + "epoch": 0.0016605877294428135, + "grad_norm": 1.6240559816360474, + "learning_rate": 0.00016272189349112427, + "loss": 7.4328, + "step": 56 + }, + { + "epoch": 0.0016902410817542922, + "grad_norm": 1.9599469900131226, + "learning_rate": 0.00016568047337278108, + "loss": 7.4062, + "step": 57 + }, + { + "epoch": 0.0017198944340657712, + "grad_norm": 1.3152766227722168, + "learning_rate": 0.00016863905325443788, + "loss": 7.347, + "step": 58 + }, + { + "epoch": 0.00174954778637725, + "grad_norm": 1.3017386198043823, + "learning_rate": 0.00017159763313609466, + "loss": 7.2959, + "step": 59 + }, + { + "epoch": 0.0017792011386887287, + "grad_norm": 1.1246821880340576, + "learning_rate": 0.0001745562130177515, + "loss": 7.2731, + "step": 60 + }, + { + "epoch": 0.0018088544910002077, + "grad_norm": 0.8387345671653748, + "learning_rate": 0.00017751479289940828, + "loss": 7.2404, + "step": 61 + }, + { + "epoch": 0.0018385078433116864, + "grad_norm": 1.1287413835525513, + "learning_rate": 0.0001804733727810651, + "loss": 7.2416, + "step": 62 + }, + { + "epoch": 0.0018681611956231651, + "grad_norm": 0.7006109356880188, + "learning_rate": 0.0001834319526627219, + "loss": 7.1399, + "step": 63 + }, + { + "epoch": 0.0018978145479346441, + "grad_norm": 1.062333583831787, + "learning_rate": 0.00018639053254437872, + "loss": 7.1538, + "step": 64 + }, + { + "epoch": 0.0019274679002461229, + "grad_norm": 1.2713754177093506, + "learning_rate": 0.0001893491124260355, + "loss": 7.1009, + "step": 65 + }, + { + "epoch": 0.0019571212525576016, + "grad_norm": 1.3022782802581787, + "learning_rate": 0.00019230769230769233, + "loss": 7.0978, + "step": 66 + }, + { + "epoch": 0.0019867746048690803, + "grad_norm": 0.8478756546974182, + "learning_rate": 0.00019526627218934911, + "loss": 7.082, + "step": 67 + }, + { + "epoch": 0.002016427957180559, + "grad_norm": 1.909644603729248, + "learning_rate": 0.00019822485207100595, + "loss": 7.0585, + "step": 68 + }, + { + "epoch": 0.0020460813094920383, + "grad_norm": 1.0752676725387573, + "learning_rate": 0.00020118343195266273, + "loss": 7.0496, + "step": 69 + }, + { + "epoch": 0.002075734661803517, + "grad_norm": 0.9119078516960144, + "learning_rate": 0.0002041420118343195, + "loss": 6.9965, + "step": 70 + }, + { + "epoch": 0.0021053880141149958, + "grad_norm": 0.990043580532074, + "learning_rate": 0.00020710059171597634, + "loss": 6.9538, + "step": 71 + }, + { + "epoch": 0.0021350413664264745, + "grad_norm": 1.2268059253692627, + "learning_rate": 0.00021005917159763312, + "loss": 6.9835, + "step": 72 + }, + { + "epoch": 0.0021646947187379533, + "grad_norm": 0.8128297328948975, + "learning_rate": 0.00021301775147928995, + "loss": 6.9153, + "step": 73 + }, + { + "epoch": 0.002194348071049432, + "grad_norm": 1.0701929330825806, + "learning_rate": 0.00021597633136094673, + "loss": 6.8824, + "step": 74 + }, + { + "epoch": 0.0022240014233609108, + "grad_norm": 0.9690162539482117, + "learning_rate": 0.00021893491124260357, + "loss": 6.8713, + "step": 75 + }, + { + "epoch": 0.00225365477567239, + "grad_norm": 0.76236492395401, + "learning_rate": 0.00022189349112426034, + "loss": 6.8187, + "step": 76 + }, + { + "epoch": 0.0022833081279838687, + "grad_norm": 1.0184530019760132, + "learning_rate": 0.00022485207100591718, + "loss": 6.7754, + "step": 77 + }, + { + "epoch": 0.0023129614802953474, + "grad_norm": 1.4012643098831177, + "learning_rate": 0.00022781065088757396, + "loss": 6.8305, + "step": 78 + }, + { + "epoch": 0.002342614832606826, + "grad_norm": 1.168962836265564, + "learning_rate": 0.0002307692307692308, + "loss": 6.8046, + "step": 79 + }, + { + "epoch": 0.002372268184918305, + "grad_norm": 2.4798812866210938, + "learning_rate": 0.00023372781065088757, + "loss": 6.7989, + "step": 80 + }, + { + "epoch": 0.0024019215372297837, + "grad_norm": 1.4856984615325928, + "learning_rate": 0.00023668639053254438, + "loss": 6.7433, + "step": 81 + }, + { + "epoch": 0.002431574889541263, + "grad_norm": 1.6413969993591309, + "learning_rate": 0.00023964497041420118, + "loss": 6.7609, + "step": 82 + }, + { + "epoch": 0.0024612282418527416, + "grad_norm": 1.3671187162399292, + "learning_rate": 0.000242603550295858, + "loss": 6.7042, + "step": 83 + }, + { + "epoch": 0.0024908815941642203, + "grad_norm": 1.3406575918197632, + "learning_rate": 0.0002455621301775148, + "loss": 6.689, + "step": 84 + }, + { + "epoch": 0.002520534946475699, + "grad_norm": 1.7815296649932861, + "learning_rate": 0.0002485207100591716, + "loss": 6.729, + "step": 85 + }, + { + "epoch": 0.002550188298787178, + "grad_norm": 1.0133517980575562, + "learning_rate": 0.00025147928994082844, + "loss": 6.6729, + "step": 86 + }, + { + "epoch": 0.0025798416510986566, + "grad_norm": 1.2696471214294434, + "learning_rate": 0.00025443786982248524, + "loss": 6.6682, + "step": 87 + }, + { + "epoch": 0.0026094950034101353, + "grad_norm": 1.106111764907837, + "learning_rate": 0.000257396449704142, + "loss": 6.6053, + "step": 88 + }, + { + "epoch": 0.0026391483557216145, + "grad_norm": 1.1070778369903564, + "learning_rate": 0.0002603550295857988, + "loss": 6.6124, + "step": 89 + }, + { + "epoch": 0.0026688017080330933, + "grad_norm": 1.0859613418579102, + "learning_rate": 0.00026331360946745566, + "loss": 6.5908, + "step": 90 + }, + { + "epoch": 0.002698455060344572, + "grad_norm": 1.4647579193115234, + "learning_rate": 0.00026627218934911247, + "loss": 6.6068, + "step": 91 + }, + { + "epoch": 0.0027281084126560507, + "grad_norm": 1.4465129375457764, + "learning_rate": 0.0002692307692307692, + "loss": 6.591, + "step": 92 + }, + { + "epoch": 0.0027577617649675295, + "grad_norm": 0.8910762071609497, + "learning_rate": 0.000272189349112426, + "loss": 6.5595, + "step": 93 + }, + { + "epoch": 0.0027874151172790082, + "grad_norm": 1.7705402374267578, + "learning_rate": 0.0002751479289940829, + "loss": 6.5373, + "step": 94 + }, + { + "epoch": 0.0028170684695904874, + "grad_norm": 0.9697396159172058, + "learning_rate": 0.00027810650887573964, + "loss": 6.5262, + "step": 95 + }, + { + "epoch": 0.002846721821901966, + "grad_norm": 1.3457783460617065, + "learning_rate": 0.00028106508875739645, + "loss": 6.5104, + "step": 96 + }, + { + "epoch": 0.002876375174213445, + "grad_norm": 1.277547836303711, + "learning_rate": 0.00028402366863905325, + "loss": 6.5197, + "step": 97 + }, + { + "epoch": 0.0029060285265249237, + "grad_norm": 1.1149176359176636, + "learning_rate": 0.0002869822485207101, + "loss": 6.4776, + "step": 98 + }, + { + "epoch": 0.0029356818788364024, + "grad_norm": 1.4933223724365234, + "learning_rate": 0.00028994082840236686, + "loss": 6.4535, + "step": 99 + }, + { + "epoch": 0.002965335231147881, + "grad_norm": 1.10006582736969, + "learning_rate": 0.00029289940828402367, + "loss": 6.4308, + "step": 100 + }, + { + "epoch": 0.00299498858345936, + "grad_norm": 0.910181999206543, + "learning_rate": 0.0002958579881656805, + "loss": 6.4486, + "step": 101 + }, + { + "epoch": 0.003024641935770839, + "grad_norm": 1.1875908374786377, + "learning_rate": 0.00029881656804733734, + "loss": 6.4343, + "step": 102 + }, + { + "epoch": 0.003054295288082318, + "grad_norm": 2.3989593982696533, + "learning_rate": 0.0003017751479289941, + "loss": 6.5035, + "step": 103 + }, + { + "epoch": 0.0030839486403937966, + "grad_norm": 1.2236170768737793, + "learning_rate": 0.0003047337278106509, + "loss": 6.4016, + "step": 104 + }, + { + "epoch": 0.0031136019927052753, + "grad_norm": 1.2667793035507202, + "learning_rate": 0.0003076923076923077, + "loss": 6.3905, + "step": 105 + }, + { + "epoch": 0.003143255345016754, + "grad_norm": 0.9771948456764221, + "learning_rate": 0.00031065088757396446, + "loss": 6.4001, + "step": 106 + }, + { + "epoch": 0.003172908697328233, + "grad_norm": 1.133126139640808, + "learning_rate": 0.0003136094674556213, + "loss": 6.3811, + "step": 107 + }, + { + "epoch": 0.0032025620496397116, + "grad_norm": 1.5597015619277954, + "learning_rate": 0.0003165680473372781, + "loss": 6.3246, + "step": 108 + }, + { + "epoch": 0.0032322154019511907, + "grad_norm": 1.1121643781661987, + "learning_rate": 0.00031952662721893493, + "loss": 6.2908, + "step": 109 + }, + { + "epoch": 0.0032618687542626695, + "grad_norm": 1.2440558671951294, + "learning_rate": 0.0003224852071005917, + "loss": 6.2791, + "step": 110 + }, + { + "epoch": 0.0032915221065741482, + "grad_norm": 1.452272891998291, + "learning_rate": 0.00032544378698224854, + "loss": 6.3228, + "step": 111 + }, + { + "epoch": 0.003321175458885627, + "grad_norm": 1.074069619178772, + "learning_rate": 0.00032840236686390535, + "loss": 6.2782, + "step": 112 + }, + { + "epoch": 0.0033508288111971057, + "grad_norm": 0.939005970954895, + "learning_rate": 0.00033136094674556215, + "loss": 6.2193, + "step": 113 + }, + { + "epoch": 0.0033804821635085845, + "grad_norm": 1.4545989036560059, + "learning_rate": 0.0003343195266272189, + "loss": 6.2689, + "step": 114 + }, + { + "epoch": 0.0034101355158200636, + "grad_norm": 1.5885658264160156, + "learning_rate": 0.00033727810650887577, + "loss": 6.2526, + "step": 115 + }, + { + "epoch": 0.0034397888681315424, + "grad_norm": 0.881787121295929, + "learning_rate": 0.0003402366863905326, + "loss": 6.217, + "step": 116 + }, + { + "epoch": 0.003469442220443021, + "grad_norm": 1.9513285160064697, + "learning_rate": 0.0003431952662721893, + "loss": 6.2454, + "step": 117 + }, + { + "epoch": 0.0034990955727545, + "grad_norm": 1.1169172525405884, + "learning_rate": 0.00034615384615384613, + "loss": 6.2037, + "step": 118 + }, + { + "epoch": 0.0035287489250659786, + "grad_norm": 1.168959379196167, + "learning_rate": 0.000349112426035503, + "loss": 6.1867, + "step": 119 + }, + { + "epoch": 0.0035584022773774574, + "grad_norm": 1.3510783910751343, + "learning_rate": 0.0003520710059171598, + "loss": 6.1521, + "step": 120 + }, + { + "epoch": 0.003588055629688936, + "grad_norm": 1.3713606595993042, + "learning_rate": 0.00035502958579881655, + "loss": 6.1908, + "step": 121 + }, + { + "epoch": 0.0036177089820004153, + "grad_norm": 0.8885394334793091, + "learning_rate": 0.00035798816568047336, + "loss": 6.1332, + "step": 122 + }, + { + "epoch": 0.003647362334311894, + "grad_norm": 1.3533185720443726, + "learning_rate": 0.0003609467455621302, + "loss": 6.1626, + "step": 123 + }, + { + "epoch": 0.003677015686623373, + "grad_norm": 1.1273229122161865, + "learning_rate": 0.000363905325443787, + "loss": 6.0853, + "step": 124 + }, + { + "epoch": 0.0037066690389348515, + "grad_norm": 1.184908151626587, + "learning_rate": 0.0003668639053254438, + "loss": 6.1026, + "step": 125 + }, + { + "epoch": 0.0037363223912463303, + "grad_norm": 1.179960012435913, + "learning_rate": 0.0003698224852071006, + "loss": 6.0816, + "step": 126 + }, + { + "epoch": 0.003765975743557809, + "grad_norm": 1.0096495151519775, + "learning_rate": 0.00037278106508875744, + "loss": 6.0621, + "step": 127 + }, + { + "epoch": 0.0037956290958692882, + "grad_norm": 1.3877495527267456, + "learning_rate": 0.0003757396449704142, + "loss": 6.089, + "step": 128 + }, + { + "epoch": 0.003825282448180767, + "grad_norm": 1.0200660228729248, + "learning_rate": 0.000378698224852071, + "loss": 6.0756, + "step": 129 + }, + { + "epoch": 0.0038549358004922457, + "grad_norm": 1.930251121520996, + "learning_rate": 0.0003816568047337278, + "loss": 6.0388, + "step": 130 + }, + { + "epoch": 0.0038845891528037245, + "grad_norm": 1.1161574125289917, + "learning_rate": 0.00038461538461538467, + "loss": 6.012, + "step": 131 + }, + { + "epoch": 0.003914242505115203, + "grad_norm": 1.0914400815963745, + "learning_rate": 0.0003875739644970414, + "loss": 6.0314, + "step": 132 + }, + { + "epoch": 0.003943895857426682, + "grad_norm": 1.153190016746521, + "learning_rate": 0.00039053254437869823, + "loss": 5.9904, + "step": 133 + }, + { + "epoch": 0.003973549209738161, + "grad_norm": 1.2420721054077148, + "learning_rate": 0.00039349112426035503, + "loss": 5.9587, + "step": 134 + }, + { + "epoch": 0.0040032025620496394, + "grad_norm": 1.1376316547393799, + "learning_rate": 0.0003964497041420119, + "loss": 5.9648, + "step": 135 + }, + { + "epoch": 0.004032855914361118, + "grad_norm": 1.0860857963562012, + "learning_rate": 0.00039940828402366865, + "loss": 5.9681, + "step": 136 + }, + { + "epoch": 0.004062509266672597, + "grad_norm": 1.5807281732559204, + "learning_rate": 0.00040236686390532545, + "loss": 5.9911, + "step": 137 + }, + { + "epoch": 0.0040921626189840766, + "grad_norm": 1.0833799839019775, + "learning_rate": 0.00040532544378698226, + "loss": 5.9872, + "step": 138 + }, + { + "epoch": 0.004121815971295555, + "grad_norm": 1.602088212966919, + "learning_rate": 0.000408284023668639, + "loss": 5.9598, + "step": 139 + }, + { + "epoch": 0.004151469323607034, + "grad_norm": 1.1079938411712646, + "learning_rate": 0.00041124260355029587, + "loss": 5.9192, + "step": 140 + }, + { + "epoch": 0.004181122675918513, + "grad_norm": 1.171912431716919, + "learning_rate": 0.0004142011834319527, + "loss": 5.9208, + "step": 141 + }, + { + "epoch": 0.0042107760282299915, + "grad_norm": 1.1064499616622925, + "learning_rate": 0.0004171597633136095, + "loss": 5.9369, + "step": 142 + }, + { + "epoch": 0.00424042938054147, + "grad_norm": 1.9310368299484253, + "learning_rate": 0.00042011834319526624, + "loss": 5.9389, + "step": 143 + }, + { + "epoch": 0.004270082732852949, + "grad_norm": 0.970365583896637, + "learning_rate": 0.0004230769230769231, + "loss": 5.8655, + "step": 144 + }, + { + "epoch": 0.004299736085164428, + "grad_norm": 0.9289197325706482, + "learning_rate": 0.0004260355029585799, + "loss": 5.866, + "step": 145 + }, + { + "epoch": 0.0043293894374759065, + "grad_norm": 1.031434416770935, + "learning_rate": 0.0004289940828402367, + "loss": 5.8457, + "step": 146 + }, + { + "epoch": 0.004359042789787385, + "grad_norm": 1.1034375429153442, + "learning_rate": 0.00043195266272189346, + "loss": 5.8467, + "step": 147 + }, + { + "epoch": 0.004388696142098864, + "grad_norm": 1.2099765539169312, + "learning_rate": 0.0004349112426035503, + "loss": 5.8001, + "step": 148 + }, + { + "epoch": 0.004418349494410343, + "grad_norm": 0.8857686519622803, + "learning_rate": 0.00043786982248520713, + "loss": 5.7723, + "step": 149 + }, + { + "epoch": 0.0044480028467218215, + "grad_norm": 0.7234377861022949, + "learning_rate": 0.0004408284023668639, + "loss": 5.7888, + "step": 150 + }, + { + "epoch": 0.004477656199033301, + "grad_norm": 0.9848999977111816, + "learning_rate": 0.0004437869822485207, + "loss": 5.7869, + "step": 151 + }, + { + "epoch": 0.00450730955134478, + "grad_norm": 1.297167420387268, + "learning_rate": 0.00044674556213017755, + "loss": 5.7976, + "step": 152 + }, + { + "epoch": 0.004536962903656259, + "grad_norm": 0.7784046530723572, + "learning_rate": 0.00044970414201183436, + "loss": 5.7635, + "step": 153 + }, + { + "epoch": 0.004566616255967737, + "grad_norm": 1.1374931335449219, + "learning_rate": 0.0004526627218934911, + "loss": 5.7512, + "step": 154 + }, + { + "epoch": 0.004596269608279216, + "grad_norm": 1.190300464630127, + "learning_rate": 0.0004556213017751479, + "loss": 5.7591, + "step": 155 + }, + { + "epoch": 0.004625922960590695, + "grad_norm": 0.9689821600914001, + "learning_rate": 0.0004585798816568048, + "loss": 5.7226, + "step": 156 + }, + { + "epoch": 0.004655576312902174, + "grad_norm": 1.0825285911560059, + "learning_rate": 0.0004615384615384616, + "loss": 5.7193, + "step": 157 + }, + { + "epoch": 0.004685229665213652, + "grad_norm": 1.283270001411438, + "learning_rate": 0.00046449704142011833, + "loss": 5.7443, + "step": 158 + }, + { + "epoch": 0.004714883017525131, + "grad_norm": 0.9803303480148315, + "learning_rate": 0.00046745562130177514, + "loss": 5.6744, + "step": 159 + }, + { + "epoch": 0.00474453636983661, + "grad_norm": 1.6158283948898315, + "learning_rate": 0.000470414201183432, + "loss": 5.7198, + "step": 160 + }, + { + "epoch": 0.004774189722148089, + "grad_norm": 0.7083865404129028, + "learning_rate": 0.00047337278106508875, + "loss": 5.6451, + "step": 161 + }, + { + "epoch": 0.004803843074459567, + "grad_norm": 1.2546383142471313, + "learning_rate": 0.00047633136094674556, + "loss": 5.6684, + "step": 162 + }, + { + "epoch": 0.004833496426771046, + "grad_norm": 1.1475765705108643, + "learning_rate": 0.00047928994082840237, + "loss": 5.6687, + "step": 163 + }, + { + "epoch": 0.004863149779082526, + "grad_norm": 0.6843975186347961, + "learning_rate": 0.0004822485207100592, + "loss": 5.6372, + "step": 164 + }, + { + "epoch": 0.0048928031313940044, + "grad_norm": 0.9668450951576233, + "learning_rate": 0.000485207100591716, + "loss": 5.6273, + "step": 165 + }, + { + "epoch": 0.004922456483705483, + "grad_norm": 1.3319953680038452, + "learning_rate": 0.0004881656804733728, + "loss": 5.6494, + "step": 166 + }, + { + "epoch": 0.004952109836016962, + "grad_norm": 1.0780526399612427, + "learning_rate": 0.0004911242603550296, + "loss": 5.6057, + "step": 167 + }, + { + "epoch": 0.004981763188328441, + "grad_norm": 1.3214702606201172, + "learning_rate": 0.0004940828402366864, + "loss": 5.5894, + "step": 168 + }, + { + "epoch": 0.005011416540639919, + "grad_norm": 0.8913089036941528, + "learning_rate": 0.0004970414201183431, + "loss": 5.6008, + "step": 169 + }, + { + "epoch": 0.005041069892951398, + "grad_norm": 0.9552988409996033, + "learning_rate": 0.0005, + "loss": 5.5714, + "step": 170 + }, + { + "epoch": 0.005070723245262877, + "grad_norm": 1.4064273834228516, + "learning_rate": 0.0005029585798816569, + "loss": 5.6152, + "step": 171 + }, + { + "epoch": 0.005100376597574356, + "grad_norm": 0.8386754989624023, + "learning_rate": 0.0005059171597633136, + "loss": 5.5835, + "step": 172 + }, + { + "epoch": 0.005130029949885834, + "grad_norm": 1.234330177307129, + "learning_rate": 0.0005088757396449705, + "loss": 5.5756, + "step": 173 + }, + { + "epoch": 0.005159683302197313, + "grad_norm": 0.9318386912345886, + "learning_rate": 0.0005118343195266271, + "loss": 5.5108, + "step": 174 + }, + { + "epoch": 0.005189336654508792, + "grad_norm": 0.9288777112960815, + "learning_rate": 0.000514792899408284, + "loss": 5.5329, + "step": 175 + }, + { + "epoch": 0.005218990006820271, + "grad_norm": 1.0611422061920166, + "learning_rate": 0.0005177514792899408, + "loss": 5.548, + "step": 176 + }, + { + "epoch": 0.00524864335913175, + "grad_norm": 0.8884417414665222, + "learning_rate": 0.0005207100591715976, + "loss": 5.5406, + "step": 177 + }, + { + "epoch": 0.005278296711443229, + "grad_norm": 0.8717062473297119, + "learning_rate": 0.0005236686390532545, + "loss": 5.5033, + "step": 178 + }, + { + "epoch": 0.005307950063754708, + "grad_norm": 0.7516209483146667, + "learning_rate": 0.0005266272189349113, + "loss": 5.4639, + "step": 179 + }, + { + "epoch": 0.0053376034160661865, + "grad_norm": 0.6829626560211182, + "learning_rate": 0.0005295857988165681, + "loss": 5.4556, + "step": 180 + }, + { + "epoch": 0.005367256768377665, + "grad_norm": 1.1825026273727417, + "learning_rate": 0.0005325443786982249, + "loss": 5.4715, + "step": 181 + }, + { + "epoch": 0.005396910120689144, + "grad_norm": 0.8295742273330688, + "learning_rate": 0.0005355029585798816, + "loss": 5.4532, + "step": 182 + }, + { + "epoch": 0.005426563473000623, + "grad_norm": 0.6820582151412964, + "learning_rate": 0.0005384615384615384, + "loss": 5.4307, + "step": 183 + }, + { + "epoch": 0.0054562168253121015, + "grad_norm": 0.6469408869743347, + "learning_rate": 0.0005414201183431953, + "loss": 5.3933, + "step": 184 + }, + { + "epoch": 0.00548587017762358, + "grad_norm": 0.5377639532089233, + "learning_rate": 0.000544378698224852, + "loss": 5.4152, + "step": 185 + }, + { + "epoch": 0.005515523529935059, + "grad_norm": 0.6100967526435852, + "learning_rate": 0.0005473372781065089, + "loss": 5.3852, + "step": 186 + }, + { + "epoch": 0.005545176882246538, + "grad_norm": 0.5679780840873718, + "learning_rate": 0.0005502958579881658, + "loss": 5.3476, + "step": 187 + }, + { + "epoch": 0.0055748302345580165, + "grad_norm": 0.7106412053108215, + "learning_rate": 0.0005532544378698225, + "loss": 5.3455, + "step": 188 + }, + { + "epoch": 0.005604483586869495, + "grad_norm": 0.6459296941757202, + "learning_rate": 0.0005562130177514793, + "loss": 5.3929, + "step": 189 + }, + { + "epoch": 0.005634136939180975, + "grad_norm": 0.700768232345581, + "learning_rate": 0.000559171597633136, + "loss": 5.3198, + "step": 190 + }, + { + "epoch": 0.005663790291492454, + "grad_norm": 0.7849311828613281, + "learning_rate": 0.0005621301775147929, + "loss": 5.3554, + "step": 191 + }, + { + "epoch": 0.005693443643803932, + "grad_norm": 0.9524291157722473, + "learning_rate": 0.0005650887573964498, + "loss": 5.3704, + "step": 192 + }, + { + "epoch": 0.005723096996115411, + "grad_norm": 1.1036955118179321, + "learning_rate": 0.0005680473372781065, + "loss": 5.3372, + "step": 193 + }, + { + "epoch": 0.00575275034842689, + "grad_norm": 0.987999677658081, + "learning_rate": 0.0005710059171597634, + "loss": 5.3489, + "step": 194 + }, + { + "epoch": 0.005782403700738369, + "grad_norm": 0.9958258271217346, + "learning_rate": 0.0005739644970414202, + "loss": 5.362, + "step": 195 + }, + { + "epoch": 0.005812057053049847, + "grad_norm": 1.1681301593780518, + "learning_rate": 0.0005769230769230769, + "loss": 5.3368, + "step": 196 + }, + { + "epoch": 0.005841710405361326, + "grad_norm": 1.0020482540130615, + "learning_rate": 0.0005798816568047337, + "loss": 5.3652, + "step": 197 + }, + { + "epoch": 0.005871363757672805, + "grad_norm": 0.8731601238250732, + "learning_rate": 0.0005828402366863905, + "loss": 5.3286, + "step": 198 + }, + { + "epoch": 0.0059010171099842836, + "grad_norm": 0.8524949550628662, + "learning_rate": 0.0005857988165680473, + "loss": 5.3082, + "step": 199 + }, + { + "epoch": 0.005930670462295762, + "grad_norm": 1.0957072973251343, + "learning_rate": 0.0005887573964497042, + "loss": 5.2935, + "step": 200 + }, + { + "epoch": 0.005960323814607241, + "grad_norm": 0.7477771639823914, + "learning_rate": 0.000591715976331361, + "loss": 5.3024, + "step": 201 + }, + { + "epoch": 0.00598997716691872, + "grad_norm": 0.809942901134491, + "learning_rate": 0.0005946745562130178, + "loss": 5.3008, + "step": 202 + }, + { + "epoch": 0.006019630519230199, + "grad_norm": 0.8745566010475159, + "learning_rate": 0.0005976331360946747, + "loss": 5.3072, + "step": 203 + }, + { + "epoch": 0.006049283871541678, + "grad_norm": 0.9016023874282837, + "learning_rate": 0.0006005917159763313, + "loss": 5.2895, + "step": 204 + }, + { + "epoch": 0.006078937223853157, + "grad_norm": 0.8439170718193054, + "learning_rate": 0.0006035502958579882, + "loss": 5.2527, + "step": 205 + }, + { + "epoch": 0.006108590576164636, + "grad_norm": 0.8199399709701538, + "learning_rate": 0.0006065088757396449, + "loss": 5.2701, + "step": 206 + }, + { + "epoch": 0.006138243928476114, + "grad_norm": 0.7986448407173157, + "learning_rate": 0.0006094674556213018, + "loss": 5.2411, + "step": 207 + }, + { + "epoch": 0.006167897280787593, + "grad_norm": 0.8755974173545837, + "learning_rate": 0.0006124260355029587, + "loss": 5.298, + "step": 208 + }, + { + "epoch": 0.006197550633099072, + "grad_norm": 0.7966985106468201, + "learning_rate": 0.0006153846153846154, + "loss": 5.2324, + "step": 209 + }, + { + "epoch": 0.006227203985410551, + "grad_norm": 0.6922147274017334, + "learning_rate": 0.0006183431952662723, + "loss": 5.2105, + "step": 210 + }, + { + "epoch": 0.006256857337722029, + "grad_norm": 0.6949384808540344, + "learning_rate": 0.0006213017751479289, + "loss": 5.2214, + "step": 211 + }, + { + "epoch": 0.006286510690033508, + "grad_norm": 0.7601715922355652, + "learning_rate": 0.0006242603550295858, + "loss": 5.1859, + "step": 212 + }, + { + "epoch": 0.006316164042344987, + "grad_norm": 0.7295635342597961, + "learning_rate": 0.0006272189349112426, + "loss": 5.2145, + "step": 213 + }, + { + "epoch": 0.006345817394656466, + "grad_norm": 0.6093840599060059, + "learning_rate": 0.0006301775147928994, + "loss": 5.1631, + "step": 214 + }, + { + "epoch": 0.006375470746967944, + "grad_norm": 0.7016854882240295, + "learning_rate": 0.0006331360946745562, + "loss": 5.1494, + "step": 215 + }, + { + "epoch": 0.006405124099279423, + "grad_norm": 0.6053462028503418, + "learning_rate": 0.0006360946745562131, + "loss": 5.1388, + "step": 216 + }, + { + "epoch": 0.006434777451590903, + "grad_norm": 0.45105603337287903, + "learning_rate": 0.0006390532544378699, + "loss": 5.1286, + "step": 217 + }, + { + "epoch": 0.0064644308039023815, + "grad_norm": 0.5825572609901428, + "learning_rate": 0.0006420118343195266, + "loss": 5.1271, + "step": 218 + }, + { + "epoch": 0.00649408415621386, + "grad_norm": 0.7899414896965027, + "learning_rate": 0.0006449704142011834, + "loss": 5.0902, + "step": 219 + }, + { + "epoch": 0.006523737508525339, + "grad_norm": 0.6510829925537109, + "learning_rate": 0.0006479289940828402, + "loss": 5.1118, + "step": 220 + }, + { + "epoch": 0.006553390860836818, + "grad_norm": 0.46504396200180054, + "learning_rate": 0.0006508875739644971, + "loss": 5.0965, + "step": 221 + }, + { + "epoch": 0.0065830442131482965, + "grad_norm": 0.6572167873382568, + "learning_rate": 0.0006538461538461538, + "loss": 5.045, + "step": 222 + }, + { + "epoch": 0.006612697565459775, + "grad_norm": 0.6746143102645874, + "learning_rate": 0.0006568047337278107, + "loss": 5.061, + "step": 223 + }, + { + "epoch": 0.006642350917771254, + "grad_norm": 0.6579557061195374, + "learning_rate": 0.0006597633136094676, + "loss": 5.062, + "step": 224 + }, + { + "epoch": 0.006672004270082733, + "grad_norm": 0.7761130332946777, + "learning_rate": 0.0006627218934911243, + "loss": 5.0528, + "step": 225 + }, + { + "epoch": 0.0067016576223942114, + "grad_norm": 0.7239773869514465, + "learning_rate": 0.0006656804733727811, + "loss": 5.0533, + "step": 226 + }, + { + "epoch": 0.00673131097470569, + "grad_norm": 0.7554423213005066, + "learning_rate": 0.0006686390532544378, + "loss": 5.0785, + "step": 227 + }, + { + "epoch": 0.006760964327017169, + "grad_norm": 0.6864815950393677, + "learning_rate": 0.0006715976331360947, + "loss": 5.049, + "step": 228 + }, + { + "epoch": 0.006790617679328648, + "grad_norm": 0.8713997006416321, + "learning_rate": 0.0006745562130177515, + "loss": 5.0288, + "step": 229 + }, + { + "epoch": 0.006820271031640127, + "grad_norm": 0.776359498500824, + "learning_rate": 0.0006775147928994083, + "loss": 5.0397, + "step": 230 + }, + { + "epoch": 0.006849924383951606, + "grad_norm": 0.6823018789291382, + "learning_rate": 0.0006804733727810651, + "loss": 5.0502, + "step": 231 + }, + { + "epoch": 0.006879577736263085, + "grad_norm": 0.6579205989837646, + "learning_rate": 0.000683431952662722, + "loss": 5.0341, + "step": 232 + }, + { + "epoch": 0.0069092310885745635, + "grad_norm": 0.8384518027305603, + "learning_rate": 0.0006863905325443787, + "loss": 5.047, + "step": 233 + }, + { + "epoch": 0.006938884440886042, + "grad_norm": 0.8099024891853333, + "learning_rate": 0.0006893491124260355, + "loss": 5.0555, + "step": 234 + }, + { + "epoch": 0.006968537793197521, + "grad_norm": 0.6159314513206482, + "learning_rate": 0.0006923076923076923, + "loss": 4.963, + "step": 235 + }, + { + "epoch": 0.006998191145509, + "grad_norm": 0.6734345555305481, + "learning_rate": 0.0006952662721893491, + "loss": 5.0304, + "step": 236 + }, + { + "epoch": 0.0070278444978204785, + "grad_norm": 0.5373747944831848, + "learning_rate": 0.000698224852071006, + "loss": 5.0167, + "step": 237 + }, + { + "epoch": 0.007057497850131957, + "grad_norm": 0.5293257832527161, + "learning_rate": 0.0007011834319526627, + "loss": 4.9788, + "step": 238 + }, + { + "epoch": 0.007087151202443436, + "grad_norm": 0.6157745718955994, + "learning_rate": 0.0007041420118343196, + "loss": 4.9954, + "step": 239 + }, + { + "epoch": 0.007116804554754915, + "grad_norm": 0.6140831112861633, + "learning_rate": 0.0007071005917159762, + "loss": 4.9509, + "step": 240 + }, + { + "epoch": 0.0071464579070663935, + "grad_norm": 0.6167766451835632, + "learning_rate": 0.0007100591715976331, + "loss": 4.955, + "step": 241 + }, + { + "epoch": 0.007176111259377872, + "grad_norm": 0.5631921887397766, + "learning_rate": 0.00071301775147929, + "loss": 4.9805, + "step": 242 + }, + { + "epoch": 0.007205764611689352, + "grad_norm": 0.7146539688110352, + "learning_rate": 0.0007159763313609467, + "loss": 4.9242, + "step": 243 + }, + { + "epoch": 0.007235417964000831, + "grad_norm": 0.6975768208503723, + "learning_rate": 0.0007189349112426036, + "loss": 4.9662, + "step": 244 + }, + { + "epoch": 0.007265071316312309, + "grad_norm": 0.6886921525001526, + "learning_rate": 0.0007218934911242604, + "loss": 4.9168, + "step": 245 + }, + { + "epoch": 0.007294724668623788, + "grad_norm": 0.7414640784263611, + "learning_rate": 0.0007248520710059172, + "loss": 4.9579, + "step": 246 + }, + { + "epoch": 0.007324378020935267, + "grad_norm": 0.7999524474143982, + "learning_rate": 0.000727810650887574, + "loss": 4.9495, + "step": 247 + }, + { + "epoch": 0.007354031373246746, + "grad_norm": 0.6891040802001953, + "learning_rate": 0.0007307692307692307, + "loss": 4.9606, + "step": 248 + }, + { + "epoch": 0.007383684725558224, + "grad_norm": 0.8466208577156067, + "learning_rate": 0.0007337278106508876, + "loss": 4.9405, + "step": 249 + }, + { + "epoch": 0.007413338077869703, + "grad_norm": 0.7528066039085388, + "learning_rate": 0.0007366863905325444, + "loss": 4.9518, + "step": 250 + }, + { + "epoch": 0.007442991430181182, + "grad_norm": 0.6537126302719116, + "learning_rate": 0.0007396449704142012, + "loss": 4.941, + "step": 251 + }, + { + "epoch": 0.007472644782492661, + "grad_norm": 0.6405721306800842, + "learning_rate": 0.000742603550295858, + "loss": 4.9211, + "step": 252 + }, + { + "epoch": 0.007502298134804139, + "grad_norm": 0.6863747239112854, + "learning_rate": 0.0007455621301775149, + "loss": 4.9334, + "step": 253 + }, + { + "epoch": 0.007531951487115618, + "grad_norm": 0.5825493335723877, + "learning_rate": 0.0007485207100591716, + "loss": 4.889, + "step": 254 + }, + { + "epoch": 0.007561604839427097, + "grad_norm": 0.5723262429237366, + "learning_rate": 0.0007514792899408284, + "loss": 4.8917, + "step": 255 + }, + { + "epoch": 0.0075912581917385764, + "grad_norm": 0.5144699215888977, + "learning_rate": 0.0007544378698224851, + "loss": 4.8596, + "step": 256 + }, + { + "epoch": 0.007620911544050055, + "grad_norm": 0.5243825316429138, + "learning_rate": 0.000757396449704142, + "loss": 4.8601, + "step": 257 + }, + { + "epoch": 0.007650564896361534, + "grad_norm": 0.4869922995567322, + "learning_rate": 0.0007603550295857989, + "loss": 4.8511, + "step": 258 + }, + { + "epoch": 0.007680218248673013, + "grad_norm": 0.49217259883880615, + "learning_rate": 0.0007633136094674556, + "loss": 4.8592, + "step": 259 + }, + { + "epoch": 0.007709871600984491, + "grad_norm": 0.49852797389030457, + "learning_rate": 0.0007662721893491125, + "loss": 4.8324, + "step": 260 + }, + { + "epoch": 0.00773952495329597, + "grad_norm": 0.427554190158844, + "learning_rate": 0.0007692307692307693, + "loss": 4.8092, + "step": 261 + }, + { + "epoch": 0.007769178305607449, + "grad_norm": 0.4734914004802704, + "learning_rate": 0.000772189349112426, + "loss": 4.7863, + "step": 262 + }, + { + "epoch": 0.007798831657918928, + "grad_norm": 0.640404999256134, + "learning_rate": 0.0007751479289940828, + "loss": 4.7932, + "step": 263 + }, + { + "epoch": 0.007828485010230406, + "grad_norm": 0.5473031401634216, + "learning_rate": 0.0007781065088757396, + "loss": 4.7706, + "step": 264 + }, + { + "epoch": 0.007858138362541886, + "grad_norm": 0.8196998238563538, + "learning_rate": 0.0007810650887573965, + "loss": 4.7984, + "step": 265 + }, + { + "epoch": 0.007887791714853364, + "grad_norm": 0.6458893418312073, + "learning_rate": 0.0007840236686390533, + "loss": 4.807, + "step": 266 + }, + { + "epoch": 0.007917445067164844, + "grad_norm": 0.707221508026123, + "learning_rate": 0.0007869822485207101, + "loss": 4.7864, + "step": 267 + }, + { + "epoch": 0.007947098419476321, + "grad_norm": 0.4559791386127472, + "learning_rate": 0.0007899408284023669, + "loss": 4.8261, + "step": 268 + }, + { + "epoch": 0.007976751771787801, + "grad_norm": 0.43594732880592346, + "learning_rate": 0.0007928994082840238, + "loss": 4.7556, + "step": 269 + }, + { + "epoch": 0.008006405124099279, + "grad_norm": 0.4484982490539551, + "learning_rate": 0.0007958579881656804, + "loss": 4.734, + "step": 270 + }, + { + "epoch": 0.008036058476410759, + "grad_norm": 0.6343010663986206, + "learning_rate": 0.0007988165680473373, + "loss": 4.7717, + "step": 271 + }, + { + "epoch": 0.008065711828722236, + "grad_norm": 0.5612204074859619, + "learning_rate": 0.000801775147928994, + "loss": 4.7545, + "step": 272 + }, + { + "epoch": 0.008095365181033716, + "grad_norm": 0.6459882855415344, + "learning_rate": 0.0008047337278106509, + "loss": 4.7293, + "step": 273 + }, + { + "epoch": 0.008125018533345194, + "grad_norm": 0.6619464755058289, + "learning_rate": 0.0008076923076923078, + "loss": 4.7413, + "step": 274 + }, + { + "epoch": 0.008154671885656673, + "grad_norm": 0.6315581798553467, + "learning_rate": 0.0008106508875739645, + "loss": 4.7555, + "step": 275 + }, + { + "epoch": 0.008184325237968153, + "grad_norm": 0.7580191493034363, + "learning_rate": 0.0008136094674556214, + "loss": 4.7426, + "step": 276 + }, + { + "epoch": 0.008213978590279631, + "grad_norm": 0.534270703792572, + "learning_rate": 0.000816568047337278, + "loss": 4.7321, + "step": 277 + }, + { + "epoch": 0.00824363194259111, + "grad_norm": 0.44729259610176086, + "learning_rate": 0.0008195266272189349, + "loss": 4.6956, + "step": 278 + }, + { + "epoch": 0.008273285294902588, + "grad_norm": 0.5441956520080566, + "learning_rate": 0.0008224852071005917, + "loss": 4.6856, + "step": 279 + }, + { + "epoch": 0.008302938647214068, + "grad_norm": 0.5066124796867371, + "learning_rate": 0.0008254437869822485, + "loss": 4.6766, + "step": 280 + }, + { + "epoch": 0.008332591999525546, + "grad_norm": 0.4550777077674866, + "learning_rate": 0.0008284023668639054, + "loss": 4.6558, + "step": 281 + }, + { + "epoch": 0.008362245351837026, + "grad_norm": 0.48840489983558655, + "learning_rate": 0.0008313609467455622, + "loss": 4.6776, + "step": 282 + }, + { + "epoch": 0.008391898704148503, + "grad_norm": 0.4946458041667938, + "learning_rate": 0.000834319526627219, + "loss": 4.6567, + "step": 283 + }, + { + "epoch": 0.008421552056459983, + "grad_norm": 0.45885196328163147, + "learning_rate": 0.0008372781065088757, + "loss": 4.6509, + "step": 284 + }, + { + "epoch": 0.008451205408771461, + "grad_norm": 0.5378486514091492, + "learning_rate": 0.0008402366863905325, + "loss": 4.6373, + "step": 285 + }, + { + "epoch": 0.00848085876108294, + "grad_norm": 0.5715571641921997, + "learning_rate": 0.0008431952662721893, + "loss": 4.616, + "step": 286 + }, + { + "epoch": 0.008510512113394418, + "grad_norm": 0.6133559346199036, + "learning_rate": 0.0008461538461538462, + "loss": 4.664, + "step": 287 + }, + { + "epoch": 0.008540165465705898, + "grad_norm": 0.5868294835090637, + "learning_rate": 0.000849112426035503, + "loss": 4.6162, + "step": 288 + }, + { + "epoch": 0.008569818818017378, + "grad_norm": 0.5616887807846069, + "learning_rate": 0.0008520710059171598, + "loss": 4.6238, + "step": 289 + }, + { + "epoch": 0.008599472170328856, + "grad_norm": 0.6396180391311646, + "learning_rate": 0.0008550295857988167, + "loss": 4.6219, + "step": 290 + }, + { + "epoch": 0.008629125522640335, + "grad_norm": 0.7806535959243774, + "learning_rate": 0.0008579881656804734, + "loss": 4.6619, + "step": 291 + }, + { + "epoch": 0.008658778874951813, + "grad_norm": 0.8013942241668701, + "learning_rate": 0.0008609467455621302, + "loss": 4.6386, + "step": 292 + }, + { + "epoch": 0.008688432227263293, + "grad_norm": 0.750120222568512, + "learning_rate": 0.0008639053254437869, + "loss": 4.6272, + "step": 293 + }, + { + "epoch": 0.00871808557957477, + "grad_norm": 0.7249065637588501, + "learning_rate": 0.0008668639053254438, + "loss": 4.6258, + "step": 294 + }, + { + "epoch": 0.00874773893188625, + "grad_norm": 0.7301360368728638, + "learning_rate": 0.0008698224852071006, + "loss": 4.636, + "step": 295 + }, + { + "epoch": 0.008777392284197728, + "grad_norm": 0.5972232818603516, + "learning_rate": 0.0008727810650887574, + "loss": 4.6312, + "step": 296 + }, + { + "epoch": 0.008807045636509208, + "grad_norm": 0.5974375009536743, + "learning_rate": 0.0008757396449704143, + "loss": 4.577, + "step": 297 + }, + { + "epoch": 0.008836698988820686, + "grad_norm": 0.545864999294281, + "learning_rate": 0.0008786982248520711, + "loss": 4.5689, + "step": 298 + }, + { + "epoch": 0.008866352341132165, + "grad_norm": 0.6520168781280518, + "learning_rate": 0.0008816568047337278, + "loss": 4.5529, + "step": 299 + }, + { + "epoch": 0.008896005693443643, + "grad_norm": 0.539908766746521, + "learning_rate": 0.0008846153846153846, + "loss": 4.5798, + "step": 300 + }, + { + "epoch": 0.008925659045755123, + "grad_norm": 0.4805278182029724, + "learning_rate": 0.0008875739644970414, + "loss": 4.5817, + "step": 301 + }, + { + "epoch": 0.008955312398066602, + "grad_norm": 0.4713899791240692, + "learning_rate": 0.0008905325443786982, + "loss": 4.5202, + "step": 302 + }, + { + "epoch": 0.00898496575037808, + "grad_norm": 0.45630621910095215, + "learning_rate": 0.0008934911242603551, + "loss": 4.5193, + "step": 303 + }, + { + "epoch": 0.00901461910268956, + "grad_norm": 0.34017252922058105, + "learning_rate": 0.0008964497041420119, + "loss": 4.4951, + "step": 304 + }, + { + "epoch": 0.009044272455001038, + "grad_norm": 0.34548866748809814, + "learning_rate": 0.0008994082840236687, + "loss": 4.4885, + "step": 305 + }, + { + "epoch": 0.009073925807312517, + "grad_norm": 0.3547917604446411, + "learning_rate": 0.0009023668639053254, + "loss": 4.4341, + "step": 306 + }, + { + "epoch": 0.009103579159623995, + "grad_norm": 0.388673335313797, + "learning_rate": 0.0009053254437869822, + "loss": 4.4796, + "step": 307 + }, + { + "epoch": 0.009133232511935475, + "grad_norm": 0.3788732886314392, + "learning_rate": 0.0009082840236686391, + "loss": 4.4058, + "step": 308 + }, + { + "epoch": 0.009162885864246953, + "grad_norm": 0.3419938087463379, + "learning_rate": 0.0009112426035502958, + "loss": 4.4378, + "step": 309 + }, + { + "epoch": 0.009192539216558432, + "grad_norm": 0.3302023112773895, + "learning_rate": 0.0009142011834319527, + "loss": 4.4298, + "step": 310 + }, + { + "epoch": 0.00922219256886991, + "grad_norm": 0.4236707389354706, + "learning_rate": 0.0009171597633136096, + "loss": 4.4458, + "step": 311 + }, + { + "epoch": 0.00925184592118139, + "grad_norm": 0.5189725160598755, + "learning_rate": 0.0009201183431952663, + "loss": 4.4099, + "step": 312 + }, + { + "epoch": 0.009281499273492868, + "grad_norm": 0.5668390989303589, + "learning_rate": 0.0009230769230769232, + "loss": 4.4394, + "step": 313 + }, + { + "epoch": 0.009311152625804347, + "grad_norm": 0.5991911292076111, + "learning_rate": 0.0009260355029585798, + "loss": 4.4025, + "step": 314 + }, + { + "epoch": 0.009340805978115827, + "grad_norm": 0.5497250556945801, + "learning_rate": 0.0009289940828402367, + "loss": 4.4051, + "step": 315 + }, + { + "epoch": 0.009370459330427305, + "grad_norm": 0.6297529935836792, + "learning_rate": 0.0009319526627218935, + "loss": 4.4283, + "step": 316 + }, + { + "epoch": 0.009400112682738784, + "grad_norm": 0.7859067320823669, + "learning_rate": 0.0009349112426035503, + "loss": 4.4628, + "step": 317 + }, + { + "epoch": 0.009429766035050262, + "grad_norm": 0.5061233043670654, + "learning_rate": 0.0009378698224852071, + "loss": 4.361, + "step": 318 + }, + { + "epoch": 0.009459419387361742, + "grad_norm": 0.437327116727829, + "learning_rate": 0.000940828402366864, + "loss": 4.4164, + "step": 319 + }, + { + "epoch": 0.00948907273967322, + "grad_norm": 0.7876100540161133, + "learning_rate": 0.0009437869822485208, + "loss": 4.4288, + "step": 320 + }, + { + "epoch": 0.0095187260919847, + "grad_norm": 0.7652246356010437, + "learning_rate": 0.0009467455621301775, + "loss": 4.4012, + "step": 321 + }, + { + "epoch": 0.009548379444296177, + "grad_norm": 0.4528758227825165, + "learning_rate": 0.0009497041420118343, + "loss": 4.4148, + "step": 322 + }, + { + "epoch": 0.009578032796607657, + "grad_norm": 0.4749153256416321, + "learning_rate": 0.0009526627218934911, + "loss": 4.4077, + "step": 323 + }, + { + "epoch": 0.009607686148919135, + "grad_norm": 0.5407995581626892, + "learning_rate": 0.000955621301775148, + "loss": 4.418, + "step": 324 + }, + { + "epoch": 0.009637339501230614, + "grad_norm": 0.6529463529586792, + "learning_rate": 0.0009585798816568047, + "loss": 4.3621, + "step": 325 + }, + { + "epoch": 0.009666992853542092, + "grad_norm": 0.5218048095703125, + "learning_rate": 0.0009615384615384616, + "loss": 4.3741, + "step": 326 + }, + { + "epoch": 0.009696646205853572, + "grad_norm": 0.3879440426826477, + "learning_rate": 0.0009644970414201185, + "loss": 4.3926, + "step": 327 + }, + { + "epoch": 0.009726299558165051, + "grad_norm": 0.39946484565734863, + "learning_rate": 0.0009674556213017751, + "loss": 4.335, + "step": 328 + }, + { + "epoch": 0.00975595291047653, + "grad_norm": 0.4860920310020447, + "learning_rate": 0.000970414201183432, + "loss": 4.3057, + "step": 329 + }, + { + "epoch": 0.009785606262788009, + "grad_norm": 0.4595347046852112, + "learning_rate": 0.0009733727810650887, + "loss": 4.3327, + "step": 330 + }, + { + "epoch": 0.009815259615099487, + "grad_norm": 0.388714462518692, + "learning_rate": 0.0009763313609467456, + "loss": 4.3247, + "step": 331 + }, + { + "epoch": 0.009844912967410966, + "grad_norm": 0.38758039474487305, + "learning_rate": 0.0009792899408284023, + "loss": 4.2897, + "step": 332 + }, + { + "epoch": 0.009874566319722444, + "grad_norm": 0.4103866219520569, + "learning_rate": 0.0009822485207100593, + "loss": 4.33, + "step": 333 + }, + { + "epoch": 0.009904219672033924, + "grad_norm": 0.40301570296287537, + "learning_rate": 0.000985207100591716, + "loss": 4.3023, + "step": 334 + }, + { + "epoch": 0.009933873024345402, + "grad_norm": 0.36728739738464355, + "learning_rate": 0.0009881656804733728, + "loss": 4.3093, + "step": 335 + }, + { + "epoch": 0.009963526376656881, + "grad_norm": 0.4135468006134033, + "learning_rate": 0.0009911242603550295, + "loss": 4.3077, + "step": 336 + }, + { + "epoch": 0.00999317972896836, + "grad_norm": 0.48888441920280457, + "learning_rate": 0.0009940828402366863, + "loss": 4.3097, + "step": 337 + }, + { + "epoch": 0.010022833081279839, + "grad_norm": 0.47122764587402344, + "learning_rate": 0.0009970414201183433, + "loss": 4.3081, + "step": 338 + }, + { + "epoch": 0.010052486433591317, + "grad_norm": 0.3885596692562103, + "learning_rate": 0.001, + "loss": 4.2322, + "step": 339 + }, + { + "epoch": 0.010082139785902796, + "grad_norm": 0.3630632162094116, + "learning_rate": 0.000999999997786207, + "loss": 4.3233, + "step": 340 + }, + { + "epoch": 0.010111793138214276, + "grad_norm": 0.44410473108291626, + "learning_rate": 0.0009999999911448282, + "loss": 4.309, + "step": 341 + }, + { + "epoch": 0.010141446490525754, + "grad_norm": 0.4832232892513275, + "learning_rate": 0.000999999980075864, + "loss": 4.2814, + "step": 342 + }, + { + "epoch": 0.010171099842837233, + "grad_norm": 0.45927876234054565, + "learning_rate": 0.0009999999645793139, + "loss": 4.2859, + "step": 343 + }, + { + "epoch": 0.010200753195148711, + "grad_norm": 0.5312002897262573, + "learning_rate": 0.0009999999446551782, + "loss": 4.3059, + "step": 344 + }, + { + "epoch": 0.010230406547460191, + "grad_norm": 0.5583577752113342, + "learning_rate": 0.0009999999203034573, + "loss": 4.3169, + "step": 345 + }, + { + "epoch": 0.010260059899771669, + "grad_norm": 0.4482254087924957, + "learning_rate": 0.000999999891524151, + "loss": 4.3291, + "step": 346 + }, + { + "epoch": 0.010289713252083148, + "grad_norm": 0.543820321559906, + "learning_rate": 0.0009999998583172603, + "loss": 4.2843, + "step": 347 + }, + { + "epoch": 0.010319366604394626, + "grad_norm": 0.46985533833503723, + "learning_rate": 0.0009999998206827846, + "loss": 4.2753, + "step": 348 + }, + { + "epoch": 0.010349019956706106, + "grad_norm": 0.4635527431964874, + "learning_rate": 0.000999999778620725, + "loss": 4.2788, + "step": 349 + }, + { + "epoch": 0.010378673309017584, + "grad_norm": 0.4735933840274811, + "learning_rate": 0.0009999997321310814, + "loss": 4.2535, + "step": 350 + }, + { + "epoch": 0.010408326661329063, + "grad_norm": 0.43246567249298096, + "learning_rate": 0.0009999996812138543, + "loss": 4.2144, + "step": 351 + }, + { + "epoch": 0.010437980013640541, + "grad_norm": 0.33642444014549255, + "learning_rate": 0.0009999996258690442, + "loss": 4.2077, + "step": 352 + }, + { + "epoch": 0.010467633365952021, + "grad_norm": 0.3229968845844269, + "learning_rate": 0.0009999995660966517, + "loss": 4.2289, + "step": 353 + }, + { + "epoch": 0.0104972867182635, + "grad_norm": 0.3496500253677368, + "learning_rate": 0.0009999995018966771, + "loss": 4.2228, + "step": 354 + }, + { + "epoch": 0.010526940070574978, + "grad_norm": 0.3779259920120239, + "learning_rate": 0.0009999994332691212, + "loss": 4.2408, + "step": 355 + }, + { + "epoch": 0.010556593422886458, + "grad_norm": 0.43291381001472473, + "learning_rate": 0.0009999993602139846, + "loss": 4.2204, + "step": 356 + }, + { + "epoch": 0.010586246775197936, + "grad_norm": 0.3747459650039673, + "learning_rate": 0.0009999992827312675, + "loss": 4.2021, + "step": 357 + }, + { + "epoch": 0.010615900127509416, + "grad_norm": 0.37806546688079834, + "learning_rate": 0.000999999200820971, + "loss": 4.1751, + "step": 358 + }, + { + "epoch": 0.010645553479820893, + "grad_norm": 0.34713518619537354, + "learning_rate": 0.000999999114483096, + "loss": 4.1711, + "step": 359 + }, + { + "epoch": 0.010675206832132373, + "grad_norm": 0.3445892333984375, + "learning_rate": 0.0009999990237176428, + "loss": 4.1802, + "step": 360 + }, + { + "epoch": 0.010704860184443851, + "grad_norm": 0.3411167860031128, + "learning_rate": 0.0009999989285246124, + "loss": 4.1815, + "step": 361 + }, + { + "epoch": 0.01073451353675533, + "grad_norm": 0.29926395416259766, + "learning_rate": 0.0009999988289040058, + "loss": 4.1541, + "step": 362 + }, + { + "epoch": 0.010764166889066808, + "grad_norm": 0.2519354820251465, + "learning_rate": 0.0009999987248558238, + "loss": 4.1446, + "step": 363 + }, + { + "epoch": 0.010793820241378288, + "grad_norm": 0.335952490568161, + "learning_rate": 0.0009999986163800672, + "loss": 4.1206, + "step": 364 + }, + { + "epoch": 0.010823473593689766, + "grad_norm": 0.3274226486682892, + "learning_rate": 0.0009999985034767369, + "loss": 4.1414, + "step": 365 + }, + { + "epoch": 0.010853126946001245, + "grad_norm": 0.3223492503166199, + "learning_rate": 0.0009999983861458343, + "loss": 4.1183, + "step": 366 + }, + { + "epoch": 0.010882780298312725, + "grad_norm": 0.34734249114990234, + "learning_rate": 0.0009999982643873599, + "loss": 4.1349, + "step": 367 + }, + { + "epoch": 0.010912433650624203, + "grad_norm": 0.35300305485725403, + "learning_rate": 0.0009999981382013152, + "loss": 4.1216, + "step": 368 + }, + { + "epoch": 0.010942087002935683, + "grad_norm": 0.3388669490814209, + "learning_rate": 0.000999998007587701, + "loss": 4.1228, + "step": 369 + }, + { + "epoch": 0.01097174035524716, + "grad_norm": 0.36457380652427673, + "learning_rate": 0.0009999978725465188, + "loss": 4.0818, + "step": 370 + }, + { + "epoch": 0.01100139370755864, + "grad_norm": 0.34127798676490784, + "learning_rate": 0.0009999977330777694, + "loss": 4.1504, + "step": 371 + }, + { + "epoch": 0.011031047059870118, + "grad_norm": 0.4581395089626312, + "learning_rate": 0.0009999975891814546, + "loss": 4.1242, + "step": 372 + }, + { + "epoch": 0.011060700412181598, + "grad_norm": 0.37697634100914, + "learning_rate": 0.000999997440857575, + "loss": 4.1027, + "step": 373 + }, + { + "epoch": 0.011090353764493075, + "grad_norm": 0.4089246988296509, + "learning_rate": 0.0009999972881061323, + "loss": 4.1107, + "step": 374 + }, + { + "epoch": 0.011120007116804555, + "grad_norm": 0.39859819412231445, + "learning_rate": 0.000999997130927128, + "loss": 4.1048, + "step": 375 + }, + { + "epoch": 0.011149660469116033, + "grad_norm": 0.31920474767684937, + "learning_rate": 0.000999996969320563, + "loss": 4.1068, + "step": 376 + }, + { + "epoch": 0.011179313821427513, + "grad_norm": 0.39359673857688904, + "learning_rate": 0.000999996803286439, + "loss": 4.1212, + "step": 377 + }, + { + "epoch": 0.01120896717373899, + "grad_norm": 0.3296069800853729, + "learning_rate": 0.0009999966328247578, + "loss": 4.0498, + "step": 378 + }, + { + "epoch": 0.01123862052605047, + "grad_norm": 0.3492998778820038, + "learning_rate": 0.0009999964579355202, + "loss": 4.0865, + "step": 379 + }, + { + "epoch": 0.01126827387836195, + "grad_norm": 0.39139166474342346, + "learning_rate": 0.0009999962786187285, + "loss": 4.0543, + "step": 380 + }, + { + "epoch": 0.011297927230673428, + "grad_norm": 0.3117806613445282, + "learning_rate": 0.000999996094874384, + "loss": 4.0549, + "step": 381 + }, + { + "epoch": 0.011327580582984907, + "grad_norm": 0.3036748468875885, + "learning_rate": 0.0009999959067024879, + "loss": 4.0766, + "step": 382 + }, + { + "epoch": 0.011357233935296385, + "grad_norm": 0.31403645873069763, + "learning_rate": 0.0009999957141030422, + "loss": 4.102, + "step": 383 + }, + { + "epoch": 0.011386887287607865, + "grad_norm": 0.30810773372650146, + "learning_rate": 0.000999995517076049, + "loss": 4.0618, + "step": 384 + }, + { + "epoch": 0.011416540639919343, + "grad_norm": 0.4018454849720001, + "learning_rate": 0.0009999953156215094, + "loss": 4.056, + "step": 385 + }, + { + "epoch": 0.011446193992230822, + "grad_norm": 0.3815452754497528, + "learning_rate": 0.0009999951097394255, + "loss": 4.0614, + "step": 386 + }, + { + "epoch": 0.0114758473445423, + "grad_norm": 0.3863217532634735, + "learning_rate": 0.0009999948994297992, + "loss": 4.0916, + "step": 387 + }, + { + "epoch": 0.01150550069685378, + "grad_norm": 0.3632625341415405, + "learning_rate": 0.000999994684692632, + "loss": 4.0909, + "step": 388 + }, + { + "epoch": 0.011535154049165258, + "grad_norm": 0.507134735584259, + "learning_rate": 0.0009999944655279262, + "loss": 4.0761, + "step": 389 + }, + { + "epoch": 0.011564807401476737, + "grad_norm": 0.5266185998916626, + "learning_rate": 0.0009999942419356837, + "loss": 4.0945, + "step": 390 + }, + { + "epoch": 0.011594460753788215, + "grad_norm": 0.41423657536506653, + "learning_rate": 0.0009999940139159061, + "loss": 4.0592, + "step": 391 + }, + { + "epoch": 0.011624114106099695, + "grad_norm": 0.413772851228714, + "learning_rate": 0.0009999937814685958, + "loss": 4.0574, + "step": 392 + }, + { + "epoch": 0.011653767458411174, + "grad_norm": 0.3069066107273102, + "learning_rate": 0.0009999935445937549, + "loss": 4.0402, + "step": 393 + }, + { + "epoch": 0.011683420810722652, + "grad_norm": 0.37199148535728455, + "learning_rate": 0.000999993303291385, + "loss": 4.0708, + "step": 394 + }, + { + "epoch": 0.011713074163034132, + "grad_norm": 0.33394595980644226, + "learning_rate": 0.000999993057561489, + "loss": 4.0186, + "step": 395 + }, + { + "epoch": 0.01174272751534561, + "grad_norm": 0.3209940493106842, + "learning_rate": 0.0009999928074040682, + "loss": 4.0334, + "step": 396 + }, + { + "epoch": 0.01177238086765709, + "grad_norm": 0.34881770610809326, + "learning_rate": 0.0009999925528191257, + "loss": 4.0102, + "step": 397 + }, + { + "epoch": 0.011802034219968567, + "grad_norm": 0.26982295513153076, + "learning_rate": 0.000999992293806663, + "loss": 4.0241, + "step": 398 + }, + { + "epoch": 0.011831687572280047, + "grad_norm": 0.2530822455883026, + "learning_rate": 0.000999992030366683, + "loss": 4.0153, + "step": 399 + }, + { + "epoch": 0.011861340924591525, + "grad_norm": 0.24095529317855835, + "learning_rate": 0.0009999917624991875, + "loss": 3.9947, + "step": 400 + }, + { + "epoch": 0.011890994276903004, + "grad_norm": 0.244659885764122, + "learning_rate": 0.0009999914902041793, + "loss": 3.974, + "step": 401 + }, + { + "epoch": 0.011920647629214482, + "grad_norm": 0.22100523114204407, + "learning_rate": 0.0009999912134816605, + "loss": 3.9644, + "step": 402 + }, + { + "epoch": 0.011950300981525962, + "grad_norm": 0.266331285238266, + "learning_rate": 0.0009999909323316336, + "loss": 4.0225, + "step": 403 + }, + { + "epoch": 0.01197995433383744, + "grad_norm": 0.3116631507873535, + "learning_rate": 0.0009999906467541013, + "loss": 4.0115, + "step": 404 + }, + { + "epoch": 0.01200960768614892, + "grad_norm": 0.34060949087142944, + "learning_rate": 0.000999990356749066, + "loss": 4.0152, + "step": 405 + }, + { + "epoch": 0.012039261038460399, + "grad_norm": 0.32798731327056885, + "learning_rate": 0.0009999900623165303, + "loss": 4.0054, + "step": 406 + }, + { + "epoch": 0.012068914390771877, + "grad_norm": 0.3266982436180115, + "learning_rate": 0.0009999897634564968, + "loss": 4.0033, + "step": 407 + }, + { + "epoch": 0.012098567743083356, + "grad_norm": 0.36435413360595703, + "learning_rate": 0.000999989460168968, + "loss": 3.9775, + "step": 408 + }, + { + "epoch": 0.012128221095394834, + "grad_norm": 0.3328779637813568, + "learning_rate": 0.0009999891524539468, + "loss": 3.9872, + "step": 409 + }, + { + "epoch": 0.012157874447706314, + "grad_norm": 0.29516205191612244, + "learning_rate": 0.000999988840311436, + "loss": 4.0072, + "step": 410 + }, + { + "epoch": 0.012187527800017792, + "grad_norm": 0.27762463688850403, + "learning_rate": 0.0009999885237414379, + "loss": 3.992, + "step": 411 + }, + { + "epoch": 0.012217181152329271, + "grad_norm": 0.315625935792923, + "learning_rate": 0.0009999882027439556, + "loss": 3.9895, + "step": 412 + }, + { + "epoch": 0.01224683450464075, + "grad_norm": 0.24397866427898407, + "learning_rate": 0.0009999878773189921, + "loss": 3.9249, + "step": 413 + }, + { + "epoch": 0.012276487856952229, + "grad_norm": 0.2418304681777954, + "learning_rate": 0.00099998754746655, + "loss": 3.9672, + "step": 414 + }, + { + "epoch": 0.012306141209263707, + "grad_norm": 0.26081135869026184, + "learning_rate": 0.0009999872131866323, + "loss": 3.9677, + "step": 415 + }, + { + "epoch": 0.012335794561575186, + "grad_norm": 0.31213095784187317, + "learning_rate": 0.0009999868744792423, + "loss": 4.0018, + "step": 416 + }, + { + "epoch": 0.012365447913886664, + "grad_norm": 0.31175461411476135, + "learning_rate": 0.0009999865313443826, + "loss": 3.9538, + "step": 417 + }, + { + "epoch": 0.012395101266198144, + "grad_norm": 0.30256664752960205, + "learning_rate": 0.000999986183782056, + "loss": 3.9514, + "step": 418 + }, + { + "epoch": 0.012424754618509622, + "grad_norm": 0.26889145374298096, + "learning_rate": 0.0009999858317922663, + "loss": 3.9564, + "step": 419 + }, + { + "epoch": 0.012454407970821101, + "grad_norm": 0.37286466360092163, + "learning_rate": 0.0009999854753750162, + "loss": 3.9739, + "step": 420 + }, + { + "epoch": 0.012484061323132581, + "grad_norm": 0.3899266719818115, + "learning_rate": 0.0009999851145303087, + "loss": 3.9552, + "step": 421 + }, + { + "epoch": 0.012513714675444059, + "grad_norm": 0.32250159978866577, + "learning_rate": 0.0009999847492581475, + "loss": 3.9493, + "step": 422 + }, + { + "epoch": 0.012543368027755538, + "grad_norm": 0.33498579263687134, + "learning_rate": 0.0009999843795585352, + "loss": 3.9636, + "step": 423 + }, + { + "epoch": 0.012573021380067016, + "grad_norm": 0.3080291748046875, + "learning_rate": 0.0009999840054314756, + "loss": 3.9452, + "step": 424 + }, + { + "epoch": 0.012602674732378496, + "grad_norm": 0.34795457124710083, + "learning_rate": 0.0009999836268769719, + "loss": 3.9556, + "step": 425 + }, + { + "epoch": 0.012632328084689974, + "grad_norm": 0.3188399076461792, + "learning_rate": 0.000999983243895027, + "loss": 3.9813, + "step": 426 + }, + { + "epoch": 0.012661981437001453, + "grad_norm": 0.32311245799064636, + "learning_rate": 0.000999982856485645, + "loss": 3.9527, + "step": 427 + }, + { + "epoch": 0.012691634789312931, + "grad_norm": 0.3047342002391815, + "learning_rate": 0.0009999824646488287, + "loss": 3.9593, + "step": 428 + }, + { + "epoch": 0.01272128814162441, + "grad_norm": 0.32210269570350647, + "learning_rate": 0.000999982068384582, + "loss": 3.932, + "step": 429 + }, + { + "epoch": 0.012750941493935889, + "grad_norm": 0.3688957095146179, + "learning_rate": 0.0009999816676929084, + "loss": 3.9573, + "step": 430 + }, + { + "epoch": 0.012780594846247368, + "grad_norm": 0.32929930090904236, + "learning_rate": 0.000999981262573811, + "loss": 3.9548, + "step": 431 + }, + { + "epoch": 0.012810248198558846, + "grad_norm": 0.2795000374317169, + "learning_rate": 0.0009999808530272941, + "loss": 3.983, + "step": 432 + }, + { + "epoch": 0.012839901550870326, + "grad_norm": 0.2821560800075531, + "learning_rate": 0.0009999804390533606, + "loss": 3.9289, + "step": 433 + }, + { + "epoch": 0.012869554903181805, + "grad_norm": 0.27201077342033386, + "learning_rate": 0.0009999800206520145, + "loss": 3.975, + "step": 434 + }, + { + "epoch": 0.012899208255493283, + "grad_norm": 0.23615828156471252, + "learning_rate": 0.0009999795978232597, + "loss": 3.9341, + "step": 435 + }, + { + "epoch": 0.012928861607804763, + "grad_norm": 0.2428130805492401, + "learning_rate": 0.0009999791705670995, + "loss": 3.9168, + "step": 436 + }, + { + "epoch": 0.01295851496011624, + "grad_norm": 0.2524501085281372, + "learning_rate": 0.0009999787388835382, + "loss": 3.9121, + "step": 437 + }, + { + "epoch": 0.01298816831242772, + "grad_norm": 0.269569456577301, + "learning_rate": 0.000999978302772579, + "loss": 3.9229, + "step": 438 + }, + { + "epoch": 0.013017821664739198, + "grad_norm": 0.24865923821926117, + "learning_rate": 0.0009999778622342263, + "loss": 3.9011, + "step": 439 + }, + { + "epoch": 0.013047475017050678, + "grad_norm": 0.213860884308815, + "learning_rate": 0.0009999774172684839, + "loss": 3.9078, + "step": 440 + }, + { + "epoch": 0.013077128369362156, + "grad_norm": 0.2657599151134491, + "learning_rate": 0.0009999769678753557, + "loss": 3.9111, + "step": 441 + }, + { + "epoch": 0.013106781721673635, + "grad_norm": 0.2759740650653839, + "learning_rate": 0.0009999765140548454, + "loss": 3.8839, + "step": 442 + }, + { + "epoch": 0.013136435073985113, + "grad_norm": 0.23813004791736603, + "learning_rate": 0.0009999760558069572, + "loss": 3.89, + "step": 443 + }, + { + "epoch": 0.013166088426296593, + "grad_norm": 0.32032710313796997, + "learning_rate": 0.0009999755931316954, + "loss": 3.8861, + "step": 444 + }, + { + "epoch": 0.01319574177860807, + "grad_norm": 0.33505979180336, + "learning_rate": 0.0009999751260290639, + "loss": 3.9123, + "step": 445 + }, + { + "epoch": 0.01322539513091955, + "grad_norm": 0.30502399802207947, + "learning_rate": 0.000999974654499067, + "loss": 3.8946, + "step": 446 + }, + { + "epoch": 0.01325504848323103, + "grad_norm": 0.27008387446403503, + "learning_rate": 0.0009999741785417084, + "loss": 3.8976, + "step": 447 + }, + { + "epoch": 0.013284701835542508, + "grad_norm": 0.34254416823387146, + "learning_rate": 0.0009999736981569926, + "loss": 3.8921, + "step": 448 + }, + { + "epoch": 0.013314355187853988, + "grad_norm": 0.36756524443626404, + "learning_rate": 0.0009999732133449241, + "loss": 3.9058, + "step": 449 + }, + { + "epoch": 0.013344008540165465, + "grad_norm": 0.283363938331604, + "learning_rate": 0.000999972724105507, + "loss": 3.8904, + "step": 450 + }, + { + "epoch": 0.013373661892476945, + "grad_norm": 0.3547140657901764, + "learning_rate": 0.0009999722304387456, + "loss": 3.8796, + "step": 451 + }, + { + "epoch": 0.013403315244788423, + "grad_norm": 0.3190770149230957, + "learning_rate": 0.000999971732344644, + "loss": 3.8896, + "step": 452 + }, + { + "epoch": 0.013432968597099902, + "grad_norm": 0.24599382281303406, + "learning_rate": 0.000999971229823207, + "loss": 3.8804, + "step": 453 + }, + { + "epoch": 0.01346262194941138, + "grad_norm": 0.22928452491760254, + "learning_rate": 0.0009999707228744391, + "loss": 3.8445, + "step": 454 + }, + { + "epoch": 0.01349227530172286, + "grad_norm": 0.22883613407611847, + "learning_rate": 0.0009999702114983446, + "loss": 3.8747, + "step": 455 + }, + { + "epoch": 0.013521928654034338, + "grad_norm": 0.24374333024024963, + "learning_rate": 0.000999969695694928, + "loss": 3.82, + "step": 456 + }, + { + "epoch": 0.013551582006345817, + "grad_norm": 0.23452125489711761, + "learning_rate": 0.000999969175464194, + "loss": 3.8603, + "step": 457 + }, + { + "epoch": 0.013581235358657295, + "grad_norm": 0.21761329472064972, + "learning_rate": 0.000999968650806147, + "loss": 3.8146, + "step": 458 + }, + { + "epoch": 0.013610888710968775, + "grad_norm": 0.19563403725624084, + "learning_rate": 0.0009999681217207918, + "loss": 3.8562, + "step": 459 + }, + { + "epoch": 0.013640542063280255, + "grad_norm": 0.23846015334129333, + "learning_rate": 0.000999967588208133, + "loss": 3.8801, + "step": 460 + }, + { + "epoch": 0.013670195415591732, + "grad_norm": 0.2601345479488373, + "learning_rate": 0.0009999670502681757, + "loss": 3.8125, + "step": 461 + }, + { + "epoch": 0.013699848767903212, + "grad_norm": 0.22394421696662903, + "learning_rate": 0.0009999665079009241, + "loss": 3.8476, + "step": 462 + }, + { + "epoch": 0.01372950212021469, + "grad_norm": 0.2297196239233017, + "learning_rate": 0.0009999659611063833, + "loss": 3.8563, + "step": 463 + }, + { + "epoch": 0.01375915547252617, + "grad_norm": 0.2883220911026001, + "learning_rate": 0.0009999654098845582, + "loss": 3.853, + "step": 464 + }, + { + "epoch": 0.013788808824837647, + "grad_norm": 0.3504149317741394, + "learning_rate": 0.0009999648542354533, + "loss": 3.8481, + "step": 465 + }, + { + "epoch": 0.013818462177149127, + "grad_norm": 0.399930477142334, + "learning_rate": 0.0009999642941590742, + "loss": 3.8648, + "step": 466 + }, + { + "epoch": 0.013848115529460605, + "grad_norm": 0.4531044363975525, + "learning_rate": 0.0009999637296554253, + "loss": 3.8823, + "step": 467 + }, + { + "epoch": 0.013877768881772085, + "grad_norm": 0.40932008624076843, + "learning_rate": 0.0009999631607245114, + "loss": 3.8876, + "step": 468 + }, + { + "epoch": 0.013907422234083562, + "grad_norm": 0.25553250312805176, + "learning_rate": 0.0009999625873663384, + "loss": 3.8683, + "step": 469 + }, + { + "epoch": 0.013937075586395042, + "grad_norm": 0.27867501974105835, + "learning_rate": 0.0009999620095809106, + "loss": 3.8781, + "step": 470 + }, + { + "epoch": 0.01396672893870652, + "grad_norm": 0.274774968624115, + "learning_rate": 0.0009999614273682334, + "loss": 3.8049, + "step": 471 + }, + { + "epoch": 0.013996382291018, + "grad_norm": 0.3039705455303192, + "learning_rate": 0.000999960840728312, + "loss": 3.8154, + "step": 472 + }, + { + "epoch": 0.01402603564332948, + "grad_norm": 0.22611534595489502, + "learning_rate": 0.0009999602496611516, + "loss": 3.8243, + "step": 473 + }, + { + "epoch": 0.014055688995640957, + "grad_norm": 0.23899929225444794, + "learning_rate": 0.0009999596541667574, + "loss": 3.8002, + "step": 474 + }, + { + "epoch": 0.014085342347952437, + "grad_norm": 0.2625550925731659, + "learning_rate": 0.0009999590542451343, + "loss": 3.8363, + "step": 475 + }, + { + "epoch": 0.014114995700263915, + "grad_norm": 0.2763436734676361, + "learning_rate": 0.0009999584498962882, + "loss": 3.8236, + "step": 476 + }, + { + "epoch": 0.014144649052575394, + "grad_norm": 0.25259020924568176, + "learning_rate": 0.0009999578411202244, + "loss": 3.7888, + "step": 477 + }, + { + "epoch": 0.014174302404886872, + "grad_norm": 0.2547082006931305, + "learning_rate": 0.0009999572279169478, + "loss": 3.7972, + "step": 478 + }, + { + "epoch": 0.014203955757198352, + "grad_norm": 0.22574612498283386, + "learning_rate": 0.0009999566102864641, + "loss": 3.7911, + "step": 479 + }, + { + "epoch": 0.01423360910950983, + "grad_norm": 0.19488640129566193, + "learning_rate": 0.000999955988228779, + "loss": 3.8333, + "step": 480 + }, + { + "epoch": 0.01426326246182131, + "grad_norm": 0.18093116581439972, + "learning_rate": 0.0009999553617438977, + "loss": 3.7644, + "step": 481 + }, + { + "epoch": 0.014292915814132787, + "grad_norm": 0.19148290157318115, + "learning_rate": 0.000999954730831826, + "loss": 3.7942, + "step": 482 + }, + { + "epoch": 0.014322569166444267, + "grad_norm": 0.20050373673439026, + "learning_rate": 0.0009999540954925693, + "loss": 3.7887, + "step": 483 + }, + { + "epoch": 0.014352222518755745, + "grad_norm": 0.2101365029811859, + "learning_rate": 0.000999953455726133, + "loss": 3.7702, + "step": 484 + }, + { + "epoch": 0.014381875871067224, + "grad_norm": 0.19698575139045715, + "learning_rate": 0.0009999528115325234, + "loss": 3.773, + "step": 485 + }, + { + "epoch": 0.014411529223378704, + "grad_norm": 0.1883394867181778, + "learning_rate": 0.0009999521629117456, + "loss": 3.7755, + "step": 486 + }, + { + "epoch": 0.014441182575690182, + "grad_norm": 0.20838291943073273, + "learning_rate": 0.0009999515098638057, + "loss": 3.7819, + "step": 487 + }, + { + "epoch": 0.014470835928001661, + "grad_norm": 0.2590768039226532, + "learning_rate": 0.0009999508523887094, + "loss": 3.7846, + "step": 488 + }, + { + "epoch": 0.014500489280313139, + "grad_norm": 0.2749667167663574, + "learning_rate": 0.0009999501904864624, + "loss": 3.7913, + "step": 489 + }, + { + "epoch": 0.014530142632624619, + "grad_norm": 0.25822436809539795, + "learning_rate": 0.0009999495241570706, + "loss": 3.7874, + "step": 490 + }, + { + "epoch": 0.014559795984936097, + "grad_norm": 0.2715834975242615, + "learning_rate": 0.0009999488534005402, + "loss": 3.7825, + "step": 491 + }, + { + "epoch": 0.014589449337247576, + "grad_norm": 0.27302712202072144, + "learning_rate": 0.0009999481782168767, + "loss": 3.779, + "step": 492 + }, + { + "epoch": 0.014619102689559054, + "grad_norm": 0.26999396085739136, + "learning_rate": 0.000999947498606086, + "loss": 3.7754, + "step": 493 + }, + { + "epoch": 0.014648756041870534, + "grad_norm": 0.2672169506549835, + "learning_rate": 0.0009999468145681749, + "loss": 3.7731, + "step": 494 + }, + { + "epoch": 0.014678409394182012, + "grad_norm": 0.2521427571773529, + "learning_rate": 0.0009999461261031486, + "loss": 3.8004, + "step": 495 + }, + { + "epoch": 0.014708062746493491, + "grad_norm": 0.2441594898700714, + "learning_rate": 0.0009999454332110136, + "loss": 3.7887, + "step": 496 + }, + { + "epoch": 0.014737716098804969, + "grad_norm": 0.27144956588745117, + "learning_rate": 0.0009999447358917761, + "loss": 3.7374, + "step": 497 + }, + { + "epoch": 0.014767369451116449, + "grad_norm": 0.21453015506267548, + "learning_rate": 0.000999944034145442, + "loss": 3.7395, + "step": 498 + }, + { + "epoch": 0.014797022803427928, + "grad_norm": 0.19485695660114288, + "learning_rate": 0.000999943327972018, + "loss": 3.7782, + "step": 499 + }, + { + "epoch": 0.014826676155739406, + "grad_norm": 0.27165359258651733, + "learning_rate": 0.0009999426173715096, + "loss": 3.7952, + "step": 500 + }, + { + "epoch": 0.014856329508050886, + "grad_norm": 0.2844657301902771, + "learning_rate": 0.0009999419023439236, + "loss": 3.7794, + "step": 501 + }, + { + "epoch": 0.014885982860362364, + "grad_norm": 0.25756537914276123, + "learning_rate": 0.000999941182889266, + "loss": 3.7769, + "step": 502 + }, + { + "epoch": 0.014915636212673843, + "grad_norm": 0.26872381567955017, + "learning_rate": 0.000999940459007544, + "loss": 3.7709, + "step": 503 + }, + { + "epoch": 0.014945289564985321, + "grad_norm": 0.25481945276260376, + "learning_rate": 0.000999939730698763, + "loss": 3.762, + "step": 504 + }, + { + "epoch": 0.0149749429172968, + "grad_norm": 0.26440247893333435, + "learning_rate": 0.00099993899796293, + "loss": 3.7467, + "step": 505 + }, + { + "epoch": 0.015004596269608279, + "grad_norm": 0.30881616473197937, + "learning_rate": 0.0009999382608000514, + "loss": 3.7398, + "step": 506 + }, + { + "epoch": 0.015034249621919758, + "grad_norm": 0.35646599531173706, + "learning_rate": 0.0009999375192101337, + "loss": 3.7906, + "step": 507 + }, + { + "epoch": 0.015063902974231236, + "grad_norm": 0.3305540978908539, + "learning_rate": 0.0009999367731931834, + "loss": 3.7778, + "step": 508 + }, + { + "epoch": 0.015093556326542716, + "grad_norm": 0.26614290475845337, + "learning_rate": 0.0009999360227492071, + "loss": 3.7616, + "step": 509 + }, + { + "epoch": 0.015123209678854194, + "grad_norm": 0.2304258793592453, + "learning_rate": 0.0009999352678782116, + "loss": 3.7877, + "step": 510 + }, + { + "epoch": 0.015152863031165673, + "grad_norm": 0.20387645065784454, + "learning_rate": 0.0009999345085802034, + "loss": 3.7215, + "step": 511 + }, + { + "epoch": 0.015182516383477153, + "grad_norm": 0.21086911857128143, + "learning_rate": 0.0009999337448551894, + "loss": 3.7539, + "step": 512 + }, + { + "epoch": 0.01521216973578863, + "grad_norm": 0.17748184502124786, + "learning_rate": 0.0009999329767031763, + "loss": 3.7637, + "step": 513 + }, + { + "epoch": 0.01524182308810011, + "grad_norm": 0.16422182321548462, + "learning_rate": 0.0009999322041241707, + "loss": 3.7196, + "step": 514 + }, + { + "epoch": 0.015271476440411588, + "grad_norm": 0.1480073481798172, + "learning_rate": 0.00099993142711818, + "loss": 3.7067, + "step": 515 + }, + { + "epoch": 0.015301129792723068, + "grad_norm": 0.15351277589797974, + "learning_rate": 0.0009999306456852102, + "loss": 3.7252, + "step": 516 + }, + { + "epoch": 0.015330783145034546, + "grad_norm": 0.1764651983976364, + "learning_rate": 0.000999929859825269, + "loss": 3.6703, + "step": 517 + }, + { + "epoch": 0.015360436497346025, + "grad_norm": 0.15416808426380157, + "learning_rate": 0.000999929069538363, + "loss": 3.7226, + "step": 518 + }, + { + "epoch": 0.015390089849657503, + "grad_norm": 0.14916902780532837, + "learning_rate": 0.0009999282748244993, + "loss": 3.6918, + "step": 519 + }, + { + "epoch": 0.015419743201968983, + "grad_norm": 0.1603958010673523, + "learning_rate": 0.000999927475683685, + "loss": 3.7068, + "step": 520 + }, + { + "epoch": 0.01544939655428046, + "grad_norm": 0.20115415751934052, + "learning_rate": 0.000999926672115927, + "loss": 3.7101, + "step": 521 + }, + { + "epoch": 0.01547904990659194, + "grad_norm": 0.24559257924556732, + "learning_rate": 0.0009999258641212325, + "loss": 3.7406, + "step": 522 + }, + { + "epoch": 0.015508703258903418, + "grad_norm": 0.26948732137680054, + "learning_rate": 0.0009999250516996088, + "loss": 3.7469, + "step": 523 + }, + { + "epoch": 0.015538356611214898, + "grad_norm": 0.2319377362728119, + "learning_rate": 0.0009999242348510628, + "loss": 3.7138, + "step": 524 + }, + { + "epoch": 0.015568009963526377, + "grad_norm": 0.24915127456188202, + "learning_rate": 0.000999923413575602, + "loss": 3.7168, + "step": 525 + }, + { + "epoch": 0.015597663315837855, + "grad_norm": 0.256888210773468, + "learning_rate": 0.0009999225878732335, + "loss": 3.7555, + "step": 526 + }, + { + "epoch": 0.015627316668149333, + "grad_norm": 0.21180899441242218, + "learning_rate": 0.0009999217577439645, + "loss": 3.7156, + "step": 527 + }, + { + "epoch": 0.015656970020460813, + "grad_norm": 0.2068791687488556, + "learning_rate": 0.0009999209231878027, + "loss": 3.725, + "step": 528 + }, + { + "epoch": 0.015686623372772292, + "grad_norm": 0.24547241628170013, + "learning_rate": 0.0009999200842047554, + "loss": 3.7321, + "step": 529 + }, + { + "epoch": 0.015716276725083772, + "grad_norm": 0.2785526216030121, + "learning_rate": 0.00099991924079483, + "loss": 3.6742, + "step": 530 + }, + { + "epoch": 0.015745930077395248, + "grad_norm": 0.23566105961799622, + "learning_rate": 0.0009999183929580338, + "loss": 3.7034, + "step": 531 + }, + { + "epoch": 0.015775583429706728, + "grad_norm": 0.2190236747264862, + "learning_rate": 0.0009999175406943744, + "loss": 3.7314, + "step": 532 + }, + { + "epoch": 0.015805236782018207, + "grad_norm": 0.23579105734825134, + "learning_rate": 0.0009999166840038592, + "loss": 3.7291, + "step": 533 + }, + { + "epoch": 0.015834890134329687, + "grad_norm": 0.24992312490940094, + "learning_rate": 0.0009999158228864962, + "loss": 3.7086, + "step": 534 + }, + { + "epoch": 0.015864543486641163, + "grad_norm": 0.24038289487361908, + "learning_rate": 0.0009999149573422926, + "loss": 3.6846, + "step": 535 + }, + { + "epoch": 0.015894196838952643, + "grad_norm": 0.2125399112701416, + "learning_rate": 0.0009999140873712565, + "loss": 3.7104, + "step": 536 + }, + { + "epoch": 0.015923850191264122, + "grad_norm": 0.22265447676181793, + "learning_rate": 0.000999913212973395, + "loss": 3.7082, + "step": 537 + }, + { + "epoch": 0.015953503543575602, + "grad_norm": 0.25653406977653503, + "learning_rate": 0.0009999123341487164, + "loss": 3.7356, + "step": 538 + }, + { + "epoch": 0.01598315689588708, + "grad_norm": 0.31217753887176514, + "learning_rate": 0.0009999114508972282, + "loss": 3.7196, + "step": 539 + }, + { + "epoch": 0.016012810248198558, + "grad_norm": 0.2728928029537201, + "learning_rate": 0.0009999105632189384, + "loss": 3.7033, + "step": 540 + }, + { + "epoch": 0.016042463600510037, + "grad_norm": 0.24674677848815918, + "learning_rate": 0.0009999096711138546, + "loss": 3.7237, + "step": 541 + }, + { + "epoch": 0.016072116952821517, + "grad_norm": 0.28642773628234863, + "learning_rate": 0.000999908774581985, + "loss": 3.6748, + "step": 542 + }, + { + "epoch": 0.016101770305132997, + "grad_norm": 0.24537667632102966, + "learning_rate": 0.0009999078736233373, + "loss": 3.707, + "step": 543 + }, + { + "epoch": 0.016131423657444473, + "grad_norm": 0.21018287539482117, + "learning_rate": 0.0009999069682379198, + "loss": 3.688, + "step": 544 + }, + { + "epoch": 0.016161077009755952, + "grad_norm": 0.18476136028766632, + "learning_rate": 0.00099990605842574, + "loss": 3.653, + "step": 545 + }, + { + "epoch": 0.016190730362067432, + "grad_norm": 0.18661506474018097, + "learning_rate": 0.0009999051441868063, + "loss": 3.6882, + "step": 546 + }, + { + "epoch": 0.01622038371437891, + "grad_norm": 0.17967715859413147, + "learning_rate": 0.0009999042255211268, + "loss": 3.6939, + "step": 547 + }, + { + "epoch": 0.016250037066690388, + "grad_norm": 0.1502300351858139, + "learning_rate": 0.0009999033024287096, + "loss": 3.677, + "step": 548 + }, + { + "epoch": 0.016279690419001867, + "grad_norm": 0.12555305659770966, + "learning_rate": 0.0009999023749095628, + "loss": 3.6694, + "step": 549 + }, + { + "epoch": 0.016309343771313347, + "grad_norm": 0.14278331398963928, + "learning_rate": 0.000999901442963695, + "loss": 3.6511, + "step": 550 + }, + { + "epoch": 0.016338997123624827, + "grad_norm": 0.14297400414943695, + "learning_rate": 0.0009999005065911136, + "loss": 3.6568, + "step": 551 + }, + { + "epoch": 0.016368650475936306, + "grad_norm": 0.1496790200471878, + "learning_rate": 0.0009998995657918275, + "loss": 3.6876, + "step": 552 + }, + { + "epoch": 0.016398303828247782, + "grad_norm": 0.1896149218082428, + "learning_rate": 0.0009998986205658452, + "loss": 3.6522, + "step": 553 + }, + { + "epoch": 0.016427957180559262, + "grad_norm": 0.17618972063064575, + "learning_rate": 0.0009998976709131744, + "loss": 3.6692, + "step": 554 + }, + { + "epoch": 0.01645761053287074, + "grad_norm": 0.16191557049751282, + "learning_rate": 0.0009998967168338242, + "loss": 3.6577, + "step": 555 + }, + { + "epoch": 0.01648726388518222, + "grad_norm": 0.209746852517128, + "learning_rate": 0.0009998957583278027, + "loss": 3.675, + "step": 556 + }, + { + "epoch": 0.016516917237493697, + "grad_norm": 0.22835804522037506, + "learning_rate": 0.0009998947953951186, + "loss": 3.6805, + "step": 557 + }, + { + "epoch": 0.016546570589805177, + "grad_norm": 0.2651835083961487, + "learning_rate": 0.00099989382803578, + "loss": 3.6681, + "step": 558 + }, + { + "epoch": 0.016576223942116657, + "grad_norm": 0.28937003016471863, + "learning_rate": 0.000999892856249796, + "loss": 3.6798, + "step": 559 + }, + { + "epoch": 0.016605877294428136, + "grad_norm": 0.28462883830070496, + "learning_rate": 0.000999891880037175, + "loss": 3.6678, + "step": 560 + }, + { + "epoch": 0.016635530646739612, + "grad_norm": 0.21436423063278198, + "learning_rate": 0.0009998908993979255, + "loss": 3.6647, + "step": 561 + }, + { + "epoch": 0.016665183999051092, + "grad_norm": 0.21654073894023895, + "learning_rate": 0.0009998899143320561, + "loss": 3.6275, + "step": 562 + }, + { + "epoch": 0.01669483735136257, + "grad_norm": 0.26759687066078186, + "learning_rate": 0.0009998889248395758, + "loss": 3.678, + "step": 563 + }, + { + "epoch": 0.01672449070367405, + "grad_norm": 0.220538929104805, + "learning_rate": 0.0009998879309204933, + "loss": 3.6871, + "step": 564 + }, + { + "epoch": 0.01675414405598553, + "grad_norm": 0.21067017316818237, + "learning_rate": 0.0009998869325748175, + "loss": 3.6774, + "step": 565 + }, + { + "epoch": 0.016783797408297007, + "grad_norm": 0.22634175419807434, + "learning_rate": 0.0009998859298025568, + "loss": 3.645, + "step": 566 + }, + { + "epoch": 0.016813450760608487, + "grad_norm": 0.24742668867111206, + "learning_rate": 0.0009998849226037208, + "loss": 3.6461, + "step": 567 + }, + { + "epoch": 0.016843104112919966, + "grad_norm": 0.21713098883628845, + "learning_rate": 0.0009998839109783178, + "loss": 3.677, + "step": 568 + }, + { + "epoch": 0.016872757465231446, + "grad_norm": 0.19655868411064148, + "learning_rate": 0.0009998828949263567, + "loss": 3.6269, + "step": 569 + }, + { + "epoch": 0.016902410817542922, + "grad_norm": 0.20222264528274536, + "learning_rate": 0.000999881874447847, + "loss": 3.6681, + "step": 570 + }, + { + "epoch": 0.0169320641698544, + "grad_norm": 0.2676234543323517, + "learning_rate": 0.0009998808495427975, + "loss": 3.6509, + "step": 571 + }, + { + "epoch": 0.01696171752216588, + "grad_norm": 0.3072691857814789, + "learning_rate": 0.0009998798202112175, + "loss": 3.6502, + "step": 572 + }, + { + "epoch": 0.01699137087447736, + "grad_norm": 0.2396077811717987, + "learning_rate": 0.0009998787864531156, + "loss": 3.6874, + "step": 573 + }, + { + "epoch": 0.017021024226788837, + "grad_norm": 0.21995270252227783, + "learning_rate": 0.0009998777482685013, + "loss": 3.677, + "step": 574 + }, + { + "epoch": 0.017050677579100317, + "grad_norm": 0.1866428554058075, + "learning_rate": 0.0009998767056573837, + "loss": 3.6566, + "step": 575 + }, + { + "epoch": 0.017080330931411796, + "grad_norm": 0.1674709916114807, + "learning_rate": 0.0009998756586197721, + "loss": 3.6764, + "step": 576 + }, + { + "epoch": 0.017109984283723276, + "grad_norm": 0.1396779716014862, + "learning_rate": 0.0009998746071556758, + "loss": 3.629, + "step": 577 + }, + { + "epoch": 0.017139637636034755, + "grad_norm": 0.15784776210784912, + "learning_rate": 0.000999873551265104, + "loss": 3.6351, + "step": 578 + }, + { + "epoch": 0.01716929098834623, + "grad_norm": 0.1414615511894226, + "learning_rate": 0.000999872490948066, + "loss": 3.6419, + "step": 579 + }, + { + "epoch": 0.01719894434065771, + "grad_norm": 0.16104407608509064, + "learning_rate": 0.0009998714262045715, + "loss": 3.6483, + "step": 580 + }, + { + "epoch": 0.01722859769296919, + "grad_norm": 0.17578288912773132, + "learning_rate": 0.0009998703570346297, + "loss": 3.5858, + "step": 581 + }, + { + "epoch": 0.01725825104528067, + "grad_norm": 0.21444176137447357, + "learning_rate": 0.0009998692834382499, + "loss": 3.629, + "step": 582 + }, + { + "epoch": 0.017287904397592146, + "grad_norm": 0.22883960604667664, + "learning_rate": 0.000999868205415442, + "loss": 3.6343, + "step": 583 + }, + { + "epoch": 0.017317557749903626, + "grad_norm": 0.21811693906784058, + "learning_rate": 0.000999867122966215, + "loss": 3.6224, + "step": 584 + }, + { + "epoch": 0.017347211102215106, + "grad_norm": 0.23218509554862976, + "learning_rate": 0.0009998660360905792, + "loss": 3.6134, + "step": 585 + }, + { + "epoch": 0.017376864454526585, + "grad_norm": 0.22580401599407196, + "learning_rate": 0.0009998649447885437, + "loss": 3.6392, + "step": 586 + }, + { + "epoch": 0.01740651780683806, + "grad_norm": 0.16846540570259094, + "learning_rate": 0.0009998638490601184, + "loss": 3.6056, + "step": 587 + }, + { + "epoch": 0.01743617115914954, + "grad_norm": 0.19588874280452728, + "learning_rate": 0.0009998627489053128, + "loss": 3.6131, + "step": 588 + }, + { + "epoch": 0.01746582451146102, + "grad_norm": 0.1920393407344818, + "learning_rate": 0.000999861644324137, + "loss": 3.5955, + "step": 589 + }, + { + "epoch": 0.0174954778637725, + "grad_norm": 0.17135845124721527, + "learning_rate": 0.0009998605353166, + "loss": 3.6417, + "step": 590 + }, + { + "epoch": 0.01752513121608398, + "grad_norm": 0.1642409861087799, + "learning_rate": 0.0009998594218827127, + "loss": 3.6189, + "step": 591 + }, + { + "epoch": 0.017554784568395456, + "grad_norm": 0.16791921854019165, + "learning_rate": 0.0009998583040224842, + "loss": 3.5735, + "step": 592 + }, + { + "epoch": 0.017584437920706936, + "grad_norm": 0.18813557922840118, + "learning_rate": 0.0009998571817359245, + "loss": 3.6085, + "step": 593 + }, + { + "epoch": 0.017614091273018415, + "grad_norm": 0.20202553272247314, + "learning_rate": 0.0009998560550230438, + "loss": 3.5934, + "step": 594 + }, + { + "epoch": 0.017643744625329895, + "grad_norm": 0.24121753871440887, + "learning_rate": 0.000999854923883852, + "loss": 3.5784, + "step": 595 + }, + { + "epoch": 0.01767339797764137, + "grad_norm": 0.24753950536251068, + "learning_rate": 0.000999853788318359, + "loss": 3.6388, + "step": 596 + }, + { + "epoch": 0.01770305132995285, + "grad_norm": 0.27859964966773987, + "learning_rate": 0.0009998526483265748, + "loss": 3.607, + "step": 597 + }, + { + "epoch": 0.01773270468226433, + "grad_norm": 0.2315208911895752, + "learning_rate": 0.0009998515039085097, + "loss": 3.6395, + "step": 598 + }, + { + "epoch": 0.01776235803457581, + "grad_norm": 0.26925763487815857, + "learning_rate": 0.0009998503550641739, + "loss": 3.6231, + "step": 599 + }, + { + "epoch": 0.017792011386887286, + "grad_norm": 0.28140243887901306, + "learning_rate": 0.000999849201793577, + "loss": 3.6301, + "step": 600 + }, + { + "epoch": 0.017821664739198766, + "grad_norm": 0.26391223073005676, + "learning_rate": 0.00099984804409673, + "loss": 3.5883, + "step": 601 + }, + { + "epoch": 0.017851318091510245, + "grad_norm": 0.28541770577430725, + "learning_rate": 0.0009998468819736425, + "loss": 3.6162, + "step": 602 + }, + { + "epoch": 0.017880971443821725, + "grad_norm": 0.2249811440706253, + "learning_rate": 0.0009998457154243253, + "loss": 3.6439, + "step": 603 + }, + { + "epoch": 0.017910624796133204, + "grad_norm": 0.20441606640815735, + "learning_rate": 0.0009998445444487883, + "loss": 3.6297, + "step": 604 + }, + { + "epoch": 0.01794027814844468, + "grad_norm": 0.20837262272834778, + "learning_rate": 0.000999843369047042, + "loss": 3.6112, + "step": 605 + }, + { + "epoch": 0.01796993150075616, + "grad_norm": 0.17206339538097382, + "learning_rate": 0.0009998421892190971, + "loss": 3.5961, + "step": 606 + }, + { + "epoch": 0.01799958485306764, + "grad_norm": 0.2021981179714203, + "learning_rate": 0.0009998410049649638, + "loss": 3.635, + "step": 607 + }, + { + "epoch": 0.01802923820537912, + "grad_norm": 0.20047864317893982, + "learning_rate": 0.0009998398162846525, + "loss": 3.6308, + "step": 608 + }, + { + "epoch": 0.018058891557690596, + "grad_norm": 0.17749369144439697, + "learning_rate": 0.0009998386231781738, + "loss": 3.5911, + "step": 609 + }, + { + "epoch": 0.018088544910002075, + "grad_norm": 0.20091204345226288, + "learning_rate": 0.0009998374256455383, + "loss": 3.6231, + "step": 610 + }, + { + "epoch": 0.018118198262313555, + "grad_norm": 0.19018132984638214, + "learning_rate": 0.0009998362236867567, + "loss": 3.5746, + "step": 611 + }, + { + "epoch": 0.018147851614625034, + "grad_norm": 0.15882952511310577, + "learning_rate": 0.0009998350173018393, + "loss": 3.5704, + "step": 612 + }, + { + "epoch": 0.01817750496693651, + "grad_norm": 0.17732000350952148, + "learning_rate": 0.0009998338064907974, + "loss": 3.5789, + "step": 613 + }, + { + "epoch": 0.01820715831924799, + "grad_norm": 0.1315821409225464, + "learning_rate": 0.0009998325912536413, + "loss": 3.5617, + "step": 614 + }, + { + "epoch": 0.01823681167155947, + "grad_norm": 0.13330046832561493, + "learning_rate": 0.0009998313715903816, + "loss": 3.6001, + "step": 615 + }, + { + "epoch": 0.01826646502387095, + "grad_norm": 0.16035111248493195, + "learning_rate": 0.0009998301475010293, + "loss": 3.5884, + "step": 616 + }, + { + "epoch": 0.01829611837618243, + "grad_norm": 0.18085958063602448, + "learning_rate": 0.0009998289189855954, + "loss": 3.5643, + "step": 617 + }, + { + "epoch": 0.018325771728493905, + "grad_norm": 0.20188534259796143, + "learning_rate": 0.0009998276860440905, + "loss": 3.6052, + "step": 618 + }, + { + "epoch": 0.018355425080805385, + "grad_norm": 0.25831368565559387, + "learning_rate": 0.0009998264486765257, + "loss": 3.6183, + "step": 619 + }, + { + "epoch": 0.018385078433116864, + "grad_norm": 0.31355008482933044, + "learning_rate": 0.0009998252068829118, + "loss": 3.6215, + "step": 620 + }, + { + "epoch": 0.018414731785428344, + "grad_norm": 0.28366050124168396, + "learning_rate": 0.0009998239606632603, + "loss": 3.5966, + "step": 621 + }, + { + "epoch": 0.01844438513773982, + "grad_norm": 0.2699154317378998, + "learning_rate": 0.0009998227100175814, + "loss": 3.6088, + "step": 622 + }, + { + "epoch": 0.0184740384900513, + "grad_norm": 0.32982444763183594, + "learning_rate": 0.0009998214549458869, + "loss": 3.5913, + "step": 623 + }, + { + "epoch": 0.01850369184236278, + "grad_norm": 0.22785630822181702, + "learning_rate": 0.0009998201954481874, + "loss": 3.5851, + "step": 624 + }, + { + "epoch": 0.01853334519467426, + "grad_norm": 0.22945886850357056, + "learning_rate": 0.0009998189315244942, + "loss": 3.6177, + "step": 625 + }, + { + "epoch": 0.018562998546985735, + "grad_norm": 0.23039376735687256, + "learning_rate": 0.0009998176631748187, + "loss": 3.6149, + "step": 626 + }, + { + "epoch": 0.018592651899297215, + "grad_norm": 0.21240602433681488, + "learning_rate": 0.0009998163903991721, + "loss": 3.5941, + "step": 627 + }, + { + "epoch": 0.018622305251608694, + "grad_norm": 0.201915442943573, + "learning_rate": 0.0009998151131975655, + "loss": 3.5827, + "step": 628 + }, + { + "epoch": 0.018651958603920174, + "grad_norm": 0.20647123456001282, + "learning_rate": 0.0009998138315700103, + "loss": 3.583, + "step": 629 + }, + { + "epoch": 0.018681611956231654, + "grad_norm": 0.15970855951309204, + "learning_rate": 0.000999812545516518, + "loss": 3.5575, + "step": 630 + }, + { + "epoch": 0.01871126530854313, + "grad_norm": 0.19063372910022736, + "learning_rate": 0.0009998112550370995, + "loss": 3.5705, + "step": 631 + }, + { + "epoch": 0.01874091866085461, + "grad_norm": 0.1856985241174698, + "learning_rate": 0.0009998099601317666, + "loss": 3.5845, + "step": 632 + }, + { + "epoch": 0.01877057201316609, + "grad_norm": 0.13558456301689148, + "learning_rate": 0.0009998086608005309, + "loss": 3.5705, + "step": 633 + }, + { + "epoch": 0.01880022536547757, + "grad_norm": 0.14240139722824097, + "learning_rate": 0.0009998073570434034, + "loss": 3.5858, + "step": 634 + }, + { + "epoch": 0.018829878717789045, + "grad_norm": 0.12790147960186005, + "learning_rate": 0.0009998060488603962, + "loss": 3.5679, + "step": 635 + }, + { + "epoch": 0.018859532070100524, + "grad_norm": 0.12416842579841614, + "learning_rate": 0.0009998047362515207, + "loss": 3.541, + "step": 636 + }, + { + "epoch": 0.018889185422412004, + "grad_norm": 0.13286440074443817, + "learning_rate": 0.0009998034192167883, + "loss": 3.5765, + "step": 637 + }, + { + "epoch": 0.018918838774723484, + "grad_norm": 0.14172318577766418, + "learning_rate": 0.000999802097756211, + "loss": 3.5809, + "step": 638 + }, + { + "epoch": 0.01894849212703496, + "grad_norm": 0.15360772609710693, + "learning_rate": 0.0009998007718698002, + "loss": 3.5632, + "step": 639 + }, + { + "epoch": 0.01897814547934644, + "grad_norm": 0.1504494994878769, + "learning_rate": 0.0009997994415575679, + "loss": 3.5631, + "step": 640 + }, + { + "epoch": 0.01900779883165792, + "grad_norm": 0.1843711882829666, + "learning_rate": 0.0009997981068195255, + "loss": 3.5389, + "step": 641 + }, + { + "epoch": 0.0190374521839694, + "grad_norm": 0.1917615383863449, + "learning_rate": 0.0009997967676556854, + "loss": 3.5618, + "step": 642 + }, + { + "epoch": 0.019067105536280878, + "grad_norm": 0.19148293137550354, + "learning_rate": 0.000999795424066059, + "loss": 3.573, + "step": 643 + }, + { + "epoch": 0.019096758888592354, + "grad_norm": 0.16595876216888428, + "learning_rate": 0.0009997940760506582, + "loss": 3.564, + "step": 644 + }, + { + "epoch": 0.019126412240903834, + "grad_norm": 0.15597964823246002, + "learning_rate": 0.0009997927236094952, + "loss": 3.5649, + "step": 645 + }, + { + "epoch": 0.019156065593215314, + "grad_norm": 0.1683032363653183, + "learning_rate": 0.0009997913667425817, + "loss": 3.564, + "step": 646 + }, + { + "epoch": 0.019185718945526793, + "grad_norm": 0.19681821763515472, + "learning_rate": 0.00099979000544993, + "loss": 3.5776, + "step": 647 + }, + { + "epoch": 0.01921537229783827, + "grad_norm": 0.20844727754592896, + "learning_rate": 0.0009997886397315522, + "loss": 3.5896, + "step": 648 + }, + { + "epoch": 0.01924502565014975, + "grad_norm": 0.2096170336008072, + "learning_rate": 0.0009997872695874598, + "loss": 3.5891, + "step": 649 + }, + { + "epoch": 0.01927467900246123, + "grad_norm": 0.18280433118343353, + "learning_rate": 0.0009997858950176657, + "loss": 3.5426, + "step": 650 + }, + { + "epoch": 0.019304332354772708, + "grad_norm": 0.2101995050907135, + "learning_rate": 0.0009997845160221815, + "loss": 3.5671, + "step": 651 + }, + { + "epoch": 0.019333985707084184, + "grad_norm": 0.2113957405090332, + "learning_rate": 0.0009997831326010198, + "loss": 3.5625, + "step": 652 + }, + { + "epoch": 0.019363639059395664, + "grad_norm": 0.23184071481227875, + "learning_rate": 0.0009997817447541925, + "loss": 3.5687, + "step": 653 + }, + { + "epoch": 0.019393292411707144, + "grad_norm": 0.22375734150409698, + "learning_rate": 0.000999780352481712, + "loss": 3.5244, + "step": 654 + }, + { + "epoch": 0.019422945764018623, + "grad_norm": 0.19863691926002502, + "learning_rate": 0.000999778955783591, + "loss": 3.5758, + "step": 655 + }, + { + "epoch": 0.019452599116330103, + "grad_norm": 0.1912478506565094, + "learning_rate": 0.0009997775546598414, + "loss": 3.5626, + "step": 656 + }, + { + "epoch": 0.01948225246864158, + "grad_norm": 0.1894378364086151, + "learning_rate": 0.0009997761491104754, + "loss": 3.5663, + "step": 657 + }, + { + "epoch": 0.01951190582095306, + "grad_norm": 0.18619553744792938, + "learning_rate": 0.0009997747391355064, + "loss": 3.5418, + "step": 658 + }, + { + "epoch": 0.019541559173264538, + "grad_norm": 0.1810201108455658, + "learning_rate": 0.000999773324734946, + "loss": 3.5702, + "step": 659 + }, + { + "epoch": 0.019571212525576018, + "grad_norm": 0.16557563841342926, + "learning_rate": 0.0009997719059088072, + "loss": 3.5038, + "step": 660 + }, + { + "epoch": 0.019600865877887494, + "grad_norm": 0.17795330286026, + "learning_rate": 0.000999770482657102, + "loss": 3.5159, + "step": 661 + }, + { + "epoch": 0.019630519230198974, + "grad_norm": 0.20166301727294922, + "learning_rate": 0.0009997690549798438, + "loss": 3.5266, + "step": 662 + }, + { + "epoch": 0.019660172582510453, + "grad_norm": 0.1998298317193985, + "learning_rate": 0.0009997676228770448, + "loss": 3.514, + "step": 663 + }, + { + "epoch": 0.019689825934821933, + "grad_norm": 0.20171386003494263, + "learning_rate": 0.0009997661863487175, + "loss": 3.525, + "step": 664 + }, + { + "epoch": 0.01971947928713341, + "grad_norm": 0.21362389624118805, + "learning_rate": 0.0009997647453948752, + "loss": 3.5741, + "step": 665 + }, + { + "epoch": 0.01974913263944489, + "grad_norm": 0.24747276306152344, + "learning_rate": 0.0009997633000155299, + "loss": 3.5347, + "step": 666 + }, + { + "epoch": 0.019778785991756368, + "grad_norm": 0.2183244228363037, + "learning_rate": 0.0009997618502106949, + "loss": 3.4803, + "step": 667 + }, + { + "epoch": 0.019808439344067848, + "grad_norm": 0.18957844376564026, + "learning_rate": 0.000999760395980383, + "loss": 3.5365, + "step": 668 + }, + { + "epoch": 0.019838092696379327, + "grad_norm": 0.2051105499267578, + "learning_rate": 0.000999758937324607, + "loss": 3.5428, + "step": 669 + }, + { + "epoch": 0.019867746048690803, + "grad_norm": 0.248274564743042, + "learning_rate": 0.0009997574742433798, + "loss": 3.5304, + "step": 670 + }, + { + "epoch": 0.019897399401002283, + "grad_norm": 0.1942138969898224, + "learning_rate": 0.0009997560067367144, + "loss": 3.5647, + "step": 671 + }, + { + "epoch": 0.019927052753313763, + "grad_norm": 0.18118128180503845, + "learning_rate": 0.0009997545348046238, + "loss": 3.505, + "step": 672 + }, + { + "epoch": 0.019956706105625242, + "grad_norm": 0.25846704840660095, + "learning_rate": 0.0009997530584471208, + "loss": 3.4981, + "step": 673 + }, + { + "epoch": 0.01998635945793672, + "grad_norm": 0.28788480162620544, + "learning_rate": 0.000999751577664219, + "loss": 3.5282, + "step": 674 + }, + { + "epoch": 0.020016012810248198, + "grad_norm": 0.18160849809646606, + "learning_rate": 0.0009997500924559311, + "loss": 3.5585, + "step": 675 + }, + { + "epoch": 0.020045666162559678, + "grad_norm": 0.13350877165794373, + "learning_rate": 0.0009997486028222701, + "loss": 3.5163, + "step": 676 + }, + { + "epoch": 0.020075319514871157, + "grad_norm": 0.1539018750190735, + "learning_rate": 0.0009997471087632498, + "loss": 3.5225, + "step": 677 + }, + { + "epoch": 0.020104972867182633, + "grad_norm": 0.15275222063064575, + "learning_rate": 0.0009997456102788828, + "loss": 3.5033, + "step": 678 + }, + { + "epoch": 0.020134626219494113, + "grad_norm": 0.17380183935165405, + "learning_rate": 0.0009997441073691829, + "loss": 3.5127, + "step": 679 + }, + { + "epoch": 0.020164279571805593, + "grad_norm": 0.15897400677204132, + "learning_rate": 0.000999742600034163, + "loss": 3.5255, + "step": 680 + }, + { + "epoch": 0.020193932924117072, + "grad_norm": 0.17390529811382294, + "learning_rate": 0.0009997410882738368, + "loss": 3.4681, + "step": 681 + }, + { + "epoch": 0.020223586276428552, + "grad_norm": 0.1709989309310913, + "learning_rate": 0.0009997395720882172, + "loss": 3.4916, + "step": 682 + }, + { + "epoch": 0.020253239628740028, + "grad_norm": 0.1868009716272354, + "learning_rate": 0.000999738051477318, + "loss": 3.4871, + "step": 683 + }, + { + "epoch": 0.020282892981051508, + "grad_norm": 0.221540629863739, + "learning_rate": 0.0009997365264411526, + "loss": 3.5504, + "step": 684 + }, + { + "epoch": 0.020312546333362987, + "grad_norm": 0.18686117231845856, + "learning_rate": 0.0009997349969797344, + "loss": 3.5219, + "step": 685 + }, + { + "epoch": 0.020342199685674467, + "grad_norm": 0.1712210774421692, + "learning_rate": 0.000999733463093077, + "loss": 3.5091, + "step": 686 + }, + { + "epoch": 0.020371853037985943, + "grad_norm": 0.21114236116409302, + "learning_rate": 0.0009997319247811941, + "loss": 3.5052, + "step": 687 + }, + { + "epoch": 0.020401506390297423, + "grad_norm": 0.17655454576015472, + "learning_rate": 0.0009997303820440994, + "loss": 3.5244, + "step": 688 + }, + { + "epoch": 0.020431159742608902, + "grad_norm": 0.2247869074344635, + "learning_rate": 0.0009997288348818061, + "loss": 3.4982, + "step": 689 + }, + { + "epoch": 0.020460813094920382, + "grad_norm": 0.24118071794509888, + "learning_rate": 0.0009997272832943283, + "loss": 3.5253, + "step": 690 + }, + { + "epoch": 0.020490466447231858, + "grad_norm": 0.24795694649219513, + "learning_rate": 0.0009997257272816797, + "loss": 3.5225, + "step": 691 + }, + { + "epoch": 0.020520119799543338, + "grad_norm": 0.21171393990516663, + "learning_rate": 0.0009997241668438738, + "loss": 3.489, + "step": 692 + }, + { + "epoch": 0.020549773151854817, + "grad_norm": 0.270896315574646, + "learning_rate": 0.000999722601980925, + "loss": 3.4997, + "step": 693 + }, + { + "epoch": 0.020579426504166297, + "grad_norm": 0.16336333751678467, + "learning_rate": 0.0009997210326928463, + "loss": 3.5415, + "step": 694 + }, + { + "epoch": 0.020609079856477776, + "grad_norm": 0.20581135153770447, + "learning_rate": 0.0009997194589796525, + "loss": 3.5233, + "step": 695 + }, + { + "epoch": 0.020638733208789253, + "grad_norm": 0.19206440448760986, + "learning_rate": 0.000999717880841357, + "loss": 3.546, + "step": 696 + }, + { + "epoch": 0.020668386561100732, + "grad_norm": 0.1851632297039032, + "learning_rate": 0.0009997162982779738, + "loss": 3.5406, + "step": 697 + }, + { + "epoch": 0.020698039913412212, + "grad_norm": 0.16235943138599396, + "learning_rate": 0.000999714711289517, + "loss": 3.4811, + "step": 698 + }, + { + "epoch": 0.02072769326572369, + "grad_norm": 0.15019893646240234, + "learning_rate": 0.0009997131198760006, + "loss": 3.5123, + "step": 699 + }, + { + "epoch": 0.020757346618035168, + "grad_norm": 0.12195251882076263, + "learning_rate": 0.0009997115240374388, + "loss": 3.492, + "step": 700 + }, + { + "epoch": 0.020786999970346647, + "grad_norm": 0.14790160953998566, + "learning_rate": 0.000999709923773846, + "loss": 3.5115, + "step": 701 + }, + { + "epoch": 0.020816653322658127, + "grad_norm": 0.14258690178394318, + "learning_rate": 0.0009997083190852356, + "loss": 3.4954, + "step": 702 + }, + { + "epoch": 0.020846306674969606, + "grad_norm": 0.16055651009082794, + "learning_rate": 0.0009997067099716225, + "loss": 3.5018, + "step": 703 + }, + { + "epoch": 0.020875960027281083, + "grad_norm": 0.16620714962482452, + "learning_rate": 0.0009997050964330205, + "loss": 3.4878, + "step": 704 + }, + { + "epoch": 0.020905613379592562, + "grad_norm": 0.15494239330291748, + "learning_rate": 0.0009997034784694444, + "loss": 3.4825, + "step": 705 + }, + { + "epoch": 0.020935266731904042, + "grad_norm": 0.19108891487121582, + "learning_rate": 0.000999701856080908, + "loss": 3.5394, + "step": 706 + }, + { + "epoch": 0.02096492008421552, + "grad_norm": 0.1837247759103775, + "learning_rate": 0.000999700229267426, + "loss": 3.4933, + "step": 707 + }, + { + "epoch": 0.020994573436527, + "grad_norm": 0.19952768087387085, + "learning_rate": 0.0009996985980290126, + "loss": 3.479, + "step": 708 + }, + { + "epoch": 0.021024226788838477, + "grad_norm": 0.24850492179393768, + "learning_rate": 0.0009996969623656824, + "loss": 3.5336, + "step": 709 + }, + { + "epoch": 0.021053880141149957, + "grad_norm": 0.24734123051166534, + "learning_rate": 0.0009996953222774498, + "loss": 3.5007, + "step": 710 + }, + { + "epoch": 0.021083533493461436, + "grad_norm": 0.1799120008945465, + "learning_rate": 0.0009996936777643293, + "loss": 3.5203, + "step": 711 + }, + { + "epoch": 0.021113186845772916, + "grad_norm": 0.2137790024280548, + "learning_rate": 0.0009996920288263358, + "loss": 3.5095, + "step": 712 + }, + { + "epoch": 0.021142840198084392, + "grad_norm": 0.224616140127182, + "learning_rate": 0.0009996903754634833, + "loss": 3.511, + "step": 713 + }, + { + "epoch": 0.021172493550395872, + "grad_norm": 0.21174031496047974, + "learning_rate": 0.0009996887176757867, + "loss": 3.5104, + "step": 714 + }, + { + "epoch": 0.02120214690270735, + "grad_norm": 0.18496596813201904, + "learning_rate": 0.000999687055463261, + "loss": 3.5307, + "step": 715 + }, + { + "epoch": 0.02123180025501883, + "grad_norm": 0.17323561012744904, + "learning_rate": 0.0009996853888259206, + "loss": 3.4896, + "step": 716 + }, + { + "epoch": 0.021261453607330307, + "grad_norm": 0.19192920625209808, + "learning_rate": 0.0009996837177637802, + "loss": 3.465, + "step": 717 + }, + { + "epoch": 0.021291106959641787, + "grad_norm": 0.19874538481235504, + "learning_rate": 0.0009996820422768548, + "loss": 3.4758, + "step": 718 + }, + { + "epoch": 0.021320760311953266, + "grad_norm": 0.21198134124279022, + "learning_rate": 0.0009996803623651591, + "loss": 3.4775, + "step": 719 + }, + { + "epoch": 0.021350413664264746, + "grad_norm": 0.2148400843143463, + "learning_rate": 0.000999678678028708, + "loss": 3.4814, + "step": 720 + }, + { + "epoch": 0.021380067016576226, + "grad_norm": 0.145036980509758, + "learning_rate": 0.0009996769892675166, + "loss": 3.4851, + "step": 721 + }, + { + "epoch": 0.021409720368887702, + "grad_norm": 0.13336052000522614, + "learning_rate": 0.0009996752960815996, + "loss": 3.4493, + "step": 722 + }, + { + "epoch": 0.02143937372119918, + "grad_norm": 0.13389700651168823, + "learning_rate": 0.000999673598470972, + "loss": 3.4957, + "step": 723 + }, + { + "epoch": 0.02146902707351066, + "grad_norm": 0.15874464809894562, + "learning_rate": 0.0009996718964356487, + "loss": 3.4766, + "step": 724 + }, + { + "epoch": 0.02149868042582214, + "grad_norm": 0.18984894454479218, + "learning_rate": 0.0009996701899756455, + "loss": 3.4757, + "step": 725 + }, + { + "epoch": 0.021528333778133617, + "grad_norm": 0.18420682847499847, + "learning_rate": 0.0009996684790909767, + "loss": 3.483, + "step": 726 + }, + { + "epoch": 0.021557987130445096, + "grad_norm": 0.1695212870836258, + "learning_rate": 0.0009996667637816577, + "loss": 3.4704, + "step": 727 + }, + { + "epoch": 0.021587640482756576, + "grad_norm": 0.18476273119449615, + "learning_rate": 0.000999665044047704, + "loss": 3.4316, + "step": 728 + }, + { + "epoch": 0.021617293835068056, + "grad_norm": 0.16327275335788727, + "learning_rate": 0.0009996633198891302, + "loss": 3.5047, + "step": 729 + }, + { + "epoch": 0.021646947187379532, + "grad_norm": 0.14073610305786133, + "learning_rate": 0.0009996615913059523, + "loss": 3.4783, + "step": 730 + }, + { + "epoch": 0.02167660053969101, + "grad_norm": 0.16342665255069733, + "learning_rate": 0.000999659858298185, + "loss": 3.4817, + "step": 731 + }, + { + "epoch": 0.02170625389200249, + "grad_norm": 0.15185026824474335, + "learning_rate": 0.000999658120865844, + "loss": 3.4754, + "step": 732 + }, + { + "epoch": 0.02173590724431397, + "grad_norm": 0.16442058980464935, + "learning_rate": 0.0009996563790089445, + "loss": 3.4469, + "step": 733 + }, + { + "epoch": 0.02176556059662545, + "grad_norm": 0.15739329159259796, + "learning_rate": 0.0009996546327275023, + "loss": 3.4679, + "step": 734 + }, + { + "epoch": 0.021795213948936926, + "grad_norm": 0.1396990269422531, + "learning_rate": 0.0009996528820215322, + "loss": 3.4402, + "step": 735 + }, + { + "epoch": 0.021824867301248406, + "grad_norm": 0.15506285429000854, + "learning_rate": 0.0009996511268910502, + "loss": 3.4431, + "step": 736 + }, + { + "epoch": 0.021854520653559886, + "grad_norm": 0.1593862771987915, + "learning_rate": 0.0009996493673360717, + "loss": 3.4612, + "step": 737 + }, + { + "epoch": 0.021884174005871365, + "grad_norm": 0.18480294942855835, + "learning_rate": 0.0009996476033566123, + "loss": 3.4601, + "step": 738 + }, + { + "epoch": 0.02191382735818284, + "grad_norm": 0.207567498087883, + "learning_rate": 0.0009996458349526877, + "loss": 3.4583, + "step": 739 + }, + { + "epoch": 0.02194348071049432, + "grad_norm": 0.21225185692310333, + "learning_rate": 0.0009996440621243133, + "loss": 3.4841, + "step": 740 + }, + { + "epoch": 0.0219731340628058, + "grad_norm": 0.21751569211483002, + "learning_rate": 0.000999642284871505, + "loss": 3.4745, + "step": 741 + }, + { + "epoch": 0.02200278741511728, + "grad_norm": 0.26541051268577576, + "learning_rate": 0.0009996405031942786, + "loss": 3.4905, + "step": 742 + }, + { + "epoch": 0.022032440767428756, + "grad_norm": 0.2245505005121231, + "learning_rate": 0.0009996387170926495, + "loss": 3.4796, + "step": 743 + }, + { + "epoch": 0.022062094119740236, + "grad_norm": 0.22873814404010773, + "learning_rate": 0.0009996369265666341, + "loss": 3.4469, + "step": 744 + }, + { + "epoch": 0.022091747472051716, + "grad_norm": 0.22130578756332397, + "learning_rate": 0.0009996351316162479, + "loss": 3.4694, + "step": 745 + }, + { + "epoch": 0.022121400824363195, + "grad_norm": 0.19865268468856812, + "learning_rate": 0.0009996333322415069, + "loss": 3.4613, + "step": 746 + }, + { + "epoch": 0.022151054176674675, + "grad_norm": 0.2127135843038559, + "learning_rate": 0.0009996315284424267, + "loss": 3.4743, + "step": 747 + }, + { + "epoch": 0.02218070752898615, + "grad_norm": 0.23930570483207703, + "learning_rate": 0.0009996297202190239, + "loss": 3.4937, + "step": 748 + }, + { + "epoch": 0.02221036088129763, + "grad_norm": 0.20230509340763092, + "learning_rate": 0.000999627907571314, + "loss": 3.452, + "step": 749 + }, + { + "epoch": 0.02224001423360911, + "grad_norm": 0.1909109652042389, + "learning_rate": 0.000999626090499313, + "loss": 3.4681, + "step": 750 + }, + { + "epoch": 0.02226966758592059, + "grad_norm": 0.16601143777370453, + "learning_rate": 0.0009996242690030377, + "loss": 3.471, + "step": 751 + }, + { + "epoch": 0.022299320938232066, + "grad_norm": 0.16384558379650116, + "learning_rate": 0.0009996224430825033, + "loss": 3.4217, + "step": 752 + }, + { + "epoch": 0.022328974290543546, + "grad_norm": 0.15749339759349823, + "learning_rate": 0.0009996206127377268, + "loss": 3.442, + "step": 753 + }, + { + "epoch": 0.022358627642855025, + "grad_norm": 0.1381659358739853, + "learning_rate": 0.0009996187779687236, + "loss": 3.4678, + "step": 754 + }, + { + "epoch": 0.022388280995166505, + "grad_norm": 0.1489262878894806, + "learning_rate": 0.0009996169387755107, + "loss": 3.4802, + "step": 755 + }, + { + "epoch": 0.02241793434747798, + "grad_norm": 0.16620208323001862, + "learning_rate": 0.000999615095158104, + "loss": 3.4701, + "step": 756 + }, + { + "epoch": 0.02244758769978946, + "grad_norm": 0.16394197940826416, + "learning_rate": 0.0009996132471165196, + "loss": 3.4686, + "step": 757 + }, + { + "epoch": 0.02247724105210094, + "grad_norm": 0.16207954287528992, + "learning_rate": 0.0009996113946507744, + "loss": 3.4071, + "step": 758 + }, + { + "epoch": 0.02250689440441242, + "grad_norm": 0.14939740300178528, + "learning_rate": 0.0009996095377608845, + "loss": 3.4509, + "step": 759 + }, + { + "epoch": 0.0225365477567239, + "grad_norm": 0.16026842594146729, + "learning_rate": 0.0009996076764468664, + "loss": 3.449, + "step": 760 + }, + { + "epoch": 0.022566201109035375, + "grad_norm": 0.1693795770406723, + "learning_rate": 0.0009996058107087365, + "loss": 3.4441, + "step": 761 + }, + { + "epoch": 0.022595854461346855, + "grad_norm": 0.19681382179260254, + "learning_rate": 0.0009996039405465113, + "loss": 3.4421, + "step": 762 + }, + { + "epoch": 0.022625507813658335, + "grad_norm": 0.17430131137371063, + "learning_rate": 0.0009996020659602076, + "loss": 3.4605, + "step": 763 + }, + { + "epoch": 0.022655161165969814, + "grad_norm": 0.18339446187019348, + "learning_rate": 0.000999600186949842, + "loss": 3.4417, + "step": 764 + }, + { + "epoch": 0.02268481451828129, + "grad_norm": 0.2030780017375946, + "learning_rate": 0.0009995983035154307, + "loss": 3.4273, + "step": 765 + }, + { + "epoch": 0.02271446787059277, + "grad_norm": 0.20858772099018097, + "learning_rate": 0.0009995964156569908, + "loss": 3.483, + "step": 766 + }, + { + "epoch": 0.02274412122290425, + "grad_norm": 0.2208828181028366, + "learning_rate": 0.0009995945233745387, + "loss": 3.4409, + "step": 767 + }, + { + "epoch": 0.02277377457521573, + "grad_norm": 0.3491773307323456, + "learning_rate": 0.0009995926266680917, + "loss": 3.4311, + "step": 768 + }, + { + "epoch": 0.022803427927527205, + "grad_norm": 0.2330491989850998, + "learning_rate": 0.000999590725537666, + "loss": 3.4835, + "step": 769 + }, + { + "epoch": 0.022833081279838685, + "grad_norm": 0.2288559377193451, + "learning_rate": 0.0009995888199832786, + "loss": 3.4238, + "step": 770 + }, + { + "epoch": 0.022862734632150165, + "grad_norm": 0.15762853622436523, + "learning_rate": 0.0009995869100049466, + "loss": 3.4097, + "step": 771 + }, + { + "epoch": 0.022892387984461644, + "grad_norm": 0.23895014822483063, + "learning_rate": 0.0009995849956026869, + "loss": 3.4668, + "step": 772 + }, + { + "epoch": 0.022922041336773124, + "grad_norm": 0.2092246562242508, + "learning_rate": 0.000999583076776516, + "loss": 3.4759, + "step": 773 + }, + { + "epoch": 0.0229516946890846, + "grad_norm": 0.18719984591007233, + "learning_rate": 0.0009995811535264514, + "loss": 3.4133, + "step": 774 + }, + { + "epoch": 0.02298134804139608, + "grad_norm": 0.15572476387023926, + "learning_rate": 0.0009995792258525099, + "loss": 3.438, + "step": 775 + }, + { + "epoch": 0.02301100139370756, + "grad_norm": 0.17106445133686066, + "learning_rate": 0.0009995772937547085, + "loss": 3.4565, + "step": 776 + }, + { + "epoch": 0.02304065474601904, + "grad_norm": 0.16437411308288574, + "learning_rate": 0.0009995753572330645, + "loss": 3.4188, + "step": 777 + }, + { + "epoch": 0.023070308098330515, + "grad_norm": 0.14558857679367065, + "learning_rate": 0.0009995734162875948, + "loss": 3.4649, + "step": 778 + }, + { + "epoch": 0.023099961450641995, + "grad_norm": 0.13309094309806824, + "learning_rate": 0.000999571470918317, + "loss": 3.4551, + "step": 779 + }, + { + "epoch": 0.023129614802953474, + "grad_norm": 0.14547760784626007, + "learning_rate": 0.0009995695211252478, + "loss": 3.4742, + "step": 780 + }, + { + "epoch": 0.023159268155264954, + "grad_norm": 0.1524268388748169, + "learning_rate": 0.000999567566908405, + "loss": 3.4379, + "step": 781 + }, + { + "epoch": 0.02318892150757643, + "grad_norm": 0.13270069658756256, + "learning_rate": 0.0009995656082678055, + "loss": 3.4346, + "step": 782 + }, + { + "epoch": 0.02321857485988791, + "grad_norm": 0.1611565202474594, + "learning_rate": 0.0009995636452034668, + "loss": 3.4376, + "step": 783 + }, + { + "epoch": 0.02324822821219939, + "grad_norm": 0.19691284000873566, + "learning_rate": 0.0009995616777154063, + "loss": 3.4236, + "step": 784 + }, + { + "epoch": 0.02327788156451087, + "grad_norm": 0.19284991919994354, + "learning_rate": 0.0009995597058036416, + "loss": 3.4327, + "step": 785 + }, + { + "epoch": 0.02330753491682235, + "grad_norm": 0.17360784113407135, + "learning_rate": 0.0009995577294681897, + "loss": 3.4332, + "step": 786 + }, + { + "epoch": 0.023337188269133825, + "grad_norm": 0.17076325416564941, + "learning_rate": 0.0009995557487090683, + "loss": 3.4307, + "step": 787 + }, + { + "epoch": 0.023366841621445304, + "grad_norm": 0.18371398746967316, + "learning_rate": 0.0009995537635262952, + "loss": 3.4271, + "step": 788 + }, + { + "epoch": 0.023396494973756784, + "grad_norm": 0.19628868997097015, + "learning_rate": 0.0009995517739198878, + "loss": 3.4412, + "step": 789 + }, + { + "epoch": 0.023426148326068263, + "grad_norm": 0.19892914593219757, + "learning_rate": 0.0009995497798898636, + "loss": 3.451, + "step": 790 + }, + { + "epoch": 0.02345580167837974, + "grad_norm": 0.187413290143013, + "learning_rate": 0.0009995477814362403, + "loss": 3.4518, + "step": 791 + }, + { + "epoch": 0.02348545503069122, + "grad_norm": 0.18775136768817902, + "learning_rate": 0.0009995457785590355, + "loss": 3.3947, + "step": 792 + }, + { + "epoch": 0.0235151083830027, + "grad_norm": 0.1424231082201004, + "learning_rate": 0.0009995437712582674, + "loss": 3.391, + "step": 793 + }, + { + "epoch": 0.02354476173531418, + "grad_norm": 0.1808721125125885, + "learning_rate": 0.0009995417595339534, + "loss": 3.4544, + "step": 794 + }, + { + "epoch": 0.023574415087625655, + "grad_norm": 0.17443716526031494, + "learning_rate": 0.0009995397433861114, + "loss": 3.4381, + "step": 795 + }, + { + "epoch": 0.023604068439937134, + "grad_norm": 0.15335297584533691, + "learning_rate": 0.000999537722814759, + "loss": 3.4282, + "step": 796 + }, + { + "epoch": 0.023633721792248614, + "grad_norm": 0.15551985800266266, + "learning_rate": 0.0009995356978199144, + "loss": 3.4094, + "step": 797 + }, + { + "epoch": 0.023663375144560093, + "grad_norm": 0.1442258059978485, + "learning_rate": 0.0009995336684015957, + "loss": 3.4209, + "step": 798 + }, + { + "epoch": 0.023693028496871573, + "grad_norm": 0.1520729511976242, + "learning_rate": 0.0009995316345598204, + "loss": 3.4383, + "step": 799 + }, + { + "epoch": 0.02372268184918305, + "grad_norm": 0.1593625545501709, + "learning_rate": 0.0009995295962946067, + "loss": 3.4087, + "step": 800 + }, + { + "epoch": 0.02375233520149453, + "grad_norm": 0.15282046794891357, + "learning_rate": 0.0009995275536059728, + "loss": 3.4207, + "step": 801 + }, + { + "epoch": 0.02378198855380601, + "grad_norm": 0.15480409562587738, + "learning_rate": 0.0009995255064939367, + "loss": 3.453, + "step": 802 + }, + { + "epoch": 0.023811641906117488, + "grad_norm": 0.17301300168037415, + "learning_rate": 0.0009995234549585162, + "loss": 3.4429, + "step": 803 + }, + { + "epoch": 0.023841295258428964, + "grad_norm": 0.21361133456230164, + "learning_rate": 0.0009995213989997301, + "loss": 3.4488, + "step": 804 + }, + { + "epoch": 0.023870948610740444, + "grad_norm": 0.23648963868618011, + "learning_rate": 0.0009995193386175961, + "loss": 3.4128, + "step": 805 + }, + { + "epoch": 0.023900601963051923, + "grad_norm": 0.20075513422489166, + "learning_rate": 0.0009995172738121326, + "loss": 3.407, + "step": 806 + }, + { + "epoch": 0.023930255315363403, + "grad_norm": 0.13921864330768585, + "learning_rate": 0.000999515204583358, + "loss": 3.3913, + "step": 807 + }, + { + "epoch": 0.02395990866767488, + "grad_norm": 0.15081259608268738, + "learning_rate": 0.0009995131309312904, + "loss": 3.4142, + "step": 808 + }, + { + "epoch": 0.02398956201998636, + "grad_norm": 0.1297830492258072, + "learning_rate": 0.0009995110528559484, + "loss": 3.4072, + "step": 809 + }, + { + "epoch": 0.02401921537229784, + "grad_norm": 0.13055463135242462, + "learning_rate": 0.0009995089703573503, + "loss": 3.4298, + "step": 810 + }, + { + "epoch": 0.024048868724609318, + "grad_norm": 0.13386860489845276, + "learning_rate": 0.0009995068834355145, + "loss": 3.4579, + "step": 811 + }, + { + "epoch": 0.024078522076920798, + "grad_norm": 0.16564136743545532, + "learning_rate": 0.0009995047920904594, + "loss": 3.3615, + "step": 812 + }, + { + "epoch": 0.024108175429232274, + "grad_norm": 0.1473875641822815, + "learning_rate": 0.0009995026963222039, + "loss": 3.4241, + "step": 813 + }, + { + "epoch": 0.024137828781543753, + "grad_norm": 0.13391266763210297, + "learning_rate": 0.000999500596130766, + "loss": 3.3961, + "step": 814 + }, + { + "epoch": 0.024167482133855233, + "grad_norm": 0.13953393697738647, + "learning_rate": 0.0009994984915161647, + "loss": 3.4166, + "step": 815 + }, + { + "epoch": 0.024197135486166713, + "grad_norm": 0.16097524762153625, + "learning_rate": 0.0009994963824784184, + "loss": 3.3875, + "step": 816 + }, + { + "epoch": 0.02422678883847819, + "grad_norm": 0.35845667123794556, + "learning_rate": 0.0009994942690175462, + "loss": 3.3942, + "step": 817 + }, + { + "epoch": 0.02425644219078967, + "grad_norm": 0.14779715240001678, + "learning_rate": 0.0009994921511335662, + "loss": 3.4123, + "step": 818 + }, + { + "epoch": 0.024286095543101148, + "grad_norm": 0.18696458637714386, + "learning_rate": 0.0009994900288264976, + "loss": 3.4207, + "step": 819 + }, + { + "epoch": 0.024315748895412628, + "grad_norm": 0.18580394983291626, + "learning_rate": 0.000999487902096359, + "loss": 3.393, + "step": 820 + }, + { + "epoch": 0.024345402247724104, + "grad_norm": 0.1483900249004364, + "learning_rate": 0.0009994857709431694, + "loss": 3.3895, + "step": 821 + }, + { + "epoch": 0.024375055600035583, + "grad_norm": 0.12887074053287506, + "learning_rate": 0.0009994836353669474, + "loss": 3.3836, + "step": 822 + }, + { + "epoch": 0.024404708952347063, + "grad_norm": 0.13438552618026733, + "learning_rate": 0.0009994814953677123, + "loss": 3.4025, + "step": 823 + }, + { + "epoch": 0.024434362304658543, + "grad_norm": 0.12926900386810303, + "learning_rate": 0.0009994793509454827, + "loss": 3.4467, + "step": 824 + }, + { + "epoch": 0.02446401565697002, + "grad_norm": 0.1397494077682495, + "learning_rate": 0.0009994772021002776, + "loss": 3.3699, + "step": 825 + }, + { + "epoch": 0.0244936690092815, + "grad_norm": 0.16738121211528778, + "learning_rate": 0.0009994750488321162, + "loss": 3.4162, + "step": 826 + }, + { + "epoch": 0.024523322361592978, + "grad_norm": 0.19621779024600983, + "learning_rate": 0.0009994728911410175, + "loss": 3.3917, + "step": 827 + }, + { + "epoch": 0.024552975713904458, + "grad_norm": 0.16599002480506897, + "learning_rate": 0.0009994707290270008, + "loss": 3.4168, + "step": 828 + }, + { + "epoch": 0.024582629066215937, + "grad_norm": 0.1383376270532608, + "learning_rate": 0.000999468562490085, + "loss": 3.378, + "step": 829 + }, + { + "epoch": 0.024612282418527413, + "grad_norm": 0.16226370632648468, + "learning_rate": 0.0009994663915302894, + "loss": 3.4252, + "step": 830 + }, + { + "epoch": 0.024641935770838893, + "grad_norm": 0.20832183957099915, + "learning_rate": 0.0009994642161476328, + "loss": 3.3897, + "step": 831 + }, + { + "epoch": 0.024671589123150373, + "grad_norm": 0.2381325513124466, + "learning_rate": 0.0009994620363421353, + "loss": 3.4234, + "step": 832 + }, + { + "epoch": 0.024701242475461852, + "grad_norm": 0.2199164628982544, + "learning_rate": 0.0009994598521138153, + "loss": 3.3949, + "step": 833 + }, + { + "epoch": 0.02473089582777333, + "grad_norm": 0.20215734839439392, + "learning_rate": 0.0009994576634626928, + "loss": 3.4293, + "step": 834 + }, + { + "epoch": 0.024760549180084808, + "grad_norm": 0.2430366426706314, + "learning_rate": 0.000999455470388787, + "loss": 3.4238, + "step": 835 + }, + { + "epoch": 0.024790202532396288, + "grad_norm": 0.2168632596731186, + "learning_rate": 0.0009994532728921173, + "loss": 3.413, + "step": 836 + }, + { + "epoch": 0.024819855884707767, + "grad_norm": 0.1815548539161682, + "learning_rate": 0.000999451070972703, + "loss": 3.3557, + "step": 837 + }, + { + "epoch": 0.024849509237019243, + "grad_norm": 0.1786431074142456, + "learning_rate": 0.0009994488646305638, + "loss": 3.4355, + "step": 838 + }, + { + "epoch": 0.024879162589330723, + "grad_norm": 0.2010490596294403, + "learning_rate": 0.0009994466538657191, + "loss": 3.3891, + "step": 839 + }, + { + "epoch": 0.024908815941642203, + "grad_norm": 0.21092146635055542, + "learning_rate": 0.0009994444386781888, + "loss": 3.3964, + "step": 840 + }, + { + "epoch": 0.024938469293953682, + "grad_norm": 0.19602283835411072, + "learning_rate": 0.000999442219067992, + "loss": 3.4276, + "step": 841 + }, + { + "epoch": 0.024968122646265162, + "grad_norm": 0.16456758975982666, + "learning_rate": 0.0009994399950351486, + "loss": 3.4229, + "step": 842 + }, + { + "epoch": 0.024997775998576638, + "grad_norm": 0.15643073618412018, + "learning_rate": 0.0009994377665796786, + "loss": 3.3788, + "step": 843 + }, + { + "epoch": 0.025027429350888118, + "grad_norm": 0.17997191846370697, + "learning_rate": 0.0009994355337016013, + "loss": 3.411, + "step": 844 + }, + { + "epoch": 0.025057082703199597, + "grad_norm": 0.1776786595582962, + "learning_rate": 0.0009994332964009367, + "loss": 3.3867, + "step": 845 + }, + { + "epoch": 0.025086736055511077, + "grad_norm": 0.1606934666633606, + "learning_rate": 0.0009994310546777043, + "loss": 3.4122, + "step": 846 + }, + { + "epoch": 0.025116389407822553, + "grad_norm": 0.1564399152994156, + "learning_rate": 0.0009994288085319243, + "loss": 3.3864, + "step": 847 + }, + { + "epoch": 0.025146042760134033, + "grad_norm": 0.16131435334682465, + "learning_rate": 0.0009994265579636166, + "loss": 3.4098, + "step": 848 + }, + { + "epoch": 0.025175696112445512, + "grad_norm": 0.14577792584896088, + "learning_rate": 0.000999424302972801, + "loss": 3.4051, + "step": 849 + }, + { + "epoch": 0.02520534946475699, + "grad_norm": 0.1478842794895172, + "learning_rate": 0.0009994220435594972, + "loss": 3.3439, + "step": 850 + }, + { + "epoch": 0.025235002817068468, + "grad_norm": 0.15820688009262085, + "learning_rate": 0.0009994197797237256, + "loss": 3.3623, + "step": 851 + }, + { + "epoch": 0.025264656169379947, + "grad_norm": 0.17569600045681, + "learning_rate": 0.0009994175114655065, + "loss": 3.4115, + "step": 852 + }, + { + "epoch": 0.025294309521691427, + "grad_norm": 0.18813592195510864, + "learning_rate": 0.000999415238784859, + "loss": 3.3955, + "step": 853 + }, + { + "epoch": 0.025323962874002907, + "grad_norm": 0.1951814889907837, + "learning_rate": 0.0009994129616818044, + "loss": 3.3657, + "step": 854 + }, + { + "epoch": 0.025353616226314386, + "grad_norm": 0.1594688594341278, + "learning_rate": 0.0009994106801563618, + "loss": 3.3674, + "step": 855 + }, + { + "epoch": 0.025383269578625862, + "grad_norm": 0.1757277250289917, + "learning_rate": 0.0009994083942085523, + "loss": 3.4262, + "step": 856 + }, + { + "epoch": 0.025412922930937342, + "grad_norm": 0.164302796125412, + "learning_rate": 0.0009994061038383956, + "loss": 3.389, + "step": 857 + }, + { + "epoch": 0.02544257628324882, + "grad_norm": 0.16077657043933868, + "learning_rate": 0.0009994038090459121, + "loss": 3.4052, + "step": 858 + }, + { + "epoch": 0.0254722296355603, + "grad_norm": 0.17733272910118103, + "learning_rate": 0.000999401509831122, + "loss": 3.3768, + "step": 859 + }, + { + "epoch": 0.025501882987871777, + "grad_norm": 0.18790769577026367, + "learning_rate": 0.0009993992061940462, + "loss": 3.3841, + "step": 860 + }, + { + "epoch": 0.025531536340183257, + "grad_norm": 0.17591433227062225, + "learning_rate": 0.0009993968981347045, + "loss": 3.4087, + "step": 861 + }, + { + "epoch": 0.025561189692494737, + "grad_norm": 0.1520904153585434, + "learning_rate": 0.0009993945856531174, + "loss": 3.3745, + "step": 862 + }, + { + "epoch": 0.025590843044806216, + "grad_norm": 0.16517092287540436, + "learning_rate": 0.0009993922687493056, + "loss": 3.4057, + "step": 863 + }, + { + "epoch": 0.025620496397117692, + "grad_norm": 0.15396223962306976, + "learning_rate": 0.0009993899474232896, + "loss": 3.4012, + "step": 864 + }, + { + "epoch": 0.025650149749429172, + "grad_norm": 0.14259999990463257, + "learning_rate": 0.00099938762167509, + "loss": 3.4193, + "step": 865 + }, + { + "epoch": 0.02567980310174065, + "grad_norm": 0.15258629620075226, + "learning_rate": 0.000999385291504727, + "loss": 3.4263, + "step": 866 + }, + { + "epoch": 0.02570945645405213, + "grad_norm": 0.1379566639661789, + "learning_rate": 0.0009993829569122218, + "loss": 3.3637, + "step": 867 + }, + { + "epoch": 0.02573910980636361, + "grad_norm": 0.15416304767131805, + "learning_rate": 0.0009993806178975949, + "loss": 3.402, + "step": 868 + }, + { + "epoch": 0.025768763158675087, + "grad_norm": 0.1611844003200531, + "learning_rate": 0.0009993782744608666, + "loss": 3.3904, + "step": 869 + }, + { + "epoch": 0.025798416510986567, + "grad_norm": 0.15511925518512726, + "learning_rate": 0.0009993759266020581, + "loss": 3.3932, + "step": 870 + }, + { + "epoch": 0.025828069863298046, + "grad_norm": 0.15829038619995117, + "learning_rate": 0.00099937357432119, + "loss": 3.3694, + "step": 871 + }, + { + "epoch": 0.025857723215609526, + "grad_norm": 0.17011280357837677, + "learning_rate": 0.0009993712176182832, + "loss": 3.3821, + "step": 872 + }, + { + "epoch": 0.025887376567921002, + "grad_norm": 0.1587747037410736, + "learning_rate": 0.0009993688564933585, + "loss": 3.3649, + "step": 873 + }, + { + "epoch": 0.02591702992023248, + "grad_norm": 0.1499738097190857, + "learning_rate": 0.0009993664909464372, + "loss": 3.3845, + "step": 874 + }, + { + "epoch": 0.02594668327254396, + "grad_norm": 0.1445324420928955, + "learning_rate": 0.0009993641209775394, + "loss": 3.3498, + "step": 875 + }, + { + "epoch": 0.02597633662485544, + "grad_norm": 0.14958184957504272, + "learning_rate": 0.0009993617465866868, + "loss": 3.3917, + "step": 876 + }, + { + "epoch": 0.026005989977166917, + "grad_norm": 0.1709052324295044, + "learning_rate": 0.0009993593677739003, + "loss": 3.3614, + "step": 877 + }, + { + "epoch": 0.026035643329478397, + "grad_norm": 0.1901400238275528, + "learning_rate": 0.0009993569845392009, + "loss": 3.3767, + "step": 878 + }, + { + "epoch": 0.026065296681789876, + "grad_norm": 0.1605062186717987, + "learning_rate": 0.0009993545968826096, + "loss": 3.3212, + "step": 879 + }, + { + "epoch": 0.026094950034101356, + "grad_norm": 0.18832464516162872, + "learning_rate": 0.0009993522048041476, + "loss": 3.3449, + "step": 880 + }, + { + "epoch": 0.026124603386412835, + "grad_norm": 0.2115894854068756, + "learning_rate": 0.000999349808303836, + "loss": 3.3777, + "step": 881 + }, + { + "epoch": 0.02615425673872431, + "grad_norm": 0.17436277866363525, + "learning_rate": 0.0009993474073816966, + "loss": 3.3964, + "step": 882 + }, + { + "epoch": 0.02618391009103579, + "grad_norm": 0.16178691387176514, + "learning_rate": 0.0009993450020377498, + "loss": 3.391, + "step": 883 + }, + { + "epoch": 0.02621356344334727, + "grad_norm": 0.1760091334581375, + "learning_rate": 0.0009993425922720173, + "loss": 3.3914, + "step": 884 + }, + { + "epoch": 0.02624321679565875, + "grad_norm": 0.15463748574256897, + "learning_rate": 0.0009993401780845207, + "loss": 3.3789, + "step": 885 + }, + { + "epoch": 0.026272870147970227, + "grad_norm": 0.1631287783384323, + "learning_rate": 0.0009993377594752807, + "loss": 3.403, + "step": 886 + }, + { + "epoch": 0.026302523500281706, + "grad_norm": 0.16649262607097626, + "learning_rate": 0.0009993353364443195, + "loss": 3.3725, + "step": 887 + }, + { + "epoch": 0.026332176852593186, + "grad_norm": 0.2051418423652649, + "learning_rate": 0.0009993329089916581, + "loss": 3.396, + "step": 888 + }, + { + "epoch": 0.026361830204904665, + "grad_norm": 0.22030192613601685, + "learning_rate": 0.000999330477117318, + "loss": 3.3644, + "step": 889 + }, + { + "epoch": 0.02639148355721614, + "grad_norm": 0.16649951040744781, + "learning_rate": 0.000999328040821321, + "loss": 3.3712, + "step": 890 + }, + { + "epoch": 0.02642113690952762, + "grad_norm": 0.16061033308506012, + "learning_rate": 0.0009993256001036882, + "loss": 3.3968, + "step": 891 + }, + { + "epoch": 0.0264507902618391, + "grad_norm": 0.17938260734081268, + "learning_rate": 0.0009993231549644418, + "loss": 3.3596, + "step": 892 + }, + { + "epoch": 0.02648044361415058, + "grad_norm": 0.16044721007347107, + "learning_rate": 0.0009993207054036029, + "loss": 3.3918, + "step": 893 + }, + { + "epoch": 0.02651009696646206, + "grad_norm": 0.18708553910255432, + "learning_rate": 0.0009993182514211937, + "loss": 3.3932, + "step": 894 + }, + { + "epoch": 0.026539750318773536, + "grad_norm": 0.22289691865444183, + "learning_rate": 0.0009993157930172354, + "loss": 3.3723, + "step": 895 + }, + { + "epoch": 0.026569403671085016, + "grad_norm": 0.1765885204076767, + "learning_rate": 0.0009993133301917502, + "loss": 3.3795, + "step": 896 + }, + { + "epoch": 0.026599057023396495, + "grad_norm": 0.14205920696258545, + "learning_rate": 0.0009993108629447595, + "loss": 3.3818, + "step": 897 + }, + { + "epoch": 0.026628710375707975, + "grad_norm": 0.17798148095607758, + "learning_rate": 0.0009993083912762859, + "loss": 3.3059, + "step": 898 + }, + { + "epoch": 0.02665836372801945, + "grad_norm": 0.19069120287895203, + "learning_rate": 0.0009993059151863503, + "loss": 3.3965, + "step": 899 + }, + { + "epoch": 0.02668801708033093, + "grad_norm": 0.18270206451416016, + "learning_rate": 0.0009993034346749755, + "loss": 3.382, + "step": 900 + }, + { + "epoch": 0.02671767043264241, + "grad_norm": 0.1748824119567871, + "learning_rate": 0.000999300949742183, + "loss": 3.4016, + "step": 901 + }, + { + "epoch": 0.02674732378495389, + "grad_norm": 0.1668251007795334, + "learning_rate": 0.0009992984603879947, + "loss": 3.3459, + "step": 902 + }, + { + "epoch": 0.026776977137265366, + "grad_norm": 0.135682612657547, + "learning_rate": 0.0009992959666124328, + "loss": 3.4029, + "step": 903 + }, + { + "epoch": 0.026806630489576846, + "grad_norm": 0.16326545178890228, + "learning_rate": 0.0009992934684155198, + "loss": 3.3666, + "step": 904 + }, + { + "epoch": 0.026836283841888325, + "grad_norm": 0.20121295750141144, + "learning_rate": 0.0009992909657972771, + "loss": 3.3535, + "step": 905 + }, + { + "epoch": 0.026865937194199805, + "grad_norm": 0.14417268335819244, + "learning_rate": 0.0009992884587577272, + "loss": 3.3417, + "step": 906 + }, + { + "epoch": 0.026895590546511285, + "grad_norm": 0.13815388083457947, + "learning_rate": 0.0009992859472968923, + "loss": 3.3329, + "step": 907 + }, + { + "epoch": 0.02692524389882276, + "grad_norm": 0.16861768066883087, + "learning_rate": 0.0009992834314147946, + "loss": 3.3812, + "step": 908 + }, + { + "epoch": 0.02695489725113424, + "grad_norm": 0.14126773178577423, + "learning_rate": 0.0009992809111114566, + "loss": 3.3338, + "step": 909 + }, + { + "epoch": 0.02698455060344572, + "grad_norm": 0.15209916234016418, + "learning_rate": 0.0009992783863869005, + "loss": 3.3849, + "step": 910 + }, + { + "epoch": 0.0270142039557572, + "grad_norm": 0.16932103037834167, + "learning_rate": 0.0009992758572411485, + "loss": 3.3506, + "step": 911 + }, + { + "epoch": 0.027043857308068676, + "grad_norm": 0.19320331513881683, + "learning_rate": 0.000999273323674223, + "loss": 3.4006, + "step": 912 + }, + { + "epoch": 0.027073510660380155, + "grad_norm": 0.17579573392868042, + "learning_rate": 0.0009992707856861466, + "loss": 3.3734, + "step": 913 + }, + { + "epoch": 0.027103164012691635, + "grad_norm": 0.13978657126426697, + "learning_rate": 0.0009992682432769415, + "loss": 3.3428, + "step": 914 + }, + { + "epoch": 0.027132817365003115, + "grad_norm": 0.16083240509033203, + "learning_rate": 0.0009992656964466307, + "loss": 3.3742, + "step": 915 + }, + { + "epoch": 0.02716247071731459, + "grad_norm": 0.15342693030834198, + "learning_rate": 0.0009992631451952363, + "loss": 3.3569, + "step": 916 + }, + { + "epoch": 0.02719212406962607, + "grad_norm": 0.13643799722194672, + "learning_rate": 0.000999260589522781, + "loss": 3.3554, + "step": 917 + }, + { + "epoch": 0.02722177742193755, + "grad_norm": 0.1443132907152176, + "learning_rate": 0.0009992580294292874, + "loss": 3.3557, + "step": 918 + }, + { + "epoch": 0.02725143077424903, + "grad_norm": 0.16157899796962738, + "learning_rate": 0.0009992554649147784, + "loss": 3.3644, + "step": 919 + }, + { + "epoch": 0.02728108412656051, + "grad_norm": 0.16932596266269684, + "learning_rate": 0.0009992528959792766, + "loss": 3.3931, + "step": 920 + }, + { + "epoch": 0.027310737478871985, + "grad_norm": 0.16516689956188202, + "learning_rate": 0.0009992503226228047, + "loss": 3.3416, + "step": 921 + }, + { + "epoch": 0.027340390831183465, + "grad_norm": 0.20238132774829865, + "learning_rate": 0.0009992477448453854, + "loss": 3.3231, + "step": 922 + }, + { + "epoch": 0.027370044183494945, + "grad_norm": 0.1964876353740692, + "learning_rate": 0.0009992451626470418, + "loss": 3.361, + "step": 923 + }, + { + "epoch": 0.027399697535806424, + "grad_norm": 0.18181852996349335, + "learning_rate": 0.0009992425760277964, + "loss": 3.3735, + "step": 924 + }, + { + "epoch": 0.0274293508881179, + "grad_norm": 0.1799389123916626, + "learning_rate": 0.0009992399849876724, + "loss": 3.3803, + "step": 925 + }, + { + "epoch": 0.02745900424042938, + "grad_norm": 0.17520570755004883, + "learning_rate": 0.0009992373895266926, + "loss": 3.3343, + "step": 926 + }, + { + "epoch": 0.02748865759274086, + "grad_norm": 0.18397241830825806, + "learning_rate": 0.0009992347896448802, + "loss": 3.3617, + "step": 927 + }, + { + "epoch": 0.02751831094505234, + "grad_norm": 0.17877130210399628, + "learning_rate": 0.0009992321853422579, + "loss": 3.3536, + "step": 928 + }, + { + "epoch": 0.027547964297363815, + "grad_norm": 0.2018546760082245, + "learning_rate": 0.000999229576618849, + "loss": 3.3458, + "step": 929 + }, + { + "epoch": 0.027577617649675295, + "grad_norm": 0.19510887563228607, + "learning_rate": 0.0009992269634746763, + "loss": 3.3703, + "step": 930 + }, + { + "epoch": 0.027607271001986775, + "grad_norm": 0.21111738681793213, + "learning_rate": 0.0009992243459097632, + "loss": 3.3287, + "step": 931 + }, + { + "epoch": 0.027636924354298254, + "grad_norm": 0.17444829642772675, + "learning_rate": 0.0009992217239241329, + "loss": 3.3846, + "step": 932 + }, + { + "epoch": 0.027666577706609734, + "grad_norm": 0.1421005129814148, + "learning_rate": 0.0009992190975178085, + "loss": 3.3399, + "step": 933 + }, + { + "epoch": 0.02769623105892121, + "grad_norm": 0.16592632234096527, + "learning_rate": 0.0009992164666908132, + "loss": 3.3621, + "step": 934 + }, + { + "epoch": 0.02772588441123269, + "grad_norm": 0.16074825823307037, + "learning_rate": 0.0009992138314431705, + "loss": 3.3519, + "step": 935 + }, + { + "epoch": 0.02775553776354417, + "grad_norm": 0.14772823452949524, + "learning_rate": 0.0009992111917749037, + "loss": 3.378, + "step": 936 + }, + { + "epoch": 0.02778519111585565, + "grad_norm": 0.18205130100250244, + "learning_rate": 0.000999208547686036, + "loss": 3.3175, + "step": 937 + }, + { + "epoch": 0.027814844468167125, + "grad_norm": 0.19178275763988495, + "learning_rate": 0.000999205899176591, + "loss": 3.3747, + "step": 938 + }, + { + "epoch": 0.027844497820478604, + "grad_norm": 0.16416914761066437, + "learning_rate": 0.000999203246246592, + "loss": 3.3166, + "step": 939 + }, + { + "epoch": 0.027874151172790084, + "grad_norm": 0.1698065847158432, + "learning_rate": 0.0009992005888960628, + "loss": 3.3711, + "step": 940 + }, + { + "epoch": 0.027903804525101564, + "grad_norm": 0.16538341343402863, + "learning_rate": 0.0009991979271250263, + "loss": 3.3679, + "step": 941 + }, + { + "epoch": 0.02793345787741304, + "grad_norm": 0.18359026312828064, + "learning_rate": 0.0009991952609335068, + "loss": 3.3537, + "step": 942 + }, + { + "epoch": 0.02796311122972452, + "grad_norm": 0.198978990316391, + "learning_rate": 0.0009991925903215276, + "loss": 3.3433, + "step": 943 + }, + { + "epoch": 0.027992764582036, + "grad_norm": 0.19789327681064606, + "learning_rate": 0.000999189915289112, + "loss": 3.3638, + "step": 944 + }, + { + "epoch": 0.02802241793434748, + "grad_norm": 0.21381688117980957, + "learning_rate": 0.0009991872358362844, + "loss": 3.3493, + "step": 945 + }, + { + "epoch": 0.02805207128665896, + "grad_norm": 0.20071791112422943, + "learning_rate": 0.0009991845519630679, + "loss": 3.3224, + "step": 946 + }, + { + "epoch": 0.028081724638970434, + "grad_norm": 0.18675117194652557, + "learning_rate": 0.0009991818636694864, + "loss": 3.335, + "step": 947 + }, + { + "epoch": 0.028111377991281914, + "grad_norm": 0.22189271450042725, + "learning_rate": 0.0009991791709555642, + "loss": 3.3202, + "step": 948 + }, + { + "epoch": 0.028141031343593394, + "grad_norm": 0.17139242589473724, + "learning_rate": 0.0009991764738213245, + "loss": 3.3251, + "step": 949 + }, + { + "epoch": 0.028170684695904873, + "grad_norm": 0.1490715742111206, + "learning_rate": 0.0009991737722667914, + "loss": 3.3595, + "step": 950 + }, + { + "epoch": 0.02820033804821635, + "grad_norm": 0.1890162080526352, + "learning_rate": 0.000999171066291989, + "loss": 3.3684, + "step": 951 + }, + { + "epoch": 0.02822999140052783, + "grad_norm": 0.14452694356441498, + "learning_rate": 0.000999168355896941, + "loss": 3.3821, + "step": 952 + }, + { + "epoch": 0.02825964475283931, + "grad_norm": 0.15880770981311798, + "learning_rate": 0.0009991656410816717, + "loss": 3.3302, + "step": 953 + }, + { + "epoch": 0.02828929810515079, + "grad_norm": 0.16685546934604645, + "learning_rate": 0.000999162921846205, + "loss": 3.3336, + "step": 954 + }, + { + "epoch": 0.028318951457462264, + "grad_norm": 0.1621856540441513, + "learning_rate": 0.0009991601981905647, + "loss": 3.329, + "step": 955 + }, + { + "epoch": 0.028348604809773744, + "grad_norm": 0.15774758160114288, + "learning_rate": 0.000999157470114775, + "loss": 3.3575, + "step": 956 + }, + { + "epoch": 0.028378258162085224, + "grad_norm": 0.18632176518440247, + "learning_rate": 0.0009991547376188607, + "loss": 3.3708, + "step": 957 + }, + { + "epoch": 0.028407911514396703, + "grad_norm": 0.22040091454982758, + "learning_rate": 0.0009991520007028452, + "loss": 3.2987, + "step": 958 + }, + { + "epoch": 0.028437564866708183, + "grad_norm": 0.19909052550792694, + "learning_rate": 0.0009991492593667533, + "loss": 3.3709, + "step": 959 + }, + { + "epoch": 0.02846721821901966, + "grad_norm": 0.16232089698314667, + "learning_rate": 0.000999146513610609, + "loss": 3.3292, + "step": 960 + }, + { + "epoch": 0.02849687157133114, + "grad_norm": 0.15386366844177246, + "learning_rate": 0.0009991437634344364, + "loss": 3.3431, + "step": 961 + }, + { + "epoch": 0.02852652492364262, + "grad_norm": 0.13735127449035645, + "learning_rate": 0.0009991410088382603, + "loss": 3.3428, + "step": 962 + }, + { + "epoch": 0.028556178275954098, + "grad_norm": 0.1475311666727066, + "learning_rate": 0.0009991382498221047, + "loss": 3.3334, + "step": 963 + }, + { + "epoch": 0.028585831628265574, + "grad_norm": 0.1473138928413391, + "learning_rate": 0.0009991354863859946, + "loss": 3.3099, + "step": 964 + }, + { + "epoch": 0.028615484980577054, + "grad_norm": 0.13235396146774292, + "learning_rate": 0.0009991327185299536, + "loss": 3.3281, + "step": 965 + }, + { + "epoch": 0.028645138332888533, + "grad_norm": 0.13398316502571106, + "learning_rate": 0.000999129946254007, + "loss": 3.3237, + "step": 966 + }, + { + "epoch": 0.028674791685200013, + "grad_norm": 0.1383059322834015, + "learning_rate": 0.000999127169558179, + "loss": 3.3188, + "step": 967 + }, + { + "epoch": 0.02870444503751149, + "grad_norm": 0.13007953763008118, + "learning_rate": 0.0009991243884424944, + "loss": 3.3145, + "step": 968 + }, + { + "epoch": 0.02873409838982297, + "grad_norm": 0.15393991768360138, + "learning_rate": 0.0009991216029069773, + "loss": 3.3717, + "step": 969 + }, + { + "epoch": 0.028763751742134448, + "grad_norm": 0.14912573993206024, + "learning_rate": 0.000999118812951653, + "loss": 3.3206, + "step": 970 + }, + { + "epoch": 0.028793405094445928, + "grad_norm": 0.15008065104484558, + "learning_rate": 0.000999116018576546, + "loss": 3.3702, + "step": 971 + }, + { + "epoch": 0.028823058446757407, + "grad_norm": 0.16470246016979218, + "learning_rate": 0.0009991132197816807, + "loss": 3.3553, + "step": 972 + }, + { + "epoch": 0.028852711799068884, + "grad_norm": 0.17517021298408508, + "learning_rate": 0.0009991104165670824, + "loss": 3.3105, + "step": 973 + }, + { + "epoch": 0.028882365151380363, + "grad_norm": 0.19603405892848969, + "learning_rate": 0.0009991076089327757, + "loss": 3.3189, + "step": 974 + }, + { + "epoch": 0.028912018503691843, + "grad_norm": 0.18710175156593323, + "learning_rate": 0.0009991047968787854, + "loss": 3.3015, + "step": 975 + }, + { + "epoch": 0.028941671856003322, + "grad_norm": 0.16662989556789398, + "learning_rate": 0.0009991019804051363, + "loss": 3.3186, + "step": 976 + }, + { + "epoch": 0.0289713252083148, + "grad_norm": 0.1510525792837143, + "learning_rate": 0.0009990991595118536, + "loss": 3.3251, + "step": 977 + }, + { + "epoch": 0.029000978560626278, + "grad_norm": 0.14021937549114227, + "learning_rate": 0.0009990963341989622, + "loss": 3.3442, + "step": 978 + }, + { + "epoch": 0.029030631912937758, + "grad_norm": 0.1478705108165741, + "learning_rate": 0.0009990935044664872, + "loss": 3.3077, + "step": 979 + }, + { + "epoch": 0.029060285265249237, + "grad_norm": 0.14707988500595093, + "learning_rate": 0.0009990906703144533, + "loss": 3.3266, + "step": 980 + }, + { + "epoch": 0.029089938617560714, + "grad_norm": 0.1531607061624527, + "learning_rate": 0.000999087831742886, + "loss": 3.3321, + "step": 981 + }, + { + "epoch": 0.029119591969872193, + "grad_norm": 0.14246581494808197, + "learning_rate": 0.0009990849887518104, + "loss": 3.3281, + "step": 982 + }, + { + "epoch": 0.029149245322183673, + "grad_norm": 0.15023761987686157, + "learning_rate": 0.0009990821413412515, + "loss": 3.2743, + "step": 983 + }, + { + "epoch": 0.029178898674495152, + "grad_norm": 0.13944029808044434, + "learning_rate": 0.0009990792895112344, + "loss": 3.2976, + "step": 984 + }, + { + "epoch": 0.029208552026806632, + "grad_norm": 0.1620928794145584, + "learning_rate": 0.0009990764332617845, + "loss": 3.3101, + "step": 985 + }, + { + "epoch": 0.029238205379118108, + "grad_norm": 0.17247651517391205, + "learning_rate": 0.0009990735725929273, + "loss": 3.268, + "step": 986 + }, + { + "epoch": 0.029267858731429588, + "grad_norm": 0.13643120229244232, + "learning_rate": 0.0009990707075046878, + "loss": 3.3295, + "step": 987 + }, + { + "epoch": 0.029297512083741067, + "grad_norm": 0.16809198260307312, + "learning_rate": 0.0009990678379970916, + "loss": 3.3415, + "step": 988 + }, + { + "epoch": 0.029327165436052547, + "grad_norm": 0.1833949089050293, + "learning_rate": 0.0009990649640701642, + "loss": 3.3337, + "step": 989 + }, + { + "epoch": 0.029356818788364023, + "grad_norm": 0.16453784704208374, + "learning_rate": 0.0009990620857239308, + "loss": 3.3322, + "step": 990 + }, + { + "epoch": 0.029386472140675503, + "grad_norm": 0.16691984236240387, + "learning_rate": 0.0009990592029584168, + "loss": 3.3126, + "step": 991 + }, + { + "epoch": 0.029416125492986982, + "grad_norm": 0.17328670620918274, + "learning_rate": 0.000999056315773648, + "loss": 3.3107, + "step": 992 + }, + { + "epoch": 0.029445778845298462, + "grad_norm": 0.18959002196788788, + "learning_rate": 0.0009990534241696499, + "loss": 3.3207, + "step": 993 + }, + { + "epoch": 0.029475432197609938, + "grad_norm": 0.1924528032541275, + "learning_rate": 0.0009990505281464478, + "loss": 3.3094, + "step": 994 + }, + { + "epoch": 0.029505085549921418, + "grad_norm": 0.195547953248024, + "learning_rate": 0.0009990476277040678, + "loss": 3.3139, + "step": 995 + }, + { + "epoch": 0.029534738902232897, + "grad_norm": 0.19128061830997467, + "learning_rate": 0.0009990447228425355, + "loss": 3.3204, + "step": 996 + }, + { + "epoch": 0.029564392254544377, + "grad_norm": 0.17986580729484558, + "learning_rate": 0.0009990418135618765, + "loss": 3.3097, + "step": 997 + }, + { + "epoch": 0.029594045606855857, + "grad_norm": 0.173731729388237, + "learning_rate": 0.0009990388998621165, + "loss": 3.3124, + "step": 998 + }, + { + "epoch": 0.029623698959167333, + "grad_norm": 0.2196926474571228, + "learning_rate": 0.0009990359817432814, + "loss": 3.2529, + "step": 999 + }, + { + "epoch": 0.029653352311478812, + "grad_norm": 0.21107357740402222, + "learning_rate": 0.0009990330592053972, + "loss": 3.3061, + "step": 1000 + }, + { + "epoch": 0.029683005663790292, + "grad_norm": 0.2056809812784195, + "learning_rate": 0.0009990301322484894, + "loss": 3.3313, + "step": 1001 + }, + { + "epoch": 0.02971265901610177, + "grad_norm": 0.1795404851436615, + "learning_rate": 0.000999027200872584, + "loss": 3.3122, + "step": 1002 + }, + { + "epoch": 0.029742312368413248, + "grad_norm": 0.1940668821334839, + "learning_rate": 0.0009990242650777072, + "loss": 3.3229, + "step": 1003 + }, + { + "epoch": 0.029771965720724727, + "grad_norm": 0.2560441493988037, + "learning_rate": 0.000999021324863885, + "loss": 3.3093, + "step": 1004 + }, + { + "epoch": 0.029801619073036207, + "grad_norm": 0.21737496554851532, + "learning_rate": 0.0009990183802311432, + "loss": 3.3492, + "step": 1005 + }, + { + "epoch": 0.029831272425347687, + "grad_norm": 0.19872844219207764, + "learning_rate": 0.0009990154311795081, + "loss": 3.3254, + "step": 1006 + }, + { + "epoch": 0.029860925777659163, + "grad_norm": 0.18111220002174377, + "learning_rate": 0.0009990124777090055, + "loss": 3.3062, + "step": 1007 + }, + { + "epoch": 0.029890579129970642, + "grad_norm": 0.20647108554840088, + "learning_rate": 0.0009990095198196618, + "loss": 3.3686, + "step": 1008 + }, + { + "epoch": 0.029920232482282122, + "grad_norm": 0.21840985119342804, + "learning_rate": 0.0009990065575115033, + "loss": 3.2857, + "step": 1009 + }, + { + "epoch": 0.0299498858345936, + "grad_norm": 0.17398293316364288, + "learning_rate": 0.000999003590784556, + "loss": 3.3178, + "step": 1010 + }, + { + "epoch": 0.02997953918690508, + "grad_norm": 0.16055263578891754, + "learning_rate": 0.0009990006196388462, + "loss": 3.3096, + "step": 1011 + }, + { + "epoch": 0.030009192539216557, + "grad_norm": 0.15691447257995605, + "learning_rate": 0.0009989976440744003, + "loss": 3.2577, + "step": 1012 + }, + { + "epoch": 0.030038845891528037, + "grad_norm": 0.13105595111846924, + "learning_rate": 0.0009989946640912447, + "loss": 3.2958, + "step": 1013 + }, + { + "epoch": 0.030068499243839517, + "grad_norm": 0.12209729105234146, + "learning_rate": 0.0009989916796894055, + "loss": 3.3156, + "step": 1014 + }, + { + "epoch": 0.030098152596150996, + "grad_norm": 0.1258728951215744, + "learning_rate": 0.0009989886908689095, + "loss": 3.3292, + "step": 1015 + }, + { + "epoch": 0.030127805948462472, + "grad_norm": 0.1182577908039093, + "learning_rate": 0.000998985697629783, + "loss": 3.287, + "step": 1016 + }, + { + "epoch": 0.030157459300773952, + "grad_norm": 0.1362868845462799, + "learning_rate": 0.0009989826999720524, + "loss": 3.3371, + "step": 1017 + }, + { + "epoch": 0.03018711265308543, + "grad_norm": 0.14311102032661438, + "learning_rate": 0.0009989796978957443, + "loss": 3.2454, + "step": 1018 + }, + { + "epoch": 0.03021676600539691, + "grad_norm": 0.14869216084480286, + "learning_rate": 0.0009989766914008855, + "loss": 3.2833, + "step": 1019 + }, + { + "epoch": 0.030246419357708387, + "grad_norm": 0.163857102394104, + "learning_rate": 0.0009989736804875023, + "loss": 3.2972, + "step": 1020 + }, + { + "epoch": 0.030276072710019867, + "grad_norm": 0.15155421197414398, + "learning_rate": 0.0009989706651556216, + "loss": 3.2652, + "step": 1021 + }, + { + "epoch": 0.030305726062331347, + "grad_norm": 0.13023793697357178, + "learning_rate": 0.00099896764540527, + "loss": 3.2799, + "step": 1022 + }, + { + "epoch": 0.030335379414642826, + "grad_norm": 0.12600980699062347, + "learning_rate": 0.0009989646212364743, + "loss": 3.3503, + "step": 1023 + }, + { + "epoch": 0.030365032766954306, + "grad_norm": 0.12982924282550812, + "learning_rate": 0.000998961592649261, + "loss": 3.2853, + "step": 1024 + }, + { + "epoch": 0.030394686119265782, + "grad_norm": 0.13902488350868225, + "learning_rate": 0.0009989585596436572, + "loss": 3.2948, + "step": 1025 + }, + { + "epoch": 0.03042433947157726, + "grad_norm": 0.14894331991672516, + "learning_rate": 0.00099895552221969, + "loss": 3.3102, + "step": 1026 + }, + { + "epoch": 0.03045399282388874, + "grad_norm": 0.1719938963651657, + "learning_rate": 0.0009989524803773857, + "loss": 3.2993, + "step": 1027 + }, + { + "epoch": 0.03048364617620022, + "grad_norm": 0.18437594175338745, + "learning_rate": 0.0009989494341167717, + "loss": 3.3457, + "step": 1028 + }, + { + "epoch": 0.030513299528511697, + "grad_norm": 0.19169045984745026, + "learning_rate": 0.0009989463834378749, + "loss": 3.3078, + "step": 1029 + }, + { + "epoch": 0.030542952880823176, + "grad_norm": 0.23796004056930542, + "learning_rate": 0.0009989433283407222, + "loss": 3.2932, + "step": 1030 + }, + { + "epoch": 0.030572606233134656, + "grad_norm": 0.26200374960899353, + "learning_rate": 0.0009989402688253405, + "loss": 3.2933, + "step": 1031 + }, + { + "epoch": 0.030602259585446136, + "grad_norm": 0.19204600155353546, + "learning_rate": 0.000998937204891757, + "loss": 3.3029, + "step": 1032 + }, + { + "epoch": 0.030631912937757612, + "grad_norm": 0.18435980379581451, + "learning_rate": 0.0009989341365399993, + "loss": 3.2732, + "step": 1033 + }, + { + "epoch": 0.03066156629006909, + "grad_norm": 0.1630273014307022, + "learning_rate": 0.0009989310637700938, + "loss": 3.2995, + "step": 1034 + }, + { + "epoch": 0.03069121964238057, + "grad_norm": 0.1851792335510254, + "learning_rate": 0.0009989279865820684, + "loss": 3.2709, + "step": 1035 + }, + { + "epoch": 0.03072087299469205, + "grad_norm": 0.1571241319179535, + "learning_rate": 0.0009989249049759499, + "loss": 3.3194, + "step": 1036 + }, + { + "epoch": 0.03075052634700353, + "grad_norm": 0.1520608365535736, + "learning_rate": 0.0009989218189517656, + "loss": 3.2994, + "step": 1037 + }, + { + "epoch": 0.030780179699315006, + "grad_norm": 0.16943219304084778, + "learning_rate": 0.0009989187285095432, + "loss": 3.2772, + "step": 1038 + }, + { + "epoch": 0.030809833051626486, + "grad_norm": 0.15617938339710236, + "learning_rate": 0.0009989156336493096, + "loss": 3.2498, + "step": 1039 + }, + { + "epoch": 0.030839486403937966, + "grad_norm": 0.14182314276695251, + "learning_rate": 0.0009989125343710925, + "loss": 3.2936, + "step": 1040 + }, + { + "epoch": 0.030869139756249445, + "grad_norm": 0.13897305727005005, + "learning_rate": 0.0009989094306749194, + "loss": 3.3135, + "step": 1041 + }, + { + "epoch": 0.03089879310856092, + "grad_norm": 0.1480395793914795, + "learning_rate": 0.0009989063225608174, + "loss": 3.2828, + "step": 1042 + }, + { + "epoch": 0.0309284464608724, + "grad_norm": 0.18869706988334656, + "learning_rate": 0.0009989032100288146, + "loss": 3.3149, + "step": 1043 + }, + { + "epoch": 0.03095809981318388, + "grad_norm": 0.23003579676151276, + "learning_rate": 0.000998900093078938, + "loss": 3.2836, + "step": 1044 + }, + { + "epoch": 0.03098775316549536, + "grad_norm": 0.23080618679523468, + "learning_rate": 0.0009988969717112156, + "loss": 3.2884, + "step": 1045 + }, + { + "epoch": 0.031017406517806836, + "grad_norm": 0.24311235547065735, + "learning_rate": 0.0009988938459256746, + "loss": 3.3201, + "step": 1046 + }, + { + "epoch": 0.031047059870118316, + "grad_norm": 0.2487393319606781, + "learning_rate": 0.0009988907157223433, + "loss": 3.2988, + "step": 1047 + }, + { + "epoch": 0.031076713222429796, + "grad_norm": 0.19402426481246948, + "learning_rate": 0.0009988875811012489, + "loss": 3.3182, + "step": 1048 + }, + { + "epoch": 0.031106366574741275, + "grad_norm": 0.19939158856868744, + "learning_rate": 0.0009988844420624195, + "loss": 3.3381, + "step": 1049 + }, + { + "epoch": 0.031136019927052755, + "grad_norm": 0.19758954644203186, + "learning_rate": 0.0009988812986058825, + "loss": 3.3204, + "step": 1050 + }, + { + "epoch": 0.03116567327936423, + "grad_norm": 0.2112305909395218, + "learning_rate": 0.000998878150731666, + "loss": 3.2961, + "step": 1051 + }, + { + "epoch": 0.03119532663167571, + "grad_norm": 0.17039300501346588, + "learning_rate": 0.000998874998439798, + "loss": 3.2544, + "step": 1052 + }, + { + "epoch": 0.03122497998398719, + "grad_norm": 0.14222951233386993, + "learning_rate": 0.000998871841730306, + "loss": 3.2927, + "step": 1053 + }, + { + "epoch": 0.031254633336298666, + "grad_norm": 0.15199501812458038, + "learning_rate": 0.0009988686806032185, + "loss": 3.2839, + "step": 1054 + }, + { + "epoch": 0.03128428668861015, + "grad_norm": 0.17120301723480225, + "learning_rate": 0.0009988655150585631, + "loss": 3.2758, + "step": 1055 + }, + { + "epoch": 0.031313940040921626, + "grad_norm": 0.14499114453792572, + "learning_rate": 0.0009988623450963678, + "loss": 3.3061, + "step": 1056 + }, + { + "epoch": 0.0313435933932331, + "grad_norm": 0.13424161076545715, + "learning_rate": 0.000998859170716661, + "loss": 3.3138, + "step": 1057 + }, + { + "epoch": 0.031373246745544585, + "grad_norm": 0.12494222074747086, + "learning_rate": 0.0009988559919194707, + "loss": 3.299, + "step": 1058 + }, + { + "epoch": 0.03140290009785606, + "grad_norm": 0.12927845120429993, + "learning_rate": 0.0009988528087048248, + "loss": 3.2832, + "step": 1059 + }, + { + "epoch": 0.031432553450167544, + "grad_norm": 0.12762780487537384, + "learning_rate": 0.0009988496210727516, + "loss": 3.2099, + "step": 1060 + }, + { + "epoch": 0.03146220680247902, + "grad_norm": 0.13430887460708618, + "learning_rate": 0.0009988464290232794, + "loss": 3.2825, + "step": 1061 + }, + { + "epoch": 0.031491860154790496, + "grad_norm": 0.14576424658298492, + "learning_rate": 0.0009988432325564365, + "loss": 3.2822, + "step": 1062 + }, + { + "epoch": 0.03152151350710198, + "grad_norm": 0.15760478377342224, + "learning_rate": 0.000998840031672251, + "loss": 3.3228, + "step": 1063 + }, + { + "epoch": 0.031551166859413456, + "grad_norm": 0.1773822009563446, + "learning_rate": 0.0009988368263707517, + "loss": 3.2712, + "step": 1064 + }, + { + "epoch": 0.03158082021172494, + "grad_norm": 0.20679695904254913, + "learning_rate": 0.0009988336166519664, + "loss": 3.2827, + "step": 1065 + }, + { + "epoch": 0.031610473564036415, + "grad_norm": 0.17482241988182068, + "learning_rate": 0.0009988304025159238, + "loss": 3.267, + "step": 1066 + }, + { + "epoch": 0.03164012691634789, + "grad_norm": 0.1479623168706894, + "learning_rate": 0.0009988271839626525, + "loss": 3.3051, + "step": 1067 + }, + { + "epoch": 0.031669780268659374, + "grad_norm": 0.18049685657024384, + "learning_rate": 0.000998823960992181, + "loss": 3.2872, + "step": 1068 + }, + { + "epoch": 0.03169943362097085, + "grad_norm": 0.17890304327011108, + "learning_rate": 0.0009988207336045375, + "loss": 3.2659, + "step": 1069 + }, + { + "epoch": 0.031729086973282326, + "grad_norm": 0.19008959829807281, + "learning_rate": 0.0009988175017997508, + "loss": 3.2659, + "step": 1070 + }, + { + "epoch": 0.03175874032559381, + "grad_norm": 0.17142337560653687, + "learning_rate": 0.0009988142655778494, + "loss": 3.2882, + "step": 1071 + }, + { + "epoch": 0.031788393677905286, + "grad_norm": 0.17029771208763123, + "learning_rate": 0.0009988110249388622, + "loss": 3.2789, + "step": 1072 + }, + { + "epoch": 0.03181804703021677, + "grad_norm": 0.1746337115764618, + "learning_rate": 0.0009988077798828178, + "loss": 3.2978, + "step": 1073 + }, + { + "epoch": 0.031847700382528245, + "grad_norm": 0.15618453919887543, + "learning_rate": 0.0009988045304097448, + "loss": 3.2536, + "step": 1074 + }, + { + "epoch": 0.03187735373483972, + "grad_norm": 0.1505044847726822, + "learning_rate": 0.000998801276519672, + "loss": 3.2725, + "step": 1075 + }, + { + "epoch": 0.031907007087151204, + "grad_norm": 0.15439356863498688, + "learning_rate": 0.0009987980182126284, + "loss": 3.3168, + "step": 1076 + }, + { + "epoch": 0.03193666043946268, + "grad_norm": 0.17249749600887299, + "learning_rate": 0.0009987947554886427, + "loss": 3.2814, + "step": 1077 + }, + { + "epoch": 0.03196631379177416, + "grad_norm": 0.2087135761976242, + "learning_rate": 0.0009987914883477437, + "loss": 3.2732, + "step": 1078 + }, + { + "epoch": 0.03199596714408564, + "grad_norm": 0.17093351483345032, + "learning_rate": 0.0009987882167899608, + "loss": 3.288, + "step": 1079 + }, + { + "epoch": 0.032025620496397116, + "grad_norm": 0.13476668298244476, + "learning_rate": 0.0009987849408153223, + "loss": 3.3032, + "step": 1080 + }, + { + "epoch": 0.0320552738487086, + "grad_norm": 0.11429506540298462, + "learning_rate": 0.0009987816604238575, + "loss": 3.2924, + "step": 1081 + }, + { + "epoch": 0.032084927201020075, + "grad_norm": 0.11740116775035858, + "learning_rate": 0.0009987783756155958, + "loss": 3.3085, + "step": 1082 + }, + { + "epoch": 0.03211458055333155, + "grad_norm": 0.1323305070400238, + "learning_rate": 0.0009987750863905658, + "loss": 3.2539, + "step": 1083 + }, + { + "epoch": 0.032144233905643034, + "grad_norm": 0.15130504965782166, + "learning_rate": 0.0009987717927487968, + "loss": 3.2301, + "step": 1084 + }, + { + "epoch": 0.03217388725795451, + "grad_norm": 0.16686268150806427, + "learning_rate": 0.000998768494690318, + "loss": 3.2859, + "step": 1085 + }, + { + "epoch": 0.03220354061026599, + "grad_norm": 0.17415161430835724, + "learning_rate": 0.0009987651922151585, + "loss": 3.2658, + "step": 1086 + }, + { + "epoch": 0.03223319396257747, + "grad_norm": 0.16873973608016968, + "learning_rate": 0.0009987618853233475, + "loss": 3.2755, + "step": 1087 + }, + { + "epoch": 0.032262847314888946, + "grad_norm": 0.17810101807117462, + "learning_rate": 0.0009987585740149146, + "loss": 3.2685, + "step": 1088 + }, + { + "epoch": 0.03229250066720043, + "grad_norm": 0.199286550283432, + "learning_rate": 0.0009987552582898887, + "loss": 3.2889, + "step": 1089 + }, + { + "epoch": 0.032322154019511905, + "grad_norm": 0.20449542999267578, + "learning_rate": 0.0009987519381482995, + "loss": 3.2808, + "step": 1090 + }, + { + "epoch": 0.03235180737182339, + "grad_norm": 0.1743985414505005, + "learning_rate": 0.0009987486135901763, + "loss": 3.2833, + "step": 1091 + }, + { + "epoch": 0.032381460724134864, + "grad_norm": 0.18822696805000305, + "learning_rate": 0.0009987452846155485, + "loss": 3.2893, + "step": 1092 + }, + { + "epoch": 0.03241111407644634, + "grad_norm": 0.1489480435848236, + "learning_rate": 0.0009987419512244456, + "loss": 3.2538, + "step": 1093 + }, + { + "epoch": 0.03244076742875782, + "grad_norm": 0.16566063463687897, + "learning_rate": 0.0009987386134168972, + "loss": 3.2736, + "step": 1094 + }, + { + "epoch": 0.0324704207810693, + "grad_norm": 0.14044764637947083, + "learning_rate": 0.0009987352711929326, + "loss": 3.3166, + "step": 1095 + }, + { + "epoch": 0.032500074133380775, + "grad_norm": 0.1663317233324051, + "learning_rate": 0.0009987319245525817, + "loss": 3.2515, + "step": 1096 + }, + { + "epoch": 0.03252972748569226, + "grad_norm": 0.17000503838062286, + "learning_rate": 0.000998728573495874, + "loss": 3.3134, + "step": 1097 + }, + { + "epoch": 0.032559380838003735, + "grad_norm": 0.19783899188041687, + "learning_rate": 0.000998725218022839, + "loss": 3.2744, + "step": 1098 + }, + { + "epoch": 0.03258903419031522, + "grad_norm": 0.19925615191459656, + "learning_rate": 0.0009987218581335067, + "loss": 3.2695, + "step": 1099 + }, + { + "epoch": 0.032618687542626694, + "grad_norm": 0.16256730258464813, + "learning_rate": 0.0009987184938279067, + "loss": 3.2783, + "step": 1100 + }, + { + "epoch": 0.03264834089493817, + "grad_norm": 0.1784701645374298, + "learning_rate": 0.000998715125106069, + "loss": 3.2708, + "step": 1101 + }, + { + "epoch": 0.03267799424724965, + "grad_norm": 0.21611616015434265, + "learning_rate": 0.000998711751968023, + "loss": 3.2563, + "step": 1102 + }, + { + "epoch": 0.03270764759956113, + "grad_norm": 0.23970991373062134, + "learning_rate": 0.000998708374413799, + "loss": 3.2795, + "step": 1103 + }, + { + "epoch": 0.03273730095187261, + "grad_norm": 0.19925890862941742, + "learning_rate": 0.000998704992443427, + "loss": 3.278, + "step": 1104 + }, + { + "epoch": 0.03276695430418409, + "grad_norm": 0.22479945421218872, + "learning_rate": 0.0009987016060569362, + "loss": 3.311, + "step": 1105 + }, + { + "epoch": 0.032796607656495565, + "grad_norm": 0.2145710438489914, + "learning_rate": 0.0009986982152543574, + "loss": 3.2379, + "step": 1106 + }, + { + "epoch": 0.03282626100880705, + "grad_norm": 0.16441671550273895, + "learning_rate": 0.0009986948200357202, + "loss": 3.3043, + "step": 1107 + }, + { + "epoch": 0.032855914361118524, + "grad_norm": 0.1590532511472702, + "learning_rate": 0.0009986914204010548, + "loss": 3.2833, + "step": 1108 + }, + { + "epoch": 0.03288556771343, + "grad_norm": 0.14208590984344482, + "learning_rate": 0.0009986880163503913, + "loss": 3.2507, + "step": 1109 + }, + { + "epoch": 0.03291522106574148, + "grad_norm": 0.1373337358236313, + "learning_rate": 0.0009986846078837597, + "loss": 3.2782, + "step": 1110 + }, + { + "epoch": 0.03294487441805296, + "grad_norm": 0.1394660919904709, + "learning_rate": 0.0009986811950011903, + "loss": 3.3053, + "step": 1111 + }, + { + "epoch": 0.03297452777036444, + "grad_norm": 0.14525774121284485, + "learning_rate": 0.0009986777777027133, + "loss": 3.2426, + "step": 1112 + }, + { + "epoch": 0.03300418112267592, + "grad_norm": 0.12924417853355408, + "learning_rate": 0.0009986743559883592, + "loss": 3.2386, + "step": 1113 + }, + { + "epoch": 0.033033834474987395, + "grad_norm": 0.12544408440589905, + "learning_rate": 0.0009986709298581578, + "loss": 3.2604, + "step": 1114 + }, + { + "epoch": 0.03306348782729888, + "grad_norm": 0.1155332624912262, + "learning_rate": 0.00099866749931214, + "loss": 3.2629, + "step": 1115 + }, + { + "epoch": 0.033093141179610354, + "grad_norm": 0.11932841688394547, + "learning_rate": 0.0009986640643503358, + "loss": 3.2366, + "step": 1116 + }, + { + "epoch": 0.03312279453192184, + "grad_norm": 0.13045167922973633, + "learning_rate": 0.0009986606249727757, + "loss": 3.317, + "step": 1117 + }, + { + "epoch": 0.03315244788423331, + "grad_norm": 0.1574888676404953, + "learning_rate": 0.00099865718117949, + "loss": 3.2726, + "step": 1118 + }, + { + "epoch": 0.03318210123654479, + "grad_norm": 0.17953895032405853, + "learning_rate": 0.0009986537329705098, + "loss": 3.2451, + "step": 1119 + }, + { + "epoch": 0.03321175458885627, + "grad_norm": 0.17237243056297302, + "learning_rate": 0.0009986502803458646, + "loss": 3.2524, + "step": 1120 + }, + { + "epoch": 0.03324140794116775, + "grad_norm": 0.1761806607246399, + "learning_rate": 0.000998646823305586, + "loss": 3.2913, + "step": 1121 + }, + { + "epoch": 0.033271061293479225, + "grad_norm": 0.18849362432956696, + "learning_rate": 0.000998643361849704, + "loss": 3.2536, + "step": 1122 + }, + { + "epoch": 0.03330071464579071, + "grad_norm": 0.19382761418819427, + "learning_rate": 0.0009986398959782497, + "loss": 3.2435, + "step": 1123 + }, + { + "epoch": 0.033330367998102184, + "grad_norm": 0.1906290203332901, + "learning_rate": 0.0009986364256912533, + "loss": 3.2745, + "step": 1124 + }, + { + "epoch": 0.03336002135041367, + "grad_norm": 0.16135770082473755, + "learning_rate": 0.0009986329509887458, + "loss": 3.2564, + "step": 1125 + }, + { + "epoch": 0.03338967470272514, + "grad_norm": 0.18531398475170135, + "learning_rate": 0.000998629471870758, + "loss": 3.2701, + "step": 1126 + }, + { + "epoch": 0.03341932805503662, + "grad_norm": 0.16666510701179504, + "learning_rate": 0.0009986259883373206, + "loss": 3.2687, + "step": 1127 + }, + { + "epoch": 0.0334489814073481, + "grad_norm": 0.17248329520225525, + "learning_rate": 0.0009986225003884644, + "loss": 3.2648, + "step": 1128 + }, + { + "epoch": 0.03347863475965958, + "grad_norm": 0.17705562710762024, + "learning_rate": 0.0009986190080242202, + "loss": 3.212, + "step": 1129 + }, + { + "epoch": 0.03350828811197106, + "grad_norm": 0.17011605203151703, + "learning_rate": 0.0009986155112446196, + "loss": 3.2872, + "step": 1130 + }, + { + "epoch": 0.03353794146428254, + "grad_norm": 0.17867696285247803, + "learning_rate": 0.0009986120100496927, + "loss": 3.2935, + "step": 1131 + }, + { + "epoch": 0.033567594816594014, + "grad_norm": 0.16761235892772675, + "learning_rate": 0.000998608504439471, + "loss": 3.2453, + "step": 1132 + }, + { + "epoch": 0.0335972481689055, + "grad_norm": 0.15541785955429077, + "learning_rate": 0.0009986049944139853, + "loss": 3.2472, + "step": 1133 + }, + { + "epoch": 0.03362690152121697, + "grad_norm": 0.14217492938041687, + "learning_rate": 0.0009986014799732669, + "loss": 3.2506, + "step": 1134 + }, + { + "epoch": 0.03365655487352845, + "grad_norm": 0.19832982122898102, + "learning_rate": 0.0009985979611173469, + "loss": 3.2751, + "step": 1135 + }, + { + "epoch": 0.03368620822583993, + "grad_norm": 0.22609058022499084, + "learning_rate": 0.0009985944378462562, + "loss": 3.2545, + "step": 1136 + }, + { + "epoch": 0.03371586157815141, + "grad_norm": 0.2133471518754959, + "learning_rate": 0.000998590910160026, + "loss": 3.2466, + "step": 1137 + }, + { + "epoch": 0.03374551493046289, + "grad_norm": 0.1827542781829834, + "learning_rate": 0.000998587378058688, + "loss": 3.25, + "step": 1138 + }, + { + "epoch": 0.03377516828277437, + "grad_norm": 0.18850910663604736, + "learning_rate": 0.0009985838415422733, + "loss": 3.241, + "step": 1139 + }, + { + "epoch": 0.033804821635085844, + "grad_norm": 0.15460167825222015, + "learning_rate": 0.0009985803006108127, + "loss": 3.2498, + "step": 1140 + }, + { + "epoch": 0.03383447498739733, + "grad_norm": 0.19210222363471985, + "learning_rate": 0.0009985767552643382, + "loss": 3.2673, + "step": 1141 + }, + { + "epoch": 0.0338641283397088, + "grad_norm": 0.1573067605495453, + "learning_rate": 0.000998573205502881, + "loss": 3.2779, + "step": 1142 + }, + { + "epoch": 0.033893781692020286, + "grad_norm": 0.167486310005188, + "learning_rate": 0.0009985696513264723, + "loss": 3.2401, + "step": 1143 + }, + { + "epoch": 0.03392343504433176, + "grad_norm": 0.1993885189294815, + "learning_rate": 0.0009985660927351438, + "loss": 3.2739, + "step": 1144 + }, + { + "epoch": 0.03395308839664324, + "grad_norm": 0.12891526520252228, + "learning_rate": 0.000998562529728927, + "loss": 3.2382, + "step": 1145 + }, + { + "epoch": 0.03398274174895472, + "grad_norm": 0.14229075610637665, + "learning_rate": 0.0009985589623078535, + "loss": 3.2476, + "step": 1146 + }, + { + "epoch": 0.0340123951012662, + "grad_norm": 0.1536453813314438, + "learning_rate": 0.0009985553904719548, + "loss": 3.2567, + "step": 1147 + }, + { + "epoch": 0.034042048453577674, + "grad_norm": 0.16532956063747406, + "learning_rate": 0.0009985518142212625, + "loss": 3.2626, + "step": 1148 + }, + { + "epoch": 0.03407170180588916, + "grad_norm": 0.15326525270938873, + "learning_rate": 0.0009985482335558085, + "loss": 3.2177, + "step": 1149 + }, + { + "epoch": 0.03410135515820063, + "grad_norm": 0.1421831250190735, + "learning_rate": 0.000998544648475624, + "loss": 3.2457, + "step": 1150 + }, + { + "epoch": 0.034131008510512116, + "grad_norm": 0.1338009238243103, + "learning_rate": 0.0009985410589807412, + "loss": 3.2844, + "step": 1151 + }, + { + "epoch": 0.03416066186282359, + "grad_norm": 0.16380952298641205, + "learning_rate": 0.0009985374650711917, + "loss": 3.2248, + "step": 1152 + }, + { + "epoch": 0.03419031521513507, + "grad_norm": 0.15506426990032196, + "learning_rate": 0.0009985338667470075, + "loss": 3.2311, + "step": 1153 + }, + { + "epoch": 0.03421996856744655, + "grad_norm": 0.1403365284204483, + "learning_rate": 0.0009985302640082203, + "loss": 3.2688, + "step": 1154 + }, + { + "epoch": 0.03424962191975803, + "grad_norm": 0.14415393769741058, + "learning_rate": 0.000998526656854862, + "loss": 3.265, + "step": 1155 + }, + { + "epoch": 0.03427927527206951, + "grad_norm": 0.14032484591007233, + "learning_rate": 0.0009985230452869646, + "loss": 3.2743, + "step": 1156 + }, + { + "epoch": 0.03430892862438099, + "grad_norm": 0.16519302129745483, + "learning_rate": 0.00099851942930456, + "loss": 3.2091, + "step": 1157 + }, + { + "epoch": 0.03433858197669246, + "grad_norm": 0.21388553082942963, + "learning_rate": 0.0009985158089076804, + "loss": 3.2656, + "step": 1158 + }, + { + "epoch": 0.034368235329003946, + "grad_norm": 0.2566899061203003, + "learning_rate": 0.0009985121840963575, + "loss": 3.2801, + "step": 1159 + }, + { + "epoch": 0.03439788868131542, + "grad_norm": 0.2198057472705841, + "learning_rate": 0.000998508554870624, + "loss": 3.2654, + "step": 1160 + }, + { + "epoch": 0.0344275420336269, + "grad_norm": 0.21678443253040314, + "learning_rate": 0.0009985049212305115, + "loss": 3.289, + "step": 1161 + }, + { + "epoch": 0.03445719538593838, + "grad_norm": 0.2150394469499588, + "learning_rate": 0.0009985012831760522, + "loss": 3.2557, + "step": 1162 + }, + { + "epoch": 0.03448684873824986, + "grad_norm": 0.1968410164117813, + "learning_rate": 0.0009984976407072788, + "loss": 3.2406, + "step": 1163 + }, + { + "epoch": 0.03451650209056134, + "grad_norm": 0.14158646762371063, + "learning_rate": 0.000998493993824223, + "loss": 3.2292, + "step": 1164 + }, + { + "epoch": 0.03454615544287282, + "grad_norm": 0.15247036516666412, + "learning_rate": 0.0009984903425269173, + "loss": 3.2397, + "step": 1165 + }, + { + "epoch": 0.03457580879518429, + "grad_norm": 0.12534278631210327, + "learning_rate": 0.000998486686815394, + "loss": 3.2576, + "step": 1166 + }, + { + "epoch": 0.034605462147495776, + "grad_norm": 0.12443055212497711, + "learning_rate": 0.000998483026689686, + "loss": 3.2529, + "step": 1167 + }, + { + "epoch": 0.03463511549980725, + "grad_norm": 0.114616759121418, + "learning_rate": 0.0009984793621498247, + "loss": 3.2282, + "step": 1168 + }, + { + "epoch": 0.034664768852118735, + "grad_norm": 0.10736098885536194, + "learning_rate": 0.0009984756931958431, + "loss": 3.2475, + "step": 1169 + }, + { + "epoch": 0.03469442220443021, + "grad_norm": 0.11595934629440308, + "learning_rate": 0.000998472019827774, + "loss": 3.2231, + "step": 1170 + }, + { + "epoch": 0.03472407555674169, + "grad_norm": 0.12171924859285355, + "learning_rate": 0.0009984683420456496, + "loss": 3.2369, + "step": 1171 + }, + { + "epoch": 0.03475372890905317, + "grad_norm": 0.13299250602722168, + "learning_rate": 0.0009984646598495022, + "loss": 3.1977, + "step": 1172 + }, + { + "epoch": 0.03478338226136465, + "grad_norm": 0.13204465806484222, + "learning_rate": 0.0009984609732393648, + "loss": 3.2438, + "step": 1173 + }, + { + "epoch": 0.03481303561367612, + "grad_norm": 0.1902400106191635, + "learning_rate": 0.00099845728221527, + "loss": 3.2258, + "step": 1174 + }, + { + "epoch": 0.034842688965987606, + "grad_norm": 0.22467510402202606, + "learning_rate": 0.0009984535867772501, + "loss": 3.248, + "step": 1175 + }, + { + "epoch": 0.03487234231829908, + "grad_norm": 0.24400009214878082, + "learning_rate": 0.0009984498869253385, + "loss": 3.2416, + "step": 1176 + }, + { + "epoch": 0.034901995670610565, + "grad_norm": 0.24961134791374207, + "learning_rate": 0.0009984461826595674, + "loss": 3.2722, + "step": 1177 + }, + { + "epoch": 0.03493164902292204, + "grad_norm": 0.1809719055891037, + "learning_rate": 0.0009984424739799698, + "loss": 3.2243, + "step": 1178 + }, + { + "epoch": 0.03496130237523352, + "grad_norm": 0.22271260619163513, + "learning_rate": 0.0009984387608865785, + "loss": 3.2367, + "step": 1179 + }, + { + "epoch": 0.034990955727545, + "grad_norm": 0.2194758951663971, + "learning_rate": 0.0009984350433794266, + "loss": 3.2762, + "step": 1180 + }, + { + "epoch": 0.03502060907985648, + "grad_norm": 0.19131506979465485, + "learning_rate": 0.0009984313214585468, + "loss": 3.2022, + "step": 1181 + }, + { + "epoch": 0.03505026243216796, + "grad_norm": 0.17346175014972687, + "learning_rate": 0.0009984275951239719, + "loss": 3.2118, + "step": 1182 + }, + { + "epoch": 0.035079915784479436, + "grad_norm": 0.1554916948080063, + "learning_rate": 0.0009984238643757353, + "loss": 3.2487, + "step": 1183 + }, + { + "epoch": 0.03510956913679091, + "grad_norm": 0.15458586812019348, + "learning_rate": 0.0009984201292138697, + "loss": 3.2525, + "step": 1184 + }, + { + "epoch": 0.035139222489102395, + "grad_norm": 0.14187116920948029, + "learning_rate": 0.0009984163896384084, + "loss": 3.262, + "step": 1185 + }, + { + "epoch": 0.03516887584141387, + "grad_norm": 0.13253174722194672, + "learning_rate": 0.0009984126456493842, + "loss": 3.218, + "step": 1186 + }, + { + "epoch": 0.03519852919372535, + "grad_norm": 0.12962158024311066, + "learning_rate": 0.0009984088972468308, + "loss": 3.2245, + "step": 1187 + }, + { + "epoch": 0.03522818254603683, + "grad_norm": 0.12114562839269638, + "learning_rate": 0.0009984051444307809, + "loss": 3.2561, + "step": 1188 + }, + { + "epoch": 0.03525783589834831, + "grad_norm": 0.11406029760837555, + "learning_rate": 0.000998401387201268, + "loss": 3.214, + "step": 1189 + }, + { + "epoch": 0.03528748925065979, + "grad_norm": 0.12096118181943893, + "learning_rate": 0.000998397625558325, + "loss": 3.2307, + "step": 1190 + }, + { + "epoch": 0.035317142602971266, + "grad_norm": 0.11474756896495819, + "learning_rate": 0.0009983938595019856, + "loss": 3.1865, + "step": 1191 + }, + { + "epoch": 0.03534679595528274, + "grad_norm": 0.12605535984039307, + "learning_rate": 0.000998390089032283, + "loss": 3.2397, + "step": 1192 + }, + { + "epoch": 0.035376449307594225, + "grad_norm": 0.1529882401227951, + "learning_rate": 0.0009983863141492506, + "loss": 3.2751, + "step": 1193 + }, + { + "epoch": 0.0354061026599057, + "grad_norm": 0.17080813646316528, + "learning_rate": 0.000998382534852922, + "loss": 3.2491, + "step": 1194 + }, + { + "epoch": 0.035435756012217184, + "grad_norm": 0.1640121340751648, + "learning_rate": 0.0009983787511433303, + "loss": 3.1984, + "step": 1195 + }, + { + "epoch": 0.03546540936452866, + "grad_norm": 0.17055892944335938, + "learning_rate": 0.0009983749630205095, + "loss": 3.2167, + "step": 1196 + }, + { + "epoch": 0.03549506271684014, + "grad_norm": 0.1810920685529709, + "learning_rate": 0.0009983711704844927, + "loss": 3.2347, + "step": 1197 + }, + { + "epoch": 0.03552471606915162, + "grad_norm": 0.1762663722038269, + "learning_rate": 0.0009983673735353136, + "loss": 3.2212, + "step": 1198 + }, + { + "epoch": 0.035554369421463096, + "grad_norm": 0.19624602794647217, + "learning_rate": 0.000998363572173006, + "loss": 3.2582, + "step": 1199 + }, + { + "epoch": 0.03558402277377457, + "grad_norm": 0.21188883483409882, + "learning_rate": 0.0009983597663976032, + "loss": 3.2392, + "step": 1200 + }, + { + "epoch": 0.035613676126086055, + "grad_norm": 0.19308912754058838, + "learning_rate": 0.0009983559562091392, + "loss": 3.265, + "step": 1201 + }, + { + "epoch": 0.03564332947839753, + "grad_norm": 0.1888842135667801, + "learning_rate": 0.0009983521416076478, + "loss": 3.2408, + "step": 1202 + }, + { + "epoch": 0.035672982830709014, + "grad_norm": 0.1982620358467102, + "learning_rate": 0.0009983483225931625, + "loss": 3.2221, + "step": 1203 + }, + { + "epoch": 0.03570263618302049, + "grad_norm": 0.1684495061635971, + "learning_rate": 0.0009983444991657174, + "loss": 3.1963, + "step": 1204 + }, + { + "epoch": 0.03573228953533197, + "grad_norm": 0.15452300012111664, + "learning_rate": 0.0009983406713253461, + "loss": 3.2154, + "step": 1205 + }, + { + "epoch": 0.03576194288764345, + "grad_norm": 0.1488846242427826, + "learning_rate": 0.0009983368390720827, + "loss": 3.2194, + "step": 1206 + }, + { + "epoch": 0.035791596239954926, + "grad_norm": 0.1734844446182251, + "learning_rate": 0.000998333002405961, + "loss": 3.2022, + "step": 1207 + }, + { + "epoch": 0.03582124959226641, + "grad_norm": 0.18206503987312317, + "learning_rate": 0.000998329161327015, + "loss": 3.2323, + "step": 1208 + }, + { + "epoch": 0.035850902944577885, + "grad_norm": 0.15486006438732147, + "learning_rate": 0.0009983253158352787, + "loss": 3.2273, + "step": 1209 + }, + { + "epoch": 0.03588055629688936, + "grad_norm": 0.14085319638252258, + "learning_rate": 0.0009983214659307865, + "loss": 3.2156, + "step": 1210 + }, + { + "epoch": 0.035910209649200844, + "grad_norm": 0.18508878350257874, + "learning_rate": 0.0009983176116135717, + "loss": 3.2398, + "step": 1211 + }, + { + "epoch": 0.03593986300151232, + "grad_norm": 0.18990084528923035, + "learning_rate": 0.0009983137528836693, + "loss": 3.2333, + "step": 1212 + }, + { + "epoch": 0.0359695163538238, + "grad_norm": 0.19087140262126923, + "learning_rate": 0.000998309889741113, + "loss": 3.2203, + "step": 1213 + }, + { + "epoch": 0.03599916970613528, + "grad_norm": 0.18774713575839996, + "learning_rate": 0.000998306022185937, + "loss": 3.209, + "step": 1214 + }, + { + "epoch": 0.036028823058446756, + "grad_norm": 0.17399896681308746, + "learning_rate": 0.0009983021502181757, + "loss": 3.2303, + "step": 1215 + }, + { + "epoch": 0.03605847641075824, + "grad_norm": 0.18839344382286072, + "learning_rate": 0.0009982982738378633, + "loss": 3.2559, + "step": 1216 + }, + { + "epoch": 0.036088129763069715, + "grad_norm": 0.1903618723154068, + "learning_rate": 0.000998294393045034, + "loss": 3.2394, + "step": 1217 + }, + { + "epoch": 0.03611778311538119, + "grad_norm": 0.19201189279556274, + "learning_rate": 0.0009982905078397227, + "loss": 3.2225, + "step": 1218 + }, + { + "epoch": 0.036147436467692674, + "grad_norm": 0.26439645886421204, + "learning_rate": 0.0009982866182219631, + "loss": 3.2099, + "step": 1219 + }, + { + "epoch": 0.03617708982000415, + "grad_norm": 0.2304188311100006, + "learning_rate": 0.0009982827241917902, + "loss": 3.2362, + "step": 1220 + }, + { + "epoch": 0.036206743172315634, + "grad_norm": 0.16708220541477203, + "learning_rate": 0.000998278825749238, + "loss": 3.2647, + "step": 1221 + }, + { + "epoch": 0.03623639652462711, + "grad_norm": 0.17368189990520477, + "learning_rate": 0.0009982749228943414, + "loss": 3.2113, + "step": 1222 + }, + { + "epoch": 0.036266049876938586, + "grad_norm": 0.14602354168891907, + "learning_rate": 0.0009982710156271348, + "loss": 3.2327, + "step": 1223 + }, + { + "epoch": 0.03629570322925007, + "grad_norm": 0.2078131139278412, + "learning_rate": 0.000998267103947653, + "loss": 3.2535, + "step": 1224 + }, + { + "epoch": 0.036325356581561545, + "grad_norm": 0.2077110856771469, + "learning_rate": 0.0009982631878559303, + "loss": 3.2439, + "step": 1225 + }, + { + "epoch": 0.03635500993387302, + "grad_norm": 0.19267266988754272, + "learning_rate": 0.0009982592673520015, + "loss": 3.2298, + "step": 1226 + }, + { + "epoch": 0.036384663286184504, + "grad_norm": 0.13700351119041443, + "learning_rate": 0.0009982553424359012, + "loss": 3.2079, + "step": 1227 + }, + { + "epoch": 0.03641431663849598, + "grad_norm": 0.15352246165275574, + "learning_rate": 0.0009982514131076647, + "loss": 3.2642, + "step": 1228 + }, + { + "epoch": 0.036443969990807464, + "grad_norm": 0.13545458018779755, + "learning_rate": 0.0009982474793673263, + "loss": 3.2013, + "step": 1229 + }, + { + "epoch": 0.03647362334311894, + "grad_norm": 0.12698356807231903, + "learning_rate": 0.000998243541214921, + "loss": 3.2103, + "step": 1230 + }, + { + "epoch": 0.036503276695430416, + "grad_norm": 0.15996713936328888, + "learning_rate": 0.0009982395986504835, + "loss": 3.2197, + "step": 1231 + }, + { + "epoch": 0.0365329300477419, + "grad_norm": 0.14082378149032593, + "learning_rate": 0.0009982356516740488, + "loss": 3.2441, + "step": 1232 + }, + { + "epoch": 0.036562583400053375, + "grad_norm": 0.1641141027212143, + "learning_rate": 0.000998231700285652, + "loss": 3.2597, + "step": 1233 + }, + { + "epoch": 0.03659223675236486, + "grad_norm": 0.16699223220348358, + "learning_rate": 0.0009982277444853277, + "loss": 3.208, + "step": 1234 + }, + { + "epoch": 0.036621890104676334, + "grad_norm": 0.16922034323215485, + "learning_rate": 0.0009982237842731116, + "loss": 3.2196, + "step": 1235 + }, + { + "epoch": 0.03665154345698781, + "grad_norm": 0.17295895516872406, + "learning_rate": 0.0009982198196490382, + "loss": 3.2284, + "step": 1236 + }, + { + "epoch": 0.036681196809299293, + "grad_norm": 0.1882040947675705, + "learning_rate": 0.0009982158506131426, + "loss": 3.2168, + "step": 1237 + }, + { + "epoch": 0.03671085016161077, + "grad_norm": 0.19428668916225433, + "learning_rate": 0.0009982118771654604, + "loss": 3.2281, + "step": 1238 + }, + { + "epoch": 0.036740503513922246, + "grad_norm": 0.21623916923999786, + "learning_rate": 0.0009982078993060264, + "loss": 3.2421, + "step": 1239 + }, + { + "epoch": 0.03677015686623373, + "grad_norm": 0.19380563497543335, + "learning_rate": 0.000998203917034876, + "loss": 3.2197, + "step": 1240 + }, + { + "epoch": 0.036799810218545205, + "grad_norm": 0.16353373229503632, + "learning_rate": 0.0009981999303520443, + "loss": 3.2228, + "step": 1241 + }, + { + "epoch": 0.03682946357085669, + "grad_norm": 0.17942692339420319, + "learning_rate": 0.0009981959392575666, + "loss": 3.2003, + "step": 1242 + }, + { + "epoch": 0.036859116923168164, + "grad_norm": 0.13742214441299438, + "learning_rate": 0.0009981919437514785, + "loss": 3.2286, + "step": 1243 + }, + { + "epoch": 0.03688877027547964, + "grad_norm": 0.15455053746700287, + "learning_rate": 0.0009981879438338153, + "loss": 3.1998, + "step": 1244 + }, + { + "epoch": 0.03691842362779112, + "grad_norm": 0.16737015545368195, + "learning_rate": 0.0009981839395046123, + "loss": 3.1987, + "step": 1245 + }, + { + "epoch": 0.0369480769801026, + "grad_norm": 0.16418565809726715, + "learning_rate": 0.0009981799307639048, + "loss": 3.2335, + "step": 1246 + }, + { + "epoch": 0.03697773033241408, + "grad_norm": 0.16169896721839905, + "learning_rate": 0.0009981759176117288, + "loss": 3.1977, + "step": 1247 + }, + { + "epoch": 0.03700738368472556, + "grad_norm": 0.16793888807296753, + "learning_rate": 0.0009981719000481193, + "loss": 3.2391, + "step": 1248 + }, + { + "epoch": 0.037037037037037035, + "grad_norm": 0.17617149651050568, + "learning_rate": 0.0009981678780731124, + "loss": 3.1882, + "step": 1249 + }, + { + "epoch": 0.03706669038934852, + "grad_norm": 0.16194908320903778, + "learning_rate": 0.000998163851686743, + "loss": 3.2078, + "step": 1250 + }, + { + "epoch": 0.037096343741659994, + "grad_norm": 0.160409078001976, + "learning_rate": 0.0009981598208890475, + "loss": 3.1883, + "step": 1251 + }, + { + "epoch": 0.03712599709397147, + "grad_norm": 0.15081234276294708, + "learning_rate": 0.000998155785680061, + "loss": 3.2354, + "step": 1252 + }, + { + "epoch": 0.03715565044628295, + "grad_norm": 0.15068042278289795, + "learning_rate": 0.00099815174605982, + "loss": 3.1671, + "step": 1253 + }, + { + "epoch": 0.03718530379859443, + "grad_norm": 0.15301941335201263, + "learning_rate": 0.0009981477020283593, + "loss": 3.2204, + "step": 1254 + }, + { + "epoch": 0.03721495715090591, + "grad_norm": 0.13935591280460358, + "learning_rate": 0.0009981436535857157, + "loss": 3.2132, + "step": 1255 + }, + { + "epoch": 0.03724461050321739, + "grad_norm": 0.1303388923406601, + "learning_rate": 0.0009981396007319242, + "loss": 3.171, + "step": 1256 + }, + { + "epoch": 0.037274263855528865, + "grad_norm": 0.14256690442562103, + "learning_rate": 0.000998135543467021, + "loss": 3.2138, + "step": 1257 + }, + { + "epoch": 0.03730391720784035, + "grad_norm": 0.20862452685832977, + "learning_rate": 0.0009981314817910421, + "loss": 3.225, + "step": 1258 + }, + { + "epoch": 0.037333570560151824, + "grad_norm": 0.2138284295797348, + "learning_rate": 0.0009981274157040234, + "loss": 3.1982, + "step": 1259 + }, + { + "epoch": 0.03736322391246331, + "grad_norm": 0.208694726228714, + "learning_rate": 0.000998123345206001, + "loss": 3.2095, + "step": 1260 + }, + { + "epoch": 0.03739287726477478, + "grad_norm": 0.18312330543994904, + "learning_rate": 0.0009981192702970107, + "loss": 3.2209, + "step": 1261 + }, + { + "epoch": 0.03742253061708626, + "grad_norm": 0.18347838521003723, + "learning_rate": 0.0009981151909770891, + "loss": 3.2277, + "step": 1262 + }, + { + "epoch": 0.03745218396939774, + "grad_norm": 0.21524298191070557, + "learning_rate": 0.0009981111072462716, + "loss": 3.2083, + "step": 1263 + }, + { + "epoch": 0.03748183732170922, + "grad_norm": 0.19329720735549927, + "learning_rate": 0.000998107019104595, + "loss": 3.2025, + "step": 1264 + }, + { + "epoch": 0.037511490674020695, + "grad_norm": 0.17075343430042267, + "learning_rate": 0.0009981029265520953, + "loss": 3.245, + "step": 1265 + }, + { + "epoch": 0.03754114402633218, + "grad_norm": 0.2064361721277237, + "learning_rate": 0.0009980988295888085, + "loss": 3.2291, + "step": 1266 + }, + { + "epoch": 0.037570797378643654, + "grad_norm": 0.19560424983501434, + "learning_rate": 0.0009980947282147712, + "loss": 3.2281, + "step": 1267 + }, + { + "epoch": 0.03760045073095514, + "grad_norm": 0.18146027624607086, + "learning_rate": 0.0009980906224300195, + "loss": 3.2129, + "step": 1268 + }, + { + "epoch": 0.03763010408326661, + "grad_norm": 0.16939321160316467, + "learning_rate": 0.0009980865122345898, + "loss": 3.2107, + "step": 1269 + }, + { + "epoch": 0.03765975743557809, + "grad_norm": 0.15371835231781006, + "learning_rate": 0.0009980823976285186, + "loss": 3.1931, + "step": 1270 + }, + { + "epoch": 0.03768941078788957, + "grad_norm": 0.14709092676639557, + "learning_rate": 0.0009980782786118423, + "loss": 3.2209, + "step": 1271 + }, + { + "epoch": 0.03771906414020105, + "grad_norm": 0.15892308950424194, + "learning_rate": 0.0009980741551845972, + "loss": 3.2025, + "step": 1272 + }, + { + "epoch": 0.03774871749251253, + "grad_norm": 0.14471815526485443, + "learning_rate": 0.0009980700273468203, + "loss": 3.1967, + "step": 1273 + }, + { + "epoch": 0.03777837084482401, + "grad_norm": 0.14434406161308289, + "learning_rate": 0.0009980658950985476, + "loss": 3.195, + "step": 1274 + }, + { + "epoch": 0.037808024197135484, + "grad_norm": 0.14599944651126862, + "learning_rate": 0.000998061758439816, + "loss": 3.2083, + "step": 1275 + }, + { + "epoch": 0.03783767754944697, + "grad_norm": 0.15304110944271088, + "learning_rate": 0.0009980576173706619, + "loss": 3.1658, + "step": 1276 + }, + { + "epoch": 0.03786733090175844, + "grad_norm": 0.17874294519424438, + "learning_rate": 0.0009980534718911221, + "loss": 3.2025, + "step": 1277 + }, + { + "epoch": 0.03789698425406992, + "grad_norm": 0.1803397238254547, + "learning_rate": 0.0009980493220012334, + "loss": 3.2082, + "step": 1278 + }, + { + "epoch": 0.0379266376063814, + "grad_norm": 0.18127968907356262, + "learning_rate": 0.0009980451677010325, + "loss": 3.2185, + "step": 1279 + }, + { + "epoch": 0.03795629095869288, + "grad_norm": 0.15553595125675201, + "learning_rate": 0.000998041008990556, + "loss": 3.2639, + "step": 1280 + }, + { + "epoch": 0.03798594431100436, + "grad_norm": 0.17311063408851624, + "learning_rate": 0.000998036845869841, + "loss": 3.1969, + "step": 1281 + }, + { + "epoch": 0.03801559766331584, + "grad_norm": 0.15750445425510406, + "learning_rate": 0.0009980326783389241, + "loss": 3.2113, + "step": 1282 + }, + { + "epoch": 0.038045251015627314, + "grad_norm": 0.16014835238456726, + "learning_rate": 0.0009980285063978427, + "loss": 3.2207, + "step": 1283 + }, + { + "epoch": 0.0380749043679388, + "grad_norm": 0.16378885507583618, + "learning_rate": 0.0009980243300466332, + "loss": 3.2227, + "step": 1284 + }, + { + "epoch": 0.03810455772025027, + "grad_norm": 0.17302599549293518, + "learning_rate": 0.0009980201492853326, + "loss": 3.1774, + "step": 1285 + }, + { + "epoch": 0.038134211072561756, + "grad_norm": 0.20878642797470093, + "learning_rate": 0.0009980159641139782, + "loss": 3.1779, + "step": 1286 + }, + { + "epoch": 0.03816386442487323, + "grad_norm": 0.20012499392032623, + "learning_rate": 0.000998011774532607, + "loss": 3.2073, + "step": 1287 + }, + { + "epoch": 0.03819351777718471, + "grad_norm": 0.1890026181936264, + "learning_rate": 0.000998007580541256, + "loss": 3.197, + "step": 1288 + }, + { + "epoch": 0.03822317112949619, + "grad_norm": 0.18516522645950317, + "learning_rate": 0.0009980033821399624, + "loss": 3.2272, + "step": 1289 + }, + { + "epoch": 0.03825282448180767, + "grad_norm": 0.16088955104351044, + "learning_rate": 0.0009979991793287635, + "loss": 3.238, + "step": 1290 + }, + { + "epoch": 0.038282477834119144, + "grad_norm": 0.14147691428661346, + "learning_rate": 0.000997994972107696, + "loss": 3.1458, + "step": 1291 + }, + { + "epoch": 0.03831213118643063, + "grad_norm": 0.15164637565612793, + "learning_rate": 0.000997990760476798, + "loss": 3.158, + "step": 1292 + }, + { + "epoch": 0.0383417845387421, + "grad_norm": 0.16033966839313507, + "learning_rate": 0.0009979865444361062, + "loss": 3.1958, + "step": 1293 + }, + { + "epoch": 0.038371437891053586, + "grad_norm": 0.1426258534193039, + "learning_rate": 0.000997982323985658, + "loss": 3.21, + "step": 1294 + }, + { + "epoch": 0.03840109124336506, + "grad_norm": 0.17001821100711823, + "learning_rate": 0.0009979780991254909, + "loss": 3.1989, + "step": 1295 + }, + { + "epoch": 0.03843074459567654, + "grad_norm": 0.14838510751724243, + "learning_rate": 0.0009979738698556422, + "loss": 3.1932, + "step": 1296 + }, + { + "epoch": 0.03846039794798802, + "grad_norm": 0.1672007292509079, + "learning_rate": 0.0009979696361761495, + "loss": 3.2275, + "step": 1297 + }, + { + "epoch": 0.0384900513002995, + "grad_norm": 0.16424605250358582, + "learning_rate": 0.00099796539808705, + "loss": 3.1911, + "step": 1298 + }, + { + "epoch": 0.03851970465261098, + "grad_norm": 0.16809652745723724, + "learning_rate": 0.0009979611555883817, + "loss": 3.214, + "step": 1299 + }, + { + "epoch": 0.03854935800492246, + "grad_norm": 0.14035409688949585, + "learning_rate": 0.0009979569086801816, + "loss": 3.2121, + "step": 1300 + }, + { + "epoch": 0.03857901135723393, + "grad_norm": 0.17105762660503387, + "learning_rate": 0.0009979526573624877, + "loss": 3.2154, + "step": 1301 + }, + { + "epoch": 0.038608664709545416, + "grad_norm": 0.21020177006721497, + "learning_rate": 0.0009979484016353376, + "loss": 3.1904, + "step": 1302 + }, + { + "epoch": 0.03863831806185689, + "grad_norm": 0.17254851758480072, + "learning_rate": 0.000997944141498769, + "loss": 3.2245, + "step": 1303 + }, + { + "epoch": 0.03866797141416837, + "grad_norm": 0.1633349061012268, + "learning_rate": 0.0009979398769528196, + "loss": 3.1878, + "step": 1304 + }, + { + "epoch": 0.03869762476647985, + "grad_norm": 0.1814800500869751, + "learning_rate": 0.0009979356079975268, + "loss": 3.2085, + "step": 1305 + }, + { + "epoch": 0.03872727811879133, + "grad_norm": 0.15399646759033203, + "learning_rate": 0.000997931334632929, + "loss": 3.2101, + "step": 1306 + }, + { + "epoch": 0.03875693147110281, + "grad_norm": 0.1539715677499771, + "learning_rate": 0.0009979270568590637, + "loss": 3.2084, + "step": 1307 + }, + { + "epoch": 0.03878658482341429, + "grad_norm": 0.18894395232200623, + "learning_rate": 0.0009979227746759688, + "loss": 3.2089, + "step": 1308 + }, + { + "epoch": 0.03881623817572576, + "grad_norm": 0.13932478427886963, + "learning_rate": 0.0009979184880836824, + "loss": 3.1784, + "step": 1309 + }, + { + "epoch": 0.038845891528037246, + "grad_norm": 0.13928118348121643, + "learning_rate": 0.0009979141970822422, + "loss": 3.2132, + "step": 1310 + }, + { + "epoch": 0.03887554488034872, + "grad_norm": 0.17707954347133636, + "learning_rate": 0.0009979099016716865, + "loss": 3.2403, + "step": 1311 + }, + { + "epoch": 0.038905198232660206, + "grad_norm": 0.19362780451774597, + "learning_rate": 0.0009979056018520529, + "loss": 3.2088, + "step": 1312 + }, + { + "epoch": 0.03893485158497168, + "grad_norm": 0.20114479959011078, + "learning_rate": 0.0009979012976233798, + "loss": 3.2164, + "step": 1313 + }, + { + "epoch": 0.03896450493728316, + "grad_norm": 0.21897482872009277, + "learning_rate": 0.0009978969889857052, + "loss": 3.2301, + "step": 1314 + }, + { + "epoch": 0.03899415828959464, + "grad_norm": 0.19022226333618164, + "learning_rate": 0.0009978926759390673, + "loss": 3.2012, + "step": 1315 + }, + { + "epoch": 0.03902381164190612, + "grad_norm": 0.18571625649929047, + "learning_rate": 0.0009978883584835043, + "loss": 3.2056, + "step": 1316 + }, + { + "epoch": 0.03905346499421759, + "grad_norm": 0.21147745847702026, + "learning_rate": 0.0009978840366190547, + "loss": 3.2043, + "step": 1317 + }, + { + "epoch": 0.039083118346529076, + "grad_norm": 0.20009052753448486, + "learning_rate": 0.0009978797103457563, + "loss": 3.2456, + "step": 1318 + }, + { + "epoch": 0.03911277169884055, + "grad_norm": 0.17048576474189758, + "learning_rate": 0.0009978753796636476, + "loss": 3.1955, + "step": 1319 + }, + { + "epoch": 0.039142425051152036, + "grad_norm": 0.17999278008937836, + "learning_rate": 0.0009978710445727667, + "loss": 3.2048, + "step": 1320 + }, + { + "epoch": 0.03917207840346351, + "grad_norm": 0.17568698525428772, + "learning_rate": 0.0009978667050731527, + "loss": 3.1895, + "step": 1321 + }, + { + "epoch": 0.03920173175577499, + "grad_norm": 0.16833928227424622, + "learning_rate": 0.0009978623611648432, + "loss": 3.1408, + "step": 1322 + }, + { + "epoch": 0.03923138510808647, + "grad_norm": 0.14038695394992828, + "learning_rate": 0.0009978580128478772, + "loss": 3.1798, + "step": 1323 + }, + { + "epoch": 0.03926103846039795, + "grad_norm": 0.14842215180397034, + "learning_rate": 0.000997853660122293, + "loss": 3.1988, + "step": 1324 + }, + { + "epoch": 0.03929069181270943, + "grad_norm": 0.15844114124774933, + "learning_rate": 0.0009978493029881292, + "loss": 3.1957, + "step": 1325 + }, + { + "epoch": 0.039320345165020906, + "grad_norm": 0.1590440720319748, + "learning_rate": 0.0009978449414454243, + "loss": 3.1916, + "step": 1326 + }, + { + "epoch": 0.03934999851733238, + "grad_norm": 0.1316007822751999, + "learning_rate": 0.0009978405754942172, + "loss": 3.1723, + "step": 1327 + }, + { + "epoch": 0.039379651869643865, + "grad_norm": 0.12914744019508362, + "learning_rate": 0.0009978362051345463, + "loss": 3.1642, + "step": 1328 + }, + { + "epoch": 0.03940930522195534, + "grad_norm": 0.15079699456691742, + "learning_rate": 0.00099783183036645, + "loss": 3.1945, + "step": 1329 + }, + { + "epoch": 0.03943895857426682, + "grad_norm": 0.20505647361278534, + "learning_rate": 0.0009978274511899677, + "loss": 3.1873, + "step": 1330 + }, + { + "epoch": 0.0394686119265783, + "grad_norm": 0.2643428146839142, + "learning_rate": 0.000997823067605138, + "loss": 3.1933, + "step": 1331 + }, + { + "epoch": 0.03949826527888978, + "grad_norm": 0.23085500299930573, + "learning_rate": 0.0009978186796119992, + "loss": 3.2373, + "step": 1332 + }, + { + "epoch": 0.03952791863120126, + "grad_norm": 0.1853373944759369, + "learning_rate": 0.000997814287210591, + "loss": 3.1808, + "step": 1333 + }, + { + "epoch": 0.039557571983512736, + "grad_norm": 0.1861761510372162, + "learning_rate": 0.0009978098904009514, + "loss": 3.1934, + "step": 1334 + }, + { + "epoch": 0.03958722533582421, + "grad_norm": 0.19808581471443176, + "learning_rate": 0.00099780548918312, + "loss": 3.2187, + "step": 1335 + }, + { + "epoch": 0.039616878688135695, + "grad_norm": 0.21973660588264465, + "learning_rate": 0.0009978010835571356, + "loss": 3.214, + "step": 1336 + }, + { + "epoch": 0.03964653204044717, + "grad_norm": 0.21368670463562012, + "learning_rate": 0.000997796673523037, + "loss": 3.2189, + "step": 1337 + }, + { + "epoch": 0.039676185392758655, + "grad_norm": 0.16227519512176514, + "learning_rate": 0.0009977922590808635, + "loss": 3.2077, + "step": 1338 + }, + { + "epoch": 0.03970583874507013, + "grad_norm": 0.16469910740852356, + "learning_rate": 0.0009977878402306541, + "loss": 3.2404, + "step": 1339 + }, + { + "epoch": 0.03973549209738161, + "grad_norm": 0.17271986603736877, + "learning_rate": 0.0009977834169724478, + "loss": 3.1836, + "step": 1340 + }, + { + "epoch": 0.03976514544969309, + "grad_norm": 0.14675404131412506, + "learning_rate": 0.000997778989306284, + "loss": 3.1741, + "step": 1341 + }, + { + "epoch": 0.039794798802004566, + "grad_norm": 0.13299110531806946, + "learning_rate": 0.0009977745572322019, + "loss": 3.1746, + "step": 1342 + }, + { + "epoch": 0.03982445215431604, + "grad_norm": 0.1391681283712387, + "learning_rate": 0.0009977701207502406, + "loss": 3.1813, + "step": 1343 + }, + { + "epoch": 0.039854105506627525, + "grad_norm": 0.15135416388511658, + "learning_rate": 0.0009977656798604393, + "loss": 3.1621, + "step": 1344 + }, + { + "epoch": 0.039883758858939, + "grad_norm": 0.15207302570343018, + "learning_rate": 0.0009977612345628377, + "loss": 3.184, + "step": 1345 + }, + { + "epoch": 0.039913412211250485, + "grad_norm": 0.15978655219078064, + "learning_rate": 0.0009977567848574746, + "loss": 3.1456, + "step": 1346 + }, + { + "epoch": 0.03994306556356196, + "grad_norm": 0.16412030160427094, + "learning_rate": 0.0009977523307443902, + "loss": 3.1856, + "step": 1347 + }, + { + "epoch": 0.03997271891587344, + "grad_norm": 0.13921216130256653, + "learning_rate": 0.000997747872223623, + "loss": 3.1763, + "step": 1348 + }, + { + "epoch": 0.04000237226818492, + "grad_norm": 0.12433695793151855, + "learning_rate": 0.0009977434092952133, + "loss": 3.1749, + "step": 1349 + }, + { + "epoch": 0.040032025620496396, + "grad_norm": 0.13688960671424866, + "learning_rate": 0.0009977389419592, + "loss": 3.1531, + "step": 1350 + }, + { + "epoch": 0.04006167897280788, + "grad_norm": 0.1720155030488968, + "learning_rate": 0.000997734470215623, + "loss": 3.1884, + "step": 1351 + }, + { + "epoch": 0.040091332325119355, + "grad_norm": 0.17411306500434875, + "learning_rate": 0.000997729994064522, + "loss": 3.1863, + "step": 1352 + }, + { + "epoch": 0.04012098567743083, + "grad_norm": 0.1636018604040146, + "learning_rate": 0.0009977255135059364, + "loss": 3.1927, + "step": 1353 + }, + { + "epoch": 0.040150639029742315, + "grad_norm": 0.1500132530927658, + "learning_rate": 0.000997721028539906, + "loss": 3.1813, + "step": 1354 + }, + { + "epoch": 0.04018029238205379, + "grad_norm": 0.19320917129516602, + "learning_rate": 0.0009977165391664704, + "loss": 3.1618, + "step": 1355 + }, + { + "epoch": 0.04020994573436527, + "grad_norm": 0.23258432745933533, + "learning_rate": 0.0009977120453856694, + "loss": 3.2114, + "step": 1356 + }, + { + "epoch": 0.04023959908667675, + "grad_norm": 0.19531312584877014, + "learning_rate": 0.0009977075471975427, + "loss": 3.1745, + "step": 1357 + }, + { + "epoch": 0.040269252438988226, + "grad_norm": 0.19104690849781036, + "learning_rate": 0.0009977030446021303, + "loss": 3.1751, + "step": 1358 + }, + { + "epoch": 0.04029890579129971, + "grad_norm": 0.18784993886947632, + "learning_rate": 0.000997698537599472, + "loss": 3.1711, + "step": 1359 + }, + { + "epoch": 0.040328559143611185, + "grad_norm": 0.19466811418533325, + "learning_rate": 0.0009976940261896077, + "loss": 3.1623, + "step": 1360 + }, + { + "epoch": 0.04035821249592266, + "grad_norm": 0.19003115594387054, + "learning_rate": 0.0009976895103725777, + "loss": 3.1715, + "step": 1361 + }, + { + "epoch": 0.040387865848234145, + "grad_norm": 0.16769547760486603, + "learning_rate": 0.0009976849901484214, + "loss": 3.1954, + "step": 1362 + }, + { + "epoch": 0.04041751920054562, + "grad_norm": 0.19299717247486115, + "learning_rate": 0.000997680465517179, + "loss": 3.1282, + "step": 1363 + }, + { + "epoch": 0.040447172552857104, + "grad_norm": 0.17934386432170868, + "learning_rate": 0.0009976759364788907, + "loss": 3.1847, + "step": 1364 + }, + { + "epoch": 0.04047682590516858, + "grad_norm": 0.1591903567314148, + "learning_rate": 0.0009976714030335964, + "loss": 3.1804, + "step": 1365 + }, + { + "epoch": 0.040506479257480056, + "grad_norm": 0.14769595861434937, + "learning_rate": 0.0009976668651813369, + "loss": 3.1912, + "step": 1366 + }, + { + "epoch": 0.04053613260979154, + "grad_norm": 0.1762290745973587, + "learning_rate": 0.0009976623229221513, + "loss": 3.197, + "step": 1367 + }, + { + "epoch": 0.040565785962103015, + "grad_norm": 0.1684182733297348, + "learning_rate": 0.0009976577762560808, + "loss": 3.1615, + "step": 1368 + }, + { + "epoch": 0.04059543931441449, + "grad_norm": 0.15395715832710266, + "learning_rate": 0.0009976532251831651, + "loss": 3.1994, + "step": 1369 + }, + { + "epoch": 0.040625092666725975, + "grad_norm": 0.15101025998592377, + "learning_rate": 0.000997648669703445, + "loss": 3.1831, + "step": 1370 + }, + { + "epoch": 0.04065474601903745, + "grad_norm": 0.1869780570268631, + "learning_rate": 0.00099764410981696, + "loss": 3.1899, + "step": 1371 + }, + { + "epoch": 0.040684399371348934, + "grad_norm": 0.19450637698173523, + "learning_rate": 0.0009976395455237512, + "loss": 3.1969, + "step": 1372 + }, + { + "epoch": 0.04071405272366041, + "grad_norm": 0.18480366468429565, + "learning_rate": 0.0009976349768238588, + "loss": 3.1649, + "step": 1373 + }, + { + "epoch": 0.040743706075971886, + "grad_norm": 0.19650660455226898, + "learning_rate": 0.0009976304037173232, + "loss": 3.1632, + "step": 1374 + }, + { + "epoch": 0.04077335942828337, + "grad_norm": 0.20163533091545105, + "learning_rate": 0.0009976258262041852, + "loss": 3.1259, + "step": 1375 + }, + { + "epoch": 0.040803012780594845, + "grad_norm": 0.1758987009525299, + "learning_rate": 0.000997621244284485, + "loss": 3.185, + "step": 1376 + }, + { + "epoch": 0.04083266613290633, + "grad_norm": 0.17514407634735107, + "learning_rate": 0.000997616657958263, + "loss": 3.1413, + "step": 1377 + }, + { + "epoch": 0.040862319485217805, + "grad_norm": 0.14874982833862305, + "learning_rate": 0.0009976120672255603, + "loss": 3.1667, + "step": 1378 + }, + { + "epoch": 0.04089197283752928, + "grad_norm": 0.14826208353042603, + "learning_rate": 0.0009976074720864174, + "loss": 3.1983, + "step": 1379 + }, + { + "epoch": 0.040921626189840764, + "grad_norm": 0.15504498779773712, + "learning_rate": 0.0009976028725408748, + "loss": 3.183, + "step": 1380 + }, + { + "epoch": 0.04095127954215224, + "grad_norm": 0.13346289098262787, + "learning_rate": 0.0009975982685889735, + "loss": 3.1468, + "step": 1381 + }, + { + "epoch": 0.040980932894463716, + "grad_norm": 0.1272670477628708, + "learning_rate": 0.000997593660230754, + "loss": 3.1635, + "step": 1382 + }, + { + "epoch": 0.0410105862467752, + "grad_norm": 0.1368921399116516, + "learning_rate": 0.0009975890474662572, + "loss": 3.1674, + "step": 1383 + }, + { + "epoch": 0.041040239599086675, + "grad_norm": 0.15321974456310272, + "learning_rate": 0.000997584430295524, + "loss": 3.1562, + "step": 1384 + }, + { + "epoch": 0.04106989295139816, + "grad_norm": 0.1626204401254654, + "learning_rate": 0.0009975798087185953, + "loss": 3.1667, + "step": 1385 + }, + { + "epoch": 0.041099546303709635, + "grad_norm": 0.1545903980731964, + "learning_rate": 0.000997575182735512, + "loss": 3.1606, + "step": 1386 + }, + { + "epoch": 0.04112919965602111, + "grad_norm": 0.1453159749507904, + "learning_rate": 0.0009975705523463149, + "loss": 3.1781, + "step": 1387 + }, + { + "epoch": 0.041158853008332594, + "grad_norm": 0.1134999543428421, + "learning_rate": 0.0009975659175510453, + "loss": 3.1723, + "step": 1388 + }, + { + "epoch": 0.04118850636064407, + "grad_norm": 0.1320718377828598, + "learning_rate": 0.000997561278349744, + "loss": 3.1426, + "step": 1389 + }, + { + "epoch": 0.04121815971295555, + "grad_norm": 0.14067606627941132, + "learning_rate": 0.0009975566347424523, + "loss": 3.1711, + "step": 1390 + }, + { + "epoch": 0.04124781306526703, + "grad_norm": 0.15078486502170563, + "learning_rate": 0.000997551986729211, + "loss": 3.1822, + "step": 1391 + }, + { + "epoch": 0.041277466417578505, + "grad_norm": 0.19459526240825653, + "learning_rate": 0.0009975473343100615, + "loss": 3.1718, + "step": 1392 + }, + { + "epoch": 0.04130711976988999, + "grad_norm": 0.17477820813655853, + "learning_rate": 0.0009975426774850452, + "loss": 3.1303, + "step": 1393 + }, + { + "epoch": 0.041336773122201464, + "grad_norm": 0.19932787120342255, + "learning_rate": 0.000997538016254203, + "loss": 3.2026, + "step": 1394 + }, + { + "epoch": 0.04136642647451294, + "grad_norm": 0.2391396462917328, + "learning_rate": 0.000997533350617576, + "loss": 3.2166, + "step": 1395 + }, + { + "epoch": 0.041396079826824424, + "grad_norm": 0.23746244609355927, + "learning_rate": 0.000997528680575206, + "loss": 3.2159, + "step": 1396 + }, + { + "epoch": 0.0414257331791359, + "grad_norm": 0.23290139436721802, + "learning_rate": 0.000997524006127134, + "loss": 3.1911, + "step": 1397 + }, + { + "epoch": 0.04145538653144738, + "grad_norm": 0.2141612023115158, + "learning_rate": 0.0009975193272734016, + "loss": 3.2059, + "step": 1398 + }, + { + "epoch": 0.04148503988375886, + "grad_norm": 0.2549446225166321, + "learning_rate": 0.0009975146440140503, + "loss": 3.1762, + "step": 1399 + }, + { + "epoch": 0.041514693236070335, + "grad_norm": 0.19886812567710876, + "learning_rate": 0.0009975099563491211, + "loss": 3.1799, + "step": 1400 + }, + { + "epoch": 0.04154434658838182, + "grad_norm": 0.1785162091255188, + "learning_rate": 0.0009975052642786561, + "loss": 3.2153, + "step": 1401 + }, + { + "epoch": 0.041573999940693294, + "grad_norm": 0.1946118026971817, + "learning_rate": 0.0009975005678026967, + "loss": 3.1638, + "step": 1402 + }, + { + "epoch": 0.04160365329300478, + "grad_norm": 0.1887834519147873, + "learning_rate": 0.000997495866921284, + "loss": 3.1951, + "step": 1403 + }, + { + "epoch": 0.041633306645316254, + "grad_norm": 0.15584057569503784, + "learning_rate": 0.0009974911616344605, + "loss": 3.1873, + "step": 1404 + }, + { + "epoch": 0.04166295999762773, + "grad_norm": 0.1800968050956726, + "learning_rate": 0.000997486451942267, + "loss": 3.162, + "step": 1405 + }, + { + "epoch": 0.04169261334993921, + "grad_norm": 0.18952174484729767, + "learning_rate": 0.0009974817378447455, + "loss": 3.1868, + "step": 1406 + }, + { + "epoch": 0.04172226670225069, + "grad_norm": 0.175430566072464, + "learning_rate": 0.000997477019341938, + "loss": 3.2034, + "step": 1407 + }, + { + "epoch": 0.041751920054562165, + "grad_norm": 0.18093560636043549, + "learning_rate": 0.0009974722964338862, + "loss": 3.1724, + "step": 1408 + }, + { + "epoch": 0.04178157340687365, + "grad_norm": 0.20069880783557892, + "learning_rate": 0.0009974675691206318, + "loss": 3.1867, + "step": 1409 + }, + { + "epoch": 0.041811226759185124, + "grad_norm": 0.1833132654428482, + "learning_rate": 0.0009974628374022165, + "loss": 3.2232, + "step": 1410 + }, + { + "epoch": 0.04184088011149661, + "grad_norm": 0.1727752387523651, + "learning_rate": 0.0009974581012786826, + "loss": 3.182, + "step": 1411 + }, + { + "epoch": 0.041870533463808084, + "grad_norm": 0.1726718246936798, + "learning_rate": 0.0009974533607500715, + "loss": 3.1538, + "step": 1412 + }, + { + "epoch": 0.04190018681611956, + "grad_norm": 0.15845315158367157, + "learning_rate": 0.0009974486158164258, + "loss": 3.1449, + "step": 1413 + }, + { + "epoch": 0.04192984016843104, + "grad_norm": 0.16103285551071167, + "learning_rate": 0.000997443866477787, + "loss": 3.1522, + "step": 1414 + }, + { + "epoch": 0.04195949352074252, + "grad_norm": 0.1538880318403244, + "learning_rate": 0.0009974391127341978, + "loss": 3.1713, + "step": 1415 + }, + { + "epoch": 0.041989146873054, + "grad_norm": 0.14857889711856842, + "learning_rate": 0.0009974343545856995, + "loss": 3.2077, + "step": 1416 + }, + { + "epoch": 0.04201880022536548, + "grad_norm": 0.16097429394721985, + "learning_rate": 0.0009974295920323346, + "loss": 3.1329, + "step": 1417 + }, + { + "epoch": 0.042048453577676954, + "grad_norm": 0.14301100373268127, + "learning_rate": 0.0009974248250741455, + "loss": 3.1466, + "step": 1418 + }, + { + "epoch": 0.04207810692998844, + "grad_norm": 0.14690819382667542, + "learning_rate": 0.000997420053711174, + "loss": 3.1438, + "step": 1419 + }, + { + "epoch": 0.042107760282299914, + "grad_norm": 0.1457168310880661, + "learning_rate": 0.0009974152779434627, + "loss": 3.1667, + "step": 1420 + }, + { + "epoch": 0.04213741363461139, + "grad_norm": 0.14394427835941315, + "learning_rate": 0.0009974104977710535, + "loss": 3.1824, + "step": 1421 + }, + { + "epoch": 0.04216706698692287, + "grad_norm": 0.13268695771694183, + "learning_rate": 0.0009974057131939891, + "loss": 3.2013, + "step": 1422 + }, + { + "epoch": 0.04219672033923435, + "grad_norm": 0.15505629777908325, + "learning_rate": 0.0009974009242123118, + "loss": 3.1876, + "step": 1423 + }, + { + "epoch": 0.04222637369154583, + "grad_norm": 0.17314930260181427, + "learning_rate": 0.0009973961308260637, + "loss": 3.15, + "step": 1424 + }, + { + "epoch": 0.04225602704385731, + "grad_norm": 0.17086604237556458, + "learning_rate": 0.0009973913330352877, + "loss": 3.1296, + "step": 1425 + }, + { + "epoch": 0.042285680396168784, + "grad_norm": 0.21781811118125916, + "learning_rate": 0.000997386530840026, + "loss": 3.1598, + "step": 1426 + }, + { + "epoch": 0.04231533374848027, + "grad_norm": 0.22940967977046967, + "learning_rate": 0.0009973817242403215, + "loss": 3.1543, + "step": 1427 + }, + { + "epoch": 0.042344987100791744, + "grad_norm": 0.20206034183502197, + "learning_rate": 0.000997376913236216, + "loss": 3.1381, + "step": 1428 + }, + { + "epoch": 0.04237464045310323, + "grad_norm": 0.18272003531455994, + "learning_rate": 0.0009973720978277527, + "loss": 3.1678, + "step": 1429 + }, + { + "epoch": 0.0424042938054147, + "grad_norm": 0.20317570865154266, + "learning_rate": 0.0009973672780149742, + "loss": 3.1665, + "step": 1430 + }, + { + "epoch": 0.04243394715772618, + "grad_norm": 0.191745325922966, + "learning_rate": 0.000997362453797923, + "loss": 3.1712, + "step": 1431 + }, + { + "epoch": 0.04246360051003766, + "grad_norm": 0.16674186289310455, + "learning_rate": 0.000997357625176642, + "loss": 3.1814, + "step": 1432 + }, + { + "epoch": 0.04249325386234914, + "grad_norm": 0.1772426962852478, + "learning_rate": 0.0009973527921511738, + "loss": 3.1646, + "step": 1433 + }, + { + "epoch": 0.042522907214660614, + "grad_norm": 0.1604829877614975, + "learning_rate": 0.0009973479547215611, + "loss": 3.1488, + "step": 1434 + }, + { + "epoch": 0.0425525605669721, + "grad_norm": 0.13843147456645966, + "learning_rate": 0.000997343112887847, + "loss": 3.213, + "step": 1435 + }, + { + "epoch": 0.042582213919283574, + "grad_norm": 0.15388314425945282, + "learning_rate": 0.0009973382666500744, + "loss": 3.1993, + "step": 1436 + }, + { + "epoch": 0.04261186727159506, + "grad_norm": 0.13701830804347992, + "learning_rate": 0.000997333416008286, + "loss": 3.187, + "step": 1437 + }, + { + "epoch": 0.04264152062390653, + "grad_norm": 0.14419814944267273, + "learning_rate": 0.0009973285609625247, + "loss": 3.1781, + "step": 1438 + }, + { + "epoch": 0.04267117397621801, + "grad_norm": 0.17768247425556183, + "learning_rate": 0.0009973237015128338, + "loss": 3.1664, + "step": 1439 + }, + { + "epoch": 0.04270082732852949, + "grad_norm": 0.17183168232440948, + "learning_rate": 0.000997318837659256, + "loss": 3.1503, + "step": 1440 + }, + { + "epoch": 0.04273048068084097, + "grad_norm": 0.17283552885055542, + "learning_rate": 0.0009973139694018347, + "loss": 3.1756, + "step": 1441 + }, + { + "epoch": 0.04276013403315245, + "grad_norm": 0.17178694903850555, + "learning_rate": 0.000997309096740613, + "loss": 3.1557, + "step": 1442 + }, + { + "epoch": 0.04278978738546393, + "grad_norm": 0.17234095931053162, + "learning_rate": 0.0009973042196756334, + "loss": 3.1683, + "step": 1443 + }, + { + "epoch": 0.042819440737775404, + "grad_norm": 0.19166772067546844, + "learning_rate": 0.00099729933820694, + "loss": 3.154, + "step": 1444 + }, + { + "epoch": 0.04284909409008689, + "grad_norm": 0.21404066681861877, + "learning_rate": 0.0009972944523345753, + "loss": 3.1599, + "step": 1445 + }, + { + "epoch": 0.04287874744239836, + "grad_norm": 0.196225106716156, + "learning_rate": 0.000997289562058583, + "loss": 3.216, + "step": 1446 + }, + { + "epoch": 0.04290840079470984, + "grad_norm": 0.1849777102470398, + "learning_rate": 0.0009972846673790062, + "loss": 3.1941, + "step": 1447 + }, + { + "epoch": 0.04293805414702132, + "grad_norm": 0.19378624856472015, + "learning_rate": 0.0009972797682958885, + "loss": 3.1625, + "step": 1448 + }, + { + "epoch": 0.0429677074993328, + "grad_norm": 0.22892548143863678, + "learning_rate": 0.0009972748648092728, + "loss": 3.1664, + "step": 1449 + }, + { + "epoch": 0.04299736085164428, + "grad_norm": 0.2322714477777481, + "learning_rate": 0.000997269956919203, + "loss": 3.1958, + "step": 1450 + }, + { + "epoch": 0.04302701420395576, + "grad_norm": 0.19566719233989716, + "learning_rate": 0.0009972650446257224, + "loss": 3.1785, + "step": 1451 + }, + { + "epoch": 0.043056667556267234, + "grad_norm": 0.1984502673149109, + "learning_rate": 0.0009972601279288743, + "loss": 3.1692, + "step": 1452 + }, + { + "epoch": 0.04308632090857872, + "grad_norm": 0.1637876182794571, + "learning_rate": 0.0009972552068287027, + "loss": 3.1597, + "step": 1453 + }, + { + "epoch": 0.04311597426089019, + "grad_norm": 0.15846039354801178, + "learning_rate": 0.0009972502813252507, + "loss": 3.1997, + "step": 1454 + }, + { + "epoch": 0.043145627613201676, + "grad_norm": 0.16533057391643524, + "learning_rate": 0.0009972453514185621, + "loss": 3.1346, + "step": 1455 + }, + { + "epoch": 0.04317528096551315, + "grad_norm": 0.15732938051223755, + "learning_rate": 0.0009972404171086806, + "loss": 3.1632, + "step": 1456 + }, + { + "epoch": 0.04320493431782463, + "grad_norm": 0.1582668572664261, + "learning_rate": 0.0009972354783956499, + "loss": 3.1451, + "step": 1457 + }, + { + "epoch": 0.04323458767013611, + "grad_norm": 0.1498292237520218, + "learning_rate": 0.0009972305352795136, + "loss": 3.1432, + "step": 1458 + }, + { + "epoch": 0.04326424102244759, + "grad_norm": 0.16233065724372864, + "learning_rate": 0.0009972255877603157, + "loss": 3.1356, + "step": 1459 + }, + { + "epoch": 0.043293894374759063, + "grad_norm": 0.14255116879940033, + "learning_rate": 0.0009972206358380999, + "loss": 3.1421, + "step": 1460 + }, + { + "epoch": 0.04332354772707055, + "grad_norm": 0.1621396392583847, + "learning_rate": 0.00099721567951291, + "loss": 3.1872, + "step": 1461 + }, + { + "epoch": 0.04335320107938202, + "grad_norm": 0.15604902803897858, + "learning_rate": 0.0009972107187847896, + "loss": 3.129, + "step": 1462 + }, + { + "epoch": 0.043382854431693506, + "grad_norm": 0.1664065569639206, + "learning_rate": 0.0009972057536537834, + "loss": 3.1835, + "step": 1463 + }, + { + "epoch": 0.04341250778400498, + "grad_norm": 0.16522136330604553, + "learning_rate": 0.0009972007841199345, + "loss": 3.1215, + "step": 1464 + }, + { + "epoch": 0.04344216113631646, + "grad_norm": 0.1591879427433014, + "learning_rate": 0.0009971958101832874, + "loss": 3.1574, + "step": 1465 + }, + { + "epoch": 0.04347181448862794, + "grad_norm": 0.1941690891981125, + "learning_rate": 0.0009971908318438863, + "loss": 3.196, + "step": 1466 + }, + { + "epoch": 0.04350146784093942, + "grad_norm": 0.21706028282642365, + "learning_rate": 0.0009971858491017748, + "loss": 3.143, + "step": 1467 + }, + { + "epoch": 0.0435311211932509, + "grad_norm": 0.2141958326101303, + "learning_rate": 0.0009971808619569974, + "loss": 3.1393, + "step": 1468 + }, + { + "epoch": 0.04356077454556238, + "grad_norm": 0.22767607867717743, + "learning_rate": 0.000997175870409598, + "loss": 3.1509, + "step": 1469 + }, + { + "epoch": 0.04359042789787385, + "grad_norm": 0.22993257641792297, + "learning_rate": 0.0009971708744596212, + "loss": 3.2006, + "step": 1470 + }, + { + "epoch": 0.043620081250185336, + "grad_norm": 0.16939038038253784, + "learning_rate": 0.0009971658741071106, + "loss": 3.1137, + "step": 1471 + }, + { + "epoch": 0.04364973460249681, + "grad_norm": 0.17313817143440247, + "learning_rate": 0.000997160869352111, + "loss": 3.1532, + "step": 1472 + }, + { + "epoch": 0.04367938795480829, + "grad_norm": 0.1624296009540558, + "learning_rate": 0.0009971558601946666, + "loss": 3.1524, + "step": 1473 + }, + { + "epoch": 0.04370904130711977, + "grad_norm": 0.15098980069160461, + "learning_rate": 0.0009971508466348217, + "loss": 3.1162, + "step": 1474 + }, + { + "epoch": 0.04373869465943125, + "grad_norm": 0.1552538275718689, + "learning_rate": 0.0009971458286726208, + "loss": 3.1855, + "step": 1475 + }, + { + "epoch": 0.04376834801174273, + "grad_norm": 0.15561558306217194, + "learning_rate": 0.0009971408063081083, + "loss": 3.1456, + "step": 1476 + }, + { + "epoch": 0.043798001364054207, + "grad_norm": 0.14748160541057587, + "learning_rate": 0.0009971357795413283, + "loss": 3.1156, + "step": 1477 + }, + { + "epoch": 0.04382765471636568, + "grad_norm": 0.12087760865688324, + "learning_rate": 0.0009971307483723258, + "loss": 3.1566, + "step": 1478 + }, + { + "epoch": 0.043857308068677166, + "grad_norm": 0.12604349851608276, + "learning_rate": 0.0009971257128011453, + "loss": 3.1467, + "step": 1479 + }, + { + "epoch": 0.04388696142098864, + "grad_norm": 0.13662078976631165, + "learning_rate": 0.0009971206728278312, + "loss": 3.1309, + "step": 1480 + }, + { + "epoch": 0.043916614773300125, + "grad_norm": 0.15561465919017792, + "learning_rate": 0.0009971156284524284, + "loss": 3.1641, + "step": 1481 + }, + { + "epoch": 0.0439462681256116, + "grad_norm": 0.15790610015392303, + "learning_rate": 0.000997110579674981, + "loss": 3.1362, + "step": 1482 + }, + { + "epoch": 0.04397592147792308, + "grad_norm": 0.14577066898345947, + "learning_rate": 0.0009971055264955345, + "loss": 3.1187, + "step": 1483 + }, + { + "epoch": 0.04400557483023456, + "grad_norm": 0.1370580792427063, + "learning_rate": 0.000997100468914133, + "loss": 3.0934, + "step": 1484 + }, + { + "epoch": 0.044035228182546036, + "grad_norm": 0.1711411327123642, + "learning_rate": 0.0009970954069308216, + "loss": 3.1555, + "step": 1485 + }, + { + "epoch": 0.04406488153485751, + "grad_norm": 0.2066470980644226, + "learning_rate": 0.0009970903405456448, + "loss": 3.1544, + "step": 1486 + }, + { + "epoch": 0.044094534887168996, + "grad_norm": 0.24918904900550842, + "learning_rate": 0.0009970852697586481, + "loss": 3.1684, + "step": 1487 + }, + { + "epoch": 0.04412418823948047, + "grad_norm": 0.2496899515390396, + "learning_rate": 0.0009970801945698759, + "loss": 3.1488, + "step": 1488 + }, + { + "epoch": 0.044153841591791955, + "grad_norm": 0.19436126947402954, + "learning_rate": 0.000997075114979373, + "loss": 3.1734, + "step": 1489 + }, + { + "epoch": 0.04418349494410343, + "grad_norm": 0.20232580602169037, + "learning_rate": 0.000997070030987185, + "loss": 3.1627, + "step": 1490 + }, + { + "epoch": 0.04421314829641491, + "grad_norm": 0.1989710032939911, + "learning_rate": 0.0009970649425933562, + "loss": 3.1849, + "step": 1491 + }, + { + "epoch": 0.04424280164872639, + "grad_norm": 0.19092707335948944, + "learning_rate": 0.0009970598497979321, + "loss": 3.1655, + "step": 1492 + }, + { + "epoch": 0.044272455001037866, + "grad_norm": 0.2033083140850067, + "learning_rate": 0.000997054752600958, + "loss": 3.1811, + "step": 1493 + }, + { + "epoch": 0.04430210835334935, + "grad_norm": 0.1860143542289734, + "learning_rate": 0.0009970496510024786, + "loss": 3.1264, + "step": 1494 + }, + { + "epoch": 0.044331761705660826, + "grad_norm": 0.16416044533252716, + "learning_rate": 0.000997044545002539, + "loss": 3.1348, + "step": 1495 + }, + { + "epoch": 0.0443614150579723, + "grad_norm": 0.17923004925251007, + "learning_rate": 0.0009970394346011848, + "loss": 3.1379, + "step": 1496 + }, + { + "epoch": 0.044391068410283785, + "grad_norm": 0.1510276049375534, + "learning_rate": 0.000997034319798461, + "loss": 3.136, + "step": 1497 + }, + { + "epoch": 0.04442072176259526, + "grad_norm": 0.1454465538263321, + "learning_rate": 0.0009970292005944132, + "loss": 3.1611, + "step": 1498 + }, + { + "epoch": 0.04445037511490674, + "grad_norm": 0.14327214658260345, + "learning_rate": 0.0009970240769890863, + "loss": 3.1459, + "step": 1499 + }, + { + "epoch": 0.04448002846721822, + "grad_norm": 0.13976392149925232, + "learning_rate": 0.0009970189489825261, + "loss": 3.14, + "step": 1500 + }, + { + "epoch": 0.044509681819529696, + "grad_norm": 0.2255636751651764, + "learning_rate": 0.0009970138165747778, + "loss": 3.1414, + "step": 1501 + }, + { + "epoch": 0.04453933517184118, + "grad_norm": 0.142348051071167, + "learning_rate": 0.0009970086797658866, + "loss": 3.184, + "step": 1502 + }, + { + "epoch": 0.044568988524152656, + "grad_norm": 0.14146175980567932, + "learning_rate": 0.0009970035385558982, + "loss": 3.1166, + "step": 1503 + }, + { + "epoch": 0.04459864187646413, + "grad_norm": 0.11314002424478531, + "learning_rate": 0.0009969983929448585, + "loss": 3.0952, + "step": 1504 + }, + { + "epoch": 0.044628295228775615, + "grad_norm": 0.1472439467906952, + "learning_rate": 0.0009969932429328124, + "loss": 3.1921, + "step": 1505 + }, + { + "epoch": 0.04465794858108709, + "grad_norm": 0.16989026963710785, + "learning_rate": 0.0009969880885198062, + "loss": 3.1562, + "step": 1506 + }, + { + "epoch": 0.044687601933398574, + "grad_norm": 0.15537303686141968, + "learning_rate": 0.0009969829297058848, + "loss": 3.1514, + "step": 1507 + }, + { + "epoch": 0.04471725528571005, + "grad_norm": 0.1647614687681198, + "learning_rate": 0.0009969777664910944, + "loss": 3.1461, + "step": 1508 + }, + { + "epoch": 0.044746908638021526, + "grad_norm": 0.17941266298294067, + "learning_rate": 0.0009969725988754805, + "loss": 3.1464, + "step": 1509 + }, + { + "epoch": 0.04477656199033301, + "grad_norm": 0.1982162743806839, + "learning_rate": 0.000996967426859089, + "loss": 3.1288, + "step": 1510 + }, + { + "epoch": 0.044806215342644486, + "grad_norm": 0.23538212478160858, + "learning_rate": 0.0009969622504419655, + "loss": 3.1474, + "step": 1511 + }, + { + "epoch": 0.04483586869495596, + "grad_norm": 0.21888920664787292, + "learning_rate": 0.0009969570696241562, + "loss": 3.1394, + "step": 1512 + }, + { + "epoch": 0.044865522047267445, + "grad_norm": 0.17266583442687988, + "learning_rate": 0.0009969518844057067, + "loss": 3.1771, + "step": 1513 + }, + { + "epoch": 0.04489517539957892, + "grad_norm": 0.24074441194534302, + "learning_rate": 0.0009969466947866627, + "loss": 3.1358, + "step": 1514 + }, + { + "epoch": 0.044924828751890404, + "grad_norm": 0.23520807921886444, + "learning_rate": 0.0009969415007670707, + "loss": 3.1545, + "step": 1515 + }, + { + "epoch": 0.04495448210420188, + "grad_norm": 0.24756231904029846, + "learning_rate": 0.0009969363023469764, + "loss": 3.1506, + "step": 1516 + }, + { + "epoch": 0.044984135456513356, + "grad_norm": 0.26947495341300964, + "learning_rate": 0.0009969310995264257, + "loss": 3.1465, + "step": 1517 + }, + { + "epoch": 0.04501378880882484, + "grad_norm": 0.27846235036849976, + "learning_rate": 0.0009969258923054648, + "loss": 3.136, + "step": 1518 + }, + { + "epoch": 0.045043442161136316, + "grad_norm": 0.2871236205101013, + "learning_rate": 0.00099692068068414, + "loss": 3.1889, + "step": 1519 + }, + { + "epoch": 0.0450730955134478, + "grad_norm": 0.23365433514118195, + "learning_rate": 0.0009969154646624972, + "loss": 3.2157, + "step": 1520 + }, + { + "epoch": 0.045102748865759275, + "grad_norm": 0.19819538295269012, + "learning_rate": 0.0009969102442405826, + "loss": 3.1659, + "step": 1521 + }, + { + "epoch": 0.04513240221807075, + "grad_norm": 0.17190136015415192, + "learning_rate": 0.0009969050194184425, + "loss": 3.1511, + "step": 1522 + }, + { + "epoch": 0.045162055570382234, + "grad_norm": 0.16605910658836365, + "learning_rate": 0.0009968997901961233, + "loss": 3.1051, + "step": 1523 + }, + { + "epoch": 0.04519170892269371, + "grad_norm": 0.16000422835350037, + "learning_rate": 0.000996894556573671, + "loss": 3.125, + "step": 1524 + }, + { + "epoch": 0.045221362275005186, + "grad_norm": 0.13862937688827515, + "learning_rate": 0.0009968893185511322, + "loss": 3.152, + "step": 1525 + }, + { + "epoch": 0.04525101562731667, + "grad_norm": 0.12229993939399719, + "learning_rate": 0.000996884076128553, + "loss": 3.1399, + "step": 1526 + }, + { + "epoch": 0.045280668979628146, + "grad_norm": 0.12168464064598083, + "learning_rate": 0.0009968788293059803, + "loss": 3.1103, + "step": 1527 + }, + { + "epoch": 0.04531032233193963, + "grad_norm": 0.13368918001651764, + "learning_rate": 0.0009968735780834603, + "loss": 3.1326, + "step": 1528 + }, + { + "epoch": 0.045339975684251105, + "grad_norm": 0.12107954174280167, + "learning_rate": 0.0009968683224610394, + "loss": 3.1319, + "step": 1529 + }, + { + "epoch": 0.04536962903656258, + "grad_norm": 0.13092151284217834, + "learning_rate": 0.000996863062438764, + "loss": 3.1134, + "step": 1530 + }, + { + "epoch": 0.045399282388874064, + "grad_norm": 0.12221729010343552, + "learning_rate": 0.000996857798016681, + "loss": 3.0819, + "step": 1531 + }, + { + "epoch": 0.04542893574118554, + "grad_norm": 0.11650129407644272, + "learning_rate": 0.0009968525291948372, + "loss": 3.179, + "step": 1532 + }, + { + "epoch": 0.04545858909349702, + "grad_norm": 0.1295059770345688, + "learning_rate": 0.0009968472559732787, + "loss": 3.1324, + "step": 1533 + }, + { + "epoch": 0.0454882424458085, + "grad_norm": 0.1440446376800537, + "learning_rate": 0.0009968419783520524, + "loss": 3.158, + "step": 1534 + }, + { + "epoch": 0.045517895798119976, + "grad_norm": 0.16215123236179352, + "learning_rate": 0.0009968366963312052, + "loss": 3.1114, + "step": 1535 + }, + { + "epoch": 0.04554754915043146, + "grad_norm": 0.15695405006408691, + "learning_rate": 0.0009968314099107838, + "loss": 3.1233, + "step": 1536 + }, + { + "epoch": 0.045577202502742935, + "grad_norm": 0.14062070846557617, + "learning_rate": 0.000996826119090835, + "loss": 3.1438, + "step": 1537 + }, + { + "epoch": 0.04560685585505441, + "grad_norm": 0.15049804747104645, + "learning_rate": 0.0009968208238714056, + "loss": 3.1267, + "step": 1538 + }, + { + "epoch": 0.045636509207365894, + "grad_norm": 0.15869706869125366, + "learning_rate": 0.0009968155242525425, + "loss": 3.1776, + "step": 1539 + }, + { + "epoch": 0.04566616255967737, + "grad_norm": 0.17372441291809082, + "learning_rate": 0.0009968102202342927, + "loss": 3.1302, + "step": 1540 + }, + { + "epoch": 0.04569581591198885, + "grad_norm": 0.17565791308879852, + "learning_rate": 0.000996804911816703, + "loss": 3.1155, + "step": 1541 + }, + { + "epoch": 0.04572546926430033, + "grad_norm": 0.1932016760110855, + "learning_rate": 0.000996799598999821, + "loss": 3.1382, + "step": 1542 + }, + { + "epoch": 0.045755122616611806, + "grad_norm": 0.21955536305904388, + "learning_rate": 0.0009967942817836928, + "loss": 3.1214, + "step": 1543 + }, + { + "epoch": 0.04578477596892329, + "grad_norm": 0.22861556708812714, + "learning_rate": 0.000996788960168366, + "loss": 3.1381, + "step": 1544 + }, + { + "epoch": 0.045814429321234765, + "grad_norm": 0.19819606840610504, + "learning_rate": 0.0009967836341538878, + "loss": 3.1525, + "step": 1545 + }, + { + "epoch": 0.04584408267354625, + "grad_norm": 0.1577143371105194, + "learning_rate": 0.0009967783037403053, + "loss": 3.1526, + "step": 1546 + }, + { + "epoch": 0.045873736025857724, + "grad_norm": 0.1707703322172165, + "learning_rate": 0.0009967729689276655, + "loss": 3.1628, + "step": 1547 + }, + { + "epoch": 0.0459033893781692, + "grad_norm": 0.17189505696296692, + "learning_rate": 0.0009967676297160158, + "loss": 3.1004, + "step": 1548 + }, + { + "epoch": 0.04593304273048068, + "grad_norm": 0.17508497834205627, + "learning_rate": 0.0009967622861054035, + "loss": 3.1248, + "step": 1549 + }, + { + "epoch": 0.04596269608279216, + "grad_norm": 0.19042792916297913, + "learning_rate": 0.000996756938095876, + "loss": 3.1305, + "step": 1550 + }, + { + "epoch": 0.045992349435103635, + "grad_norm": 0.1747506856918335, + "learning_rate": 0.0009967515856874804, + "loss": 3.1105, + "step": 1551 + }, + { + "epoch": 0.04602200278741512, + "grad_norm": 0.1663828045129776, + "learning_rate": 0.0009967462288802643, + "loss": 3.1513, + "step": 1552 + }, + { + "epoch": 0.046051656139726595, + "grad_norm": 0.16053098440170288, + "learning_rate": 0.0009967408676742752, + "loss": 3.1516, + "step": 1553 + }, + { + "epoch": 0.04608130949203808, + "grad_norm": 0.200715571641922, + "learning_rate": 0.0009967355020695603, + "loss": 3.1277, + "step": 1554 + }, + { + "epoch": 0.046110962844349554, + "grad_norm": 0.20398209989070892, + "learning_rate": 0.0009967301320661672, + "loss": 3.0931, + "step": 1555 + }, + { + "epoch": 0.04614061619666103, + "grad_norm": 0.17990347743034363, + "learning_rate": 0.0009967247576641437, + "loss": 3.1353, + "step": 1556 + }, + { + "epoch": 0.04617026954897251, + "grad_norm": 0.16095615923404694, + "learning_rate": 0.0009967193788635372, + "loss": 3.1111, + "step": 1557 + }, + { + "epoch": 0.04619992290128399, + "grad_norm": 0.1929856538772583, + "learning_rate": 0.000996713995664395, + "loss": 3.1706, + "step": 1558 + }, + { + "epoch": 0.04622957625359547, + "grad_norm": 0.17606747150421143, + "learning_rate": 0.0009967086080667656, + "loss": 3.1168, + "step": 1559 + }, + { + "epoch": 0.04625922960590695, + "grad_norm": 0.1573939025402069, + "learning_rate": 0.0009967032160706959, + "loss": 3.1429, + "step": 1560 + }, + { + "epoch": 0.046288882958218425, + "grad_norm": 0.1743171066045761, + "learning_rate": 0.0009966978196762342, + "loss": 3.1473, + "step": 1561 + }, + { + "epoch": 0.04631853631052991, + "grad_norm": 0.1634756177663803, + "learning_rate": 0.0009966924188834277, + "loss": 3.1647, + "step": 1562 + }, + { + "epoch": 0.046348189662841384, + "grad_norm": 0.166097491979599, + "learning_rate": 0.0009966870136923248, + "loss": 3.1452, + "step": 1563 + }, + { + "epoch": 0.04637784301515286, + "grad_norm": 0.1798708587884903, + "learning_rate": 0.000996681604102973, + "loss": 3.1318, + "step": 1564 + }, + { + "epoch": 0.04640749636746434, + "grad_norm": 0.18537937104701996, + "learning_rate": 0.0009966761901154207, + "loss": 3.1645, + "step": 1565 + }, + { + "epoch": 0.04643714971977582, + "grad_norm": 0.1724862903356552, + "learning_rate": 0.0009966707717297151, + "loss": 3.1518, + "step": 1566 + }, + { + "epoch": 0.0464668030720873, + "grad_norm": 0.1714521199464798, + "learning_rate": 0.0009966653489459048, + "loss": 3.1717, + "step": 1567 + }, + { + "epoch": 0.04649645642439878, + "grad_norm": 0.14380691945552826, + "learning_rate": 0.0009966599217640375, + "loss": 3.1238, + "step": 1568 + }, + { + "epoch": 0.046526109776710255, + "grad_norm": 0.15002880990505219, + "learning_rate": 0.0009966544901841613, + "loss": 3.1019, + "step": 1569 + }, + { + "epoch": 0.04655576312902174, + "grad_norm": 0.14462661743164062, + "learning_rate": 0.0009966490542063244, + "loss": 3.1376, + "step": 1570 + }, + { + "epoch": 0.046585416481333214, + "grad_norm": 0.13823319971561432, + "learning_rate": 0.0009966436138305749, + "loss": 3.1402, + "step": 1571 + }, + { + "epoch": 0.0466150698336447, + "grad_norm": 0.1806962490081787, + "learning_rate": 0.000996638169056961, + "loss": 3.1754, + "step": 1572 + }, + { + "epoch": 0.04664472318595617, + "grad_norm": 0.2087087631225586, + "learning_rate": 0.0009966327198855304, + "loss": 3.1497, + "step": 1573 + }, + { + "epoch": 0.04667437653826765, + "grad_norm": 0.21647971868515015, + "learning_rate": 0.0009966272663163324, + "loss": 3.165, + "step": 1574 + }, + { + "epoch": 0.04670402989057913, + "grad_norm": 0.2559834122657776, + "learning_rate": 0.0009966218083494146, + "loss": 3.1148, + "step": 1575 + }, + { + "epoch": 0.04673368324289061, + "grad_norm": 0.2466445118188858, + "learning_rate": 0.0009966163459848253, + "loss": 3.1349, + "step": 1576 + }, + { + "epoch": 0.046763336595202085, + "grad_norm": 0.2453184723854065, + "learning_rate": 0.0009966108792226129, + "loss": 3.1254, + "step": 1577 + }, + { + "epoch": 0.04679298994751357, + "grad_norm": 0.20696814358234406, + "learning_rate": 0.0009966054080628262, + "loss": 3.1219, + "step": 1578 + }, + { + "epoch": 0.046822643299825044, + "grad_norm": 0.19545380771160126, + "learning_rate": 0.0009965999325055133, + "loss": 3.1422, + "step": 1579 + }, + { + "epoch": 0.04685229665213653, + "grad_norm": 0.17389340698719025, + "learning_rate": 0.0009965944525507225, + "loss": 3.1523, + "step": 1580 + }, + { + "epoch": 0.046881950004448, + "grad_norm": 0.15615461766719818, + "learning_rate": 0.0009965889681985028, + "loss": 3.1399, + "step": 1581 + }, + { + "epoch": 0.04691160335675948, + "grad_norm": 0.1542995572090149, + "learning_rate": 0.0009965834794489026, + "loss": 3.1282, + "step": 1582 + }, + { + "epoch": 0.04694125670907096, + "grad_norm": 0.16459213197231293, + "learning_rate": 0.0009965779863019704, + "loss": 3.1195, + "step": 1583 + }, + { + "epoch": 0.04697091006138244, + "grad_norm": 0.16613422334194183, + "learning_rate": 0.0009965724887577548, + "loss": 3.1489, + "step": 1584 + }, + { + "epoch": 0.04700056341369392, + "grad_norm": 0.14986436069011688, + "learning_rate": 0.0009965669868163048, + "loss": 3.1526, + "step": 1585 + }, + { + "epoch": 0.0470302167660054, + "grad_norm": 0.14300774037837982, + "learning_rate": 0.0009965614804776687, + "loss": 3.1308, + "step": 1586 + }, + { + "epoch": 0.047059870118316874, + "grad_norm": 0.155240997672081, + "learning_rate": 0.0009965559697418956, + "loss": 3.1414, + "step": 1587 + }, + { + "epoch": 0.04708952347062836, + "grad_norm": 0.15050947666168213, + "learning_rate": 0.000996550454609034, + "loss": 3.114, + "step": 1588 + }, + { + "epoch": 0.04711917682293983, + "grad_norm": 0.1474442183971405, + "learning_rate": 0.0009965449350791329, + "loss": 3.1278, + "step": 1589 + }, + { + "epoch": 0.04714883017525131, + "grad_norm": 0.1502431482076645, + "learning_rate": 0.0009965394111522412, + "loss": 3.1324, + "step": 1590 + }, + { + "epoch": 0.04717848352756279, + "grad_norm": 0.1527746319770813, + "learning_rate": 0.0009965338828284078, + "loss": 3.1165, + "step": 1591 + }, + { + "epoch": 0.04720813687987427, + "grad_norm": 0.15561190247535706, + "learning_rate": 0.0009965283501076818, + "loss": 3.0668, + "step": 1592 + }, + { + "epoch": 0.04723779023218575, + "grad_norm": 0.19489581882953644, + "learning_rate": 0.0009965228129901118, + "loss": 3.1065, + "step": 1593 + }, + { + "epoch": 0.04726744358449723, + "grad_norm": 0.14627884328365326, + "learning_rate": 0.0009965172714757472, + "loss": 3.1263, + "step": 1594 + }, + { + "epoch": 0.047297096936808704, + "grad_norm": 0.16048525273799896, + "learning_rate": 0.0009965117255646369, + "loss": 3.1249, + "step": 1595 + }, + { + "epoch": 0.04732675028912019, + "grad_norm": 0.16009153425693512, + "learning_rate": 0.00099650617525683, + "loss": 3.1178, + "step": 1596 + }, + { + "epoch": 0.04735640364143166, + "grad_norm": 0.16331857442855835, + "learning_rate": 0.0009965006205523758, + "loss": 3.1117, + "step": 1597 + }, + { + "epoch": 0.047386056993743146, + "grad_norm": 0.17917253077030182, + "learning_rate": 0.0009964950614513232, + "loss": 3.0957, + "step": 1598 + }, + { + "epoch": 0.04741571034605462, + "grad_norm": 0.18453560769557953, + "learning_rate": 0.0009964894979537216, + "loss": 3.1404, + "step": 1599 + }, + { + "epoch": 0.0474453636983661, + "grad_norm": 0.19447658956050873, + "learning_rate": 0.0009964839300596205, + "loss": 3.1019, + "step": 1600 + }, + { + "epoch": 0.04747501705067758, + "grad_norm": 0.1961759477853775, + "learning_rate": 0.0009964783577690688, + "loss": 3.1203, + "step": 1601 + }, + { + "epoch": 0.04750467040298906, + "grad_norm": 0.1857331544160843, + "learning_rate": 0.000996472781082116, + "loss": 3.1521, + "step": 1602 + }, + { + "epoch": 0.047534323755300534, + "grad_norm": 0.17932288348674774, + "learning_rate": 0.0009964671999988118, + "loss": 3.1074, + "step": 1603 + }, + { + "epoch": 0.04756397710761202, + "grad_norm": 0.15755368769168854, + "learning_rate": 0.000996461614519205, + "loss": 3.1324, + "step": 1604 + }, + { + "epoch": 0.04759363045992349, + "grad_norm": 0.17925459146499634, + "learning_rate": 0.0009964560246433457, + "loss": 3.1123, + "step": 1605 + }, + { + "epoch": 0.047623283812234976, + "grad_norm": 0.17222686111927032, + "learning_rate": 0.0009964504303712827, + "loss": 3.1467, + "step": 1606 + }, + { + "epoch": 0.04765293716454645, + "grad_norm": 0.1789839118719101, + "learning_rate": 0.0009964448317030662, + "loss": 3.1279, + "step": 1607 + }, + { + "epoch": 0.04768259051685793, + "grad_norm": 0.174265518784523, + "learning_rate": 0.0009964392286387453, + "loss": 3.1335, + "step": 1608 + }, + { + "epoch": 0.04771224386916941, + "grad_norm": 0.17445887625217438, + "learning_rate": 0.00099643362117837, + "loss": 3.0867, + "step": 1609 + }, + { + "epoch": 0.04774189722148089, + "grad_norm": 0.13114801049232483, + "learning_rate": 0.0009964280093219897, + "loss": 3.1098, + "step": 1610 + }, + { + "epoch": 0.04777155057379237, + "grad_norm": 0.15031877160072327, + "learning_rate": 0.0009964223930696541, + "loss": 3.1492, + "step": 1611 + }, + { + "epoch": 0.04780120392610385, + "grad_norm": 0.17331084609031677, + "learning_rate": 0.0009964167724214133, + "loss": 3.1024, + "step": 1612 + }, + { + "epoch": 0.04783085727841532, + "grad_norm": 0.181935653090477, + "learning_rate": 0.0009964111473773165, + "loss": 3.1168, + "step": 1613 + }, + { + "epoch": 0.047860510630726806, + "grad_norm": 0.1966409534215927, + "learning_rate": 0.0009964055179374138, + "loss": 3.1045, + "step": 1614 + }, + { + "epoch": 0.04789016398303828, + "grad_norm": 0.19644126296043396, + "learning_rate": 0.000996399884101755, + "loss": 3.0939, + "step": 1615 + }, + { + "epoch": 0.04791981733534976, + "grad_norm": 0.16715896129608154, + "learning_rate": 0.00099639424587039, + "loss": 3.1148, + "step": 1616 + }, + { + "epoch": 0.04794947068766124, + "grad_norm": 0.19233150780200958, + "learning_rate": 0.0009963886032433688, + "loss": 3.1263, + "step": 1617 + }, + { + "epoch": 0.04797912403997272, + "grad_norm": 0.2113412469625473, + "learning_rate": 0.0009963829562207413, + "loss": 3.128, + "step": 1618 + }, + { + "epoch": 0.0480087773922842, + "grad_norm": 0.19927610456943512, + "learning_rate": 0.0009963773048025577, + "loss": 3.1272, + "step": 1619 + }, + { + "epoch": 0.04803843074459568, + "grad_norm": 0.19835221767425537, + "learning_rate": 0.0009963716489888677, + "loss": 3.1358, + "step": 1620 + }, + { + "epoch": 0.04806808409690715, + "grad_norm": 0.1824713796377182, + "learning_rate": 0.0009963659887797217, + "loss": 3.15, + "step": 1621 + }, + { + "epoch": 0.048097737449218636, + "grad_norm": 0.23277097940444946, + "learning_rate": 0.0009963603241751695, + "loss": 3.1452, + "step": 1622 + }, + { + "epoch": 0.04812739080153011, + "grad_norm": 0.17351894080638885, + "learning_rate": 0.0009963546551752613, + "loss": 3.126, + "step": 1623 + }, + { + "epoch": 0.048157044153841595, + "grad_norm": 0.1873546987771988, + "learning_rate": 0.0009963489817800476, + "loss": 3.1548, + "step": 1624 + }, + { + "epoch": 0.04818669750615307, + "grad_norm": 0.20405083894729614, + "learning_rate": 0.0009963433039895785, + "loss": 3.1502, + "step": 1625 + }, + { + "epoch": 0.04821635085846455, + "grad_norm": 0.1606907844543457, + "learning_rate": 0.0009963376218039043, + "loss": 3.0729, + "step": 1626 + }, + { + "epoch": 0.04824600421077603, + "grad_norm": 0.1642637401819229, + "learning_rate": 0.0009963319352230752, + "loss": 3.1198, + "step": 1627 + }, + { + "epoch": 0.04827565756308751, + "grad_norm": 0.1780068576335907, + "learning_rate": 0.0009963262442471417, + "loss": 3.1413, + "step": 1628 + }, + { + "epoch": 0.04830531091539898, + "grad_norm": 0.17360851168632507, + "learning_rate": 0.000996320548876154, + "loss": 3.0958, + "step": 1629 + }, + { + "epoch": 0.048334964267710466, + "grad_norm": 0.16764216125011444, + "learning_rate": 0.0009963148491101625, + "loss": 3.0976, + "step": 1630 + }, + { + "epoch": 0.04836461762002194, + "grad_norm": 0.14442948997020721, + "learning_rate": 0.0009963091449492181, + "loss": 3.1298, + "step": 1631 + }, + { + "epoch": 0.048394270972333425, + "grad_norm": 0.1586787849664688, + "learning_rate": 0.000996303436393371, + "loss": 3.1169, + "step": 1632 + }, + { + "epoch": 0.0484239243246449, + "grad_norm": 0.15348488092422485, + "learning_rate": 0.0009962977234426716, + "loss": 3.147, + "step": 1633 + }, + { + "epoch": 0.04845357767695638, + "grad_norm": 0.14052627980709076, + "learning_rate": 0.0009962920060971707, + "loss": 3.0937, + "step": 1634 + }, + { + "epoch": 0.04848323102926786, + "grad_norm": 0.14276456832885742, + "learning_rate": 0.000996286284356919, + "loss": 3.1342, + "step": 1635 + }, + { + "epoch": 0.04851288438157934, + "grad_norm": 0.15509670972824097, + "learning_rate": 0.000996280558221967, + "loss": 3.1155, + "step": 1636 + }, + { + "epoch": 0.04854253773389081, + "grad_norm": 0.18570375442504883, + "learning_rate": 0.0009962748276923655, + "loss": 3.1131, + "step": 1637 + }, + { + "epoch": 0.048572191086202296, + "grad_norm": 0.18344052135944366, + "learning_rate": 0.0009962690927681653, + "loss": 3.1173, + "step": 1638 + }, + { + "epoch": 0.04860184443851377, + "grad_norm": 0.1913170963525772, + "learning_rate": 0.000996263353449417, + "loss": 3.048, + "step": 1639 + }, + { + "epoch": 0.048631497790825255, + "grad_norm": 0.18305933475494385, + "learning_rate": 0.0009962576097361715, + "loss": 3.1001, + "step": 1640 + }, + { + "epoch": 0.04866115114313673, + "grad_norm": 0.19334611296653748, + "learning_rate": 0.0009962518616284798, + "loss": 3.1062, + "step": 1641 + }, + { + "epoch": 0.04869080449544821, + "grad_norm": 0.18682865798473358, + "learning_rate": 0.0009962461091263925, + "loss": 3.1126, + "step": 1642 + }, + { + "epoch": 0.04872045784775969, + "grad_norm": 0.1853751689195633, + "learning_rate": 0.0009962403522299607, + "loss": 3.1146, + "step": 1643 + }, + { + "epoch": 0.04875011120007117, + "grad_norm": 0.19067083299160004, + "learning_rate": 0.0009962345909392356, + "loss": 3.1327, + "step": 1644 + }, + { + "epoch": 0.04877976455238265, + "grad_norm": 0.17511940002441406, + "learning_rate": 0.000996228825254268, + "loss": 3.1469, + "step": 1645 + }, + { + "epoch": 0.048809417904694126, + "grad_norm": 0.3993642032146454, + "learning_rate": 0.0009962230551751091, + "loss": 3.1319, + "step": 1646 + }, + { + "epoch": 0.0488390712570056, + "grad_norm": 0.18246501684188843, + "learning_rate": 0.0009962172807018096, + "loss": 3.132, + "step": 1647 + }, + { + "epoch": 0.048868724609317085, + "grad_norm": 0.17241697013378143, + "learning_rate": 0.000996211501834421, + "loss": 3.119, + "step": 1648 + }, + { + "epoch": 0.04889837796162856, + "grad_norm": 0.1904931217432022, + "learning_rate": 0.0009962057185729945, + "loss": 3.1548, + "step": 1649 + }, + { + "epoch": 0.04892803131394004, + "grad_norm": 0.15110984444618225, + "learning_rate": 0.000996199930917581, + "loss": 3.0828, + "step": 1650 + }, + { + "epoch": 0.04895768466625152, + "grad_norm": 0.148568257689476, + "learning_rate": 0.000996194138868232, + "loss": 3.1195, + "step": 1651 + }, + { + "epoch": 0.048987338018563, + "grad_norm": 0.18546465039253235, + "learning_rate": 0.0009961883424249986, + "loss": 3.1198, + "step": 1652 + }, + { + "epoch": 0.04901699137087448, + "grad_norm": 0.15864241123199463, + "learning_rate": 0.0009961825415879325, + "loss": 3.0975, + "step": 1653 + }, + { + "epoch": 0.049046644723185956, + "grad_norm": 0.17199623584747314, + "learning_rate": 0.0009961767363570848, + "loss": 3.1106, + "step": 1654 + }, + { + "epoch": 0.04907629807549743, + "grad_norm": 0.1658650040626526, + "learning_rate": 0.0009961709267325067, + "loss": 3.1057, + "step": 1655 + }, + { + "epoch": 0.049105951427808915, + "grad_norm": 0.16719132661819458, + "learning_rate": 0.00099616511271425, + "loss": 3.1316, + "step": 1656 + }, + { + "epoch": 0.04913560478012039, + "grad_norm": 0.19271376729011536, + "learning_rate": 0.0009961592943023663, + "loss": 3.1319, + "step": 1657 + }, + { + "epoch": 0.049165258132431874, + "grad_norm": 0.2002442479133606, + "learning_rate": 0.0009961534714969067, + "loss": 3.0999, + "step": 1658 + }, + { + "epoch": 0.04919491148474335, + "grad_norm": 0.22681471705436707, + "learning_rate": 0.000996147644297923, + "loss": 3.1309, + "step": 1659 + }, + { + "epoch": 0.04922456483705483, + "grad_norm": 0.2086062878370285, + "learning_rate": 0.0009961418127054666, + "loss": 3.1055, + "step": 1660 + }, + { + "epoch": 0.04925421818936631, + "grad_norm": 0.2041211873292923, + "learning_rate": 0.0009961359767195893, + "loss": 3.1598, + "step": 1661 + }, + { + "epoch": 0.049283871541677786, + "grad_norm": 0.19097095727920532, + "learning_rate": 0.0009961301363403427, + "loss": 3.1145, + "step": 1662 + }, + { + "epoch": 0.04931352489398926, + "grad_norm": 0.19221125543117523, + "learning_rate": 0.0009961242915677787, + "loss": 3.0996, + "step": 1663 + }, + { + "epoch": 0.049343178246300745, + "grad_norm": 0.19069737195968628, + "learning_rate": 0.000996118442401949, + "loss": 3.1117, + "step": 1664 + }, + { + "epoch": 0.04937283159861222, + "grad_norm": 0.16755472123622894, + "learning_rate": 0.0009961125888429054, + "loss": 3.1345, + "step": 1665 + }, + { + "epoch": 0.049402484950923704, + "grad_norm": 0.21382398903369904, + "learning_rate": 0.0009961067308906994, + "loss": 3.1071, + "step": 1666 + }, + { + "epoch": 0.04943213830323518, + "grad_norm": 0.14088962972164154, + "learning_rate": 0.0009961008685453834, + "loss": 3.115, + "step": 1667 + }, + { + "epoch": 0.04946179165554666, + "grad_norm": 0.15195348858833313, + "learning_rate": 0.000996095001807009, + "loss": 3.1302, + "step": 1668 + }, + { + "epoch": 0.04949144500785814, + "grad_norm": 0.16357681155204773, + "learning_rate": 0.0009960891306756282, + "loss": 3.1171, + "step": 1669 + }, + { + "epoch": 0.049521098360169616, + "grad_norm": 0.16318175196647644, + "learning_rate": 0.000996083255151293, + "loss": 3.1023, + "step": 1670 + }, + { + "epoch": 0.0495507517124811, + "grad_norm": 0.15163388848304749, + "learning_rate": 0.0009960773752340554, + "loss": 3.1107, + "step": 1671 + }, + { + "epoch": 0.049580405064792575, + "grad_norm": 0.16183674335479736, + "learning_rate": 0.0009960714909239673, + "loss": 3.1256, + "step": 1672 + }, + { + "epoch": 0.04961005841710405, + "grad_norm": 0.16018499433994293, + "learning_rate": 0.0009960656022210811, + "loss": 3.1446, + "step": 1673 + }, + { + "epoch": 0.049639711769415534, + "grad_norm": 0.16964177787303925, + "learning_rate": 0.000996059709125449, + "loss": 3.1327, + "step": 1674 + }, + { + "epoch": 0.04966936512172701, + "grad_norm": 0.17629221081733704, + "learning_rate": 0.000996053811637123, + "loss": 3.0903, + "step": 1675 + }, + { + "epoch": 0.04969901847403849, + "grad_norm": 0.17536292970180511, + "learning_rate": 0.0009960479097561553, + "loss": 3.0871, + "step": 1676 + }, + { + "epoch": 0.04972867182634997, + "grad_norm": 0.16054311394691467, + "learning_rate": 0.000996042003482598, + "loss": 3.1238, + "step": 1677 + }, + { + "epoch": 0.049758325178661446, + "grad_norm": 0.1563349813222885, + "learning_rate": 0.0009960360928165039, + "loss": 3.0832, + "step": 1678 + }, + { + "epoch": 0.04978797853097293, + "grad_norm": 0.1640584021806717, + "learning_rate": 0.000996030177757925, + "loss": 3.1177, + "step": 1679 + }, + { + "epoch": 0.049817631883284405, + "grad_norm": 0.16089558601379395, + "learning_rate": 0.0009960242583069137, + "loss": 3.0903, + "step": 1680 + }, + { + "epoch": 0.04984728523559588, + "grad_norm": 0.20156683027744293, + "learning_rate": 0.0009960183344635226, + "loss": 3.1306, + "step": 1681 + }, + { + "epoch": 0.049876938587907364, + "grad_norm": 0.17595869302749634, + "learning_rate": 0.0009960124062278037, + "loss": 3.1151, + "step": 1682 + }, + { + "epoch": 0.04990659194021884, + "grad_norm": 0.16643716394901276, + "learning_rate": 0.00099600647359981, + "loss": 3.1299, + "step": 1683 + }, + { + "epoch": 0.049936245292530324, + "grad_norm": 0.16023625433444977, + "learning_rate": 0.0009960005365795938, + "loss": 3.0656, + "step": 1684 + }, + { + "epoch": 0.0499658986448418, + "grad_norm": 0.18001927435398102, + "learning_rate": 0.0009959945951672079, + "loss": 3.0775, + "step": 1685 + }, + { + "epoch": 0.049995551997153276, + "grad_norm": 0.18972264230251312, + "learning_rate": 0.0009959886493627044, + "loss": 3.1294, + "step": 1686 + }, + { + "epoch": 0.05002520534946476, + "grad_norm": 0.22636544704437256, + "learning_rate": 0.0009959826991661365, + "loss": 3.1163, + "step": 1687 + }, + { + "epoch": 0.050054858701776235, + "grad_norm": 0.2711211144924164, + "learning_rate": 0.0009959767445775565, + "loss": 3.1671, + "step": 1688 + }, + { + "epoch": 0.05008451205408771, + "grad_norm": 0.2247464507818222, + "learning_rate": 0.0009959707855970174, + "loss": 3.1219, + "step": 1689 + }, + { + "epoch": 0.050114165406399194, + "grad_norm": 0.19962520897388458, + "learning_rate": 0.0009959648222245719, + "loss": 3.1368, + "step": 1690 + }, + { + "epoch": 0.05014381875871067, + "grad_norm": 0.19455958902835846, + "learning_rate": 0.0009959588544602726, + "loss": 3.0848, + "step": 1691 + }, + { + "epoch": 0.050173472111022153, + "grad_norm": 0.1919725239276886, + "learning_rate": 0.0009959528823041727, + "loss": 3.1346, + "step": 1692 + }, + { + "epoch": 0.05020312546333363, + "grad_norm": 0.16332896053791046, + "learning_rate": 0.0009959469057563247, + "loss": 3.115, + "step": 1693 + }, + { + "epoch": 0.050232778815645106, + "grad_norm": 0.17940038442611694, + "learning_rate": 0.0009959409248167818, + "loss": 3.1147, + "step": 1694 + }, + { + "epoch": 0.05026243216795659, + "grad_norm": 0.18509243428707123, + "learning_rate": 0.000995934939485597, + "loss": 3.0989, + "step": 1695 + }, + { + "epoch": 0.050292085520268065, + "grad_norm": 0.16578498482704163, + "learning_rate": 0.0009959289497628232, + "loss": 3.1018, + "step": 1696 + }, + { + "epoch": 0.05032173887257955, + "grad_norm": 0.17529244720935822, + "learning_rate": 0.0009959229556485132, + "loss": 3.1052, + "step": 1697 + }, + { + "epoch": 0.050351392224891024, + "grad_norm": 0.1451672911643982, + "learning_rate": 0.0009959169571427205, + "loss": 3.0949, + "step": 1698 + }, + { + "epoch": 0.0503810455772025, + "grad_norm": 0.15592600405216217, + "learning_rate": 0.000995910954245498, + "loss": 3.0917, + "step": 1699 + }, + { + "epoch": 0.05041069892951398, + "grad_norm": 0.1687522679567337, + "learning_rate": 0.000995904946956899, + "loss": 3.116, + "step": 1700 + }, + { + "epoch": 0.05044035228182546, + "grad_norm": 0.17695362865924835, + "learning_rate": 0.0009958989352769761, + "loss": 3.1157, + "step": 1701 + }, + { + "epoch": 0.050470005634136936, + "grad_norm": 0.18753978610038757, + "learning_rate": 0.0009958929192057835, + "loss": 3.0994, + "step": 1702 + }, + { + "epoch": 0.05049965898644842, + "grad_norm": 0.2131645530462265, + "learning_rate": 0.0009958868987433736, + "loss": 3.1114, + "step": 1703 + }, + { + "epoch": 0.050529312338759895, + "grad_norm": 0.16248515248298645, + "learning_rate": 0.0009958808738898004, + "loss": 3.1085, + "step": 1704 + }, + { + "epoch": 0.05055896569107138, + "grad_norm": 0.16717086732387543, + "learning_rate": 0.0009958748446451168, + "loss": 3.1073, + "step": 1705 + }, + { + "epoch": 0.050588619043382854, + "grad_norm": 0.16506871581077576, + "learning_rate": 0.0009958688110093764, + "loss": 3.0875, + "step": 1706 + }, + { + "epoch": 0.05061827239569433, + "grad_norm": 0.15031488239765167, + "learning_rate": 0.0009958627729826325, + "loss": 3.108, + "step": 1707 + }, + { + "epoch": 0.05064792574800581, + "grad_norm": 0.17414788901805878, + "learning_rate": 0.0009958567305649387, + "loss": 3.1074, + "step": 1708 + }, + { + "epoch": 0.05067757910031729, + "grad_norm": 0.18905456364154816, + "learning_rate": 0.0009958506837563484, + "loss": 3.1155, + "step": 1709 + }, + { + "epoch": 0.05070723245262877, + "grad_norm": 0.1981566995382309, + "learning_rate": 0.0009958446325569151, + "loss": 3.1246, + "step": 1710 + }, + { + "epoch": 0.05073688580494025, + "grad_norm": 0.17116260528564453, + "learning_rate": 0.0009958385769666927, + "loss": 3.1067, + "step": 1711 + }, + { + "epoch": 0.050766539157251725, + "grad_norm": 0.1592789590358734, + "learning_rate": 0.0009958325169857343, + "loss": 3.0948, + "step": 1712 + }, + { + "epoch": 0.05079619250956321, + "grad_norm": 0.20621265470981598, + "learning_rate": 0.0009958264526140942, + "loss": 3.1343, + "step": 1713 + }, + { + "epoch": 0.050825845861874684, + "grad_norm": 0.21098284423351288, + "learning_rate": 0.0009958203838518255, + "loss": 3.1575, + "step": 1714 + }, + { + "epoch": 0.05085549921418616, + "grad_norm": 0.21817785501480103, + "learning_rate": 0.0009958143106989822, + "loss": 3.1028, + "step": 1715 + }, + { + "epoch": 0.05088515256649764, + "grad_norm": 0.21249234676361084, + "learning_rate": 0.000995808233155618, + "loss": 3.118, + "step": 1716 + }, + { + "epoch": 0.05091480591880912, + "grad_norm": 0.1916252076625824, + "learning_rate": 0.0009958021512217869, + "loss": 3.1104, + "step": 1717 + }, + { + "epoch": 0.0509444592711206, + "grad_norm": 0.15968909859657288, + "learning_rate": 0.0009957960648975428, + "loss": 3.1182, + "step": 1718 + }, + { + "epoch": 0.05097411262343208, + "grad_norm": 0.17430919408798218, + "learning_rate": 0.0009957899741829394, + "loss": 3.1458, + "step": 1719 + }, + { + "epoch": 0.051003765975743555, + "grad_norm": 0.1791965514421463, + "learning_rate": 0.0009957838790780305, + "loss": 3.1193, + "step": 1720 + }, + { + "epoch": 0.05103341932805504, + "grad_norm": 0.16589109599590302, + "learning_rate": 0.0009957777795828703, + "loss": 3.1084, + "step": 1721 + }, + { + "epoch": 0.051063072680366514, + "grad_norm": 0.1469266265630722, + "learning_rate": 0.0009957716756975128, + "loss": 3.0433, + "step": 1722 + }, + { + "epoch": 0.051092726032678, + "grad_norm": 0.14038509130477905, + "learning_rate": 0.000995765567422012, + "loss": 3.1095, + "step": 1723 + }, + { + "epoch": 0.05112237938498947, + "grad_norm": 0.13625434041023254, + "learning_rate": 0.000995759454756422, + "loss": 3.0869, + "step": 1724 + }, + { + "epoch": 0.05115203273730095, + "grad_norm": 0.15043964982032776, + "learning_rate": 0.000995753337700797, + "loss": 3.1142, + "step": 1725 + }, + { + "epoch": 0.05118168608961243, + "grad_norm": 0.15257583558559418, + "learning_rate": 0.000995747216255191, + "loss": 3.0768, + "step": 1726 + }, + { + "epoch": 0.05121133944192391, + "grad_norm": 0.16030754148960114, + "learning_rate": 0.0009957410904196584, + "loss": 3.1293, + "step": 1727 + }, + { + "epoch": 0.051240992794235385, + "grad_norm": 0.1519380360841751, + "learning_rate": 0.0009957349601942532, + "loss": 3.1009, + "step": 1728 + }, + { + "epoch": 0.05127064614654687, + "grad_norm": 0.15510718524456024, + "learning_rate": 0.00099572882557903, + "loss": 3.0933, + "step": 1729 + }, + { + "epoch": 0.051300299498858344, + "grad_norm": 0.1627625674009323, + "learning_rate": 0.000995722686574043, + "loss": 3.0948, + "step": 1730 + }, + { + "epoch": 0.05132995285116983, + "grad_norm": 0.1703173667192459, + "learning_rate": 0.0009957165431793463, + "loss": 3.1083, + "step": 1731 + }, + { + "epoch": 0.0513596062034813, + "grad_norm": 0.1804959625005722, + "learning_rate": 0.0009957103953949947, + "loss": 3.1609, + "step": 1732 + }, + { + "epoch": 0.05138925955579278, + "grad_norm": 0.22162504494190216, + "learning_rate": 0.0009957042432210423, + "loss": 3.1245, + "step": 1733 + }, + { + "epoch": 0.05141891290810426, + "grad_norm": 0.23673060536384583, + "learning_rate": 0.0009956980866575437, + "loss": 3.1122, + "step": 1734 + }, + { + "epoch": 0.05144856626041574, + "grad_norm": 0.22650350630283356, + "learning_rate": 0.0009956919257045537, + "loss": 3.0841, + "step": 1735 + }, + { + "epoch": 0.05147821961272722, + "grad_norm": 0.2593044340610504, + "learning_rate": 0.0009956857603621266, + "loss": 3.1312, + "step": 1736 + }, + { + "epoch": 0.0515078729650387, + "grad_norm": 0.21994300186634064, + "learning_rate": 0.000995679590630317, + "loss": 3.1161, + "step": 1737 + }, + { + "epoch": 0.051537526317350174, + "grad_norm": 0.19219930469989777, + "learning_rate": 0.0009956734165091792, + "loss": 3.1207, + "step": 1738 + }, + { + "epoch": 0.05156717966966166, + "grad_norm": 0.21482834219932556, + "learning_rate": 0.0009956672379987685, + "loss": 3.1036, + "step": 1739 + }, + { + "epoch": 0.05159683302197313, + "grad_norm": 0.20744670927524567, + "learning_rate": 0.0009956610550991393, + "loss": 3.0775, + "step": 1740 + }, + { + "epoch": 0.05162648637428461, + "grad_norm": 0.17769861221313477, + "learning_rate": 0.0009956548678103465, + "loss": 3.1326, + "step": 1741 + }, + { + "epoch": 0.05165613972659609, + "grad_norm": 0.1753624826669693, + "learning_rate": 0.0009956486761324445, + "loss": 3.1352, + "step": 1742 + }, + { + "epoch": 0.05168579307890757, + "grad_norm": 0.16337907314300537, + "learning_rate": 0.0009956424800654886, + "loss": 3.1194, + "step": 1743 + }, + { + "epoch": 0.05171544643121905, + "grad_norm": 0.1561734527349472, + "learning_rate": 0.0009956362796095335, + "loss": 3.1148, + "step": 1744 + }, + { + "epoch": 0.05174509978353053, + "grad_norm": 0.145893856883049, + "learning_rate": 0.0009956300747646339, + "loss": 3.1056, + "step": 1745 + }, + { + "epoch": 0.051774753135842004, + "grad_norm": 0.15025199949741364, + "learning_rate": 0.000995623865530845, + "loss": 3.1031, + "step": 1746 + }, + { + "epoch": 0.05180440648815349, + "grad_norm": 0.13090330362319946, + "learning_rate": 0.0009956176519082217, + "loss": 3.1121, + "step": 1747 + }, + { + "epoch": 0.05183405984046496, + "grad_norm": 0.14036719501018524, + "learning_rate": 0.000995611433896819, + "loss": 3.1065, + "step": 1748 + }, + { + "epoch": 0.051863713192776446, + "grad_norm": 0.15299059450626373, + "learning_rate": 0.000995605211496692, + "loss": 3.0857, + "step": 1749 + }, + { + "epoch": 0.05189336654508792, + "grad_norm": 0.16244757175445557, + "learning_rate": 0.0009955989847078958, + "loss": 3.0697, + "step": 1750 + }, + { + "epoch": 0.0519230198973994, + "grad_norm": 0.19965127110481262, + "learning_rate": 0.0009955927535304854, + "loss": 3.1213, + "step": 1751 + }, + { + "epoch": 0.05195267324971088, + "grad_norm": 0.20277361571788788, + "learning_rate": 0.0009955865179645162, + "loss": 3.0993, + "step": 1752 + }, + { + "epoch": 0.05198232660202236, + "grad_norm": 0.19303324818611145, + "learning_rate": 0.0009955802780100434, + "loss": 3.0822, + "step": 1753 + }, + { + "epoch": 0.052011979954333834, + "grad_norm": 0.16656363010406494, + "learning_rate": 0.0009955740336671222, + "loss": 3.0673, + "step": 1754 + }, + { + "epoch": 0.05204163330664532, + "grad_norm": 0.15461033582687378, + "learning_rate": 0.0009955677849358077, + "loss": 3.0816, + "step": 1755 + }, + { + "epoch": 0.05207128665895679, + "grad_norm": 0.1831149309873581, + "learning_rate": 0.0009955615318161554, + "loss": 3.0985, + "step": 1756 + }, + { + "epoch": 0.052100940011268276, + "grad_norm": 0.20593401789665222, + "learning_rate": 0.0009955552743082209, + "loss": 3.0926, + "step": 1757 + }, + { + "epoch": 0.05213059336357975, + "grad_norm": 0.23348873853683472, + "learning_rate": 0.000995549012412059, + "loss": 3.0666, + "step": 1758 + }, + { + "epoch": 0.05216024671589123, + "grad_norm": 0.22225497663021088, + "learning_rate": 0.0009955427461277259, + "loss": 3.1187, + "step": 1759 + }, + { + "epoch": 0.05218990006820271, + "grad_norm": 0.20137187838554382, + "learning_rate": 0.0009955364754552765, + "loss": 3.085, + "step": 1760 + }, + { + "epoch": 0.05221955342051419, + "grad_norm": 0.21902471780776978, + "learning_rate": 0.0009955302003947666, + "loss": 3.1148, + "step": 1761 + }, + { + "epoch": 0.05224920677282567, + "grad_norm": 0.1996917575597763, + "learning_rate": 0.0009955239209462519, + "loss": 3.0987, + "step": 1762 + }, + { + "epoch": 0.05227886012513715, + "grad_norm": 0.1687667816877365, + "learning_rate": 0.0009955176371097877, + "loss": 3.0997, + "step": 1763 + }, + { + "epoch": 0.05230851347744862, + "grad_norm": 0.1664511114358902, + "learning_rate": 0.0009955113488854296, + "loss": 3.1236, + "step": 1764 + }, + { + "epoch": 0.052338166829760106, + "grad_norm": 0.19474440813064575, + "learning_rate": 0.0009955050562732335, + "loss": 3.0572, + "step": 1765 + }, + { + "epoch": 0.05236782018207158, + "grad_norm": 0.1624297797679901, + "learning_rate": 0.0009954987592732552, + "loss": 3.0783, + "step": 1766 + }, + { + "epoch": 0.05239747353438306, + "grad_norm": 0.14908233284950256, + "learning_rate": 0.0009954924578855504, + "loss": 3.1357, + "step": 1767 + }, + { + "epoch": 0.05242712688669454, + "grad_norm": 0.17700998485088348, + "learning_rate": 0.0009954861521101748, + "loss": 3.0888, + "step": 1768 + }, + { + "epoch": 0.05245678023900602, + "grad_norm": 0.19183704257011414, + "learning_rate": 0.000995479841947184, + "loss": 3.0699, + "step": 1769 + }, + { + "epoch": 0.0524864335913175, + "grad_norm": 0.15927816927433014, + "learning_rate": 0.0009954735273966344, + "loss": 3.0688, + "step": 1770 + }, + { + "epoch": 0.05251608694362898, + "grad_norm": 0.1374296396970749, + "learning_rate": 0.0009954672084585817, + "loss": 3.0702, + "step": 1771 + }, + { + "epoch": 0.05254574029594045, + "grad_norm": 0.13431242108345032, + "learning_rate": 0.0009954608851330817, + "loss": 3.0786, + "step": 1772 + }, + { + "epoch": 0.052575393648251936, + "grad_norm": 0.1400938630104065, + "learning_rate": 0.0009954545574201905, + "loss": 3.1009, + "step": 1773 + }, + { + "epoch": 0.05260504700056341, + "grad_norm": 0.1411624550819397, + "learning_rate": 0.0009954482253199644, + "loss": 3.1197, + "step": 1774 + }, + { + "epoch": 0.052634700352874896, + "grad_norm": 0.15944938361644745, + "learning_rate": 0.000995441888832459, + "loss": 3.0872, + "step": 1775 + }, + { + "epoch": 0.05266435370518637, + "grad_norm": 0.15625493228435516, + "learning_rate": 0.0009954355479577306, + "loss": 3.0865, + "step": 1776 + }, + { + "epoch": 0.05269400705749785, + "grad_norm": 0.17248424887657166, + "learning_rate": 0.0009954292026958355, + "loss": 3.0702, + "step": 1777 + }, + { + "epoch": 0.05272366040980933, + "grad_norm": 0.15945543348789215, + "learning_rate": 0.0009954228530468297, + "loss": 3.0913, + "step": 1778 + }, + { + "epoch": 0.05275331376212081, + "grad_norm": 0.15648961067199707, + "learning_rate": 0.0009954164990107694, + "loss": 3.0775, + "step": 1779 + }, + { + "epoch": 0.05278296711443228, + "grad_norm": 0.1742032915353775, + "learning_rate": 0.0009954101405877111, + "loss": 3.0858, + "step": 1780 + }, + { + "epoch": 0.052812620466743766, + "grad_norm": 0.1997382491827011, + "learning_rate": 0.0009954037777777111, + "loss": 3.0542, + "step": 1781 + }, + { + "epoch": 0.05284227381905524, + "grad_norm": 0.2059880793094635, + "learning_rate": 0.0009953974105808255, + "loss": 3.107, + "step": 1782 + }, + { + "epoch": 0.052871927171366725, + "grad_norm": 0.20747223496437073, + "learning_rate": 0.0009953910389971109, + "loss": 3.1228, + "step": 1783 + }, + { + "epoch": 0.0529015805236782, + "grad_norm": 0.2099684476852417, + "learning_rate": 0.0009953846630266234, + "loss": 3.098, + "step": 1784 + }, + { + "epoch": 0.05293123387598968, + "grad_norm": 0.2045886218547821, + "learning_rate": 0.0009953782826694197, + "loss": 3.1199, + "step": 1785 + }, + { + "epoch": 0.05296088722830116, + "grad_norm": 0.18480640649795532, + "learning_rate": 0.0009953718979255563, + "loss": 3.0668, + "step": 1786 + }, + { + "epoch": 0.05299054058061264, + "grad_norm": 0.1872214823961258, + "learning_rate": 0.0009953655087950896, + "loss": 3.0799, + "step": 1787 + }, + { + "epoch": 0.05302019393292412, + "grad_norm": 0.21952037513256073, + "learning_rate": 0.0009953591152780765, + "loss": 3.0638, + "step": 1788 + }, + { + "epoch": 0.053049847285235596, + "grad_norm": 0.23347632586956024, + "learning_rate": 0.0009953527173745735, + "loss": 3.0842, + "step": 1789 + }, + { + "epoch": 0.05307950063754707, + "grad_norm": 0.16780933737754822, + "learning_rate": 0.000995346315084637, + "loss": 3.0607, + "step": 1790 + }, + { + "epoch": 0.053109153989858555, + "grad_norm": 0.17597083747386932, + "learning_rate": 0.0009953399084083239, + "loss": 3.0766, + "step": 1791 + }, + { + "epoch": 0.05313880734217003, + "grad_norm": 0.19523678719997406, + "learning_rate": 0.0009953334973456908, + "loss": 3.0963, + "step": 1792 + }, + { + "epoch": 0.05316846069448151, + "grad_norm": 0.17164060473442078, + "learning_rate": 0.0009953270818967945, + "loss": 3.0951, + "step": 1793 + }, + { + "epoch": 0.05319811404679299, + "grad_norm": 0.1521032452583313, + "learning_rate": 0.0009953206620616922, + "loss": 3.0715, + "step": 1794 + }, + { + "epoch": 0.05322776739910447, + "grad_norm": 0.17163042724132538, + "learning_rate": 0.00099531423784044, + "loss": 3.108, + "step": 1795 + }, + { + "epoch": 0.05325742075141595, + "grad_norm": 0.18188728392124176, + "learning_rate": 0.0009953078092330954, + "loss": 3.0944, + "step": 1796 + }, + { + "epoch": 0.053287074103727426, + "grad_norm": 0.19201667606830597, + "learning_rate": 0.0009953013762397152, + "loss": 3.1136, + "step": 1797 + }, + { + "epoch": 0.0533167274560389, + "grad_norm": 0.2127690464258194, + "learning_rate": 0.0009952949388603563, + "loss": 3.0888, + "step": 1798 + }, + { + "epoch": 0.053346380808350385, + "grad_norm": 0.23782919347286224, + "learning_rate": 0.0009952884970950756, + "loss": 3.061, + "step": 1799 + }, + { + "epoch": 0.05337603416066186, + "grad_norm": 0.18288296461105347, + "learning_rate": 0.0009952820509439302, + "loss": 3.1002, + "step": 1800 + }, + { + "epoch": 0.053405687512973345, + "grad_norm": 0.17112863063812256, + "learning_rate": 0.0009952756004069775, + "loss": 3.0965, + "step": 1801 + }, + { + "epoch": 0.05343534086528482, + "grad_norm": 0.42452484369277954, + "learning_rate": 0.000995269145484274, + "loss": 3.0794, + "step": 1802 + }, + { + "epoch": 0.0534649942175963, + "grad_norm": 0.14568987488746643, + "learning_rate": 0.0009952626861758774, + "loss": 3.1241, + "step": 1803 + }, + { + "epoch": 0.05349464756990778, + "grad_norm": 0.27598053216934204, + "learning_rate": 0.0009952562224818447, + "loss": 3.0851, + "step": 1804 + }, + { + "epoch": 0.053524300922219256, + "grad_norm": 0.15830934047698975, + "learning_rate": 0.0009952497544022329, + "loss": 3.1165, + "step": 1805 + }, + { + "epoch": 0.05355395427453073, + "grad_norm": 0.14719414710998535, + "learning_rate": 0.0009952432819370998, + "loss": 3.1057, + "step": 1806 + }, + { + "epoch": 0.053583607626842215, + "grad_norm": 0.16749964654445648, + "learning_rate": 0.0009952368050865023, + "loss": 3.1303, + "step": 1807 + }, + { + "epoch": 0.05361326097915369, + "grad_norm": 0.19447526335716248, + "learning_rate": 0.000995230323850498, + "loss": 3.1002, + "step": 1808 + }, + { + "epoch": 0.053642914331465175, + "grad_norm": 0.18822556734085083, + "learning_rate": 0.000995223838229144, + "loss": 3.0981, + "step": 1809 + }, + { + "epoch": 0.05367256768377665, + "grad_norm": 0.1883774995803833, + "learning_rate": 0.000995217348222498, + "loss": 3.0733, + "step": 1810 + }, + { + "epoch": 0.05370222103608813, + "grad_norm": 0.17871177196502686, + "learning_rate": 0.0009952108538306176, + "loss": 3.0764, + "step": 1811 + }, + { + "epoch": 0.05373187438839961, + "grad_norm": 0.15905585885047913, + "learning_rate": 0.0009952043550535597, + "loss": 3.1117, + "step": 1812 + }, + { + "epoch": 0.053761527740711086, + "grad_norm": 0.14690850675106049, + "learning_rate": 0.0009951978518913825, + "loss": 3.0636, + "step": 1813 + }, + { + "epoch": 0.05379118109302257, + "grad_norm": 0.15325774252414703, + "learning_rate": 0.0009951913443441431, + "loss": 3.0794, + "step": 1814 + }, + { + "epoch": 0.053820834445334045, + "grad_norm": 0.14061696827411652, + "learning_rate": 0.0009951848324118995, + "loss": 3.0803, + "step": 1815 + }, + { + "epoch": 0.05385048779764552, + "grad_norm": 0.12847144901752472, + "learning_rate": 0.0009951783160947092, + "loss": 3.0644, + "step": 1816 + }, + { + "epoch": 0.053880141149957005, + "grad_norm": 0.12896117568016052, + "learning_rate": 0.00099517179539263, + "loss": 3.0221, + "step": 1817 + }, + { + "epoch": 0.05390979450226848, + "grad_norm": 0.14026671648025513, + "learning_rate": 0.0009951652703057195, + "loss": 3.0766, + "step": 1818 + }, + { + "epoch": 0.05393944785457996, + "grad_norm": 0.1579778641462326, + "learning_rate": 0.0009951587408340355, + "loss": 3.1168, + "step": 1819 + }, + { + "epoch": 0.05396910120689144, + "grad_norm": 0.17304983735084534, + "learning_rate": 0.0009951522069776358, + "loss": 3.063, + "step": 1820 + }, + { + "epoch": 0.053998754559202916, + "grad_norm": 0.1953476071357727, + "learning_rate": 0.0009951456687365783, + "loss": 3.1081, + "step": 1821 + }, + { + "epoch": 0.0540284079115144, + "grad_norm": 0.17762361466884613, + "learning_rate": 0.000995139126110921, + "loss": 3.0799, + "step": 1822 + }, + { + "epoch": 0.054058061263825875, + "grad_norm": 0.14527244865894318, + "learning_rate": 0.0009951325791007217, + "loss": 3.0522, + "step": 1823 + }, + { + "epoch": 0.05408771461613735, + "grad_norm": 0.1635226309299469, + "learning_rate": 0.0009951260277060385, + "loss": 3.0827, + "step": 1824 + }, + { + "epoch": 0.054117367968448835, + "grad_norm": 0.17333130538463593, + "learning_rate": 0.0009951194719269292, + "loss": 3.0706, + "step": 1825 + }, + { + "epoch": 0.05414702132076031, + "grad_norm": 0.1669687032699585, + "learning_rate": 0.000995112911763452, + "loss": 3.0702, + "step": 1826 + }, + { + "epoch": 0.054176674673071794, + "grad_norm": 0.167738676071167, + "learning_rate": 0.0009951063472156652, + "loss": 3.0816, + "step": 1827 + }, + { + "epoch": 0.05420632802538327, + "grad_norm": 0.15448737144470215, + "learning_rate": 0.0009950997782836267, + "loss": 3.0904, + "step": 1828 + }, + { + "epoch": 0.054235981377694746, + "grad_norm": 0.1569412797689438, + "learning_rate": 0.0009950932049673945, + "loss": 3.129, + "step": 1829 + }, + { + "epoch": 0.05426563473000623, + "grad_norm": 0.18781717121601105, + "learning_rate": 0.000995086627267027, + "loss": 3.16, + "step": 1830 + }, + { + "epoch": 0.054295288082317705, + "grad_norm": 0.19064030051231384, + "learning_rate": 0.0009950800451825825, + "loss": 3.1004, + "step": 1831 + }, + { + "epoch": 0.05432494143462918, + "grad_norm": 0.2005058228969574, + "learning_rate": 0.0009950734587141192, + "loss": 3.0862, + "step": 1832 + }, + { + "epoch": 0.054354594786940665, + "grad_norm": 0.1838034838438034, + "learning_rate": 0.0009950668678616954, + "loss": 3.1107, + "step": 1833 + }, + { + "epoch": 0.05438424813925214, + "grad_norm": 0.24339447915554047, + "learning_rate": 0.0009950602726253696, + "loss": 3.0998, + "step": 1834 + }, + { + "epoch": 0.054413901491563624, + "grad_norm": 0.17452137172222137, + "learning_rate": 0.0009950536730052, + "loss": 3.0727, + "step": 1835 + }, + { + "epoch": 0.0544435548438751, + "grad_norm": 0.185299813747406, + "learning_rate": 0.0009950470690012452, + "loss": 3.0679, + "step": 1836 + }, + { + "epoch": 0.054473208196186576, + "grad_norm": 0.16117972135543823, + "learning_rate": 0.0009950404606135638, + "loss": 3.0984, + "step": 1837 + }, + { + "epoch": 0.05450286154849806, + "grad_norm": 0.18541282415390015, + "learning_rate": 0.0009950338478422137, + "loss": 3.0994, + "step": 1838 + }, + { + "epoch": 0.054532514900809535, + "grad_norm": 0.1729278564453125, + "learning_rate": 0.0009950272306872543, + "loss": 3.0494, + "step": 1839 + }, + { + "epoch": 0.05456216825312102, + "grad_norm": 0.14438515901565552, + "learning_rate": 0.000995020609148744, + "loss": 3.0674, + "step": 1840 + }, + { + "epoch": 0.054591821605432495, + "grad_norm": 0.15297824144363403, + "learning_rate": 0.0009950139832267408, + "loss": 3.0629, + "step": 1841 + }, + { + "epoch": 0.05462147495774397, + "grad_norm": 0.1796818971633911, + "learning_rate": 0.000995007352921304, + "loss": 3.0766, + "step": 1842 + }, + { + "epoch": 0.054651128310055454, + "grad_norm": 0.18416868150234222, + "learning_rate": 0.000995000718232492, + "loss": 3.0603, + "step": 1843 + }, + { + "epoch": 0.05468078166236693, + "grad_norm": 0.1546630859375, + "learning_rate": 0.0009949940791603637, + "loss": 3.1104, + "step": 1844 + }, + { + "epoch": 0.054710435014678406, + "grad_norm": 0.16656431555747986, + "learning_rate": 0.0009949874357049779, + "loss": 3.0822, + "step": 1845 + }, + { + "epoch": 0.05474008836698989, + "grad_norm": 0.19802260398864746, + "learning_rate": 0.0009949807878663936, + "loss": 3.0716, + "step": 1846 + }, + { + "epoch": 0.054769741719301365, + "grad_norm": 0.17723259329795837, + "learning_rate": 0.0009949741356446691, + "loss": 3.0682, + "step": 1847 + }, + { + "epoch": 0.05479939507161285, + "grad_norm": 0.20602738857269287, + "learning_rate": 0.0009949674790398638, + "loss": 3.0997, + "step": 1848 + }, + { + "epoch": 0.054829048423924324, + "grad_norm": 0.21657855808734894, + "learning_rate": 0.0009949608180520365, + "loss": 3.0624, + "step": 1849 + }, + { + "epoch": 0.0548587017762358, + "grad_norm": 0.16460980474948883, + "learning_rate": 0.0009949541526812463, + "loss": 3.0603, + "step": 1850 + }, + { + "epoch": 0.054888355128547284, + "grad_norm": 0.1803317368030548, + "learning_rate": 0.000994947482927552, + "loss": 3.0546, + "step": 1851 + }, + { + "epoch": 0.05491800848085876, + "grad_norm": 0.17463670670986176, + "learning_rate": 0.0009949408087910128, + "loss": 3.0732, + "step": 1852 + }, + { + "epoch": 0.05494766183317024, + "grad_norm": 0.18250437080860138, + "learning_rate": 0.0009949341302716878, + "loss": 3.1173, + "step": 1853 + }, + { + "epoch": 0.05497731518548172, + "grad_norm": 0.20034746825695038, + "learning_rate": 0.0009949274473696362, + "loss": 3.0639, + "step": 1854 + }, + { + "epoch": 0.055006968537793195, + "grad_norm": 0.1757134646177292, + "learning_rate": 0.0009949207600849171, + "loss": 3.1014, + "step": 1855 + }, + { + "epoch": 0.05503662189010468, + "grad_norm": 0.1622852385044098, + "learning_rate": 0.0009949140684175897, + "loss": 3.0683, + "step": 1856 + }, + { + "epoch": 0.055066275242416154, + "grad_norm": 0.15313588082790375, + "learning_rate": 0.0009949073723677132, + "loss": 3.0753, + "step": 1857 + }, + { + "epoch": 0.05509592859472763, + "grad_norm": 0.15707805752754211, + "learning_rate": 0.0009949006719353472, + "loss": 3.0729, + "step": 1858 + }, + { + "epoch": 0.055125581947039114, + "grad_norm": 0.16334202885627747, + "learning_rate": 0.0009948939671205505, + "loss": 3.0702, + "step": 1859 + }, + { + "epoch": 0.05515523529935059, + "grad_norm": 0.16559572517871857, + "learning_rate": 0.0009948872579233828, + "loss": 3.1073, + "step": 1860 + }, + { + "epoch": 0.05518488865166207, + "grad_norm": 0.205597922205925, + "learning_rate": 0.0009948805443439036, + "loss": 3.0442, + "step": 1861 + }, + { + "epoch": 0.05521454200397355, + "grad_norm": 0.17766594886779785, + "learning_rate": 0.0009948738263821722, + "loss": 3.0665, + "step": 1862 + }, + { + "epoch": 0.055244195356285025, + "grad_norm": 0.1669096052646637, + "learning_rate": 0.0009948671040382483, + "loss": 3.0669, + "step": 1863 + }, + { + "epoch": 0.05527384870859651, + "grad_norm": 0.1481359452009201, + "learning_rate": 0.000994860377312191, + "loss": 3.0946, + "step": 1864 + }, + { + "epoch": 0.055303502060907984, + "grad_norm": 0.18607915937900543, + "learning_rate": 0.00099485364620406, + "loss": 3.0953, + "step": 1865 + }, + { + "epoch": 0.05533315541321947, + "grad_norm": 0.21183578670024872, + "learning_rate": 0.0009948469107139153, + "loss": 3.049, + "step": 1866 + }, + { + "epoch": 0.055362808765530944, + "grad_norm": 0.18748503923416138, + "learning_rate": 0.0009948401708418163, + "loss": 3.0635, + "step": 1867 + }, + { + "epoch": 0.05539246211784242, + "grad_norm": 0.18682153522968292, + "learning_rate": 0.0009948334265878224, + "loss": 3.1046, + "step": 1868 + }, + { + "epoch": 0.0554221154701539, + "grad_norm": 0.1780429184436798, + "learning_rate": 0.0009948266779519937, + "loss": 3.075, + "step": 1869 + }, + { + "epoch": 0.05545176882246538, + "grad_norm": 0.18736860156059265, + "learning_rate": 0.0009948199249343898, + "loss": 3.0771, + "step": 1870 + }, + { + "epoch": 0.055481422174776855, + "grad_norm": 0.1702619343996048, + "learning_rate": 0.0009948131675350707, + "loss": 3.106, + "step": 1871 + }, + { + "epoch": 0.05551107552708834, + "grad_norm": 0.2040405571460724, + "learning_rate": 0.000994806405754096, + "loss": 3.101, + "step": 1872 + }, + { + "epoch": 0.055540728879399814, + "grad_norm": 0.2407662570476532, + "learning_rate": 0.0009947996395915253, + "loss": 3.0995, + "step": 1873 + }, + { + "epoch": 0.0555703822317113, + "grad_norm": 0.28407904505729675, + "learning_rate": 0.0009947928690474193, + "loss": 3.1163, + "step": 1874 + }, + { + "epoch": 0.055600035584022774, + "grad_norm": 0.2730367183685303, + "learning_rate": 0.0009947860941218374, + "loss": 3.0701, + "step": 1875 + }, + { + "epoch": 0.05562968893633425, + "grad_norm": 0.2304537296295166, + "learning_rate": 0.0009947793148148397, + "loss": 3.1124, + "step": 1876 + }, + { + "epoch": 0.05565934228864573, + "grad_norm": 0.1793505847454071, + "learning_rate": 0.0009947725311264862, + "loss": 3.1191, + "step": 1877 + }, + { + "epoch": 0.05568899564095721, + "grad_norm": 0.1850803792476654, + "learning_rate": 0.0009947657430568369, + "loss": 3.1106, + "step": 1878 + }, + { + "epoch": 0.05571864899326869, + "grad_norm": 0.1735653132200241, + "learning_rate": 0.0009947589506059521, + "loss": 3.1183, + "step": 1879 + }, + { + "epoch": 0.05574830234558017, + "grad_norm": 0.19895072281360626, + "learning_rate": 0.0009947521537738918, + "loss": 3.1011, + "step": 1880 + }, + { + "epoch": 0.055777955697891644, + "grad_norm": 0.20481450855731964, + "learning_rate": 0.0009947453525607163, + "loss": 3.0711, + "step": 1881 + }, + { + "epoch": 0.05580760905020313, + "grad_norm": 0.1686088591814041, + "learning_rate": 0.000994738546966486, + "loss": 3.0866, + "step": 1882 + }, + { + "epoch": 0.055837262402514604, + "grad_norm": 0.2001541405916214, + "learning_rate": 0.0009947317369912608, + "loss": 3.0772, + "step": 1883 + }, + { + "epoch": 0.05586691575482608, + "grad_norm": 0.17152468860149384, + "learning_rate": 0.0009947249226351011, + "loss": 3.0909, + "step": 1884 + }, + { + "epoch": 0.05589656910713756, + "grad_norm": 0.16230517625808716, + "learning_rate": 0.0009947181038980674, + "loss": 3.0866, + "step": 1885 + }, + { + "epoch": 0.05592622245944904, + "grad_norm": 0.15139354765415192, + "learning_rate": 0.00099471128078022, + "loss": 3.0434, + "step": 1886 + }, + { + "epoch": 0.05595587581176052, + "grad_norm": 0.15267489850521088, + "learning_rate": 0.000994704453281619, + "loss": 3.0974, + "step": 1887 + }, + { + "epoch": 0.055985529164072, + "grad_norm": 0.1452474147081375, + "learning_rate": 0.0009946976214023253, + "loss": 3.0365, + "step": 1888 + }, + { + "epoch": 0.056015182516383474, + "grad_norm": 0.15976451337337494, + "learning_rate": 0.0009946907851423993, + "loss": 3.092, + "step": 1889 + }, + { + "epoch": 0.05604483586869496, + "grad_norm": 0.18579064309597015, + "learning_rate": 0.0009946839445019015, + "loss": 3.0987, + "step": 1890 + }, + { + "epoch": 0.056074489221006434, + "grad_norm": 0.16925452649593353, + "learning_rate": 0.0009946770994808925, + "loss": 3.1198, + "step": 1891 + }, + { + "epoch": 0.05610414257331792, + "grad_norm": 0.15642833709716797, + "learning_rate": 0.0009946702500794327, + "loss": 3.076, + "step": 1892 + }, + { + "epoch": 0.05613379592562939, + "grad_norm": 0.14062829315662384, + "learning_rate": 0.000994663396297583, + "loss": 3.0867, + "step": 1893 + }, + { + "epoch": 0.05616344927794087, + "grad_norm": 0.12682944536209106, + "learning_rate": 0.000994656538135404, + "loss": 3.0595, + "step": 1894 + }, + { + "epoch": 0.05619310263025235, + "grad_norm": 0.14625082910060883, + "learning_rate": 0.0009946496755929566, + "loss": 3.1059, + "step": 1895 + }, + { + "epoch": 0.05622275598256383, + "grad_norm": 0.1334090232849121, + "learning_rate": 0.0009946428086703013, + "loss": 3.06, + "step": 1896 + }, + { + "epoch": 0.056252409334875304, + "grad_norm": 0.12959429621696472, + "learning_rate": 0.000994635937367499, + "loss": 3.0658, + "step": 1897 + }, + { + "epoch": 0.05628206268718679, + "grad_norm": 0.15431952476501465, + "learning_rate": 0.0009946290616846107, + "loss": 3.0932, + "step": 1898 + }, + { + "epoch": 0.056311716039498264, + "grad_norm": 0.18689201772212982, + "learning_rate": 0.000994622181621697, + "loss": 3.0832, + "step": 1899 + }, + { + "epoch": 0.05634136939180975, + "grad_norm": 0.20167088508605957, + "learning_rate": 0.000994615297178819, + "loss": 3.0745, + "step": 1900 + }, + { + "epoch": 0.05637102274412122, + "grad_norm": 0.17669782042503357, + "learning_rate": 0.000994608408356038, + "loss": 3.0504, + "step": 1901 + }, + { + "epoch": 0.0564006760964327, + "grad_norm": 0.1759978085756302, + "learning_rate": 0.0009946015151534142, + "loss": 3.0705, + "step": 1902 + }, + { + "epoch": 0.05643032944874418, + "grad_norm": 0.1724768728017807, + "learning_rate": 0.0009945946175710092, + "loss": 3.0667, + "step": 1903 + }, + { + "epoch": 0.05645998280105566, + "grad_norm": 0.18715260922908783, + "learning_rate": 0.000994587715608884, + "loss": 3.1256, + "step": 1904 + }, + { + "epoch": 0.05648963615336714, + "grad_norm": 0.1753177046775818, + "learning_rate": 0.0009945808092670996, + "loss": 3.0347, + "step": 1905 + }, + { + "epoch": 0.05651928950567862, + "grad_norm": 0.18573398888111115, + "learning_rate": 0.0009945738985457173, + "loss": 3.0764, + "step": 1906 + }, + { + "epoch": 0.056548942857990094, + "grad_norm": 0.19643180072307587, + "learning_rate": 0.0009945669834447981, + "loss": 3.0553, + "step": 1907 + }, + { + "epoch": 0.05657859621030158, + "grad_norm": 0.17877143621444702, + "learning_rate": 0.0009945600639644037, + "loss": 3.0632, + "step": 1908 + }, + { + "epoch": 0.05660824956261305, + "grad_norm": 0.15670183300971985, + "learning_rate": 0.0009945531401045948, + "loss": 3.0498, + "step": 1909 + }, + { + "epoch": 0.05663790291492453, + "grad_norm": 0.21555320918560028, + "learning_rate": 0.000994546211865433, + "loss": 3.0964, + "step": 1910 + }, + { + "epoch": 0.05666755626723601, + "grad_norm": 0.24640290439128876, + "learning_rate": 0.0009945392792469797, + "loss": 3.0421, + "step": 1911 + }, + { + "epoch": 0.05669720961954749, + "grad_norm": 0.2607918381690979, + "learning_rate": 0.000994532342249296, + "loss": 3.0706, + "step": 1912 + }, + { + "epoch": 0.05672686297185897, + "grad_norm": 0.20688726007938385, + "learning_rate": 0.0009945254008724438, + "loss": 3.0855, + "step": 1913 + }, + { + "epoch": 0.05675651632417045, + "grad_norm": 0.20212508738040924, + "learning_rate": 0.000994518455116484, + "loss": 3.0405, + "step": 1914 + }, + { + "epoch": 0.056786169676481923, + "grad_norm": 0.21057556569576263, + "learning_rate": 0.0009945115049814785, + "loss": 3.0416, + "step": 1915 + }, + { + "epoch": 0.05681582302879341, + "grad_norm": 0.19000846147537231, + "learning_rate": 0.0009945045504674889, + "loss": 3.1253, + "step": 1916 + }, + { + "epoch": 0.05684547638110488, + "grad_norm": 0.2009722739458084, + "learning_rate": 0.0009944975915745764, + "loss": 3.1101, + "step": 1917 + }, + { + "epoch": 0.056875129733416366, + "grad_norm": 0.20954914391040802, + "learning_rate": 0.000994490628302803, + "loss": 3.0583, + "step": 1918 + }, + { + "epoch": 0.05690478308572784, + "grad_norm": 0.23621079325675964, + "learning_rate": 0.0009944836606522302, + "loss": 3.0965, + "step": 1919 + }, + { + "epoch": 0.05693443643803932, + "grad_norm": 0.20948852598667145, + "learning_rate": 0.0009944766886229195, + "loss": 3.0765, + "step": 1920 + }, + { + "epoch": 0.0569640897903508, + "grad_norm": 0.2181585729122162, + "learning_rate": 0.000994469712214933, + "loss": 3.0786, + "step": 1921 + }, + { + "epoch": 0.05699374314266228, + "grad_norm": 0.22466593980789185, + "learning_rate": 0.0009944627314283324, + "loss": 3.0971, + "step": 1922 + }, + { + "epoch": 0.05702339649497375, + "grad_norm": 0.22071918845176697, + "learning_rate": 0.0009944557462631793, + "loss": 3.0851, + "step": 1923 + }, + { + "epoch": 0.05705304984728524, + "grad_norm": 0.18658463656902313, + "learning_rate": 0.000994448756719536, + "loss": 3.0579, + "step": 1924 + }, + { + "epoch": 0.05708270319959671, + "grad_norm": 0.18250398337841034, + "learning_rate": 0.0009944417627974639, + "loss": 3.093, + "step": 1925 + }, + { + "epoch": 0.057112356551908196, + "grad_norm": 0.16569751501083374, + "learning_rate": 0.000994434764497025, + "loss": 3.0421, + "step": 1926 + }, + { + "epoch": 0.05714200990421967, + "grad_norm": 0.15814189612865448, + "learning_rate": 0.0009944277618182814, + "loss": 3.073, + "step": 1927 + }, + { + "epoch": 0.05717166325653115, + "grad_norm": 0.1572141945362091, + "learning_rate": 0.0009944207547612951, + "loss": 3.0769, + "step": 1928 + }, + { + "epoch": 0.05720131660884263, + "grad_norm": 0.1422571986913681, + "learning_rate": 0.0009944137433261283, + "loss": 3.0687, + "step": 1929 + }, + { + "epoch": 0.05723096996115411, + "grad_norm": 0.13969112932682037, + "learning_rate": 0.0009944067275128427, + "loss": 3.0319, + "step": 1930 + }, + { + "epoch": 0.05726062331346559, + "grad_norm": 0.13981549441814423, + "learning_rate": 0.000994399707321501, + "loss": 3.1143, + "step": 1931 + }, + { + "epoch": 0.057290276665777067, + "grad_norm": 0.1579955518245697, + "learning_rate": 0.0009943926827521647, + "loss": 3.0646, + "step": 1932 + }, + { + "epoch": 0.05731993001808854, + "grad_norm": 0.14001260697841644, + "learning_rate": 0.0009943856538048965, + "loss": 3.0582, + "step": 1933 + }, + { + "epoch": 0.057349583370400026, + "grad_norm": 0.14049795269966125, + "learning_rate": 0.0009943786204797585, + "loss": 3.0881, + "step": 1934 + }, + { + "epoch": 0.0573792367227115, + "grad_norm": 0.13492760062217712, + "learning_rate": 0.0009943715827768129, + "loss": 3.1014, + "step": 1935 + }, + { + "epoch": 0.05740889007502298, + "grad_norm": 0.16598516702651978, + "learning_rate": 0.0009943645406961222, + "loss": 3.0575, + "step": 1936 + }, + { + "epoch": 0.05743854342733446, + "grad_norm": 0.1920858472585678, + "learning_rate": 0.0009943574942377486, + "loss": 3.0244, + "step": 1937 + }, + { + "epoch": 0.05746819677964594, + "grad_norm": 0.21237434446811676, + "learning_rate": 0.0009943504434017543, + "loss": 3.1033, + "step": 1938 + }, + { + "epoch": 0.05749785013195742, + "grad_norm": 0.2090446650981903, + "learning_rate": 0.000994343388188202, + "loss": 3.0504, + "step": 1939 + }, + { + "epoch": 0.057527503484268896, + "grad_norm": 0.21140769124031067, + "learning_rate": 0.0009943363285971544, + "loss": 3.0614, + "step": 1940 + }, + { + "epoch": 0.05755715683658037, + "grad_norm": 0.22269168496131897, + "learning_rate": 0.0009943292646286738, + "loss": 3.0678, + "step": 1941 + }, + { + "epoch": 0.057586810188891856, + "grad_norm": 0.15936626493930817, + "learning_rate": 0.0009943221962828224, + "loss": 3.0608, + "step": 1942 + }, + { + "epoch": 0.05761646354120333, + "grad_norm": 0.15284360945224762, + "learning_rate": 0.0009943151235596633, + "loss": 3.051, + "step": 1943 + }, + { + "epoch": 0.057646116893514815, + "grad_norm": 0.15876568853855133, + "learning_rate": 0.000994308046459259, + "loss": 3.0791, + "step": 1944 + }, + { + "epoch": 0.05767577024582629, + "grad_norm": 0.17226751148700714, + "learning_rate": 0.0009943009649816719, + "loss": 3.0662, + "step": 1945 + }, + { + "epoch": 0.05770542359813777, + "grad_norm": 0.14377716183662415, + "learning_rate": 0.0009942938791269648, + "loss": 3.0344, + "step": 1946 + }, + { + "epoch": 0.05773507695044925, + "grad_norm": 0.12645332515239716, + "learning_rate": 0.0009942867888952007, + "loss": 3.033, + "step": 1947 + }, + { + "epoch": 0.057764730302760726, + "grad_norm": 0.14577904343605042, + "learning_rate": 0.0009942796942864424, + "loss": 3.0388, + "step": 1948 + }, + { + "epoch": 0.0577943836550722, + "grad_norm": 0.12548032402992249, + "learning_rate": 0.0009942725953007525, + "loss": 3.0669, + "step": 1949 + }, + { + "epoch": 0.057824037007383686, + "grad_norm": 0.12263022363185883, + "learning_rate": 0.0009942654919381938, + "loss": 3.0524, + "step": 1950 + }, + { + "epoch": 0.05785369035969516, + "grad_norm": 0.15116800367832184, + "learning_rate": 0.0009942583841988295, + "loss": 3.0469, + "step": 1951 + }, + { + "epoch": 0.057883343712006645, + "grad_norm": 0.1827414631843567, + "learning_rate": 0.000994251272082722, + "loss": 3.0267, + "step": 1952 + }, + { + "epoch": 0.05791299706431812, + "grad_norm": 0.2007978856563568, + "learning_rate": 0.000994244155589935, + "loss": 3.0827, + "step": 1953 + }, + { + "epoch": 0.0579426504166296, + "grad_norm": 0.16665872931480408, + "learning_rate": 0.0009942370347205312, + "loss": 3.0699, + "step": 1954 + }, + { + "epoch": 0.05797230376894108, + "grad_norm": 0.16713857650756836, + "learning_rate": 0.0009942299094745737, + "loss": 3.0741, + "step": 1955 + }, + { + "epoch": 0.058001957121252556, + "grad_norm": 0.2154398411512375, + "learning_rate": 0.0009942227798521253, + "loss": 3.037, + "step": 1956 + }, + { + "epoch": 0.05803161047356404, + "grad_norm": 0.21614311635494232, + "learning_rate": 0.0009942156458532493, + "loss": 3.0267, + "step": 1957 + }, + { + "epoch": 0.058061263825875516, + "grad_norm": 0.2413703203201294, + "learning_rate": 0.000994208507478009, + "loss": 3.0372, + "step": 1958 + }, + { + "epoch": 0.05809091717818699, + "grad_norm": 0.23191791772842407, + "learning_rate": 0.0009942013647264677, + "loss": 3.0728, + "step": 1959 + }, + { + "epoch": 0.058120570530498475, + "grad_norm": 0.2399364858865738, + "learning_rate": 0.0009941942175986883, + "loss": 3.071, + "step": 1960 + }, + { + "epoch": 0.05815022388280995, + "grad_norm": 0.25208938121795654, + "learning_rate": 0.0009941870660947342, + "loss": 3.1073, + "step": 1961 + }, + { + "epoch": 0.05817987723512143, + "grad_norm": 0.21492381393909454, + "learning_rate": 0.0009941799102146688, + "loss": 3.0724, + "step": 1962 + }, + { + "epoch": 0.05820953058743291, + "grad_norm": 0.19591930508613586, + "learning_rate": 0.0009941727499585557, + "loss": 3.0706, + "step": 1963 + }, + { + "epoch": 0.058239183939744386, + "grad_norm": 0.17288023233413696, + "learning_rate": 0.0009941655853264579, + "loss": 3.0652, + "step": 1964 + }, + { + "epoch": 0.05826883729205587, + "grad_norm": 0.15196402370929718, + "learning_rate": 0.0009941584163184391, + "loss": 3.067, + "step": 1965 + }, + { + "epoch": 0.058298490644367346, + "grad_norm": 0.156283900141716, + "learning_rate": 0.0009941512429345626, + "loss": 3.0577, + "step": 1966 + }, + { + "epoch": 0.05832814399667882, + "grad_norm": 0.16499127447605133, + "learning_rate": 0.0009941440651748921, + "loss": 3.0989, + "step": 1967 + }, + { + "epoch": 0.058357797348990305, + "grad_norm": 0.19917573034763336, + "learning_rate": 0.0009941368830394912, + "loss": 3.0809, + "step": 1968 + }, + { + "epoch": 0.05838745070130178, + "grad_norm": 0.17515923082828522, + "learning_rate": 0.0009941296965284233, + "loss": 3.0853, + "step": 1969 + }, + { + "epoch": 0.058417104053613264, + "grad_norm": 0.16056880354881287, + "learning_rate": 0.000994122505641752, + "loss": 3.0543, + "step": 1970 + }, + { + "epoch": 0.05844675740592474, + "grad_norm": 0.17177647352218628, + "learning_rate": 0.0009941153103795414, + "loss": 3.0983, + "step": 1971 + }, + { + "epoch": 0.058476410758236216, + "grad_norm": 0.16657975316047668, + "learning_rate": 0.0009941081107418545, + "loss": 3.0534, + "step": 1972 + }, + { + "epoch": 0.0585060641105477, + "grad_norm": 0.1539945751428604, + "learning_rate": 0.0009941009067287558, + "loss": 3.0737, + "step": 1973 + }, + { + "epoch": 0.058535717462859176, + "grad_norm": 0.14053553342819214, + "learning_rate": 0.0009940936983403087, + "loss": 3.022, + "step": 1974 + }, + { + "epoch": 0.05856537081517065, + "grad_norm": 0.1255146712064743, + "learning_rate": 0.000994086485576577, + "loss": 3.0841, + "step": 1975 + }, + { + "epoch": 0.058595024167482135, + "grad_norm": 0.12449851632118225, + "learning_rate": 0.0009940792684376245, + "loss": 3.0965, + "step": 1976 + }, + { + "epoch": 0.05862467751979361, + "grad_norm": 0.14875508844852448, + "learning_rate": 0.0009940720469235156, + "loss": 3.0806, + "step": 1977 + }, + { + "epoch": 0.058654330872105094, + "grad_norm": 0.1913611739873886, + "learning_rate": 0.0009940648210343137, + "loss": 3.0593, + "step": 1978 + }, + { + "epoch": 0.05868398422441657, + "grad_norm": 0.17300665378570557, + "learning_rate": 0.000994057590770083, + "loss": 3.0652, + "step": 1979 + }, + { + "epoch": 0.058713637576728046, + "grad_norm": 0.15894204378128052, + "learning_rate": 0.0009940503561308876, + "loss": 3.0508, + "step": 1980 + }, + { + "epoch": 0.05874329092903953, + "grad_norm": 0.1917923092842102, + "learning_rate": 0.0009940431171167915, + "loss": 3.0473, + "step": 1981 + }, + { + "epoch": 0.058772944281351006, + "grad_norm": 0.15716570615768433, + "learning_rate": 0.0009940358737278588, + "loss": 3.0529, + "step": 1982 + }, + { + "epoch": 0.05880259763366249, + "grad_norm": 0.15968327224254608, + "learning_rate": 0.0009940286259641539, + "loss": 3.0672, + "step": 1983 + }, + { + "epoch": 0.058832250985973965, + "grad_norm": 0.16281577944755554, + "learning_rate": 0.0009940213738257402, + "loss": 3.1071, + "step": 1984 + }, + { + "epoch": 0.05886190433828544, + "grad_norm": 0.16779156029224396, + "learning_rate": 0.0009940141173126827, + "loss": 3.0523, + "step": 1985 + }, + { + "epoch": 0.058891557690596924, + "grad_norm": 0.16729597747325897, + "learning_rate": 0.0009940068564250454, + "loss": 3.0495, + "step": 1986 + }, + { + "epoch": 0.0589212110429084, + "grad_norm": 0.17508667707443237, + "learning_rate": 0.0009939995911628927, + "loss": 3.0834, + "step": 1987 + }, + { + "epoch": 0.058950864395219876, + "grad_norm": 0.2130952924489975, + "learning_rate": 0.0009939923215262886, + "loss": 3.0649, + "step": 1988 + }, + { + "epoch": 0.05898051774753136, + "grad_norm": 0.2062571793794632, + "learning_rate": 0.0009939850475152979, + "loss": 3.055, + "step": 1989 + }, + { + "epoch": 0.059010171099842836, + "grad_norm": 0.19814391434192657, + "learning_rate": 0.0009939777691299846, + "loss": 3.077, + "step": 1990 + }, + { + "epoch": 0.05903982445215432, + "grad_norm": 0.21472042798995972, + "learning_rate": 0.0009939704863704136, + "loss": 3.0583, + "step": 1991 + }, + { + "epoch": 0.059069477804465795, + "grad_norm": 0.21575427055358887, + "learning_rate": 0.000993963199236649, + "loss": 3.0848, + "step": 1992 + }, + { + "epoch": 0.05909913115677727, + "grad_norm": 0.18313530087471008, + "learning_rate": 0.0009939559077287554, + "loss": 3.0791, + "step": 1993 + }, + { + "epoch": 0.059128784509088754, + "grad_norm": 0.16300064325332642, + "learning_rate": 0.0009939486118467975, + "loss": 3.0658, + "step": 1994 + }, + { + "epoch": 0.05915843786140023, + "grad_norm": 0.14558972418308258, + "learning_rate": 0.00099394131159084, + "loss": 3.0875, + "step": 1995 + }, + { + "epoch": 0.05918809121371171, + "grad_norm": 0.15296433866024017, + "learning_rate": 0.0009939340069609474, + "loss": 3.0339, + "step": 1996 + }, + { + "epoch": 0.05921774456602319, + "grad_norm": 0.14253278076648712, + "learning_rate": 0.0009939266979571842, + "loss": 3.0462, + "step": 1997 + }, + { + "epoch": 0.059247397918334666, + "grad_norm": 0.16776874661445618, + "learning_rate": 0.0009939193845796156, + "loss": 3.0756, + "step": 1998 + }, + { + "epoch": 0.05927705127064615, + "grad_norm": 0.1759079247713089, + "learning_rate": 0.000993912066828306, + "loss": 3.038, + "step": 1999 + }, + { + "epoch": 0.059306704622957625, + "grad_norm": 0.2104906439781189, + "learning_rate": 0.00099390474470332, + "loss": 3.0742, + "step": 2000 + }, + { + "epoch": 0.0593363579752691, + "grad_norm": 0.2348456084728241, + "learning_rate": 0.0009938974182047227, + "loss": 3.0758, + "step": 2001 + }, + { + "epoch": 0.059366011327580584, + "grad_norm": 0.20416630804538727, + "learning_rate": 0.0009938900873325794, + "loss": 3.0748, + "step": 2002 + }, + { + "epoch": 0.05939566467989206, + "grad_norm": 0.20049673318862915, + "learning_rate": 0.0009938827520869543, + "loss": 3.0629, + "step": 2003 + }, + { + "epoch": 0.05942531803220354, + "grad_norm": 0.1826002597808838, + "learning_rate": 0.0009938754124679127, + "loss": 3.0504, + "step": 2004 + }, + { + "epoch": 0.05945497138451502, + "grad_norm": 0.18035131692886353, + "learning_rate": 0.0009938680684755195, + "loss": 3.0718, + "step": 2005 + }, + { + "epoch": 0.059484624736826495, + "grad_norm": 0.20703184604644775, + "learning_rate": 0.0009938607201098399, + "loss": 3.0637, + "step": 2006 + }, + { + "epoch": 0.05951427808913798, + "grad_norm": 0.23934674263000488, + "learning_rate": 0.0009938533673709386, + "loss": 3.0809, + "step": 2007 + }, + { + "epoch": 0.059543931441449455, + "grad_norm": 0.22381363809108734, + "learning_rate": 0.0009938460102588813, + "loss": 3.0054, + "step": 2008 + }, + { + "epoch": 0.05957358479376094, + "grad_norm": 0.20005151629447937, + "learning_rate": 0.0009938386487737326, + "loss": 3.0528, + "step": 2009 + }, + { + "epoch": 0.059603238146072414, + "grad_norm": 0.19467376172542572, + "learning_rate": 0.000993831282915558, + "loss": 3.0571, + "step": 2010 + }, + { + "epoch": 0.05963289149838389, + "grad_norm": 0.14918337762355804, + "learning_rate": 0.0009938239126844226, + "loss": 3.0738, + "step": 2011 + }, + { + "epoch": 0.05966254485069537, + "grad_norm": 0.19681605696678162, + "learning_rate": 0.0009938165380803917, + "loss": 3.0745, + "step": 2012 + }, + { + "epoch": 0.05969219820300685, + "grad_norm": 0.1765449047088623, + "learning_rate": 0.0009938091591035305, + "loss": 3.058, + "step": 2013 + }, + { + "epoch": 0.059721851555318325, + "grad_norm": 0.14611287415027618, + "learning_rate": 0.0009938017757539046, + "loss": 3.0597, + "step": 2014 + }, + { + "epoch": 0.05975150490762981, + "grad_norm": 0.15309473872184753, + "learning_rate": 0.0009937943880315792, + "loss": 3.0752, + "step": 2015 + }, + { + "epoch": 0.059781158259941285, + "grad_norm": 0.1570080667734146, + "learning_rate": 0.0009937869959366196, + "loss": 3.0909, + "step": 2016 + }, + { + "epoch": 0.05981081161225277, + "grad_norm": 0.18164712190628052, + "learning_rate": 0.0009937795994690915, + "loss": 3.0594, + "step": 2017 + }, + { + "epoch": 0.059840464964564244, + "grad_norm": 0.18455000221729279, + "learning_rate": 0.0009937721986290602, + "loss": 3.0261, + "step": 2018 + }, + { + "epoch": 0.05987011831687572, + "grad_norm": 0.19455979764461517, + "learning_rate": 0.0009937647934165914, + "loss": 3.0815, + "step": 2019 + }, + { + "epoch": 0.0598997716691872, + "grad_norm": 0.19551943242549896, + "learning_rate": 0.0009937573838317505, + "loss": 3.0783, + "step": 2020 + }, + { + "epoch": 0.05992942502149868, + "grad_norm": 0.1667933315038681, + "learning_rate": 0.0009937499698746033, + "loss": 3.1189, + "step": 2021 + }, + { + "epoch": 0.05995907837381016, + "grad_norm": 0.1831766664981842, + "learning_rate": 0.0009937425515452155, + "loss": 3.0694, + "step": 2022 + }, + { + "epoch": 0.05998873172612164, + "grad_norm": 0.18772365152835846, + "learning_rate": 0.0009937351288436523, + "loss": 3.0796, + "step": 2023 + }, + { + "epoch": 0.060018385078433115, + "grad_norm": 0.1893157660961151, + "learning_rate": 0.00099372770176998, + "loss": 3.0908, + "step": 2024 + }, + { + "epoch": 0.0600480384307446, + "grad_norm": 0.19021324813365936, + "learning_rate": 0.0009937202703242643, + "loss": 3.0391, + "step": 2025 + }, + { + "epoch": 0.060077691783056074, + "grad_norm": 0.18924863636493683, + "learning_rate": 0.0009937128345065707, + "loss": 3.0251, + "step": 2026 + }, + { + "epoch": 0.06010734513536755, + "grad_norm": 0.1694294810295105, + "learning_rate": 0.0009937053943169653, + "loss": 3.073, + "step": 2027 + }, + { + "epoch": 0.06013699848767903, + "grad_norm": 0.1755012422800064, + "learning_rate": 0.0009936979497555136, + "loss": 3.0587, + "step": 2028 + }, + { + "epoch": 0.06016665183999051, + "grad_norm": 0.16913190484046936, + "learning_rate": 0.000993690500822282, + "loss": 3.0179, + "step": 2029 + }, + { + "epoch": 0.06019630519230199, + "grad_norm": 0.18043088912963867, + "learning_rate": 0.0009936830475173364, + "loss": 3.0921, + "step": 2030 + }, + { + "epoch": 0.06022595854461347, + "grad_norm": 0.1570361703634262, + "learning_rate": 0.0009936755898407425, + "loss": 3.0334, + "step": 2031 + }, + { + "epoch": 0.060255611896924945, + "grad_norm": 0.17627547681331635, + "learning_rate": 0.0009936681277925665, + "loss": 3.0488, + "step": 2032 + }, + { + "epoch": 0.06028526524923643, + "grad_norm": 0.17974576354026794, + "learning_rate": 0.0009936606613728746, + "loss": 3.0286, + "step": 2033 + }, + { + "epoch": 0.060314918601547904, + "grad_norm": 0.18402160704135895, + "learning_rate": 0.0009936531905817328, + "loss": 3.037, + "step": 2034 + }, + { + "epoch": 0.06034457195385939, + "grad_norm": 0.16541947424411774, + "learning_rate": 0.0009936457154192074, + "loss": 3.0146, + "step": 2035 + }, + { + "epoch": 0.06037422530617086, + "grad_norm": 0.17464794218540192, + "learning_rate": 0.0009936382358853642, + "loss": 3.0861, + "step": 2036 + }, + { + "epoch": 0.06040387865848234, + "grad_norm": 0.1875174194574356, + "learning_rate": 0.0009936307519802698, + "loss": 3.0548, + "step": 2037 + }, + { + "epoch": 0.06043353201079382, + "grad_norm": 0.18356886506080627, + "learning_rate": 0.0009936232637039904, + "loss": 3.0304, + "step": 2038 + }, + { + "epoch": 0.0604631853631053, + "grad_norm": 0.1718379706144333, + "learning_rate": 0.000993615771056592, + "loss": 3.045, + "step": 2039 + }, + { + "epoch": 0.060492838715416775, + "grad_norm": 0.15559335052967072, + "learning_rate": 0.0009936082740381416, + "loss": 3.0922, + "step": 2040 + }, + { + "epoch": 0.06052249206772826, + "grad_norm": 0.15758389234542847, + "learning_rate": 0.000993600772648705, + "loss": 3.0599, + "step": 2041 + }, + { + "epoch": 0.060552145420039734, + "grad_norm": 0.1697024255990982, + "learning_rate": 0.000993593266888349, + "loss": 3.066, + "step": 2042 + }, + { + "epoch": 0.06058179877235122, + "grad_norm": 0.15769726037979126, + "learning_rate": 0.0009935857567571395, + "loss": 3.0664, + "step": 2043 + }, + { + "epoch": 0.06061145212466269, + "grad_norm": 0.15007343888282776, + "learning_rate": 0.0009935782422551438, + "loss": 3.0544, + "step": 2044 + }, + { + "epoch": 0.06064110547697417, + "grad_norm": 0.14303341507911682, + "learning_rate": 0.000993570723382428, + "loss": 3.0484, + "step": 2045 + }, + { + "epoch": 0.06067075882928565, + "grad_norm": 0.16661281883716583, + "learning_rate": 0.0009935632001390586, + "loss": 3.0214, + "step": 2046 + }, + { + "epoch": 0.06070041218159713, + "grad_norm": 0.17938107252120972, + "learning_rate": 0.0009935556725251024, + "loss": 3.0658, + "step": 2047 + }, + { + "epoch": 0.06073006553390861, + "grad_norm": 0.16755010187625885, + "learning_rate": 0.000993548140540626, + "loss": 3.0512, + "step": 2048 + }, + { + "epoch": 0.06075971888622009, + "grad_norm": 0.17167837917804718, + "learning_rate": 0.000993540604185696, + "loss": 3.0632, + "step": 2049 + }, + { + "epoch": 0.060789372238531564, + "grad_norm": 0.18132419884204865, + "learning_rate": 0.0009935330634603793, + "loss": 3.0516, + "step": 2050 + }, + { + "epoch": 0.06081902559084305, + "grad_norm": 0.17916782200336456, + "learning_rate": 0.0009935255183647427, + "loss": 3.0676, + "step": 2051 + }, + { + "epoch": 0.06084867894315452, + "grad_norm": 0.18609389662742615, + "learning_rate": 0.0009935179688988528, + "loss": 3.0567, + "step": 2052 + }, + { + "epoch": 0.060878332295466, + "grad_norm": 0.16487351059913635, + "learning_rate": 0.0009935104150627766, + "loss": 3.0536, + "step": 2053 + }, + { + "epoch": 0.06090798564777748, + "grad_norm": 0.16757525503635406, + "learning_rate": 0.0009935028568565812, + "loss": 3.0501, + "step": 2054 + }, + { + "epoch": 0.06093763900008896, + "grad_norm": 0.20018409192562103, + "learning_rate": 0.000993495294280333, + "loss": 3.0473, + "step": 2055 + }, + { + "epoch": 0.06096729235240044, + "grad_norm": 0.22816896438598633, + "learning_rate": 0.0009934877273340993, + "loss": 3.0614, + "step": 2056 + }, + { + "epoch": 0.06099694570471192, + "grad_norm": 0.20117589831352234, + "learning_rate": 0.0009934801560179472, + "loss": 3.0864, + "step": 2057 + }, + { + "epoch": 0.061026599057023394, + "grad_norm": 0.18039098381996155, + "learning_rate": 0.0009934725803319435, + "loss": 3.0534, + "step": 2058 + }, + { + "epoch": 0.06105625240933488, + "grad_norm": 0.16289149224758148, + "learning_rate": 0.0009934650002761554, + "loss": 3.0251, + "step": 2059 + }, + { + "epoch": 0.06108590576164635, + "grad_norm": 0.15285295248031616, + "learning_rate": 0.00099345741585065, + "loss": 3.0692, + "step": 2060 + }, + { + "epoch": 0.061115559113957836, + "grad_norm": 0.16974641382694244, + "learning_rate": 0.0009934498270554946, + "loss": 3.0424, + "step": 2061 + }, + { + "epoch": 0.06114521246626931, + "grad_norm": 0.1860116869211197, + "learning_rate": 0.0009934422338907564, + "loss": 3.0419, + "step": 2062 + }, + { + "epoch": 0.06117486581858079, + "grad_norm": 0.19707831740379333, + "learning_rate": 0.000993434636356502, + "loss": 3.0403, + "step": 2063 + }, + { + "epoch": 0.06120451917089227, + "grad_norm": 0.21881528198719025, + "learning_rate": 0.0009934270344527996, + "loss": 3.0631, + "step": 2064 + }, + { + "epoch": 0.06123417252320375, + "grad_norm": 0.222359299659729, + "learning_rate": 0.000993419428179716, + "loss": 3.0371, + "step": 2065 + }, + { + "epoch": 0.061263825875515224, + "grad_norm": 0.1816520243883133, + "learning_rate": 0.0009934118175373187, + "loss": 3.0442, + "step": 2066 + }, + { + "epoch": 0.06129347922782671, + "grad_norm": 0.16391035914421082, + "learning_rate": 0.0009934042025256753, + "loss": 3.0155, + "step": 2067 + }, + { + "epoch": 0.06132313258013818, + "grad_norm": 0.157710462808609, + "learning_rate": 0.0009933965831448526, + "loss": 3.0536, + "step": 2068 + }, + { + "epoch": 0.061352785932449666, + "grad_norm": 0.15852215886116028, + "learning_rate": 0.0009933889593949188, + "loss": 3.0535, + "step": 2069 + }, + { + "epoch": 0.06138243928476114, + "grad_norm": 0.16976195573806763, + "learning_rate": 0.0009933813312759407, + "loss": 3.0503, + "step": 2070 + }, + { + "epoch": 0.06141209263707262, + "grad_norm": 0.16547437012195587, + "learning_rate": 0.0009933736987879865, + "loss": 3.0879, + "step": 2071 + }, + { + "epoch": 0.0614417459893841, + "grad_norm": 0.17268984019756317, + "learning_rate": 0.0009933660619311235, + "loss": 3.0503, + "step": 2072 + }, + { + "epoch": 0.06147139934169558, + "grad_norm": 0.18982048332691193, + "learning_rate": 0.0009933584207054192, + "loss": 3.0399, + "step": 2073 + }, + { + "epoch": 0.06150105269400706, + "grad_norm": 0.1881730854511261, + "learning_rate": 0.0009933507751109416, + "loss": 3.0098, + "step": 2074 + }, + { + "epoch": 0.06153070604631854, + "grad_norm": 0.16444586217403412, + "learning_rate": 0.000993343125147758, + "loss": 3.0398, + "step": 2075 + }, + { + "epoch": 0.06156035939863001, + "grad_norm": 0.1725073605775833, + "learning_rate": 0.0009933354708159365, + "loss": 3.0672, + "step": 2076 + }, + { + "epoch": 0.061590012750941496, + "grad_norm": 0.19984383881092072, + "learning_rate": 0.0009933278121155447, + "loss": 3.062, + "step": 2077 + }, + { + "epoch": 0.06161966610325297, + "grad_norm": 0.22683566808700562, + "learning_rate": 0.0009933201490466502, + "loss": 3.0345, + "step": 2078 + }, + { + "epoch": 0.06164931945556445, + "grad_norm": 0.19601307809352875, + "learning_rate": 0.0009933124816093215, + "loss": 3.0699, + "step": 2079 + }, + { + "epoch": 0.06167897280787593, + "grad_norm": 0.16417649388313293, + "learning_rate": 0.0009933048098036258, + "loss": 3.0684, + "step": 2080 + }, + { + "epoch": 0.06170862616018741, + "grad_norm": 0.1604418307542801, + "learning_rate": 0.0009932971336296314, + "loss": 3.0574, + "step": 2081 + }, + { + "epoch": 0.06173827951249889, + "grad_norm": 0.16420690715312958, + "learning_rate": 0.0009932894530874064, + "loss": 3.0564, + "step": 2082 + }, + { + "epoch": 0.06176793286481037, + "grad_norm": 0.14379678666591644, + "learning_rate": 0.0009932817681770185, + "loss": 3.0418, + "step": 2083 + }, + { + "epoch": 0.06179758621712184, + "grad_norm": 0.14315985143184662, + "learning_rate": 0.0009932740788985356, + "loss": 3.0433, + "step": 2084 + }, + { + "epoch": 0.061827239569433326, + "grad_norm": 0.15767526626586914, + "learning_rate": 0.0009932663852520265, + "loss": 3.0498, + "step": 2085 + }, + { + "epoch": 0.0618568929217448, + "grad_norm": 0.17483732104301453, + "learning_rate": 0.0009932586872375586, + "loss": 3.1005, + "step": 2086 + }, + { + "epoch": 0.061886546274056285, + "grad_norm": 0.1658538579940796, + "learning_rate": 0.0009932509848552004, + "loss": 3.0391, + "step": 2087 + }, + { + "epoch": 0.06191619962636776, + "grad_norm": 0.13711662590503693, + "learning_rate": 0.0009932432781050203, + "loss": 3.0832, + "step": 2088 + }, + { + "epoch": 0.06194585297867924, + "grad_norm": 0.1396826207637787, + "learning_rate": 0.000993235566987086, + "loss": 3.0495, + "step": 2089 + }, + { + "epoch": 0.06197550633099072, + "grad_norm": 0.13619625568389893, + "learning_rate": 0.0009932278515014663, + "loss": 3.0637, + "step": 2090 + }, + { + "epoch": 0.0620051596833022, + "grad_norm": 0.16183850169181824, + "learning_rate": 0.0009932201316482292, + "loss": 2.9995, + "step": 2091 + }, + { + "epoch": 0.06203481303561367, + "grad_norm": 0.1830897182226181, + "learning_rate": 0.0009932124074274432, + "loss": 3.0062, + "step": 2092 + }, + { + "epoch": 0.062064466387925156, + "grad_norm": 0.18974189460277557, + "learning_rate": 0.0009932046788391766, + "loss": 3.0471, + "step": 2093 + }, + { + "epoch": 0.06209411974023663, + "grad_norm": 0.2250567227602005, + "learning_rate": 0.0009931969458834983, + "loss": 3.037, + "step": 2094 + }, + { + "epoch": 0.062123773092548115, + "grad_norm": 0.28038084506988525, + "learning_rate": 0.000993189208560476, + "loss": 3.0527, + "step": 2095 + }, + { + "epoch": 0.06215342644485959, + "grad_norm": 0.22547462582588196, + "learning_rate": 0.0009931814668701787, + "loss": 3.0697, + "step": 2096 + }, + { + "epoch": 0.06218307979717107, + "grad_norm": 0.21129384636878967, + "learning_rate": 0.0009931737208126747, + "loss": 3.0368, + "step": 2097 + }, + { + "epoch": 0.06221273314948255, + "grad_norm": 0.15031160414218903, + "learning_rate": 0.000993165970388033, + "loss": 3.0664, + "step": 2098 + }, + { + "epoch": 0.06224238650179403, + "grad_norm": 0.214495450258255, + "learning_rate": 0.0009931582155963217, + "loss": 3.0719, + "step": 2099 + }, + { + "epoch": 0.06227203985410551, + "grad_norm": 0.21265731751918793, + "learning_rate": 0.0009931504564376099, + "loss": 3.0446, + "step": 2100 + }, + { + "epoch": 0.062301693206416986, + "grad_norm": 0.17622162401676178, + "learning_rate": 0.0009931426929119663, + "loss": 3.0374, + "step": 2101 + }, + { + "epoch": 0.06233134655872846, + "grad_norm": 0.2060105949640274, + "learning_rate": 0.0009931349250194594, + "loss": 3.0603, + "step": 2102 + }, + { + "epoch": 0.062360999911039945, + "grad_norm": 0.18785575032234192, + "learning_rate": 0.000993127152760158, + "loss": 3.0347, + "step": 2103 + }, + { + "epoch": 0.06239065326335142, + "grad_norm": 0.18645218014717102, + "learning_rate": 0.000993119376134131, + "loss": 3.0045, + "step": 2104 + }, + { + "epoch": 0.0624203066156629, + "grad_norm": 0.19193856418132782, + "learning_rate": 0.0009931115951414475, + "loss": 3.0217, + "step": 2105 + }, + { + "epoch": 0.06244995996797438, + "grad_norm": 0.16422709822654724, + "learning_rate": 0.0009931038097821762, + "loss": 3.017, + "step": 2106 + }, + { + "epoch": 0.06247961332028586, + "grad_norm": 0.1612444519996643, + "learning_rate": 0.0009930960200563858, + "loss": 3.0365, + "step": 2107 + }, + { + "epoch": 0.06250926667259733, + "grad_norm": 0.18179255723953247, + "learning_rate": 0.0009930882259641457, + "loss": 3.0243, + "step": 2108 + }, + { + "epoch": 0.06253892002490882, + "grad_norm": 0.18933752179145813, + "learning_rate": 0.0009930804275055246, + "loss": 3.0658, + "step": 2109 + }, + { + "epoch": 0.0625685733772203, + "grad_norm": 0.1803157925605774, + "learning_rate": 0.0009930726246805916, + "loss": 3.055, + "step": 2110 + }, + { + "epoch": 0.06259822672953178, + "grad_norm": 0.15767474472522736, + "learning_rate": 0.0009930648174894159, + "loss": 3.0522, + "step": 2111 + }, + { + "epoch": 0.06262788008184325, + "grad_norm": 0.14969471096992493, + "learning_rate": 0.0009930570059320668, + "loss": 3.0363, + "step": 2112 + }, + { + "epoch": 0.06265753343415473, + "grad_norm": 0.1887224167585373, + "learning_rate": 0.000993049190008613, + "loss": 3.0399, + "step": 2113 + }, + { + "epoch": 0.0626871867864662, + "grad_norm": 0.1771204173564911, + "learning_rate": 0.0009930413697191243, + "loss": 3.0335, + "step": 2114 + }, + { + "epoch": 0.0627168401387777, + "grad_norm": 0.16974830627441406, + "learning_rate": 0.0009930335450636695, + "loss": 3.0476, + "step": 2115 + }, + { + "epoch": 0.06274649349108917, + "grad_norm": 0.1800994724035263, + "learning_rate": 0.000993025716042318, + "loss": 3.0538, + "step": 2116 + }, + { + "epoch": 0.06277614684340065, + "grad_norm": 0.19728723168373108, + "learning_rate": 0.000993017882655139, + "loss": 3.0329, + "step": 2117 + }, + { + "epoch": 0.06280580019571212, + "grad_norm": 0.18736286461353302, + "learning_rate": 0.0009930100449022023, + "loss": 3.0216, + "step": 2118 + }, + { + "epoch": 0.0628354535480236, + "grad_norm": 0.19551493227481842, + "learning_rate": 0.000993002202783577, + "loss": 3.0971, + "step": 2119 + }, + { + "epoch": 0.06286510690033509, + "grad_norm": 0.20856919884681702, + "learning_rate": 0.0009929943562993324, + "loss": 3.0638, + "step": 2120 + }, + { + "epoch": 0.06289476025264656, + "grad_norm": 0.17859527468681335, + "learning_rate": 0.0009929865054495383, + "loss": 3.0213, + "step": 2121 + }, + { + "epoch": 0.06292441360495804, + "grad_norm": 0.16105452179908752, + "learning_rate": 0.0009929786502342638, + "loss": 3.0177, + "step": 2122 + }, + { + "epoch": 0.06295406695726952, + "grad_norm": 0.17930398881435394, + "learning_rate": 0.0009929707906535792, + "loss": 3.0325, + "step": 2123 + }, + { + "epoch": 0.06298372030958099, + "grad_norm": 0.15958097577095032, + "learning_rate": 0.0009929629267075534, + "loss": 3.0611, + "step": 2124 + }, + { + "epoch": 0.06301337366189248, + "grad_norm": 0.15489377081394196, + "learning_rate": 0.0009929550583962562, + "loss": 3.0031, + "step": 2125 + }, + { + "epoch": 0.06304302701420396, + "grad_norm": 0.1616480052471161, + "learning_rate": 0.0009929471857197574, + "loss": 3.0692, + "step": 2126 + }, + { + "epoch": 0.06307268036651544, + "grad_norm": 0.19419316947460175, + "learning_rate": 0.0009929393086781267, + "loss": 3.0092, + "step": 2127 + }, + { + "epoch": 0.06310233371882691, + "grad_norm": 0.2190171480178833, + "learning_rate": 0.0009929314272714338, + "loss": 3.022, + "step": 2128 + }, + { + "epoch": 0.06313198707113839, + "grad_norm": 0.2035624384880066, + "learning_rate": 0.0009929235414997484, + "loss": 3.0416, + "step": 2129 + }, + { + "epoch": 0.06316164042344988, + "grad_norm": 0.17442356050014496, + "learning_rate": 0.0009929156513631405, + "loss": 3.0412, + "step": 2130 + }, + { + "epoch": 0.06319129377576135, + "grad_norm": 0.18747219443321228, + "learning_rate": 0.00099290775686168, + "loss": 3.033, + "step": 2131 + }, + { + "epoch": 0.06322094712807283, + "grad_norm": 0.19120295345783234, + "learning_rate": 0.0009928998579954364, + "loss": 3.0411, + "step": 2132 + }, + { + "epoch": 0.0632506004803843, + "grad_norm": 0.1545696258544922, + "learning_rate": 0.0009928919547644805, + "loss": 3.0198, + "step": 2133 + }, + { + "epoch": 0.06328025383269578, + "grad_norm": 0.1604296863079071, + "learning_rate": 0.000992884047168881, + "loss": 3.0108, + "step": 2134 + }, + { + "epoch": 0.06330990718500727, + "grad_norm": 0.17104420065879822, + "learning_rate": 0.0009928761352087092, + "loss": 3.0455, + "step": 2135 + }, + { + "epoch": 0.06333956053731875, + "grad_norm": 0.16573137044906616, + "learning_rate": 0.0009928682188840346, + "loss": 3.0373, + "step": 2136 + }, + { + "epoch": 0.06336921388963022, + "grad_norm": 0.174669548869133, + "learning_rate": 0.000992860298194927, + "loss": 3.0469, + "step": 2137 + }, + { + "epoch": 0.0633988672419417, + "grad_norm": 0.20679791271686554, + "learning_rate": 0.0009928523731414572, + "loss": 3.0212, + "step": 2138 + }, + { + "epoch": 0.06342852059425318, + "grad_norm": 0.19527803361415863, + "learning_rate": 0.0009928444437236948, + "loss": 3.0197, + "step": 2139 + }, + { + "epoch": 0.06345817394656465, + "grad_norm": 0.16130617260932922, + "learning_rate": 0.0009928365099417106, + "loss": 3.047, + "step": 2140 + }, + { + "epoch": 0.06348782729887614, + "grad_norm": 0.1453140527009964, + "learning_rate": 0.000992828571795574, + "loss": 3.0354, + "step": 2141 + }, + { + "epoch": 0.06351748065118762, + "grad_norm": 0.22707922756671906, + "learning_rate": 0.0009928206292853562, + "loss": 3.07, + "step": 2142 + }, + { + "epoch": 0.0635471340034991, + "grad_norm": 0.1796528697013855, + "learning_rate": 0.000992812682411127, + "loss": 3.0448, + "step": 2143 + }, + { + "epoch": 0.06357678735581057, + "grad_norm": 0.16480284929275513, + "learning_rate": 0.000992804731172957, + "loss": 3.0354, + "step": 2144 + }, + { + "epoch": 0.06360644070812205, + "grad_norm": 0.1498224288225174, + "learning_rate": 0.0009927967755709165, + "loss": 3.0396, + "step": 2145 + }, + { + "epoch": 0.06363609406043354, + "grad_norm": 0.20520487427711487, + "learning_rate": 0.0009927888156050758, + "loss": 3.0604, + "step": 2146 + }, + { + "epoch": 0.06366574741274501, + "grad_norm": 0.21572765707969666, + "learning_rate": 0.0009927808512755056, + "loss": 3.0328, + "step": 2147 + }, + { + "epoch": 0.06369540076505649, + "grad_norm": 0.19307737052440643, + "learning_rate": 0.0009927728825822764, + "loss": 3.0238, + "step": 2148 + }, + { + "epoch": 0.06372505411736797, + "grad_norm": 0.17908239364624023, + "learning_rate": 0.0009927649095254588, + "loss": 3.033, + "step": 2149 + }, + { + "epoch": 0.06375470746967944, + "grad_norm": 0.18890884518623352, + "learning_rate": 0.0009927569321051234, + "loss": 3.0351, + "step": 2150 + }, + { + "epoch": 0.06378436082199093, + "grad_norm": 0.18338856101036072, + "learning_rate": 0.0009927489503213404, + "loss": 3.0512, + "step": 2151 + }, + { + "epoch": 0.06381401417430241, + "grad_norm": 0.16985031962394714, + "learning_rate": 0.0009927409641741815, + "loss": 3.0129, + "step": 2152 + }, + { + "epoch": 0.06384366752661388, + "grad_norm": 0.21100600063800812, + "learning_rate": 0.0009927329736637164, + "loss": 3.0329, + "step": 2153 + }, + { + "epoch": 0.06387332087892536, + "grad_norm": 0.17464932799339294, + "learning_rate": 0.000992724978790016, + "loss": 3.025, + "step": 2154 + }, + { + "epoch": 0.06390297423123684, + "grad_norm": 0.15121546387672424, + "learning_rate": 0.0009927169795531517, + "loss": 3.0028, + "step": 2155 + }, + { + "epoch": 0.06393262758354833, + "grad_norm": 0.16610440611839294, + "learning_rate": 0.000992708975953194, + "loss": 3.1019, + "step": 2156 + }, + { + "epoch": 0.0639622809358598, + "grad_norm": 0.17304129898548126, + "learning_rate": 0.0009927009679902136, + "loss": 3.0491, + "step": 2157 + }, + { + "epoch": 0.06399193428817128, + "grad_norm": 0.17105230689048767, + "learning_rate": 0.0009926929556642815, + "loss": 2.9946, + "step": 2158 + }, + { + "epoch": 0.06402158764048275, + "grad_norm": 0.14552760124206543, + "learning_rate": 0.0009926849389754687, + "loss": 3.0353, + "step": 2159 + }, + { + "epoch": 0.06405124099279423, + "grad_norm": 0.1686764657497406, + "learning_rate": 0.0009926769179238466, + "loss": 3.0618, + "step": 2160 + }, + { + "epoch": 0.06408089434510572, + "grad_norm": 0.16615231335163116, + "learning_rate": 0.0009926688925094855, + "loss": 3.0616, + "step": 2161 + }, + { + "epoch": 0.0641105476974172, + "grad_norm": 0.16991658508777618, + "learning_rate": 0.0009926608627324567, + "loss": 3.0208, + "step": 2162 + }, + { + "epoch": 0.06414020104972867, + "grad_norm": 0.17564471065998077, + "learning_rate": 0.0009926528285928316, + "loss": 3.0185, + "step": 2163 + }, + { + "epoch": 0.06416985440204015, + "grad_norm": 0.15566006302833557, + "learning_rate": 0.000992644790090681, + "loss": 3.0389, + "step": 2164 + }, + { + "epoch": 0.06419950775435163, + "grad_norm": 0.16481739282608032, + "learning_rate": 0.0009926367472260762, + "loss": 3.0452, + "step": 2165 + }, + { + "epoch": 0.0642291611066631, + "grad_norm": 0.16047506034374237, + "learning_rate": 0.0009926286999990886, + "loss": 3.0454, + "step": 2166 + }, + { + "epoch": 0.06425881445897459, + "grad_norm": 0.16472727060317993, + "learning_rate": 0.0009926206484097892, + "loss": 3.0141, + "step": 2167 + }, + { + "epoch": 0.06428846781128607, + "grad_norm": 0.1609262228012085, + "learning_rate": 0.0009926125924582495, + "loss": 3.085, + "step": 2168 + }, + { + "epoch": 0.06431812116359754, + "grad_norm": 0.17496027052402496, + "learning_rate": 0.0009926045321445407, + "loss": 3.0854, + "step": 2169 + }, + { + "epoch": 0.06434777451590902, + "grad_norm": 0.1970663070678711, + "learning_rate": 0.0009925964674687342, + "loss": 2.9919, + "step": 2170 + }, + { + "epoch": 0.0643774278682205, + "grad_norm": 0.2031109780073166, + "learning_rate": 0.0009925883984309015, + "loss": 3.0105, + "step": 2171 + }, + { + "epoch": 0.06440708122053199, + "grad_norm": 0.21126581728458405, + "learning_rate": 0.0009925803250311136, + "loss": 3.0232, + "step": 2172 + }, + { + "epoch": 0.06443673457284346, + "grad_norm": 0.1877264827489853, + "learning_rate": 0.0009925722472694427, + "loss": 3.0265, + "step": 2173 + }, + { + "epoch": 0.06446638792515494, + "grad_norm": 0.18421852588653564, + "learning_rate": 0.00099256416514596, + "loss": 2.9892, + "step": 2174 + }, + { + "epoch": 0.06449604127746641, + "grad_norm": 0.20095688104629517, + "learning_rate": 0.0009925560786607371, + "loss": 3.0538, + "step": 2175 + }, + { + "epoch": 0.06452569462977789, + "grad_norm": 0.15520605444908142, + "learning_rate": 0.0009925479878138456, + "loss": 2.9787, + "step": 2176 + }, + { + "epoch": 0.06455534798208938, + "grad_norm": 0.16815347969532013, + "learning_rate": 0.000992539892605357, + "loss": 3.0419, + "step": 2177 + }, + { + "epoch": 0.06458500133440086, + "grad_norm": 0.19301044940948486, + "learning_rate": 0.000992531793035343, + "loss": 3.0111, + "step": 2178 + }, + { + "epoch": 0.06461465468671233, + "grad_norm": 0.18360577523708344, + "learning_rate": 0.0009925236891038757, + "loss": 3.019, + "step": 2179 + }, + { + "epoch": 0.06464430803902381, + "grad_norm": 0.20552627742290497, + "learning_rate": 0.0009925155808110265, + "loss": 3.0414, + "step": 2180 + }, + { + "epoch": 0.06467396139133529, + "grad_norm": 0.22144117951393127, + "learning_rate": 0.0009925074681568671, + "loss": 3.0371, + "step": 2181 + }, + { + "epoch": 0.06470361474364678, + "grad_norm": 0.23787733912467957, + "learning_rate": 0.0009924993511414696, + "loss": 3.0345, + "step": 2182 + }, + { + "epoch": 0.06473326809595825, + "grad_norm": 0.19800825417041779, + "learning_rate": 0.000992491229764906, + "loss": 3.0561, + "step": 2183 + }, + { + "epoch": 0.06476292144826973, + "grad_norm": 0.1581488847732544, + "learning_rate": 0.000992483104027248, + "loss": 3.0619, + "step": 2184 + }, + { + "epoch": 0.0647925748005812, + "grad_norm": 0.16628685593605042, + "learning_rate": 0.0009924749739285675, + "loss": 3.0154, + "step": 2185 + }, + { + "epoch": 0.06482222815289268, + "grad_norm": 0.14751970767974854, + "learning_rate": 0.0009924668394689364, + "loss": 3.0363, + "step": 2186 + }, + { + "epoch": 0.06485188150520416, + "grad_norm": 0.16046229004859924, + "learning_rate": 0.0009924587006484272, + "loss": 3.0826, + "step": 2187 + }, + { + "epoch": 0.06488153485751565, + "grad_norm": 0.14447806775569916, + "learning_rate": 0.0009924505574671115, + "loss": 3.0262, + "step": 2188 + }, + { + "epoch": 0.06491118820982712, + "grad_norm": 0.16653361916542053, + "learning_rate": 0.0009924424099250618, + "loss": 3.0588, + "step": 2189 + }, + { + "epoch": 0.0649408415621386, + "grad_norm": 0.17342223227024078, + "learning_rate": 0.0009924342580223497, + "loss": 3.041, + "step": 2190 + }, + { + "epoch": 0.06497049491445007, + "grad_norm": 0.21297158300876617, + "learning_rate": 0.0009924261017590478, + "loss": 3.0379, + "step": 2191 + }, + { + "epoch": 0.06500014826676155, + "grad_norm": 0.20133639872074127, + "learning_rate": 0.0009924179411352286, + "loss": 3.0319, + "step": 2192 + }, + { + "epoch": 0.06502980161907304, + "grad_norm": 0.18216504156589508, + "learning_rate": 0.0009924097761509637, + "loss": 3.0524, + "step": 2193 + }, + { + "epoch": 0.06505945497138452, + "grad_norm": 0.14311710000038147, + "learning_rate": 0.0009924016068063256, + "loss": 3.0133, + "step": 2194 + }, + { + "epoch": 0.065089108323696, + "grad_norm": 0.1571066528558731, + "learning_rate": 0.000992393433101387, + "loss": 3.0, + "step": 2195 + }, + { + "epoch": 0.06511876167600747, + "grad_norm": 0.15063391625881195, + "learning_rate": 0.00099238525503622, + "loss": 3.0139, + "step": 2196 + }, + { + "epoch": 0.06514841502831895, + "grad_norm": 0.1423439234495163, + "learning_rate": 0.000992377072610897, + "loss": 3.0372, + "step": 2197 + }, + { + "epoch": 0.06517806838063044, + "grad_norm": 0.1644069403409958, + "learning_rate": 0.0009923688858254904, + "loss": 3.0433, + "step": 2198 + }, + { + "epoch": 0.06520772173294191, + "grad_norm": 0.16722428798675537, + "learning_rate": 0.0009923606946800729, + "loss": 3.0655, + "step": 2199 + }, + { + "epoch": 0.06523737508525339, + "grad_norm": 0.21213462948799133, + "learning_rate": 0.0009923524991747171, + "loss": 3.0477, + "step": 2200 + }, + { + "epoch": 0.06526702843756486, + "grad_norm": 0.2667617201805115, + "learning_rate": 0.0009923442993094952, + "loss": 3.0179, + "step": 2201 + }, + { + "epoch": 0.06529668178987634, + "grad_norm": 0.2280375212430954, + "learning_rate": 0.00099233609508448, + "loss": 3.0077, + "step": 2202 + }, + { + "epoch": 0.06532633514218783, + "grad_norm": 0.18983137607574463, + "learning_rate": 0.0009923278864997442, + "loss": 3.0599, + "step": 2203 + }, + { + "epoch": 0.0653559884944993, + "grad_norm": 0.17173807322978973, + "learning_rate": 0.0009923196735553605, + "loss": 3.0637, + "step": 2204 + }, + { + "epoch": 0.06538564184681078, + "grad_norm": 0.13675548136234283, + "learning_rate": 0.0009923114562514015, + "loss": 3.0058, + "step": 2205 + }, + { + "epoch": 0.06541529519912226, + "grad_norm": 0.18195278942584991, + "learning_rate": 0.0009923032345879402, + "loss": 3.0257, + "step": 2206 + }, + { + "epoch": 0.06544494855143373, + "grad_norm": 0.17044036090373993, + "learning_rate": 0.000992295008565049, + "loss": 3.0572, + "step": 2207 + }, + { + "epoch": 0.06547460190374522, + "grad_norm": 0.18576394021511078, + "learning_rate": 0.0009922867781828014, + "loss": 2.9932, + "step": 2208 + }, + { + "epoch": 0.0655042552560567, + "grad_norm": 0.18010857701301575, + "learning_rate": 0.0009922785434412695, + "loss": 2.9863, + "step": 2209 + }, + { + "epoch": 0.06553390860836818, + "grad_norm": 0.14891734719276428, + "learning_rate": 0.000992270304340527, + "loss": 3.0066, + "step": 2210 + }, + { + "epoch": 0.06556356196067965, + "grad_norm": 0.15169815719127655, + "learning_rate": 0.000992262060880646, + "loss": 3.014, + "step": 2211 + }, + { + "epoch": 0.06559321531299113, + "grad_norm": 0.15664686262607574, + "learning_rate": 0.0009922538130617002, + "loss": 3.0607, + "step": 2212 + }, + { + "epoch": 0.0656228686653026, + "grad_norm": 0.15455453097820282, + "learning_rate": 0.0009922455608837623, + "loss": 3.0386, + "step": 2213 + }, + { + "epoch": 0.0656525220176141, + "grad_norm": 0.16342034935951233, + "learning_rate": 0.0009922373043469057, + "loss": 3.029, + "step": 2214 + }, + { + "epoch": 0.06568217536992557, + "grad_norm": 0.1902998685836792, + "learning_rate": 0.0009922290434512032, + "loss": 3.0497, + "step": 2215 + }, + { + "epoch": 0.06571182872223705, + "grad_norm": 0.22058072686195374, + "learning_rate": 0.0009922207781967278, + "loss": 3.0441, + "step": 2216 + }, + { + "epoch": 0.06574148207454852, + "grad_norm": 0.19167448580265045, + "learning_rate": 0.0009922125085835532, + "loss": 3.0424, + "step": 2217 + }, + { + "epoch": 0.06577113542686, + "grad_norm": 0.16457122564315796, + "learning_rate": 0.0009922042346117521, + "loss": 3.0396, + "step": 2218 + }, + { + "epoch": 0.06580078877917149, + "grad_norm": 0.19984008371829987, + "learning_rate": 0.0009921959562813982, + "loss": 2.9832, + "step": 2219 + }, + { + "epoch": 0.06583044213148297, + "grad_norm": 0.18658918142318726, + "learning_rate": 0.0009921876735925644, + "loss": 3.0149, + "step": 2220 + }, + { + "epoch": 0.06586009548379444, + "grad_norm": 0.20580491423606873, + "learning_rate": 0.0009921793865453245, + "loss": 3.033, + "step": 2221 + }, + { + "epoch": 0.06588974883610592, + "grad_norm": 0.25689950585365295, + "learning_rate": 0.0009921710951397516, + "loss": 3.0237, + "step": 2222 + }, + { + "epoch": 0.0659194021884174, + "grad_norm": 0.2967556416988373, + "learning_rate": 0.000992162799375919, + "loss": 3.0069, + "step": 2223 + }, + { + "epoch": 0.06594905554072888, + "grad_norm": 0.2740667462348938, + "learning_rate": 0.0009921544992539005, + "loss": 3.05, + "step": 2224 + }, + { + "epoch": 0.06597870889304036, + "grad_norm": 0.236906036734581, + "learning_rate": 0.0009921461947737696, + "loss": 3.0227, + "step": 2225 + }, + { + "epoch": 0.06600836224535184, + "grad_norm": 0.19108302891254425, + "learning_rate": 0.0009921378859355993, + "loss": 3.028, + "step": 2226 + }, + { + "epoch": 0.06603801559766331, + "grad_norm": 0.20221015810966492, + "learning_rate": 0.0009921295727394637, + "loss": 3.0343, + "step": 2227 + }, + { + "epoch": 0.06606766894997479, + "grad_norm": 0.15334168076515198, + "learning_rate": 0.0009921212551854365, + "loss": 3.0451, + "step": 2228 + }, + { + "epoch": 0.06609732230228628, + "grad_norm": 0.1430433690547943, + "learning_rate": 0.0009921129332735909, + "loss": 2.9962, + "step": 2229 + }, + { + "epoch": 0.06612697565459776, + "grad_norm": 0.14992813766002655, + "learning_rate": 0.0009921046070040006, + "loss": 3.0341, + "step": 2230 + }, + { + "epoch": 0.06615662900690923, + "grad_norm": 0.13715723156929016, + "learning_rate": 0.0009920962763767399, + "loss": 3.0151, + "step": 2231 + }, + { + "epoch": 0.06618628235922071, + "grad_norm": 0.1450931429862976, + "learning_rate": 0.000992087941391882, + "loss": 3.0098, + "step": 2232 + }, + { + "epoch": 0.06621593571153218, + "grad_norm": 0.12654170393943787, + "learning_rate": 0.0009920796020495008, + "loss": 3.0082, + "step": 2233 + }, + { + "epoch": 0.06624558906384367, + "grad_norm": 0.132157102227211, + "learning_rate": 0.0009920712583496704, + "loss": 3.0154, + "step": 2234 + }, + { + "epoch": 0.06627524241615515, + "grad_norm": 0.15466541051864624, + "learning_rate": 0.0009920629102924646, + "loss": 3.0266, + "step": 2235 + }, + { + "epoch": 0.06630489576846663, + "grad_norm": 0.15517586469650269, + "learning_rate": 0.0009920545578779572, + "loss": 3.0494, + "step": 2236 + }, + { + "epoch": 0.0663345491207781, + "grad_norm": 0.19623342156410217, + "learning_rate": 0.0009920462011062223, + "loss": 2.9906, + "step": 2237 + }, + { + "epoch": 0.06636420247308958, + "grad_norm": 0.1883232295513153, + "learning_rate": 0.0009920378399773339, + "loss": 3.047, + "step": 2238 + }, + { + "epoch": 0.06639385582540105, + "grad_norm": 0.15673291683197021, + "learning_rate": 0.0009920294744913659, + "loss": 2.9997, + "step": 2239 + }, + { + "epoch": 0.06642350917771254, + "grad_norm": 0.14607664942741394, + "learning_rate": 0.0009920211046483922, + "loss": 3.0518, + "step": 2240 + }, + { + "epoch": 0.06645316253002402, + "grad_norm": 0.13710811734199524, + "learning_rate": 0.0009920127304484873, + "loss": 2.9888, + "step": 2241 + }, + { + "epoch": 0.0664828158823355, + "grad_norm": 0.16306725144386292, + "learning_rate": 0.0009920043518917255, + "loss": 3.0167, + "step": 2242 + }, + { + "epoch": 0.06651246923464697, + "grad_norm": 0.1464441567659378, + "learning_rate": 0.0009919959689781803, + "loss": 3.0183, + "step": 2243 + }, + { + "epoch": 0.06654212258695845, + "grad_norm": 0.16050344705581665, + "learning_rate": 0.0009919875817079268, + "loss": 3.0203, + "step": 2244 + }, + { + "epoch": 0.06657177593926994, + "grad_norm": 0.1717090904712677, + "learning_rate": 0.0009919791900810384, + "loss": 3.0382, + "step": 2245 + }, + { + "epoch": 0.06660142929158142, + "grad_norm": 0.1760338395833969, + "learning_rate": 0.00099197079409759, + "loss": 3.0669, + "step": 2246 + }, + { + "epoch": 0.06663108264389289, + "grad_norm": 0.16908113658428192, + "learning_rate": 0.0009919623937576557, + "loss": 3.0142, + "step": 2247 + }, + { + "epoch": 0.06666073599620437, + "grad_norm": 0.16829966008663177, + "learning_rate": 0.0009919539890613101, + "loss": 3.0003, + "step": 2248 + }, + { + "epoch": 0.06669038934851584, + "grad_norm": 0.187338724732399, + "learning_rate": 0.0009919455800086274, + "loss": 3.0555, + "step": 2249 + }, + { + "epoch": 0.06672004270082733, + "grad_norm": 0.20811191201210022, + "learning_rate": 0.0009919371665996822, + "loss": 2.9906, + "step": 2250 + }, + { + "epoch": 0.06674969605313881, + "grad_norm": 0.1951189935207367, + "learning_rate": 0.0009919287488345488, + "loss": 3.0343, + "step": 2251 + }, + { + "epoch": 0.06677934940545029, + "grad_norm": 0.217795267701149, + "learning_rate": 0.0009919203267133021, + "loss": 3.0063, + "step": 2252 + }, + { + "epoch": 0.06680900275776176, + "grad_norm": 0.21362696588039398, + "learning_rate": 0.0009919119002360162, + "loss": 3.0183, + "step": 2253 + }, + { + "epoch": 0.06683865611007324, + "grad_norm": 0.17073391377925873, + "learning_rate": 0.0009919034694027661, + "loss": 3.0359, + "step": 2254 + }, + { + "epoch": 0.06686830946238473, + "grad_norm": 0.16989238560199738, + "learning_rate": 0.0009918950342136265, + "loss": 3.0361, + "step": 2255 + }, + { + "epoch": 0.0668979628146962, + "grad_norm": 0.16695985198020935, + "learning_rate": 0.0009918865946686717, + "loss": 3.0606, + "step": 2256 + }, + { + "epoch": 0.06692761616700768, + "grad_norm": 0.1856328248977661, + "learning_rate": 0.000991878150767977, + "loss": 3.0496, + "step": 2257 + }, + { + "epoch": 0.06695726951931916, + "grad_norm": 0.18865376710891724, + "learning_rate": 0.0009918697025116166, + "loss": 3.0222, + "step": 2258 + }, + { + "epoch": 0.06698692287163063, + "grad_norm": 0.20233690738677979, + "learning_rate": 0.0009918612498996655, + "loss": 3.0018, + "step": 2259 + }, + { + "epoch": 0.06701657622394212, + "grad_norm": 0.1925121545791626, + "learning_rate": 0.0009918527929321987, + "loss": 3.0001, + "step": 2260 + }, + { + "epoch": 0.0670462295762536, + "grad_norm": 0.15922988951206207, + "learning_rate": 0.0009918443316092912, + "loss": 3.0265, + "step": 2261 + }, + { + "epoch": 0.06707588292856508, + "grad_norm": 0.15721169114112854, + "learning_rate": 0.0009918358659310174, + "loss": 3.0164, + "step": 2262 + }, + { + "epoch": 0.06710553628087655, + "grad_norm": 0.143845334649086, + "learning_rate": 0.0009918273958974527, + "loss": 3.0443, + "step": 2263 + }, + { + "epoch": 0.06713518963318803, + "grad_norm": 0.15477505326271057, + "learning_rate": 0.0009918189215086719, + "loss": 3.0155, + "step": 2264 + }, + { + "epoch": 0.0671648429854995, + "grad_norm": 0.1666717827320099, + "learning_rate": 0.0009918104427647503, + "loss": 3.038, + "step": 2265 + }, + { + "epoch": 0.067194496337811, + "grad_norm": 0.17734432220458984, + "learning_rate": 0.0009918019596657627, + "loss": 3.0119, + "step": 2266 + }, + { + "epoch": 0.06722414969012247, + "grad_norm": 0.20742303133010864, + "learning_rate": 0.0009917934722117844, + "loss": 3.019, + "step": 2267 + }, + { + "epoch": 0.06725380304243395, + "grad_norm": 0.19910043478012085, + "learning_rate": 0.0009917849804028905, + "loss": 3.0319, + "step": 2268 + }, + { + "epoch": 0.06728345639474542, + "grad_norm": 0.17835365235805511, + "learning_rate": 0.0009917764842391561, + "loss": 2.9729, + "step": 2269 + }, + { + "epoch": 0.0673131097470569, + "grad_norm": 0.17548729479312897, + "learning_rate": 0.0009917679837206565, + "loss": 3.0131, + "step": 2270 + }, + { + "epoch": 0.06734276309936839, + "grad_norm": 0.18008145689964294, + "learning_rate": 0.0009917594788474671, + "loss": 3.0163, + "step": 2271 + }, + { + "epoch": 0.06737241645167986, + "grad_norm": 0.17372746765613556, + "learning_rate": 0.0009917509696196632, + "loss": 2.99, + "step": 2272 + }, + { + "epoch": 0.06740206980399134, + "grad_norm": 0.1690170168876648, + "learning_rate": 0.0009917424560373198, + "loss": 3.0598, + "step": 2273 + }, + { + "epoch": 0.06743172315630282, + "grad_norm": 0.15290755033493042, + "learning_rate": 0.0009917339381005127, + "loss": 3.0359, + "step": 2274 + }, + { + "epoch": 0.06746137650861429, + "grad_norm": 0.18562614917755127, + "learning_rate": 0.000991725415809317, + "loss": 3.0403, + "step": 2275 + }, + { + "epoch": 0.06749102986092578, + "grad_norm": 0.2041219025850296, + "learning_rate": 0.0009917168891638085, + "loss": 3.0058, + "step": 2276 + }, + { + "epoch": 0.06752068321323726, + "grad_norm": 0.21842604875564575, + "learning_rate": 0.0009917083581640626, + "loss": 3.0283, + "step": 2277 + }, + { + "epoch": 0.06755033656554874, + "grad_norm": 0.2519043982028961, + "learning_rate": 0.0009916998228101546, + "loss": 3.039, + "step": 2278 + }, + { + "epoch": 0.06757998991786021, + "grad_norm": 0.23245427012443542, + "learning_rate": 0.0009916912831021605, + "loss": 3.0083, + "step": 2279 + }, + { + "epoch": 0.06760964327017169, + "grad_norm": 0.16201049089431763, + "learning_rate": 0.0009916827390401555, + "loss": 3.0615, + "step": 2280 + }, + { + "epoch": 0.06763929662248318, + "grad_norm": 0.18077372014522552, + "learning_rate": 0.0009916741906242155, + "loss": 2.9986, + "step": 2281 + }, + { + "epoch": 0.06766894997479465, + "grad_norm": 0.15354131162166595, + "learning_rate": 0.0009916656378544163, + "loss": 3.0276, + "step": 2282 + }, + { + "epoch": 0.06769860332710613, + "grad_norm": 0.13746537268161774, + "learning_rate": 0.0009916570807308332, + "loss": 2.9865, + "step": 2283 + }, + { + "epoch": 0.0677282566794176, + "grad_norm": 0.14910247921943665, + "learning_rate": 0.0009916485192535424, + "loss": 2.9942, + "step": 2284 + }, + { + "epoch": 0.06775791003172908, + "grad_norm": 0.1632990837097168, + "learning_rate": 0.0009916399534226196, + "loss": 3.0543, + "step": 2285 + }, + { + "epoch": 0.06778756338404057, + "grad_norm": 0.18613839149475098, + "learning_rate": 0.0009916313832381406, + "loss": 3.0314, + "step": 2286 + }, + { + "epoch": 0.06781721673635205, + "grad_norm": 0.16808916628360748, + "learning_rate": 0.0009916228087001812, + "loss": 3.0277, + "step": 2287 + }, + { + "epoch": 0.06784687008866352, + "grad_norm": 0.16380882263183594, + "learning_rate": 0.0009916142298088176, + "loss": 3.0339, + "step": 2288 + }, + { + "epoch": 0.067876523440975, + "grad_norm": 0.17367346584796906, + "learning_rate": 0.0009916056465641256, + "loss": 3.0273, + "step": 2289 + }, + { + "epoch": 0.06790617679328648, + "grad_norm": 0.16978785395622253, + "learning_rate": 0.0009915970589661812, + "loss": 3.008, + "step": 2290 + }, + { + "epoch": 0.06793583014559795, + "grad_norm": 0.17816875874996185, + "learning_rate": 0.0009915884670150604, + "loss": 3.0125, + "step": 2291 + }, + { + "epoch": 0.06796548349790944, + "grad_norm": 0.1749923676252365, + "learning_rate": 0.0009915798707108394, + "loss": 3.0466, + "step": 2292 + }, + { + "epoch": 0.06799513685022092, + "grad_norm": 0.1652064174413681, + "learning_rate": 0.0009915712700535942, + "loss": 3.0168, + "step": 2293 + }, + { + "epoch": 0.0680247902025324, + "grad_norm": 0.17070534825325012, + "learning_rate": 0.000991562665043401, + "loss": 2.9968, + "step": 2294 + }, + { + "epoch": 0.06805444355484387, + "grad_norm": 0.18853330612182617, + "learning_rate": 0.0009915540556803364, + "loss": 3.0222, + "step": 2295 + }, + { + "epoch": 0.06808409690715535, + "grad_norm": 0.19146186113357544, + "learning_rate": 0.0009915454419644758, + "loss": 3.0374, + "step": 2296 + }, + { + "epoch": 0.06811375025946684, + "grad_norm": 0.15902483463287354, + "learning_rate": 0.000991536823895896, + "loss": 2.9777, + "step": 2297 + }, + { + "epoch": 0.06814340361177831, + "grad_norm": 0.1834462732076645, + "learning_rate": 0.0009915282014746735, + "loss": 3.0633, + "step": 2298 + }, + { + "epoch": 0.06817305696408979, + "grad_norm": 0.18455082178115845, + "learning_rate": 0.0009915195747008843, + "loss": 3.0089, + "step": 2299 + }, + { + "epoch": 0.06820271031640127, + "grad_norm": 0.1842135339975357, + "learning_rate": 0.0009915109435746049, + "loss": 3.0236, + "step": 2300 + }, + { + "epoch": 0.06823236366871274, + "grad_norm": 0.1827622652053833, + "learning_rate": 0.0009915023080959116, + "loss": 3.0176, + "step": 2301 + }, + { + "epoch": 0.06826201702102423, + "grad_norm": 0.2109857052564621, + "learning_rate": 0.0009914936682648811, + "loss": 3.0004, + "step": 2302 + }, + { + "epoch": 0.06829167037333571, + "grad_norm": 0.16086818277835846, + "learning_rate": 0.0009914850240815899, + "loss": 3.0268, + "step": 2303 + }, + { + "epoch": 0.06832132372564718, + "grad_norm": 0.1669674515724182, + "learning_rate": 0.0009914763755461142, + "loss": 3.0193, + "step": 2304 + }, + { + "epoch": 0.06835097707795866, + "grad_norm": 0.16931022703647614, + "learning_rate": 0.000991467722658531, + "loss": 3.034, + "step": 2305 + }, + { + "epoch": 0.06838063043027014, + "grad_norm": 0.16621704399585724, + "learning_rate": 0.000991459065418917, + "loss": 2.9792, + "step": 2306 + }, + { + "epoch": 0.06841028378258163, + "grad_norm": 0.16232973337173462, + "learning_rate": 0.0009914504038273481, + "loss": 3.0286, + "step": 2307 + }, + { + "epoch": 0.0684399371348931, + "grad_norm": 0.17302832007408142, + "learning_rate": 0.0009914417378839018, + "loss": 3.0251, + "step": 2308 + }, + { + "epoch": 0.06846959048720458, + "grad_norm": 0.182801753282547, + "learning_rate": 0.0009914330675886544, + "loss": 3.0056, + "step": 2309 + }, + { + "epoch": 0.06849924383951606, + "grad_norm": 0.19219143688678741, + "learning_rate": 0.0009914243929416832, + "loss": 2.9929, + "step": 2310 + }, + { + "epoch": 0.06852889719182753, + "grad_norm": 0.189886212348938, + "learning_rate": 0.0009914157139430642, + "loss": 3.0125, + "step": 2311 + }, + { + "epoch": 0.06855855054413902, + "grad_norm": 0.17879918217658997, + "learning_rate": 0.000991407030592875, + "loss": 3.0791, + "step": 2312 + }, + { + "epoch": 0.0685882038964505, + "grad_norm": 0.19362393021583557, + "learning_rate": 0.000991398342891192, + "loss": 3.0308, + "step": 2313 + }, + { + "epoch": 0.06861785724876197, + "grad_norm": 0.18941472470760345, + "learning_rate": 0.0009913896508380925, + "loss": 3.0474, + "step": 2314 + }, + { + "epoch": 0.06864751060107345, + "grad_norm": 0.17837713658809662, + "learning_rate": 0.0009913809544336532, + "loss": 3.0226, + "step": 2315 + }, + { + "epoch": 0.06867716395338493, + "grad_norm": 0.1856290102005005, + "learning_rate": 0.0009913722536779512, + "loss": 3.0575, + "step": 2316 + }, + { + "epoch": 0.0687068173056964, + "grad_norm": 0.1748974770307541, + "learning_rate": 0.0009913635485710637, + "loss": 2.9994, + "step": 2317 + }, + { + "epoch": 0.06873647065800789, + "grad_norm": 0.15144066512584686, + "learning_rate": 0.0009913548391130675, + "loss": 2.9953, + "step": 2318 + }, + { + "epoch": 0.06876612401031937, + "grad_norm": 0.17123056948184967, + "learning_rate": 0.0009913461253040399, + "loss": 3.0083, + "step": 2319 + }, + { + "epoch": 0.06879577736263084, + "grad_norm": 0.18417920172214508, + "learning_rate": 0.000991337407144058, + "loss": 2.9936, + "step": 2320 + }, + { + "epoch": 0.06882543071494232, + "grad_norm": 0.19266411662101746, + "learning_rate": 0.000991328684633199, + "loss": 2.9981, + "step": 2321 + }, + { + "epoch": 0.0688550840672538, + "grad_norm": 0.1671406328678131, + "learning_rate": 0.00099131995777154, + "loss": 3.0267, + "step": 2322 + }, + { + "epoch": 0.06888473741956529, + "grad_norm": 0.1473296731710434, + "learning_rate": 0.000991311226559159, + "loss": 2.998, + "step": 2323 + }, + { + "epoch": 0.06891439077187676, + "grad_norm": 0.14282028377056122, + "learning_rate": 0.000991302490996132, + "loss": 3.0122, + "step": 2324 + }, + { + "epoch": 0.06894404412418824, + "grad_norm": 0.1470119059085846, + "learning_rate": 0.0009912937510825376, + "loss": 3.034, + "step": 2325 + }, + { + "epoch": 0.06897369747649972, + "grad_norm": 0.18780545890331268, + "learning_rate": 0.0009912850068184527, + "loss": 3.0231, + "step": 2326 + }, + { + "epoch": 0.06900335082881119, + "grad_norm": 0.2172086536884308, + "learning_rate": 0.0009912762582039544, + "loss": 3.0402, + "step": 2327 + }, + { + "epoch": 0.06903300418112268, + "grad_norm": 0.2536363899707794, + "learning_rate": 0.0009912675052391208, + "loss": 3.0102, + "step": 2328 + }, + { + "epoch": 0.06906265753343416, + "grad_norm": 0.16989079117774963, + "learning_rate": 0.0009912587479240292, + "loss": 3.0073, + "step": 2329 + }, + { + "epoch": 0.06909231088574563, + "grad_norm": 0.14521749317646027, + "learning_rate": 0.000991249986258757, + "loss": 3.045, + "step": 2330 + }, + { + "epoch": 0.06912196423805711, + "grad_norm": 0.17394649982452393, + "learning_rate": 0.0009912412202433816, + "loss": 2.9887, + "step": 2331 + }, + { + "epoch": 0.06915161759036859, + "grad_norm": 0.15603972971439362, + "learning_rate": 0.000991232449877981, + "loss": 2.9818, + "step": 2332 + }, + { + "epoch": 0.06918127094268008, + "grad_norm": 0.17263881862163544, + "learning_rate": 0.000991223675162633, + "loss": 3.0555, + "step": 2333 + }, + { + "epoch": 0.06921092429499155, + "grad_norm": 0.18343821167945862, + "learning_rate": 0.0009912148960974146, + "loss": 3.0101, + "step": 2334 + }, + { + "epoch": 0.06924057764730303, + "grad_norm": 0.18518562614917755, + "learning_rate": 0.0009912061126824043, + "loss": 3.0029, + "step": 2335 + }, + { + "epoch": 0.0692702309996145, + "grad_norm": 0.15688781440258026, + "learning_rate": 0.0009911973249176794, + "loss": 3.0382, + "step": 2336 + }, + { + "epoch": 0.06929988435192598, + "grad_norm": 0.1323844939470291, + "learning_rate": 0.0009911885328033178, + "loss": 3.0088, + "step": 2337 + }, + { + "epoch": 0.06932953770423747, + "grad_norm": 0.16420842707157135, + "learning_rate": 0.0009911797363393977, + "loss": 2.9911, + "step": 2338 + }, + { + "epoch": 0.06935919105654895, + "grad_norm": 0.18109621107578278, + "learning_rate": 0.0009911709355259965, + "loss": 3.0298, + "step": 2339 + }, + { + "epoch": 0.06938884440886042, + "grad_norm": 0.1457805633544922, + "learning_rate": 0.0009911621303631925, + "loss": 2.9935, + "step": 2340 + }, + { + "epoch": 0.0694184977611719, + "grad_norm": 0.15319468080997467, + "learning_rate": 0.0009911533208510634, + "loss": 3.0012, + "step": 2341 + }, + { + "epoch": 0.06944815111348338, + "grad_norm": 0.18172425031661987, + "learning_rate": 0.0009911445069896877, + "loss": 3.023, + "step": 2342 + }, + { + "epoch": 0.06947780446579485, + "grad_norm": 0.1923753321170807, + "learning_rate": 0.0009911356887791426, + "loss": 3.0163, + "step": 2343 + }, + { + "epoch": 0.06950745781810634, + "grad_norm": 0.19137586653232574, + "learning_rate": 0.000991126866219507, + "loss": 3.0302, + "step": 2344 + }, + { + "epoch": 0.06953711117041782, + "grad_norm": 0.164087176322937, + "learning_rate": 0.0009911180393108586, + "loss": 2.9629, + "step": 2345 + }, + { + "epoch": 0.0695667645227293, + "grad_norm": 0.17903541028499603, + "learning_rate": 0.0009911092080532756, + "loss": 3.0298, + "step": 2346 + }, + { + "epoch": 0.06959641787504077, + "grad_norm": 0.19584299623966217, + "learning_rate": 0.0009911003724468361, + "loss": 3.0117, + "step": 2347 + }, + { + "epoch": 0.06962607122735225, + "grad_norm": 0.20398539304733276, + "learning_rate": 0.0009910915324916187, + "loss": 3.0124, + "step": 2348 + }, + { + "epoch": 0.06965572457966374, + "grad_norm": 0.19087785482406616, + "learning_rate": 0.0009910826881877016, + "loss": 3.0491, + "step": 2349 + }, + { + "epoch": 0.06968537793197521, + "grad_norm": 0.18152648210525513, + "learning_rate": 0.0009910738395351628, + "loss": 2.9987, + "step": 2350 + }, + { + "epoch": 0.06971503128428669, + "grad_norm": 0.1774802803993225, + "learning_rate": 0.000991064986534081, + "loss": 3.0375, + "step": 2351 + }, + { + "epoch": 0.06974468463659816, + "grad_norm": 0.17823675274848938, + "learning_rate": 0.0009910561291845345, + "loss": 3.0193, + "step": 2352 + }, + { + "epoch": 0.06977433798890964, + "grad_norm": 0.15597695112228394, + "learning_rate": 0.0009910472674866016, + "loss": 3.0172, + "step": 2353 + }, + { + "epoch": 0.06980399134122113, + "grad_norm": 0.1451733261346817, + "learning_rate": 0.0009910384014403608, + "loss": 2.972, + "step": 2354 + }, + { + "epoch": 0.0698336446935326, + "grad_norm": 0.13620789349079132, + "learning_rate": 0.0009910295310458907, + "loss": 2.9438, + "step": 2355 + }, + { + "epoch": 0.06986329804584408, + "grad_norm": 0.1298074871301651, + "learning_rate": 0.0009910206563032698, + "loss": 3.0236, + "step": 2356 + }, + { + "epoch": 0.06989295139815556, + "grad_norm": 0.13995428383350372, + "learning_rate": 0.0009910117772125768, + "loss": 3.005, + "step": 2357 + }, + { + "epoch": 0.06992260475046704, + "grad_norm": 0.1539401113986969, + "learning_rate": 0.0009910028937738901, + "loss": 3.0016, + "step": 2358 + }, + { + "epoch": 0.06995225810277853, + "grad_norm": 0.16133476793766022, + "learning_rate": 0.0009909940059872886, + "loss": 3.0178, + "step": 2359 + }, + { + "epoch": 0.06998191145509, + "grad_norm": 0.18912741541862488, + "learning_rate": 0.000990985113852851, + "loss": 2.9894, + "step": 2360 + }, + { + "epoch": 0.07001156480740148, + "grad_norm": 0.20612116158008575, + "learning_rate": 0.0009909762173706557, + "loss": 3.0127, + "step": 2361 + }, + { + "epoch": 0.07004121815971295, + "grad_norm": 0.18791310489177704, + "learning_rate": 0.0009909673165407818, + "loss": 2.9992, + "step": 2362 + }, + { + "epoch": 0.07007087151202443, + "grad_norm": 0.18829971551895142, + "learning_rate": 0.0009909584113633081, + "loss": 3.0203, + "step": 2363 + }, + { + "epoch": 0.07010052486433592, + "grad_norm": 0.21676141023635864, + "learning_rate": 0.0009909495018383134, + "loss": 3.0255, + "step": 2364 + }, + { + "epoch": 0.0701301782166474, + "grad_norm": 0.20979757606983185, + "learning_rate": 0.0009909405879658766, + "loss": 3.0167, + "step": 2365 + }, + { + "epoch": 0.07015983156895887, + "grad_norm": 0.14825496077537537, + "learning_rate": 0.0009909316697460765, + "loss": 3.009, + "step": 2366 + }, + { + "epoch": 0.07018948492127035, + "grad_norm": 0.15140599012374878, + "learning_rate": 0.0009909227471789923, + "loss": 3.018, + "step": 2367 + }, + { + "epoch": 0.07021913827358182, + "grad_norm": 0.1844102144241333, + "learning_rate": 0.000990913820264703, + "loss": 2.9966, + "step": 2368 + }, + { + "epoch": 0.0702487916258933, + "grad_norm": 0.1767953485250473, + "learning_rate": 0.0009909048890032874, + "loss": 2.9764, + "step": 2369 + }, + { + "epoch": 0.07027844497820479, + "grad_norm": 0.17067211866378784, + "learning_rate": 0.0009908959533948248, + "loss": 2.9819, + "step": 2370 + }, + { + "epoch": 0.07030809833051627, + "grad_norm": 0.16725075244903564, + "learning_rate": 0.000990887013439394, + "loss": 2.9559, + "step": 2371 + }, + { + "epoch": 0.07033775168282774, + "grad_norm": 0.1821320652961731, + "learning_rate": 0.000990878069137075, + "loss": 3.0101, + "step": 2372 + }, + { + "epoch": 0.07036740503513922, + "grad_norm": 0.21745719015598297, + "learning_rate": 0.0009908691204879461, + "loss": 3.0434, + "step": 2373 + }, + { + "epoch": 0.0703970583874507, + "grad_norm": 0.2500905990600586, + "learning_rate": 0.000990860167492087, + "loss": 3.0206, + "step": 2374 + }, + { + "epoch": 0.07042671173976219, + "grad_norm": 0.25020259618759155, + "learning_rate": 0.0009908512101495766, + "loss": 2.9934, + "step": 2375 + }, + { + "epoch": 0.07045636509207366, + "grad_norm": 0.24981054663658142, + "learning_rate": 0.0009908422484604946, + "loss": 3.009, + "step": 2376 + }, + { + "epoch": 0.07048601844438514, + "grad_norm": 0.21596497297286987, + "learning_rate": 0.0009908332824249205, + "loss": 2.9926, + "step": 2377 + }, + { + "epoch": 0.07051567179669661, + "grad_norm": 0.17701058089733124, + "learning_rate": 0.0009908243120429331, + "loss": 3.012, + "step": 2378 + }, + { + "epoch": 0.07054532514900809, + "grad_norm": 0.1880001574754715, + "learning_rate": 0.0009908153373146124, + "loss": 3.0289, + "step": 2379 + }, + { + "epoch": 0.07057497850131958, + "grad_norm": 0.20004528760910034, + "learning_rate": 0.0009908063582400376, + "loss": 2.9833, + "step": 2380 + }, + { + "epoch": 0.07060463185363106, + "grad_norm": 0.17450863122940063, + "learning_rate": 0.000990797374819288, + "loss": 2.9859, + "step": 2381 + }, + { + "epoch": 0.07063428520594253, + "grad_norm": 0.17424480617046356, + "learning_rate": 0.0009907883870524437, + "loss": 3.0367, + "step": 2382 + }, + { + "epoch": 0.07066393855825401, + "grad_norm": 0.20191647112369537, + "learning_rate": 0.0009907793949395839, + "loss": 3.0014, + "step": 2383 + }, + { + "epoch": 0.07069359191056548, + "grad_norm": 0.19630907475948334, + "learning_rate": 0.0009907703984807883, + "loss": 3.0327, + "step": 2384 + }, + { + "epoch": 0.07072324526287697, + "grad_norm": 0.15147651731967926, + "learning_rate": 0.0009907613976761365, + "loss": 2.987, + "step": 2385 + }, + { + "epoch": 0.07075289861518845, + "grad_norm": 0.1513381451368332, + "learning_rate": 0.0009907523925257085, + "loss": 2.9912, + "step": 2386 + }, + { + "epoch": 0.07078255196749993, + "grad_norm": 0.14703437685966492, + "learning_rate": 0.0009907433830295836, + "loss": 2.9806, + "step": 2387 + }, + { + "epoch": 0.0708122053198114, + "grad_norm": 0.13654185831546783, + "learning_rate": 0.0009907343691878418, + "loss": 2.98, + "step": 2388 + }, + { + "epoch": 0.07084185867212288, + "grad_norm": 0.1616266965866089, + "learning_rate": 0.000990725351000563, + "loss": 3.0052, + "step": 2389 + }, + { + "epoch": 0.07087151202443437, + "grad_norm": 0.17664460837841034, + "learning_rate": 0.0009907163284678271, + "loss": 3.007, + "step": 2390 + }, + { + "epoch": 0.07090116537674584, + "grad_norm": 0.16566480696201324, + "learning_rate": 0.0009907073015897139, + "loss": 3.0122, + "step": 2391 + }, + { + "epoch": 0.07093081872905732, + "grad_norm": 0.1443408727645874, + "learning_rate": 0.0009906982703663033, + "loss": 3.0406, + "step": 2392 + }, + { + "epoch": 0.0709604720813688, + "grad_norm": 0.15901000797748566, + "learning_rate": 0.0009906892347976751, + "loss": 2.9738, + "step": 2393 + }, + { + "epoch": 0.07099012543368027, + "grad_norm": 0.1475083827972412, + "learning_rate": 0.0009906801948839094, + "loss": 3.0261, + "step": 2394 + }, + { + "epoch": 0.07101977878599175, + "grad_norm": 0.13159172236919403, + "learning_rate": 0.0009906711506250867, + "loss": 2.9788, + "step": 2395 + }, + { + "epoch": 0.07104943213830324, + "grad_norm": 0.14637567102909088, + "learning_rate": 0.0009906621020212866, + "loss": 3.0254, + "step": 2396 + }, + { + "epoch": 0.07107908549061472, + "grad_norm": 0.17081180214881897, + "learning_rate": 0.0009906530490725893, + "loss": 2.9809, + "step": 2397 + }, + { + "epoch": 0.07110873884292619, + "grad_norm": 0.1646333932876587, + "learning_rate": 0.0009906439917790751, + "loss": 2.9771, + "step": 2398 + }, + { + "epoch": 0.07113839219523767, + "grad_norm": 0.1640044003725052, + "learning_rate": 0.0009906349301408242, + "loss": 3.0267, + "step": 2399 + }, + { + "epoch": 0.07116804554754914, + "grad_norm": 0.147353857755661, + "learning_rate": 0.0009906258641579166, + "loss": 2.9971, + "step": 2400 + }, + { + "epoch": 0.07119769889986063, + "grad_norm": 0.17045708000659943, + "learning_rate": 0.000990616793830433, + "loss": 3.004, + "step": 2401 + }, + { + "epoch": 0.07122735225217211, + "grad_norm": 0.16824473440647125, + "learning_rate": 0.0009906077191584532, + "loss": 3.0216, + "step": 2402 + }, + { + "epoch": 0.07125700560448359, + "grad_norm": 0.16488252580165863, + "learning_rate": 0.000990598640142058, + "loss": 2.9717, + "step": 2403 + }, + { + "epoch": 0.07128665895679506, + "grad_norm": 0.16276341676712036, + "learning_rate": 0.0009905895567813277, + "loss": 3.0325, + "step": 2404 + }, + { + "epoch": 0.07131631230910654, + "grad_norm": 0.1600695103406906, + "learning_rate": 0.0009905804690763425, + "loss": 2.9992, + "step": 2405 + }, + { + "epoch": 0.07134596566141803, + "grad_norm": 0.1682947278022766, + "learning_rate": 0.0009905713770271831, + "loss": 2.9784, + "step": 2406 + }, + { + "epoch": 0.0713756190137295, + "grad_norm": 0.15823784470558167, + "learning_rate": 0.00099056228063393, + "loss": 3.0195, + "step": 2407 + }, + { + "epoch": 0.07140527236604098, + "grad_norm": 0.15769945085048676, + "learning_rate": 0.0009905531798966637, + "loss": 2.9849, + "step": 2408 + }, + { + "epoch": 0.07143492571835246, + "grad_norm": 0.15032722055912018, + "learning_rate": 0.0009905440748154647, + "loss": 3.0273, + "step": 2409 + }, + { + "epoch": 0.07146457907066393, + "grad_norm": 0.17353709042072296, + "learning_rate": 0.0009905349653904136, + "loss": 3.0126, + "step": 2410 + }, + { + "epoch": 0.07149423242297542, + "grad_norm": 0.22661225497722626, + "learning_rate": 0.0009905258516215915, + "loss": 3.0277, + "step": 2411 + }, + { + "epoch": 0.0715238857752869, + "grad_norm": 0.2568579316139221, + "learning_rate": 0.0009905167335090787, + "loss": 3.0235, + "step": 2412 + }, + { + "epoch": 0.07155353912759838, + "grad_norm": 0.18976977467536926, + "learning_rate": 0.000990507611052956, + "loss": 2.9808, + "step": 2413 + }, + { + "epoch": 0.07158319247990985, + "grad_norm": 0.21569783985614777, + "learning_rate": 0.000990498484253304, + "loss": 3.007, + "step": 2414 + }, + { + "epoch": 0.07161284583222133, + "grad_norm": 0.22152191400527954, + "learning_rate": 0.0009904893531102038, + "loss": 2.9967, + "step": 2415 + }, + { + "epoch": 0.07164249918453282, + "grad_norm": 0.17959089577198029, + "learning_rate": 0.0009904802176237365, + "loss": 2.979, + "step": 2416 + }, + { + "epoch": 0.0716721525368443, + "grad_norm": 0.2022978961467743, + "learning_rate": 0.0009904710777939823, + "loss": 3.0079, + "step": 2417 + }, + { + "epoch": 0.07170180588915577, + "grad_norm": 0.20558923482894897, + "learning_rate": 0.0009904619336210227, + "loss": 3.0283, + "step": 2418 + }, + { + "epoch": 0.07173145924146725, + "grad_norm": 0.1785823404788971, + "learning_rate": 0.0009904527851049385, + "loss": 3.0196, + "step": 2419 + }, + { + "epoch": 0.07176111259377872, + "grad_norm": 0.14596013724803925, + "learning_rate": 0.0009904436322458107, + "loss": 2.9383, + "step": 2420 + }, + { + "epoch": 0.0717907659460902, + "grad_norm": 0.1503039002418518, + "learning_rate": 0.0009904344750437204, + "loss": 3.0153, + "step": 2421 + }, + { + "epoch": 0.07182041929840169, + "grad_norm": 0.15707528591156006, + "learning_rate": 0.0009904253134987485, + "loss": 2.9595, + "step": 2422 + }, + { + "epoch": 0.07185007265071316, + "grad_norm": 0.1480252742767334, + "learning_rate": 0.0009904161476109764, + "loss": 3.033, + "step": 2423 + }, + { + "epoch": 0.07187972600302464, + "grad_norm": 0.15117000043392181, + "learning_rate": 0.0009904069773804852, + "loss": 3.002, + "step": 2424 + }, + { + "epoch": 0.07190937935533612, + "grad_norm": 0.14897580444812775, + "learning_rate": 0.0009903978028073558, + "loss": 3.0185, + "step": 2425 + }, + { + "epoch": 0.0719390327076476, + "grad_norm": 0.16888387501239777, + "learning_rate": 0.0009903886238916697, + "loss": 2.9742, + "step": 2426 + }, + { + "epoch": 0.07196868605995908, + "grad_norm": 0.16346195340156555, + "learning_rate": 0.0009903794406335084, + "loss": 3.0224, + "step": 2427 + }, + { + "epoch": 0.07199833941227056, + "grad_norm": 0.16206619143486023, + "learning_rate": 0.0009903702530329528, + "loss": 2.9753, + "step": 2428 + }, + { + "epoch": 0.07202799276458204, + "grad_norm": 0.17093481123447418, + "learning_rate": 0.0009903610610900843, + "loss": 2.9752, + "step": 2429 + }, + { + "epoch": 0.07205764611689351, + "grad_norm": 0.18589498102664948, + "learning_rate": 0.0009903518648049848, + "loss": 3.0215, + "step": 2430 + }, + { + "epoch": 0.07208729946920499, + "grad_norm": 0.1784159243106842, + "learning_rate": 0.0009903426641777351, + "loss": 2.9741, + "step": 2431 + }, + { + "epoch": 0.07211695282151648, + "grad_norm": 0.17931033670902252, + "learning_rate": 0.000990333459208417, + "loss": 3.0118, + "step": 2432 + }, + { + "epoch": 0.07214660617382795, + "grad_norm": 0.1856592744588852, + "learning_rate": 0.000990324249897112, + "loss": 2.9715, + "step": 2433 + }, + { + "epoch": 0.07217625952613943, + "grad_norm": 0.1743941456079483, + "learning_rate": 0.0009903150362439018, + "loss": 2.9949, + "step": 2434 + }, + { + "epoch": 0.0722059128784509, + "grad_norm": 0.17774291336536407, + "learning_rate": 0.0009903058182488675, + "loss": 3.0055, + "step": 2435 + }, + { + "epoch": 0.07223556623076238, + "grad_norm": 0.18698319792747498, + "learning_rate": 0.000990296595912091, + "loss": 3.0016, + "step": 2436 + }, + { + "epoch": 0.07226521958307387, + "grad_norm": 0.1670137494802475, + "learning_rate": 0.0009902873692336541, + "loss": 2.9951, + "step": 2437 + }, + { + "epoch": 0.07229487293538535, + "grad_norm": 0.12912890315055847, + "learning_rate": 0.0009902781382136383, + "loss": 3.0061, + "step": 2438 + }, + { + "epoch": 0.07232452628769682, + "grad_norm": 0.1546470671892166, + "learning_rate": 0.0009902689028521256, + "loss": 3.0379, + "step": 2439 + }, + { + "epoch": 0.0723541796400083, + "grad_norm": 0.1532977819442749, + "learning_rate": 0.0009902596631491975, + "loss": 3.0178, + "step": 2440 + }, + { + "epoch": 0.07238383299231978, + "grad_norm": 0.1741519272327423, + "learning_rate": 0.000990250419104936, + "loss": 2.9894, + "step": 2441 + }, + { + "epoch": 0.07241348634463127, + "grad_norm": 0.19407762587070465, + "learning_rate": 0.0009902411707194228, + "loss": 2.9684, + "step": 2442 + }, + { + "epoch": 0.07244313969694274, + "grad_norm": 0.2128705233335495, + "learning_rate": 0.0009902319179927398, + "loss": 3.0152, + "step": 2443 + }, + { + "epoch": 0.07247279304925422, + "grad_norm": 0.24903322756290436, + "learning_rate": 0.000990222660924969, + "loss": 2.9578, + "step": 2444 + }, + { + "epoch": 0.0725024464015657, + "grad_norm": 0.276809424161911, + "learning_rate": 0.0009902133995161927, + "loss": 3.0122, + "step": 2445 + }, + { + "epoch": 0.07253209975387717, + "grad_norm": 0.23813137412071228, + "learning_rate": 0.0009902041337664924, + "loss": 3.0091, + "step": 2446 + }, + { + "epoch": 0.07256175310618865, + "grad_norm": 0.20263026654720306, + "learning_rate": 0.0009901948636759504, + "loss": 3.0118, + "step": 2447 + }, + { + "epoch": 0.07259140645850014, + "grad_norm": 0.18535946309566498, + "learning_rate": 0.0009901855892446487, + "loss": 3.0156, + "step": 2448 + }, + { + "epoch": 0.07262105981081161, + "grad_norm": 0.20431062579154968, + "learning_rate": 0.0009901763104726694, + "loss": 2.9852, + "step": 2449 + }, + { + "epoch": 0.07265071316312309, + "grad_norm": 0.191827192902565, + "learning_rate": 0.000990167027360095, + "loss": 3.0219, + "step": 2450 + }, + { + "epoch": 0.07268036651543457, + "grad_norm": 0.16999980807304382, + "learning_rate": 0.0009901577399070072, + "loss": 2.9982, + "step": 2451 + }, + { + "epoch": 0.07271001986774604, + "grad_norm": 0.1494646817445755, + "learning_rate": 0.0009901484481134885, + "loss": 3.0166, + "step": 2452 + }, + { + "epoch": 0.07273967322005753, + "grad_norm": 0.15212443470954895, + "learning_rate": 0.0009901391519796213, + "loss": 3.0097, + "step": 2453 + }, + { + "epoch": 0.07276932657236901, + "grad_norm": 0.1448695808649063, + "learning_rate": 0.0009901298515054878, + "loss": 2.9695, + "step": 2454 + }, + { + "epoch": 0.07279897992468048, + "grad_norm": 0.15465621650218964, + "learning_rate": 0.00099012054669117, + "loss": 2.9799, + "step": 2455 + }, + { + "epoch": 0.07282863327699196, + "grad_norm": 0.14660148322582245, + "learning_rate": 0.000990111237536751, + "loss": 2.9979, + "step": 2456 + }, + { + "epoch": 0.07285828662930344, + "grad_norm": 0.15623736381530762, + "learning_rate": 0.0009901019240423127, + "loss": 2.9915, + "step": 2457 + }, + { + "epoch": 0.07288793998161493, + "grad_norm": 0.1728622019290924, + "learning_rate": 0.0009900926062079377, + "loss": 3.0004, + "step": 2458 + }, + { + "epoch": 0.0729175933339264, + "grad_norm": 0.1622520238161087, + "learning_rate": 0.0009900832840337086, + "loss": 2.9903, + "step": 2459 + }, + { + "epoch": 0.07294724668623788, + "grad_norm": 0.14155668020248413, + "learning_rate": 0.0009900739575197078, + "loss": 3.0127, + "step": 2460 + }, + { + "epoch": 0.07297690003854936, + "grad_norm": 0.13749872148036957, + "learning_rate": 0.0009900646266660183, + "loss": 2.998, + "step": 2461 + }, + { + "epoch": 0.07300655339086083, + "grad_norm": 0.1346825808286667, + "learning_rate": 0.000990055291472722, + "loss": 2.9508, + "step": 2462 + }, + { + "epoch": 0.07303620674317232, + "grad_norm": 0.15241460502147675, + "learning_rate": 0.0009900459519399023, + "loss": 2.9951, + "step": 2463 + }, + { + "epoch": 0.0730658600954838, + "grad_norm": 0.17420001327991486, + "learning_rate": 0.0009900366080676413, + "loss": 3.0196, + "step": 2464 + }, + { + "epoch": 0.07309551344779527, + "grad_norm": 0.16600634157657623, + "learning_rate": 0.0009900272598560222, + "loss": 3.007, + "step": 2465 + }, + { + "epoch": 0.07312516680010675, + "grad_norm": 0.16743646562099457, + "learning_rate": 0.0009900179073051277, + "loss": 2.9911, + "step": 2466 + }, + { + "epoch": 0.07315482015241823, + "grad_norm": 0.23282787203788757, + "learning_rate": 0.0009900085504150403, + "loss": 2.9769, + "step": 2467 + }, + { + "epoch": 0.07318447350472972, + "grad_norm": 0.24437430500984192, + "learning_rate": 0.000989999189185843, + "loss": 2.9633, + "step": 2468 + }, + { + "epoch": 0.07321412685704119, + "grad_norm": 0.18505316972732544, + "learning_rate": 0.0009899898236176191, + "loss": 3.0107, + "step": 2469 + }, + { + "epoch": 0.07324378020935267, + "grad_norm": 0.22460077702999115, + "learning_rate": 0.000989980453710451, + "loss": 2.9947, + "step": 2470 + }, + { + "epoch": 0.07327343356166414, + "grad_norm": 0.2731972932815552, + "learning_rate": 0.0009899710794644219, + "loss": 3.0175, + "step": 2471 + }, + { + "epoch": 0.07330308691397562, + "grad_norm": 0.2005922794342041, + "learning_rate": 0.000989961700879615, + "loss": 3.0004, + "step": 2472 + }, + { + "epoch": 0.0733327402662871, + "grad_norm": 0.19016492366790771, + "learning_rate": 0.000989952317956113, + "loss": 2.9891, + "step": 2473 + }, + { + "epoch": 0.07336239361859859, + "grad_norm": 0.1880662739276886, + "learning_rate": 0.0009899429306939988, + "loss": 3.0333, + "step": 2474 + }, + { + "epoch": 0.07339204697091006, + "grad_norm": 0.1865164190530777, + "learning_rate": 0.0009899335390933562, + "loss": 2.9857, + "step": 2475 + }, + { + "epoch": 0.07342170032322154, + "grad_norm": 0.1595560759305954, + "learning_rate": 0.000989924143154268, + "loss": 3.0202, + "step": 2476 + }, + { + "epoch": 0.07345135367553302, + "grad_norm": 0.17340609431266785, + "learning_rate": 0.0009899147428768173, + "loss": 2.9838, + "step": 2477 + }, + { + "epoch": 0.07348100702784449, + "grad_norm": 0.15783703327178955, + "learning_rate": 0.0009899053382610877, + "loss": 2.9973, + "step": 2478 + }, + { + "epoch": 0.07351066038015598, + "grad_norm": 0.1333906501531601, + "learning_rate": 0.0009898959293071618, + "loss": 2.9407, + "step": 2479 + }, + { + "epoch": 0.07354031373246746, + "grad_norm": 0.16211199760437012, + "learning_rate": 0.0009898865160151238, + "loss": 3.0002, + "step": 2480 + }, + { + "epoch": 0.07356996708477893, + "grad_norm": 0.1602664440870285, + "learning_rate": 0.0009898770983850565, + "loss": 2.9755, + "step": 2481 + }, + { + "epoch": 0.07359962043709041, + "grad_norm": 0.1841450035572052, + "learning_rate": 0.0009898676764170432, + "loss": 2.989, + "step": 2482 + }, + { + "epoch": 0.07362927378940189, + "grad_norm": 0.2048049420118332, + "learning_rate": 0.0009898582501111676, + "loss": 2.9799, + "step": 2483 + }, + { + "epoch": 0.07365892714171338, + "grad_norm": 0.19821670651435852, + "learning_rate": 0.0009898488194675134, + "loss": 2.9966, + "step": 2484 + }, + { + "epoch": 0.07368858049402485, + "grad_norm": 0.14659754931926727, + "learning_rate": 0.0009898393844861636, + "loss": 3.0143, + "step": 2485 + }, + { + "epoch": 0.07371823384633633, + "grad_norm": 0.129456028342247, + "learning_rate": 0.000989829945167202, + "loss": 2.9735, + "step": 2486 + }, + { + "epoch": 0.0737478871986478, + "grad_norm": 0.17586594820022583, + "learning_rate": 0.000989820501510712, + "loss": 2.9865, + "step": 2487 + }, + { + "epoch": 0.07377754055095928, + "grad_norm": 0.19713379442691803, + "learning_rate": 0.0009898110535167775, + "loss": 2.9931, + "step": 2488 + }, + { + "epoch": 0.07380719390327077, + "grad_norm": 0.18598198890686035, + "learning_rate": 0.000989801601185482, + "loss": 2.9668, + "step": 2489 + }, + { + "epoch": 0.07383684725558225, + "grad_norm": 0.17605425417423248, + "learning_rate": 0.0009897921445169095, + "loss": 3.0211, + "step": 2490 + }, + { + "epoch": 0.07386650060789372, + "grad_norm": 0.17896494269371033, + "learning_rate": 0.0009897826835111431, + "loss": 2.9955, + "step": 2491 + }, + { + "epoch": 0.0738961539602052, + "grad_norm": 0.1780274212360382, + "learning_rate": 0.0009897732181682673, + "loss": 2.9841, + "step": 2492 + }, + { + "epoch": 0.07392580731251668, + "grad_norm": 0.149502694606781, + "learning_rate": 0.0009897637484883655, + "loss": 3.0195, + "step": 2493 + }, + { + "epoch": 0.07395546066482817, + "grad_norm": 0.14118914306163788, + "learning_rate": 0.0009897542744715215, + "loss": 2.9679, + "step": 2494 + }, + { + "epoch": 0.07398511401713964, + "grad_norm": 0.14288806915283203, + "learning_rate": 0.0009897447961178193, + "loss": 3.0328, + "step": 2495 + }, + { + "epoch": 0.07401476736945112, + "grad_norm": 0.15420259535312653, + "learning_rate": 0.0009897353134273432, + "loss": 2.9786, + "step": 2496 + }, + { + "epoch": 0.0740444207217626, + "grad_norm": 0.14952097833156586, + "learning_rate": 0.0009897258264001767, + "loss": 2.9958, + "step": 2497 + }, + { + "epoch": 0.07407407407407407, + "grad_norm": 0.16363990306854248, + "learning_rate": 0.0009897163350364039, + "loss": 3.0007, + "step": 2498 + }, + { + "epoch": 0.07410372742638555, + "grad_norm": 0.1874520182609558, + "learning_rate": 0.0009897068393361088, + "loss": 2.9933, + "step": 2499 + }, + { + "epoch": 0.07413338077869704, + "grad_norm": 0.20184428989887238, + "learning_rate": 0.0009896973392993756, + "loss": 2.9664, + "step": 2500 + }, + { + "epoch": 0.07416303413100851, + "grad_norm": 0.17872647941112518, + "learning_rate": 0.0009896878349262886, + "loss": 3.0108, + "step": 2501 + }, + { + "epoch": 0.07419268748331999, + "grad_norm": 0.19040051102638245, + "learning_rate": 0.0009896783262169316, + "loss": 3.0, + "step": 2502 + }, + { + "epoch": 0.07422234083563146, + "grad_norm": 0.18063268065452576, + "learning_rate": 0.000989668813171389, + "loss": 2.9643, + "step": 2503 + }, + { + "epoch": 0.07425199418794294, + "grad_norm": 0.18679028749465942, + "learning_rate": 0.000989659295789745, + "loss": 2.962, + "step": 2504 + }, + { + "epoch": 0.07428164754025443, + "grad_norm": 0.17819096148014069, + "learning_rate": 0.0009896497740720838, + "loss": 2.9798, + "step": 2505 + }, + { + "epoch": 0.0743113008925659, + "grad_norm": 0.17992904782295227, + "learning_rate": 0.00098964024801849, + "loss": 2.9929, + "step": 2506 + }, + { + "epoch": 0.07434095424487738, + "grad_norm": 0.19368216395378113, + "learning_rate": 0.0009896307176290476, + "loss": 3.0064, + "step": 2507 + }, + { + "epoch": 0.07437060759718886, + "grad_norm": 0.18694289028644562, + "learning_rate": 0.0009896211829038414, + "loss": 3.0011, + "step": 2508 + }, + { + "epoch": 0.07440026094950034, + "grad_norm": 0.17044296860694885, + "learning_rate": 0.0009896116438429551, + "loss": 2.9963, + "step": 2509 + }, + { + "epoch": 0.07442991430181183, + "grad_norm": 0.20194944739341736, + "learning_rate": 0.000989602100446474, + "loss": 2.9772, + "step": 2510 + }, + { + "epoch": 0.0744595676541233, + "grad_norm": 0.1666887402534485, + "learning_rate": 0.0009895925527144823, + "loss": 2.9895, + "step": 2511 + }, + { + "epoch": 0.07448922100643478, + "grad_norm": 0.16924570500850677, + "learning_rate": 0.0009895830006470645, + "loss": 2.981, + "step": 2512 + }, + { + "epoch": 0.07451887435874625, + "grad_norm": 0.1768084168434143, + "learning_rate": 0.0009895734442443049, + "loss": 3.0127, + "step": 2513 + }, + { + "epoch": 0.07454852771105773, + "grad_norm": 0.16158242523670197, + "learning_rate": 0.0009895638835062887, + "loss": 2.9853, + "step": 2514 + }, + { + "epoch": 0.07457818106336922, + "grad_norm": 0.14817474782466888, + "learning_rate": 0.0009895543184331001, + "loss": 3.0133, + "step": 2515 + }, + { + "epoch": 0.0746078344156807, + "grad_norm": 0.14360272884368896, + "learning_rate": 0.0009895447490248241, + "loss": 2.9997, + "step": 2516 + }, + { + "epoch": 0.07463748776799217, + "grad_norm": 0.14424945414066315, + "learning_rate": 0.0009895351752815453, + "loss": 2.9855, + "step": 2517 + }, + { + "epoch": 0.07466714112030365, + "grad_norm": 0.15466265380382538, + "learning_rate": 0.0009895255972033486, + "loss": 2.9928, + "step": 2518 + }, + { + "epoch": 0.07469679447261512, + "grad_norm": 0.16476908326148987, + "learning_rate": 0.0009895160147903187, + "loss": 2.9335, + "step": 2519 + }, + { + "epoch": 0.07472644782492661, + "grad_norm": 0.1697176992893219, + "learning_rate": 0.0009895064280425402, + "loss": 2.994, + "step": 2520 + }, + { + "epoch": 0.07475610117723809, + "grad_norm": 0.15843142569065094, + "learning_rate": 0.0009894968369600985, + "loss": 3.0407, + "step": 2521 + }, + { + "epoch": 0.07478575452954957, + "grad_norm": 0.14681679010391235, + "learning_rate": 0.0009894872415430783, + "loss": 2.9782, + "step": 2522 + }, + { + "epoch": 0.07481540788186104, + "grad_norm": 0.1520508974790573, + "learning_rate": 0.0009894776417915644, + "loss": 3.0216, + "step": 2523 + }, + { + "epoch": 0.07484506123417252, + "grad_norm": 0.16590340435504913, + "learning_rate": 0.000989468037705642, + "loss": 2.9939, + "step": 2524 + }, + { + "epoch": 0.074874714586484, + "grad_norm": 0.18238288164138794, + "learning_rate": 0.0009894584292853962, + "loss": 2.9399, + "step": 2525 + }, + { + "epoch": 0.07490436793879549, + "grad_norm": 0.17638103663921356, + "learning_rate": 0.0009894488165309119, + "loss": 3.0112, + "step": 2526 + }, + { + "epoch": 0.07493402129110696, + "grad_norm": 0.1913510411977768, + "learning_rate": 0.0009894391994422745, + "loss": 3.0038, + "step": 2527 + }, + { + "epoch": 0.07496367464341844, + "grad_norm": 0.22466887533664703, + "learning_rate": 0.0009894295780195688, + "loss": 3.0018, + "step": 2528 + }, + { + "epoch": 0.07499332799572991, + "grad_norm": 0.21194298565387726, + "learning_rate": 0.0009894199522628802, + "loss": 2.9879, + "step": 2529 + }, + { + "epoch": 0.07502298134804139, + "grad_norm": 0.203980952501297, + "learning_rate": 0.0009894103221722939, + "loss": 2.9615, + "step": 2530 + }, + { + "epoch": 0.07505263470035288, + "grad_norm": 0.24129410088062286, + "learning_rate": 0.0009894006877478952, + "loss": 2.9987, + "step": 2531 + }, + { + "epoch": 0.07508228805266436, + "grad_norm": 0.2124023288488388, + "learning_rate": 0.0009893910489897695, + "loss": 3.0141, + "step": 2532 + }, + { + "epoch": 0.07511194140497583, + "grad_norm": 0.21275562047958374, + "learning_rate": 0.000989381405898002, + "loss": 2.9546, + "step": 2533 + }, + { + "epoch": 0.07514159475728731, + "grad_norm": 0.22502128779888153, + "learning_rate": 0.000989371758472678, + "loss": 3.0024, + "step": 2534 + }, + { + "epoch": 0.07517124810959878, + "grad_norm": 0.22568091750144958, + "learning_rate": 0.0009893621067138833, + "loss": 2.9731, + "step": 2535 + }, + { + "epoch": 0.07520090146191027, + "grad_norm": 0.20522311329841614, + "learning_rate": 0.0009893524506217031, + "loss": 2.9666, + "step": 2536 + }, + { + "epoch": 0.07523055481422175, + "grad_norm": 0.17662203311920166, + "learning_rate": 0.000989342790196223, + "loss": 2.9279, + "step": 2537 + }, + { + "epoch": 0.07526020816653323, + "grad_norm": 0.16340240836143494, + "learning_rate": 0.0009893331254375282, + "loss": 2.9851, + "step": 2538 + }, + { + "epoch": 0.0752898615188447, + "grad_norm": 0.18912264704704285, + "learning_rate": 0.0009893234563457048, + "loss": 3.0072, + "step": 2539 + }, + { + "epoch": 0.07531951487115618, + "grad_norm": 0.17525237798690796, + "learning_rate": 0.0009893137829208383, + "loss": 2.9825, + "step": 2540 + }, + { + "epoch": 0.07534916822346767, + "grad_norm": 0.15307214856147766, + "learning_rate": 0.000989304105163014, + "loss": 2.9876, + "step": 2541 + }, + { + "epoch": 0.07537882157577915, + "grad_norm": 0.18323785066604614, + "learning_rate": 0.000989294423072318, + "loss": 3.0141, + "step": 2542 + }, + { + "epoch": 0.07540847492809062, + "grad_norm": 0.17391358315944672, + "learning_rate": 0.0009892847366488361, + "loss": 2.975, + "step": 2543 + }, + { + "epoch": 0.0754381282804021, + "grad_norm": 0.15648454427719116, + "learning_rate": 0.0009892750458926536, + "loss": 2.9821, + "step": 2544 + }, + { + "epoch": 0.07546778163271357, + "grad_norm": 0.16550765931606293, + "learning_rate": 0.0009892653508038567, + "loss": 2.9407, + "step": 2545 + }, + { + "epoch": 0.07549743498502506, + "grad_norm": 0.14064812660217285, + "learning_rate": 0.0009892556513825311, + "loss": 3.0235, + "step": 2546 + }, + { + "epoch": 0.07552708833733654, + "grad_norm": 0.12269081920385361, + "learning_rate": 0.0009892459476287627, + "loss": 2.9481, + "step": 2547 + }, + { + "epoch": 0.07555674168964802, + "grad_norm": 0.13380080461502075, + "learning_rate": 0.0009892362395426376, + "loss": 3.0148, + "step": 2548 + }, + { + "epoch": 0.07558639504195949, + "grad_norm": 0.15402407944202423, + "learning_rate": 0.0009892265271242416, + "loss": 2.9851, + "step": 2549 + }, + { + "epoch": 0.07561604839427097, + "grad_norm": 0.1755807250738144, + "learning_rate": 0.0009892168103736605, + "loss": 2.9674, + "step": 2550 + }, + { + "epoch": 0.07564570174658244, + "grad_norm": 0.19320081174373627, + "learning_rate": 0.0009892070892909808, + "loss": 2.9732, + "step": 2551 + }, + { + "epoch": 0.07567535509889393, + "grad_norm": 0.1921011060476303, + "learning_rate": 0.0009891973638762883, + "loss": 2.9545, + "step": 2552 + }, + { + "epoch": 0.07570500845120541, + "grad_norm": 0.17174884676933289, + "learning_rate": 0.000989187634129669, + "loss": 3.0307, + "step": 2553 + }, + { + "epoch": 0.07573466180351689, + "grad_norm": 0.1996627151966095, + "learning_rate": 0.0009891779000512093, + "loss": 2.9631, + "step": 2554 + }, + { + "epoch": 0.07576431515582836, + "grad_norm": 0.2346617728471756, + "learning_rate": 0.0009891681616409956, + "loss": 2.978, + "step": 2555 + }, + { + "epoch": 0.07579396850813984, + "grad_norm": 0.20546254515647888, + "learning_rate": 0.0009891584188991137, + "loss": 2.9839, + "step": 2556 + }, + { + "epoch": 0.07582362186045133, + "grad_norm": 0.18889737129211426, + "learning_rate": 0.00098914867182565, + "loss": 3.013, + "step": 2557 + }, + { + "epoch": 0.0758532752127628, + "grad_norm": 0.1942456215620041, + "learning_rate": 0.0009891389204206907, + "loss": 2.9468, + "step": 2558 + }, + { + "epoch": 0.07588292856507428, + "grad_norm": 0.16257219016551971, + "learning_rate": 0.0009891291646843227, + "loss": 2.9562, + "step": 2559 + }, + { + "epoch": 0.07591258191738576, + "grad_norm": 0.16555219888687134, + "learning_rate": 0.0009891194046166315, + "loss": 2.9971, + "step": 2560 + }, + { + "epoch": 0.07594223526969723, + "grad_norm": 0.18586525321006775, + "learning_rate": 0.0009891096402177043, + "loss": 3.0323, + "step": 2561 + }, + { + "epoch": 0.07597188862200872, + "grad_norm": 0.1771145462989807, + "learning_rate": 0.0009890998714876273, + "loss": 2.9863, + "step": 2562 + }, + { + "epoch": 0.0760015419743202, + "grad_norm": 0.16387052834033966, + "learning_rate": 0.000989090098426487, + "loss": 2.9892, + "step": 2563 + }, + { + "epoch": 0.07603119532663168, + "grad_norm": 0.15473172068595886, + "learning_rate": 0.0009890803210343697, + "loss": 2.9999, + "step": 2564 + }, + { + "epoch": 0.07606084867894315, + "grad_norm": 0.15005145967006683, + "learning_rate": 0.0009890705393113624, + "loss": 2.9737, + "step": 2565 + }, + { + "epoch": 0.07609050203125463, + "grad_norm": 0.1503332555294037, + "learning_rate": 0.0009890607532575515, + "loss": 2.9732, + "step": 2566 + }, + { + "epoch": 0.07612015538356612, + "grad_norm": 0.14537088572978973, + "learning_rate": 0.0009890509628730237, + "loss": 2.9582, + "step": 2567 + }, + { + "epoch": 0.0761498087358776, + "grad_norm": 0.14683139324188232, + "learning_rate": 0.0009890411681578656, + "loss": 2.9724, + "step": 2568 + }, + { + "epoch": 0.07617946208818907, + "grad_norm": 0.15255138278007507, + "learning_rate": 0.000989031369112164, + "loss": 3.0012, + "step": 2569 + }, + { + "epoch": 0.07620911544050055, + "grad_norm": 0.16550134122371674, + "learning_rate": 0.0009890215657360059, + "loss": 2.953, + "step": 2570 + }, + { + "epoch": 0.07623876879281202, + "grad_norm": 0.18219143152236938, + "learning_rate": 0.0009890117580294777, + "loss": 3.0148, + "step": 2571 + }, + { + "epoch": 0.07626842214512351, + "grad_norm": 0.2066718190908432, + "learning_rate": 0.0009890019459926666, + "loss": 2.9972, + "step": 2572 + }, + { + "epoch": 0.07629807549743499, + "grad_norm": 0.21990028023719788, + "learning_rate": 0.000988992129625659, + "loss": 3.007, + "step": 2573 + }, + { + "epoch": 0.07632772884974647, + "grad_norm": 0.1853928416967392, + "learning_rate": 0.0009889823089285425, + "loss": 2.9847, + "step": 2574 + }, + { + "epoch": 0.07635738220205794, + "grad_norm": 0.23144765198230743, + "learning_rate": 0.0009889724839014036, + "loss": 2.9724, + "step": 2575 + }, + { + "epoch": 0.07638703555436942, + "grad_norm": 0.21561695635318756, + "learning_rate": 0.0009889626545443292, + "loss": 3.0028, + "step": 2576 + }, + { + "epoch": 0.0764166889066809, + "grad_norm": 0.21683749556541443, + "learning_rate": 0.000988952820857407, + "loss": 3.0191, + "step": 2577 + }, + { + "epoch": 0.07644634225899238, + "grad_norm": 0.19198639690876007, + "learning_rate": 0.0009889429828407233, + "loss": 2.9724, + "step": 2578 + }, + { + "epoch": 0.07647599561130386, + "grad_norm": 0.1803869605064392, + "learning_rate": 0.0009889331404943657, + "loss": 3.0206, + "step": 2579 + }, + { + "epoch": 0.07650564896361534, + "grad_norm": 0.17359836399555206, + "learning_rate": 0.0009889232938184213, + "loss": 2.9938, + "step": 2580 + }, + { + "epoch": 0.07653530231592681, + "grad_norm": 0.15941305458545685, + "learning_rate": 0.0009889134428129772, + "loss": 3.01, + "step": 2581 + }, + { + "epoch": 0.07656495566823829, + "grad_norm": 0.15290546417236328, + "learning_rate": 0.0009889035874781206, + "loss": 2.9792, + "step": 2582 + }, + { + "epoch": 0.07659460902054978, + "grad_norm": 0.16960589587688446, + "learning_rate": 0.0009888937278139387, + "loss": 2.9969, + "step": 2583 + }, + { + "epoch": 0.07662426237286125, + "grad_norm": 0.17881281673908234, + "learning_rate": 0.0009888838638205189, + "loss": 3.0121, + "step": 2584 + }, + { + "epoch": 0.07665391572517273, + "grad_norm": 0.18444646894931793, + "learning_rate": 0.0009888739954979488, + "loss": 2.9836, + "step": 2585 + }, + { + "epoch": 0.0766835690774842, + "grad_norm": 0.18294140696525574, + "learning_rate": 0.0009888641228463153, + "loss": 2.9685, + "step": 2586 + }, + { + "epoch": 0.07671322242979568, + "grad_norm": 0.17330537736415863, + "learning_rate": 0.0009888542458657062, + "loss": 2.9901, + "step": 2587 + }, + { + "epoch": 0.07674287578210717, + "grad_norm": 0.15923623740673065, + "learning_rate": 0.0009888443645562088, + "loss": 3.0035, + "step": 2588 + }, + { + "epoch": 0.07677252913441865, + "grad_norm": 0.1595505028963089, + "learning_rate": 0.0009888344789179108, + "loss": 2.9502, + "step": 2589 + }, + { + "epoch": 0.07680218248673013, + "grad_norm": 0.14858457446098328, + "learning_rate": 0.0009888245889508995, + "loss": 2.9877, + "step": 2590 + }, + { + "epoch": 0.0768318358390416, + "grad_norm": 0.1516745239496231, + "learning_rate": 0.0009888146946552625, + "loss": 2.9589, + "step": 2591 + }, + { + "epoch": 0.07686148919135308, + "grad_norm": 0.13964590430259705, + "learning_rate": 0.0009888047960310874, + "loss": 2.9823, + "step": 2592 + }, + { + "epoch": 0.07689114254366457, + "grad_norm": 0.14223189651966095, + "learning_rate": 0.0009887948930784622, + "loss": 3.0082, + "step": 2593 + }, + { + "epoch": 0.07692079589597604, + "grad_norm": 0.1592486947774887, + "learning_rate": 0.0009887849857974742, + "loss": 2.993, + "step": 2594 + }, + { + "epoch": 0.07695044924828752, + "grad_norm": 0.17066511511802673, + "learning_rate": 0.000988775074188211, + "loss": 2.9776, + "step": 2595 + }, + { + "epoch": 0.076980102600599, + "grad_norm": 0.1626061648130417, + "learning_rate": 0.000988765158250761, + "loss": 2.9874, + "step": 2596 + }, + { + "epoch": 0.07700975595291047, + "grad_norm": 0.16676127910614014, + "learning_rate": 0.0009887552379852114, + "loss": 2.9834, + "step": 2597 + }, + { + "epoch": 0.07703940930522196, + "grad_norm": 0.17687740921974182, + "learning_rate": 0.0009887453133916504, + "loss": 3.0009, + "step": 2598 + }, + { + "epoch": 0.07706906265753344, + "grad_norm": 0.20366334915161133, + "learning_rate": 0.0009887353844701657, + "loss": 3.0044, + "step": 2599 + }, + { + "epoch": 0.07709871600984491, + "grad_norm": 0.24296240508556366, + "learning_rate": 0.0009887254512208452, + "loss": 2.9537, + "step": 2600 + }, + { + "epoch": 0.07712836936215639, + "grad_norm": 0.27139657735824585, + "learning_rate": 0.000988715513643777, + "loss": 2.9978, + "step": 2601 + }, + { + "epoch": 0.07715802271446787, + "grad_norm": 0.27887651324272156, + "learning_rate": 0.000988705571739049, + "loss": 2.9861, + "step": 2602 + }, + { + "epoch": 0.07718767606677934, + "grad_norm": 0.2340616136789322, + "learning_rate": 0.0009886956255067494, + "loss": 2.9759, + "step": 2603 + }, + { + "epoch": 0.07721732941909083, + "grad_norm": 0.2331950068473816, + "learning_rate": 0.0009886856749469662, + "loss": 2.965, + "step": 2604 + }, + { + "epoch": 0.07724698277140231, + "grad_norm": 0.21663296222686768, + "learning_rate": 0.0009886757200597872, + "loss": 2.9888, + "step": 2605 + }, + { + "epoch": 0.07727663612371378, + "grad_norm": 0.18647401034832, + "learning_rate": 0.000988665760845301, + "loss": 2.972, + "step": 2606 + }, + { + "epoch": 0.07730628947602526, + "grad_norm": 0.1937815248966217, + "learning_rate": 0.0009886557973035955, + "loss": 2.9855, + "step": 2607 + }, + { + "epoch": 0.07733594282833674, + "grad_norm": 0.16636167466640472, + "learning_rate": 0.0009886458294347592, + "loss": 2.9637, + "step": 2608 + }, + { + "epoch": 0.07736559618064823, + "grad_norm": 0.169729083776474, + "learning_rate": 0.00098863585723888, + "loss": 2.972, + "step": 2609 + }, + { + "epoch": 0.0773952495329597, + "grad_norm": 0.16979123651981354, + "learning_rate": 0.0009886258807160467, + "loss": 2.9957, + "step": 2610 + }, + { + "epoch": 0.07742490288527118, + "grad_norm": 0.13899898529052734, + "learning_rate": 0.0009886158998663472, + "loss": 2.9611, + "step": 2611 + }, + { + "epoch": 0.07745455623758266, + "grad_norm": 0.12694112956523895, + "learning_rate": 0.00098860591468987, + "loss": 2.9832, + "step": 2612 + }, + { + "epoch": 0.07748420958989413, + "grad_norm": 0.1460741013288498, + "learning_rate": 0.0009885959251867036, + "loss": 2.9481, + "step": 2613 + }, + { + "epoch": 0.07751386294220562, + "grad_norm": 0.15081383287906647, + "learning_rate": 0.0009885859313569364, + "loss": 2.9975, + "step": 2614 + }, + { + "epoch": 0.0775435162945171, + "grad_norm": 0.14999964833259583, + "learning_rate": 0.0009885759332006567, + "loss": 3.0035, + "step": 2615 + }, + { + "epoch": 0.07757316964682857, + "grad_norm": 0.15858030319213867, + "learning_rate": 0.0009885659307179535, + "loss": 2.9829, + "step": 2616 + }, + { + "epoch": 0.07760282299914005, + "grad_norm": 0.1738538295030594, + "learning_rate": 0.0009885559239089152, + "loss": 2.9673, + "step": 2617 + }, + { + "epoch": 0.07763247635145153, + "grad_norm": 0.16808851063251495, + "learning_rate": 0.00098854591277363, + "loss": 2.9792, + "step": 2618 + }, + { + "epoch": 0.07766212970376302, + "grad_norm": 0.17135930061340332, + "learning_rate": 0.0009885358973121872, + "loss": 2.9943, + "step": 2619 + }, + { + "epoch": 0.07769178305607449, + "grad_norm": 0.20302249491214752, + "learning_rate": 0.0009885258775246752, + "loss": 2.9885, + "step": 2620 + }, + { + "epoch": 0.07772143640838597, + "grad_norm": 0.21277384459972382, + "learning_rate": 0.0009885158534111824, + "loss": 2.9882, + "step": 2621 + }, + { + "epoch": 0.07775108976069744, + "grad_norm": 0.2034849226474762, + "learning_rate": 0.000988505824971798, + "loss": 3.0111, + "step": 2622 + }, + { + "epoch": 0.07778074311300892, + "grad_norm": 0.18333816528320312, + "learning_rate": 0.000988495792206611, + "loss": 2.9663, + "step": 2623 + }, + { + "epoch": 0.07781039646532041, + "grad_norm": 0.19248372316360474, + "learning_rate": 0.0009884857551157094, + "loss": 2.9642, + "step": 2624 + }, + { + "epoch": 0.07784004981763189, + "grad_norm": 0.2061699479818344, + "learning_rate": 0.0009884757136991828, + "loss": 2.983, + "step": 2625 + }, + { + "epoch": 0.07786970316994336, + "grad_norm": 0.22833245992660522, + "learning_rate": 0.00098846566795712, + "loss": 2.9438, + "step": 2626 + }, + { + "epoch": 0.07789935652225484, + "grad_norm": 0.21920348703861237, + "learning_rate": 0.0009884556178896098, + "loss": 2.9941, + "step": 2627 + }, + { + "epoch": 0.07792900987456632, + "grad_norm": 0.18845108151435852, + "learning_rate": 0.0009884455634967414, + "loss": 2.9987, + "step": 2628 + }, + { + "epoch": 0.07795866322687779, + "grad_norm": 0.16523855924606323, + "learning_rate": 0.0009884355047786034, + "loss": 3.0124, + "step": 2629 + }, + { + "epoch": 0.07798831657918928, + "grad_norm": 0.180614173412323, + "learning_rate": 0.0009884254417352853, + "loss": 2.9697, + "step": 2630 + }, + { + "epoch": 0.07801796993150076, + "grad_norm": 0.15463978052139282, + "learning_rate": 0.000988415374366876, + "loss": 2.985, + "step": 2631 + }, + { + "epoch": 0.07804762328381223, + "grad_norm": 0.15035557746887207, + "learning_rate": 0.0009884053026734648, + "loss": 2.9678, + "step": 2632 + }, + { + "epoch": 0.07807727663612371, + "grad_norm": 0.16988277435302734, + "learning_rate": 0.0009883952266551408, + "loss": 2.9682, + "step": 2633 + }, + { + "epoch": 0.07810692998843519, + "grad_norm": 0.1757243573665619, + "learning_rate": 0.0009883851463119934, + "loss": 3.0088, + "step": 2634 + }, + { + "epoch": 0.07813658334074668, + "grad_norm": 0.1491086483001709, + "learning_rate": 0.0009883750616441114, + "loss": 2.9805, + "step": 2635 + }, + { + "epoch": 0.07816623669305815, + "grad_norm": 0.15636491775512695, + "learning_rate": 0.0009883649726515847, + "loss": 2.984, + "step": 2636 + }, + { + "epoch": 0.07819589004536963, + "grad_norm": 0.1660081446170807, + "learning_rate": 0.000988354879334502, + "loss": 2.985, + "step": 2637 + }, + { + "epoch": 0.0782255433976811, + "grad_norm": 0.14874204993247986, + "learning_rate": 0.0009883447816929534, + "loss": 2.9712, + "step": 2638 + }, + { + "epoch": 0.07825519674999258, + "grad_norm": 0.15704143047332764, + "learning_rate": 0.0009883346797270277, + "loss": 2.9683, + "step": 2639 + }, + { + "epoch": 0.07828485010230407, + "grad_norm": 0.13299816846847534, + "learning_rate": 0.0009883245734368146, + "loss": 3.0035, + "step": 2640 + }, + { + "epoch": 0.07831450345461555, + "grad_norm": 0.14274707436561584, + "learning_rate": 0.0009883144628224036, + "loss": 2.9968, + "step": 2641 + }, + { + "epoch": 0.07834415680692702, + "grad_norm": 0.16036854684352875, + "learning_rate": 0.0009883043478838842, + "loss": 2.9848, + "step": 2642 + }, + { + "epoch": 0.0783738101592385, + "grad_norm": 0.14432212710380554, + "learning_rate": 0.000988294228621346, + "loss": 3.0051, + "step": 2643 + }, + { + "epoch": 0.07840346351154998, + "grad_norm": 0.15768150985240936, + "learning_rate": 0.0009882841050348787, + "loss": 2.982, + "step": 2644 + }, + { + "epoch": 0.07843311686386147, + "grad_norm": 0.1816752851009369, + "learning_rate": 0.0009882739771245716, + "loss": 3.0007, + "step": 2645 + }, + { + "epoch": 0.07846277021617294, + "grad_norm": 0.18820199370384216, + "learning_rate": 0.0009882638448905148, + "loss": 2.9846, + "step": 2646 + }, + { + "epoch": 0.07849242356848442, + "grad_norm": 0.18721021711826324, + "learning_rate": 0.000988253708332798, + "loss": 3.006, + "step": 2647 + }, + { + "epoch": 0.0785220769207959, + "grad_norm": 0.18942925333976746, + "learning_rate": 0.0009882435674515105, + "loss": 2.9982, + "step": 2648 + }, + { + "epoch": 0.07855173027310737, + "grad_norm": 0.18589423596858978, + "learning_rate": 0.0009882334222467426, + "loss": 2.9653, + "step": 2649 + }, + { + "epoch": 0.07858138362541886, + "grad_norm": 0.20513984560966492, + "learning_rate": 0.0009882232727185837, + "loss": 2.976, + "step": 2650 + }, + { + "epoch": 0.07861103697773034, + "grad_norm": 0.2126484364271164, + "learning_rate": 0.0009882131188671242, + "loss": 2.9699, + "step": 2651 + }, + { + "epoch": 0.07864069033004181, + "grad_norm": 0.1910134106874466, + "learning_rate": 0.0009882029606924536, + "loss": 2.9737, + "step": 2652 + }, + { + "epoch": 0.07867034368235329, + "grad_norm": 0.21302562952041626, + "learning_rate": 0.000988192798194662, + "loss": 2.9989, + "step": 2653 + }, + { + "epoch": 0.07869999703466476, + "grad_norm": 0.20946308970451355, + "learning_rate": 0.0009881826313738393, + "loss": 2.9705, + "step": 2654 + }, + { + "epoch": 0.07872965038697624, + "grad_norm": 0.18908852338790894, + "learning_rate": 0.0009881724602300757, + "loss": 3.0058, + "step": 2655 + }, + { + "epoch": 0.07875930373928773, + "grad_norm": 0.20090098679065704, + "learning_rate": 0.0009881622847634612, + "loss": 2.9604, + "step": 2656 + }, + { + "epoch": 0.07878895709159921, + "grad_norm": 0.16996446251869202, + "learning_rate": 0.0009881521049740858, + "loss": 2.9957, + "step": 2657 + }, + { + "epoch": 0.07881861044391068, + "grad_norm": 0.1688147783279419, + "learning_rate": 0.0009881419208620397, + "loss": 2.9947, + "step": 2658 + }, + { + "epoch": 0.07884826379622216, + "grad_norm": 0.17351587116718292, + "learning_rate": 0.0009881317324274132, + "loss": 2.9939, + "step": 2659 + }, + { + "epoch": 0.07887791714853364, + "grad_norm": 0.1798180490732193, + "learning_rate": 0.0009881215396702963, + "loss": 3.0007, + "step": 2660 + }, + { + "epoch": 0.07890757050084513, + "grad_norm": 0.16850414872169495, + "learning_rate": 0.0009881113425907796, + "loss": 2.9938, + "step": 2661 + }, + { + "epoch": 0.0789372238531566, + "grad_norm": 0.18981020152568817, + "learning_rate": 0.0009881011411889528, + "loss": 2.9654, + "step": 2662 + }, + { + "epoch": 0.07896687720546808, + "grad_norm": 0.16494157910346985, + "learning_rate": 0.000988090935464907, + "loss": 2.9839, + "step": 2663 + }, + { + "epoch": 0.07899653055777955, + "grad_norm": 0.14845548570156097, + "learning_rate": 0.000988080725418732, + "loss": 2.9339, + "step": 2664 + }, + { + "epoch": 0.07902618391009103, + "grad_norm": 0.13608062267303467, + "learning_rate": 0.0009880705110505183, + "loss": 2.9918, + "step": 2665 + }, + { + "epoch": 0.07905583726240252, + "grad_norm": 0.1623600870370865, + "learning_rate": 0.0009880602923603567, + "loss": 3.0068, + "step": 2666 + }, + { + "epoch": 0.079085490614714, + "grad_norm": 0.16730330884456635, + "learning_rate": 0.0009880500693483373, + "loss": 2.951, + "step": 2667 + }, + { + "epoch": 0.07911514396702547, + "grad_norm": 0.16194938123226166, + "learning_rate": 0.0009880398420145508, + "loss": 2.9684, + "step": 2668 + }, + { + "epoch": 0.07914479731933695, + "grad_norm": 0.1596331000328064, + "learning_rate": 0.0009880296103590876, + "loss": 2.9362, + "step": 2669 + }, + { + "epoch": 0.07917445067164842, + "grad_norm": 0.14718854427337646, + "learning_rate": 0.0009880193743820385, + "loss": 2.9837, + "step": 2670 + }, + { + "epoch": 0.07920410402395991, + "grad_norm": 0.1503051370382309, + "learning_rate": 0.000988009134083494, + "loss": 3.0024, + "step": 2671 + }, + { + "epoch": 0.07923375737627139, + "grad_norm": 0.1508837789297104, + "learning_rate": 0.000987998889463545, + "loss": 2.9647, + "step": 2672 + }, + { + "epoch": 0.07926341072858287, + "grad_norm": 0.16891801357269287, + "learning_rate": 0.000987988640522282, + "loss": 2.9629, + "step": 2673 + }, + { + "epoch": 0.07929306408089434, + "grad_norm": 0.18381771445274353, + "learning_rate": 0.000987978387259796, + "loss": 2.9875, + "step": 2674 + }, + { + "epoch": 0.07932271743320582, + "grad_norm": 0.18180473148822784, + "learning_rate": 0.0009879681296761774, + "loss": 2.9639, + "step": 2675 + }, + { + "epoch": 0.07935237078551731, + "grad_norm": 0.14992284774780273, + "learning_rate": 0.0009879578677715172, + "loss": 2.9847, + "step": 2676 + }, + { + "epoch": 0.07938202413782879, + "grad_norm": 0.16873548924922943, + "learning_rate": 0.0009879476015459065, + "loss": 2.9378, + "step": 2677 + }, + { + "epoch": 0.07941167749014026, + "grad_norm": 0.16613145172595978, + "learning_rate": 0.0009879373309994361, + "loss": 3.011, + "step": 2678 + }, + { + "epoch": 0.07944133084245174, + "grad_norm": 0.17402085661888123, + "learning_rate": 0.0009879270561321968, + "loss": 2.9736, + "step": 2679 + }, + { + "epoch": 0.07947098419476321, + "grad_norm": 0.17236508429050446, + "learning_rate": 0.0009879167769442797, + "loss": 2.9291, + "step": 2680 + }, + { + "epoch": 0.07950063754707469, + "grad_norm": 0.17778220772743225, + "learning_rate": 0.0009879064934357755, + "loss": 2.9532, + "step": 2681 + }, + { + "epoch": 0.07953029089938618, + "grad_norm": 0.1864854246377945, + "learning_rate": 0.000987896205606776, + "loss": 2.9722, + "step": 2682 + }, + { + "epoch": 0.07955994425169766, + "grad_norm": 0.16460385918617249, + "learning_rate": 0.0009878859134573714, + "loss": 3.0037, + "step": 2683 + }, + { + "epoch": 0.07958959760400913, + "grad_norm": 0.1755058765411377, + "learning_rate": 0.0009878756169876535, + "loss": 2.9887, + "step": 2684 + }, + { + "epoch": 0.07961925095632061, + "grad_norm": 0.1853102594614029, + "learning_rate": 0.0009878653161977133, + "loss": 3.0152, + "step": 2685 + }, + { + "epoch": 0.07964890430863208, + "grad_norm": 0.21454723179340363, + "learning_rate": 0.000987855011087642, + "loss": 3.04, + "step": 2686 + }, + { + "epoch": 0.07967855766094357, + "grad_norm": 0.20412389934062958, + "learning_rate": 0.000987844701657531, + "loss": 2.9642, + "step": 2687 + }, + { + "epoch": 0.07970821101325505, + "grad_norm": 0.18751893937587738, + "learning_rate": 0.000987834387907471, + "loss": 2.9623, + "step": 2688 + }, + { + "epoch": 0.07973786436556653, + "grad_norm": 0.20348544418811798, + "learning_rate": 0.0009878240698375541, + "loss": 2.9932, + "step": 2689 + }, + { + "epoch": 0.079767517717878, + "grad_norm": 0.18868649005889893, + "learning_rate": 0.0009878137474478713, + "loss": 2.97, + "step": 2690 + }, + { + "epoch": 0.07979717107018948, + "grad_norm": 0.18706709146499634, + "learning_rate": 0.000987803420738514, + "loss": 2.9449, + "step": 2691 + }, + { + "epoch": 0.07982682442250097, + "grad_norm": 0.14644014835357666, + "learning_rate": 0.0009877930897095736, + "loss": 2.9456, + "step": 2692 + }, + { + "epoch": 0.07985647777481245, + "grad_norm": 0.14589524269104004, + "learning_rate": 0.0009877827543611417, + "loss": 2.9906, + "step": 2693 + }, + { + "epoch": 0.07988613112712392, + "grad_norm": 0.14377596974372864, + "learning_rate": 0.0009877724146933097, + "loss": 2.9878, + "step": 2694 + }, + { + "epoch": 0.0799157844794354, + "grad_norm": 0.14086580276489258, + "learning_rate": 0.0009877620707061693, + "loss": 2.9795, + "step": 2695 + }, + { + "epoch": 0.07994543783174687, + "grad_norm": 0.13682527840137482, + "learning_rate": 0.000987751722399812, + "loss": 2.9852, + "step": 2696 + }, + { + "epoch": 0.07997509118405836, + "grad_norm": 0.15926557779312134, + "learning_rate": 0.0009877413697743295, + "loss": 2.9454, + "step": 2697 + }, + { + "epoch": 0.08000474453636984, + "grad_norm": 0.14335687458515167, + "learning_rate": 0.0009877310128298135, + "loss": 2.9979, + "step": 2698 + }, + { + "epoch": 0.08003439788868132, + "grad_norm": 0.15985292196273804, + "learning_rate": 0.0009877206515663556, + "loss": 3.0057, + "step": 2699 + }, + { + "epoch": 0.08006405124099279, + "grad_norm": 0.18128660321235657, + "learning_rate": 0.0009877102859840478, + "loss": 2.9638, + "step": 2700 + }, + { + "epoch": 0.08009370459330427, + "grad_norm": 0.2024649977684021, + "learning_rate": 0.0009876999160829817, + "loss": 3.0337, + "step": 2701 + }, + { + "epoch": 0.08012335794561576, + "grad_norm": 0.21756424009799957, + "learning_rate": 0.000987689541863249, + "loss": 2.9873, + "step": 2702 + }, + { + "epoch": 0.08015301129792723, + "grad_norm": 0.1899878978729248, + "learning_rate": 0.0009876791633249417, + "loss": 2.9647, + "step": 2703 + }, + { + "epoch": 0.08018266465023871, + "grad_norm": 0.1762421876192093, + "learning_rate": 0.0009876687804681516, + "loss": 2.9325, + "step": 2704 + }, + { + "epoch": 0.08021231800255019, + "grad_norm": 0.1854078620672226, + "learning_rate": 0.0009876583932929709, + "loss": 2.9806, + "step": 2705 + }, + { + "epoch": 0.08024197135486166, + "grad_norm": 0.2018187791109085, + "learning_rate": 0.0009876480017994914, + "loss": 2.952, + "step": 2706 + }, + { + "epoch": 0.08027162470717314, + "grad_norm": 0.21394337713718414, + "learning_rate": 0.000987637605987805, + "loss": 2.9905, + "step": 2707 + }, + { + "epoch": 0.08030127805948463, + "grad_norm": 0.21903419494628906, + "learning_rate": 0.000987627205858004, + "loss": 2.9456, + "step": 2708 + }, + { + "epoch": 0.0803309314117961, + "grad_norm": 0.21552503108978271, + "learning_rate": 0.0009876168014101806, + "loss": 2.9727, + "step": 2709 + }, + { + "epoch": 0.08036058476410758, + "grad_norm": 0.21179677546024323, + "learning_rate": 0.0009876063926444263, + "loss": 3.0029, + "step": 2710 + }, + { + "epoch": 0.08039023811641906, + "grad_norm": 0.17524398863315582, + "learning_rate": 0.000987595979560834, + "loss": 3.0263, + "step": 2711 + }, + { + "epoch": 0.08041989146873053, + "grad_norm": 0.16268454492092133, + "learning_rate": 0.0009875855621594954, + "loss": 2.9832, + "step": 2712 + }, + { + "epoch": 0.08044954482104202, + "grad_norm": 0.16098260879516602, + "learning_rate": 0.000987575140440503, + "loss": 2.9741, + "step": 2713 + }, + { + "epoch": 0.0804791981733535, + "grad_norm": 0.15104736387729645, + "learning_rate": 0.0009875647144039492, + "loss": 2.9606, + "step": 2714 + }, + { + "epoch": 0.08050885152566498, + "grad_norm": 0.15181730687618256, + "learning_rate": 0.000987554284049926, + "loss": 2.9698, + "step": 2715 + }, + { + "epoch": 0.08053850487797645, + "grad_norm": 0.1424308866262436, + "learning_rate": 0.000987543849378526, + "loss": 2.9657, + "step": 2716 + }, + { + "epoch": 0.08056815823028793, + "grad_norm": 0.13149291276931763, + "learning_rate": 0.0009875334103898418, + "loss": 2.9491, + "step": 2717 + }, + { + "epoch": 0.08059781158259942, + "grad_norm": 0.15842469036579132, + "learning_rate": 0.0009875229670839652, + "loss": 2.9972, + "step": 2718 + }, + { + "epoch": 0.0806274649349109, + "grad_norm": 0.1731119602918625, + "learning_rate": 0.000987512519460989, + "loss": 2.9733, + "step": 2719 + }, + { + "epoch": 0.08065711828722237, + "grad_norm": 0.16398996114730835, + "learning_rate": 0.000987502067521006, + "loss": 2.9372, + "step": 2720 + }, + { + "epoch": 0.08068677163953385, + "grad_norm": 0.1749027520418167, + "learning_rate": 0.0009874916112641084, + "loss": 2.9384, + "step": 2721 + }, + { + "epoch": 0.08071642499184532, + "grad_norm": 0.1750650852918625, + "learning_rate": 0.0009874811506903891, + "loss": 2.9342, + "step": 2722 + }, + { + "epoch": 0.08074607834415681, + "grad_norm": 0.1791299432516098, + "learning_rate": 0.0009874706857999403, + "loss": 2.9572, + "step": 2723 + }, + { + "epoch": 0.08077573169646829, + "grad_norm": 0.1868058741092682, + "learning_rate": 0.0009874602165928549, + "loss": 2.9465, + "step": 2724 + }, + { + "epoch": 0.08080538504877977, + "grad_norm": 0.18466442823410034, + "learning_rate": 0.0009874497430692258, + "loss": 2.9415, + "step": 2725 + }, + { + "epoch": 0.08083503840109124, + "grad_norm": 0.1669052094221115, + "learning_rate": 0.0009874392652291452, + "loss": 2.9557, + "step": 2726 + }, + { + "epoch": 0.08086469175340272, + "grad_norm": 0.17768587172031403, + "learning_rate": 0.0009874287830727066, + "loss": 2.9845, + "step": 2727 + }, + { + "epoch": 0.08089434510571421, + "grad_norm": 0.16911545395851135, + "learning_rate": 0.0009874182966000023, + "loss": 2.9974, + "step": 2728 + }, + { + "epoch": 0.08092399845802568, + "grad_norm": 0.1423010677099228, + "learning_rate": 0.0009874078058111253, + "loss": 2.994, + "step": 2729 + }, + { + "epoch": 0.08095365181033716, + "grad_norm": 0.14343777298927307, + "learning_rate": 0.0009873973107061686, + "loss": 2.9871, + "step": 2730 + }, + { + "epoch": 0.08098330516264864, + "grad_norm": 0.17451341450214386, + "learning_rate": 0.000987386811285225, + "loss": 3.034, + "step": 2731 + }, + { + "epoch": 0.08101295851496011, + "grad_norm": 0.16221554577350616, + "learning_rate": 0.0009873763075483877, + "loss": 2.9637, + "step": 2732 + }, + { + "epoch": 0.08104261186727159, + "grad_norm": 0.16966792941093445, + "learning_rate": 0.0009873657994957491, + "loss": 2.9754, + "step": 2733 + }, + { + "epoch": 0.08107226521958308, + "grad_norm": 0.16762816905975342, + "learning_rate": 0.000987355287127403, + "loss": 2.9505, + "step": 2734 + }, + { + "epoch": 0.08110191857189455, + "grad_norm": 0.1643103063106537, + "learning_rate": 0.0009873447704434424, + "loss": 2.9421, + "step": 2735 + }, + { + "epoch": 0.08113157192420603, + "grad_norm": 0.16972093284130096, + "learning_rate": 0.00098733424944396, + "loss": 2.9951, + "step": 2736 + }, + { + "epoch": 0.0811612252765175, + "grad_norm": 0.17549824714660645, + "learning_rate": 0.0009873237241290488, + "loss": 2.9892, + "step": 2737 + }, + { + "epoch": 0.08119087862882898, + "grad_norm": 0.18161624670028687, + "learning_rate": 0.000987313194498803, + "loss": 2.9343, + "step": 2738 + }, + { + "epoch": 0.08122053198114047, + "grad_norm": 0.18881914019584656, + "learning_rate": 0.0009873026605533148, + "loss": 2.9416, + "step": 2739 + }, + { + "epoch": 0.08125018533345195, + "grad_norm": 0.15772698819637299, + "learning_rate": 0.000987292122292678, + "loss": 2.9758, + "step": 2740 + }, + { + "epoch": 0.08127983868576343, + "grad_norm": 0.1640419363975525, + "learning_rate": 0.0009872815797169858, + "loss": 2.9524, + "step": 2741 + }, + { + "epoch": 0.0813094920380749, + "grad_norm": 0.16316886246204376, + "learning_rate": 0.0009872710328263318, + "loss": 2.9815, + "step": 2742 + }, + { + "epoch": 0.08133914539038638, + "grad_norm": 0.16636979579925537, + "learning_rate": 0.0009872604816208088, + "loss": 2.9776, + "step": 2743 + }, + { + "epoch": 0.08136879874269787, + "grad_norm": 0.15891264379024506, + "learning_rate": 0.0009872499261005109, + "loss": 2.9475, + "step": 2744 + }, + { + "epoch": 0.08139845209500934, + "grad_norm": 0.1575651615858078, + "learning_rate": 0.000987239366265531, + "loss": 2.9386, + "step": 2745 + }, + { + "epoch": 0.08142810544732082, + "grad_norm": 0.17565779387950897, + "learning_rate": 0.000987228802115963, + "loss": 2.9867, + "step": 2746 + }, + { + "epoch": 0.0814577587996323, + "grad_norm": 0.21283413469791412, + "learning_rate": 0.0009872182336519003, + "loss": 3.0035, + "step": 2747 + }, + { + "epoch": 0.08148741215194377, + "grad_norm": 0.2280060350894928, + "learning_rate": 0.0009872076608734366, + "loss": 2.9432, + "step": 2748 + }, + { + "epoch": 0.08151706550425526, + "grad_norm": 0.24072466790676117, + "learning_rate": 0.0009871970837806653, + "loss": 2.9934, + "step": 2749 + }, + { + "epoch": 0.08154671885656674, + "grad_norm": 0.22207212448120117, + "learning_rate": 0.0009871865023736803, + "loss": 2.9698, + "step": 2750 + }, + { + "epoch": 0.08157637220887821, + "grad_norm": 0.1952158659696579, + "learning_rate": 0.0009871759166525753, + "loss": 2.9555, + "step": 2751 + }, + { + "epoch": 0.08160602556118969, + "grad_norm": 0.21573598682880402, + "learning_rate": 0.000987165326617444, + "loss": 2.9902, + "step": 2752 + }, + { + "epoch": 0.08163567891350117, + "grad_norm": 0.21187373995780945, + "learning_rate": 0.00098715473226838, + "loss": 2.986, + "step": 2753 + }, + { + "epoch": 0.08166533226581266, + "grad_norm": 0.19594809412956238, + "learning_rate": 0.0009871441336054769, + "loss": 2.9814, + "step": 2754 + }, + { + "epoch": 0.08169498561812413, + "grad_norm": 0.16009089350700378, + "learning_rate": 0.0009871335306288291, + "loss": 2.9825, + "step": 2755 + }, + { + "epoch": 0.08172463897043561, + "grad_norm": 0.1572500318288803, + "learning_rate": 0.0009871229233385304, + "loss": 2.9434, + "step": 2756 + }, + { + "epoch": 0.08175429232274709, + "grad_norm": 0.1517653912305832, + "learning_rate": 0.0009871123117346744, + "loss": 2.958, + "step": 2757 + }, + { + "epoch": 0.08178394567505856, + "grad_norm": 0.14786173403263092, + "learning_rate": 0.0009871016958173553, + "loss": 2.9713, + "step": 2758 + }, + { + "epoch": 0.08181359902737004, + "grad_norm": 0.15476244688034058, + "learning_rate": 0.0009870910755866674, + "loss": 2.935, + "step": 2759 + }, + { + "epoch": 0.08184325237968153, + "grad_norm": 0.1584484577178955, + "learning_rate": 0.000987080451042704, + "loss": 2.9766, + "step": 2760 + }, + { + "epoch": 0.081872905731993, + "grad_norm": 0.1562964767217636, + "learning_rate": 0.0009870698221855598, + "loss": 2.9889, + "step": 2761 + }, + { + "epoch": 0.08190255908430448, + "grad_norm": 0.15103448927402496, + "learning_rate": 0.0009870591890153285, + "loss": 2.9272, + "step": 2762 + }, + { + "epoch": 0.08193221243661596, + "grad_norm": 0.1499788463115692, + "learning_rate": 0.0009870485515321048, + "loss": 2.9788, + "step": 2763 + }, + { + "epoch": 0.08196186578892743, + "grad_norm": 0.15418055653572083, + "learning_rate": 0.0009870379097359826, + "loss": 2.9648, + "step": 2764 + }, + { + "epoch": 0.08199151914123892, + "grad_norm": 0.1496022790670395, + "learning_rate": 0.0009870272636270559, + "loss": 2.9733, + "step": 2765 + }, + { + "epoch": 0.0820211724935504, + "grad_norm": 0.13981501758098602, + "learning_rate": 0.0009870166132054192, + "loss": 2.9769, + "step": 2766 + }, + { + "epoch": 0.08205082584586187, + "grad_norm": 0.15343530476093292, + "learning_rate": 0.0009870059584711668, + "loss": 2.964, + "step": 2767 + }, + { + "epoch": 0.08208047919817335, + "grad_norm": 0.18254616856575012, + "learning_rate": 0.0009869952994243931, + "loss": 2.9805, + "step": 2768 + }, + { + "epoch": 0.08211013255048483, + "grad_norm": 0.18156185746192932, + "learning_rate": 0.0009869846360651925, + "loss": 2.9549, + "step": 2769 + }, + { + "epoch": 0.08213978590279632, + "grad_norm": 0.16611669957637787, + "learning_rate": 0.0009869739683936592, + "loss": 2.964, + "step": 2770 + }, + { + "epoch": 0.08216943925510779, + "grad_norm": 0.19667406380176544, + "learning_rate": 0.0009869632964098877, + "loss": 2.9679, + "step": 2771 + }, + { + "epoch": 0.08219909260741927, + "grad_norm": 0.16685035824775696, + "learning_rate": 0.000986952620113973, + "loss": 2.9487, + "step": 2772 + }, + { + "epoch": 0.08222874595973075, + "grad_norm": 0.1532350480556488, + "learning_rate": 0.0009869419395060089, + "loss": 2.9601, + "step": 2773 + }, + { + "epoch": 0.08225839931204222, + "grad_norm": 0.16022448241710663, + "learning_rate": 0.0009869312545860906, + "loss": 2.9862, + "step": 2774 + }, + { + "epoch": 0.08228805266435371, + "grad_norm": 0.14278025925159454, + "learning_rate": 0.0009869205653543123, + "loss": 2.9841, + "step": 2775 + }, + { + "epoch": 0.08231770601666519, + "grad_norm": 0.20334039628505707, + "learning_rate": 0.000986909871810769, + "loss": 2.9519, + "step": 2776 + }, + { + "epoch": 0.08234735936897666, + "grad_norm": 0.20936909317970276, + "learning_rate": 0.000986899173955555, + "loss": 3.0258, + "step": 2777 + }, + { + "epoch": 0.08237701272128814, + "grad_norm": 0.21223638951778412, + "learning_rate": 0.0009868884717887654, + "loss": 2.9674, + "step": 2778 + }, + { + "epoch": 0.08240666607359962, + "grad_norm": 0.18921732902526855, + "learning_rate": 0.0009868777653104948, + "loss": 2.9708, + "step": 2779 + }, + { + "epoch": 0.0824363194259111, + "grad_norm": 0.1795913279056549, + "learning_rate": 0.000986867054520838, + "loss": 2.9672, + "step": 2780 + }, + { + "epoch": 0.08246597277822258, + "grad_norm": 0.18711160123348236, + "learning_rate": 0.0009868563394198897, + "loss": 2.988, + "step": 2781 + }, + { + "epoch": 0.08249562613053406, + "grad_norm": 0.18242625892162323, + "learning_rate": 0.000986845620007745, + "loss": 2.9649, + "step": 2782 + }, + { + "epoch": 0.08252527948284553, + "grad_norm": 0.19408269226551056, + "learning_rate": 0.000986834896284499, + "loss": 2.9749, + "step": 2783 + }, + { + "epoch": 0.08255493283515701, + "grad_norm": 0.19539807736873627, + "learning_rate": 0.000986824168250246, + "loss": 2.9577, + "step": 2784 + }, + { + "epoch": 0.08258458618746849, + "grad_norm": 0.19472827017307281, + "learning_rate": 0.0009868134359050818, + "loss": 2.9658, + "step": 2785 + }, + { + "epoch": 0.08261423953977998, + "grad_norm": 0.18351610004901886, + "learning_rate": 0.000986802699249101, + "loss": 2.9695, + "step": 2786 + }, + { + "epoch": 0.08264389289209145, + "grad_norm": 0.18783816695213318, + "learning_rate": 0.0009867919582823988, + "loss": 2.9627, + "step": 2787 + }, + { + "epoch": 0.08267354624440293, + "grad_norm": 0.1954375058412552, + "learning_rate": 0.0009867812130050701, + "loss": 2.934, + "step": 2788 + }, + { + "epoch": 0.0827031995967144, + "grad_norm": 0.18173889815807343, + "learning_rate": 0.0009867704634172103, + "loss": 2.9868, + "step": 2789 + }, + { + "epoch": 0.08273285294902588, + "grad_norm": 0.16696450114250183, + "learning_rate": 0.0009867597095189146, + "loss": 2.9456, + "step": 2790 + }, + { + "epoch": 0.08276250630133737, + "grad_norm": 0.15037646889686584, + "learning_rate": 0.000986748951310278, + "loss": 2.9625, + "step": 2791 + }, + { + "epoch": 0.08279215965364885, + "grad_norm": 0.15562565624713898, + "learning_rate": 0.0009867381887913957, + "loss": 3.0007, + "step": 2792 + }, + { + "epoch": 0.08282181300596032, + "grad_norm": 0.1563323736190796, + "learning_rate": 0.0009867274219623632, + "loss": 2.9812, + "step": 2793 + }, + { + "epoch": 0.0828514663582718, + "grad_norm": 0.16709673404693604, + "learning_rate": 0.000986716650823276, + "loss": 2.993, + "step": 2794 + }, + { + "epoch": 0.08288111971058328, + "grad_norm": 0.17248359322547913, + "learning_rate": 0.0009867058753742293, + "loss": 2.9398, + "step": 2795 + }, + { + "epoch": 0.08291077306289477, + "grad_norm": 0.1782355010509491, + "learning_rate": 0.0009866950956153187, + "loss": 2.951, + "step": 2796 + }, + { + "epoch": 0.08294042641520624, + "grad_norm": 0.17612288892269135, + "learning_rate": 0.0009866843115466392, + "loss": 2.9448, + "step": 2797 + }, + { + "epoch": 0.08297007976751772, + "grad_norm": 0.16573551297187805, + "learning_rate": 0.0009866735231682867, + "loss": 2.964, + "step": 2798 + }, + { + "epoch": 0.0829997331198292, + "grad_norm": 0.1546497642993927, + "learning_rate": 0.0009866627304803567, + "loss": 2.9846, + "step": 2799 + }, + { + "epoch": 0.08302938647214067, + "grad_norm": 0.15708819031715393, + "learning_rate": 0.0009866519334829444, + "loss": 2.9583, + "step": 2800 + }, + { + "epoch": 0.08305903982445216, + "grad_norm": 0.16476213932037354, + "learning_rate": 0.000986641132176146, + "loss": 2.945, + "step": 2801 + }, + { + "epoch": 0.08308869317676364, + "grad_norm": 0.1921926736831665, + "learning_rate": 0.0009866303265600568, + "loss": 2.9648, + "step": 2802 + }, + { + "epoch": 0.08311834652907511, + "grad_norm": 0.18730942904949188, + "learning_rate": 0.0009866195166347725, + "loss": 2.9811, + "step": 2803 + }, + { + "epoch": 0.08314799988138659, + "grad_norm": 0.17793339490890503, + "learning_rate": 0.0009866087024003889, + "loss": 2.9209, + "step": 2804 + }, + { + "epoch": 0.08317765323369807, + "grad_norm": 0.14609481394290924, + "learning_rate": 0.0009865978838570016, + "loss": 2.9306, + "step": 2805 + }, + { + "epoch": 0.08320730658600956, + "grad_norm": 0.154218852519989, + "learning_rate": 0.0009865870610047064, + "loss": 2.9555, + "step": 2806 + }, + { + "epoch": 0.08323695993832103, + "grad_norm": 0.16043294966220856, + "learning_rate": 0.0009865762338435995, + "loss": 2.9863, + "step": 2807 + }, + { + "epoch": 0.08326661329063251, + "grad_norm": 0.16100046038627625, + "learning_rate": 0.0009865654023737765, + "loss": 2.9495, + "step": 2808 + }, + { + "epoch": 0.08329626664294398, + "grad_norm": 0.1754685938358307, + "learning_rate": 0.0009865545665953333, + "loss": 3.0006, + "step": 2809 + }, + { + "epoch": 0.08332591999525546, + "grad_norm": 0.17794688045978546, + "learning_rate": 0.0009865437265083657, + "loss": 2.9674, + "step": 2810 + }, + { + "epoch": 0.08335557334756694, + "grad_norm": 0.19880418479442596, + "learning_rate": 0.0009865328821129702, + "loss": 2.9985, + "step": 2811 + }, + { + "epoch": 0.08338522669987843, + "grad_norm": 0.1891280710697174, + "learning_rate": 0.0009865220334092425, + "loss": 2.9484, + "step": 2812 + }, + { + "epoch": 0.0834148800521899, + "grad_norm": 0.19070248305797577, + "learning_rate": 0.0009865111803972783, + "loss": 2.9547, + "step": 2813 + }, + { + "epoch": 0.08344453340450138, + "grad_norm": 0.1861138939857483, + "learning_rate": 0.0009865003230771746, + "loss": 2.9734, + "step": 2814 + }, + { + "epoch": 0.08347418675681285, + "grad_norm": 0.18323402106761932, + "learning_rate": 0.0009864894614490267, + "loss": 2.9626, + "step": 2815 + }, + { + "epoch": 0.08350384010912433, + "grad_norm": 0.1946316510438919, + "learning_rate": 0.0009864785955129311, + "loss": 2.9832, + "step": 2816 + }, + { + "epoch": 0.08353349346143582, + "grad_norm": 0.1841481477022171, + "learning_rate": 0.0009864677252689842, + "loss": 2.9995, + "step": 2817 + }, + { + "epoch": 0.0835631468137473, + "grad_norm": 0.20030349493026733, + "learning_rate": 0.000986456850717282, + "loss": 2.9573, + "step": 2818 + }, + { + "epoch": 0.08359280016605877, + "grad_norm": 0.22943030297756195, + "learning_rate": 0.0009864459718579208, + "loss": 2.9557, + "step": 2819 + }, + { + "epoch": 0.08362245351837025, + "grad_norm": 0.2112683802843094, + "learning_rate": 0.0009864350886909972, + "loss": 2.9616, + "step": 2820 + }, + { + "epoch": 0.08365210687068173, + "grad_norm": 0.19432294368743896, + "learning_rate": 0.0009864242012166075, + "loss": 2.9934, + "step": 2821 + }, + { + "epoch": 0.08368176022299322, + "grad_norm": 0.19932889938354492, + "learning_rate": 0.0009864133094348478, + "loss": 2.9508, + "step": 2822 + }, + { + "epoch": 0.08371141357530469, + "grad_norm": 0.18478235602378845, + "learning_rate": 0.0009864024133458148, + "loss": 2.971, + "step": 2823 + }, + { + "epoch": 0.08374106692761617, + "grad_norm": 0.18678773939609528, + "learning_rate": 0.0009863915129496048, + "loss": 2.9413, + "step": 2824 + }, + { + "epoch": 0.08377072027992764, + "grad_norm": 0.18042384088039398, + "learning_rate": 0.0009863806082463147, + "loss": 2.9747, + "step": 2825 + }, + { + "epoch": 0.08380037363223912, + "grad_norm": 0.15785619616508484, + "learning_rate": 0.0009863696992360408, + "loss": 2.9666, + "step": 2826 + }, + { + "epoch": 0.08383002698455061, + "grad_norm": 0.16390512883663177, + "learning_rate": 0.0009863587859188796, + "loss": 2.9795, + "step": 2827 + }, + { + "epoch": 0.08385968033686209, + "grad_norm": 0.19084492325782776, + "learning_rate": 0.0009863478682949278, + "loss": 2.9637, + "step": 2828 + }, + { + "epoch": 0.08388933368917356, + "grad_norm": 0.1833401918411255, + "learning_rate": 0.0009863369463642823, + "loss": 2.978, + "step": 2829 + }, + { + "epoch": 0.08391898704148504, + "grad_norm": 0.17615880072116852, + "learning_rate": 0.0009863260201270396, + "loss": 2.9672, + "step": 2830 + }, + { + "epoch": 0.08394864039379651, + "grad_norm": 0.19366900622844696, + "learning_rate": 0.0009863150895832965, + "loss": 2.9663, + "step": 2831 + }, + { + "epoch": 0.083978293746108, + "grad_norm": 0.16155259311199188, + "learning_rate": 0.00098630415473315, + "loss": 2.9459, + "step": 2832 + }, + { + "epoch": 0.08400794709841948, + "grad_norm": 0.15905405580997467, + "learning_rate": 0.0009862932155766965, + "loss": 2.951, + "step": 2833 + }, + { + "epoch": 0.08403760045073096, + "grad_norm": 0.16154778003692627, + "learning_rate": 0.0009862822721140331, + "loss": 2.9728, + "step": 2834 + }, + { + "epoch": 0.08406725380304243, + "grad_norm": 0.1500644087791443, + "learning_rate": 0.0009862713243452568, + "loss": 2.9659, + "step": 2835 + }, + { + "epoch": 0.08409690715535391, + "grad_norm": 0.155793696641922, + "learning_rate": 0.0009862603722704645, + "loss": 2.9402, + "step": 2836 + }, + { + "epoch": 0.08412656050766538, + "grad_norm": 0.17949999868869781, + "learning_rate": 0.000986249415889753, + "loss": 2.9488, + "step": 2837 + }, + { + "epoch": 0.08415621385997687, + "grad_norm": 0.17273399233818054, + "learning_rate": 0.0009862384552032197, + "loss": 2.9331, + "step": 2838 + }, + { + "epoch": 0.08418586721228835, + "grad_norm": 0.17652037739753723, + "learning_rate": 0.0009862274902109611, + "loss": 2.9585, + "step": 2839 + }, + { + "epoch": 0.08421552056459983, + "grad_norm": 0.2040611356496811, + "learning_rate": 0.0009862165209130749, + "loss": 2.9366, + "step": 2840 + }, + { + "epoch": 0.0842451739169113, + "grad_norm": 0.22223417460918427, + "learning_rate": 0.0009862055473096579, + "loss": 2.97, + "step": 2841 + }, + { + "epoch": 0.08427482726922278, + "grad_norm": 0.21305575966835022, + "learning_rate": 0.0009861945694008072, + "loss": 2.9815, + "step": 2842 + }, + { + "epoch": 0.08430448062153427, + "grad_norm": 0.1420731246471405, + "learning_rate": 0.0009861835871866204, + "loss": 2.9545, + "step": 2843 + }, + { + "epoch": 0.08433413397384575, + "grad_norm": 0.16133061051368713, + "learning_rate": 0.0009861726006671942, + "loss": 2.9381, + "step": 2844 + }, + { + "epoch": 0.08436378732615722, + "grad_norm": 0.15504330396652222, + "learning_rate": 0.000986161609842626, + "loss": 2.989, + "step": 2845 + }, + { + "epoch": 0.0843934406784687, + "grad_norm": 0.13724058866500854, + "learning_rate": 0.0009861506147130137, + "loss": 3.0086, + "step": 2846 + }, + { + "epoch": 0.08442309403078017, + "grad_norm": 0.15449821949005127, + "learning_rate": 0.000986139615278454, + "loss": 2.9397, + "step": 2847 + }, + { + "epoch": 0.08445274738309166, + "grad_norm": 0.1565295308828354, + "learning_rate": 0.0009861286115390446, + "loss": 2.993, + "step": 2848 + }, + { + "epoch": 0.08448240073540314, + "grad_norm": 0.15516003966331482, + "learning_rate": 0.000986117603494883, + "loss": 2.9222, + "step": 2849 + }, + { + "epoch": 0.08451205408771462, + "grad_norm": 0.16732123494148254, + "learning_rate": 0.0009861065911460666, + "loss": 2.924, + "step": 2850 + }, + { + "epoch": 0.08454170744002609, + "grad_norm": 0.17545291781425476, + "learning_rate": 0.0009860955744926928, + "loss": 2.9528, + "step": 2851 + }, + { + "epoch": 0.08457136079233757, + "grad_norm": 0.16937164962291718, + "learning_rate": 0.0009860845535348594, + "loss": 3.0092, + "step": 2852 + }, + { + "epoch": 0.08460101414464906, + "grad_norm": 0.1583477109670639, + "learning_rate": 0.0009860735282726635, + "loss": 2.9593, + "step": 2853 + }, + { + "epoch": 0.08463066749696053, + "grad_norm": 0.1518852412700653, + "learning_rate": 0.0009860624987062035, + "loss": 2.938, + "step": 2854 + }, + { + "epoch": 0.08466032084927201, + "grad_norm": 0.15828067064285278, + "learning_rate": 0.0009860514648355764, + "loss": 2.9624, + "step": 2855 + }, + { + "epoch": 0.08468997420158349, + "grad_norm": 0.17907392978668213, + "learning_rate": 0.0009860404266608803, + "loss": 2.95, + "step": 2856 + }, + { + "epoch": 0.08471962755389496, + "grad_norm": 0.19239169359207153, + "learning_rate": 0.0009860293841822125, + "loss": 2.9606, + "step": 2857 + }, + { + "epoch": 0.08474928090620645, + "grad_norm": 0.1950083076953888, + "learning_rate": 0.0009860183373996713, + "loss": 2.9835, + "step": 2858 + }, + { + "epoch": 0.08477893425851793, + "grad_norm": 0.20015300810337067, + "learning_rate": 0.0009860072863133544, + "loss": 2.9646, + "step": 2859 + }, + { + "epoch": 0.0848085876108294, + "grad_norm": 0.2001620978116989, + "learning_rate": 0.0009859962309233595, + "loss": 2.9987, + "step": 2860 + }, + { + "epoch": 0.08483824096314088, + "grad_norm": 0.1810777187347412, + "learning_rate": 0.0009859851712297845, + "loss": 2.9662, + "step": 2861 + }, + { + "epoch": 0.08486789431545236, + "grad_norm": 0.18742765486240387, + "learning_rate": 0.0009859741072327274, + "loss": 2.9525, + "step": 2862 + }, + { + "epoch": 0.08489754766776383, + "grad_norm": 0.22381167113780975, + "learning_rate": 0.000985963038932286, + "loss": 2.9687, + "step": 2863 + }, + { + "epoch": 0.08492720102007532, + "grad_norm": 0.21967263519763947, + "learning_rate": 0.0009859519663285585, + "loss": 3.0012, + "step": 2864 + }, + { + "epoch": 0.0849568543723868, + "grad_norm": 0.1749613732099533, + "learning_rate": 0.0009859408894216431, + "loss": 2.9532, + "step": 2865 + }, + { + "epoch": 0.08498650772469828, + "grad_norm": 0.18381471931934357, + "learning_rate": 0.0009859298082116378, + "loss": 2.9839, + "step": 2866 + }, + { + "epoch": 0.08501616107700975, + "grad_norm": 0.18857519328594208, + "learning_rate": 0.0009859187226986404, + "loss": 2.9405, + "step": 2867 + }, + { + "epoch": 0.08504581442932123, + "grad_norm": 0.16987422108650208, + "learning_rate": 0.0009859076328827495, + "loss": 2.9656, + "step": 2868 + }, + { + "epoch": 0.08507546778163272, + "grad_norm": 0.15156397223472595, + "learning_rate": 0.0009858965387640629, + "loss": 2.9658, + "step": 2869 + }, + { + "epoch": 0.0851051211339442, + "grad_norm": 0.15302634239196777, + "learning_rate": 0.0009858854403426791, + "loss": 2.9615, + "step": 2870 + }, + { + "epoch": 0.08513477448625567, + "grad_norm": 0.15321943163871765, + "learning_rate": 0.0009858743376186963, + "loss": 2.9468, + "step": 2871 + }, + { + "epoch": 0.08516442783856715, + "grad_norm": 0.16215574741363525, + "learning_rate": 0.000985863230592213, + "loss": 2.9686, + "step": 2872 + }, + { + "epoch": 0.08519408119087862, + "grad_norm": 0.23250038921833038, + "learning_rate": 0.0009858521192633272, + "loss": 2.9571, + "step": 2873 + }, + { + "epoch": 0.08522373454319011, + "grad_norm": 0.1604142189025879, + "learning_rate": 0.0009858410036321377, + "loss": 3.0078, + "step": 2874 + }, + { + "epoch": 0.08525338789550159, + "grad_norm": 0.13142450153827667, + "learning_rate": 0.0009858298836987426, + "loss": 2.9366, + "step": 2875 + }, + { + "epoch": 0.08528304124781307, + "grad_norm": 0.14131656289100647, + "learning_rate": 0.0009858187594632404, + "loss": 2.9641, + "step": 2876 + }, + { + "epoch": 0.08531269460012454, + "grad_norm": 0.14888747036457062, + "learning_rate": 0.0009858076309257298, + "loss": 2.9523, + "step": 2877 + }, + { + "epoch": 0.08534234795243602, + "grad_norm": 0.12778577208518982, + "learning_rate": 0.0009857964980863093, + "loss": 2.94, + "step": 2878 + }, + { + "epoch": 0.08537200130474751, + "grad_norm": 0.12989233434200287, + "learning_rate": 0.0009857853609450775, + "loss": 2.945, + "step": 2879 + }, + { + "epoch": 0.08540165465705898, + "grad_norm": 0.13707736134529114, + "learning_rate": 0.0009857742195021326, + "loss": 2.9933, + "step": 2880 + }, + { + "epoch": 0.08543130800937046, + "grad_norm": 0.13851453363895416, + "learning_rate": 0.000985763073757574, + "loss": 2.963, + "step": 2881 + }, + { + "epoch": 0.08546096136168194, + "grad_norm": 0.14659862220287323, + "learning_rate": 0.0009857519237114999, + "loss": 2.9283, + "step": 2882 + }, + { + "epoch": 0.08549061471399341, + "grad_norm": 0.14007717370986938, + "learning_rate": 0.0009857407693640088, + "loss": 2.9795, + "step": 2883 + }, + { + "epoch": 0.0855202680663049, + "grad_norm": 0.14781010150909424, + "learning_rate": 0.0009857296107152003, + "loss": 2.9513, + "step": 2884 + }, + { + "epoch": 0.08554992141861638, + "grad_norm": 0.15907689929008484, + "learning_rate": 0.0009857184477651724, + "loss": 2.9895, + "step": 2885 + }, + { + "epoch": 0.08557957477092785, + "grad_norm": 0.200156569480896, + "learning_rate": 0.0009857072805140243, + "loss": 2.9549, + "step": 2886 + }, + { + "epoch": 0.08560922812323933, + "grad_norm": 0.241535946726799, + "learning_rate": 0.0009856961089618548, + "loss": 2.965, + "step": 2887 + }, + { + "epoch": 0.08563888147555081, + "grad_norm": 0.24595440924167633, + "learning_rate": 0.0009856849331087631, + "loss": 2.998, + "step": 2888 + }, + { + "epoch": 0.08566853482786228, + "grad_norm": 0.21272996068000793, + "learning_rate": 0.0009856737529548478, + "loss": 2.97, + "step": 2889 + }, + { + "epoch": 0.08569818818017377, + "grad_norm": 0.1845160871744156, + "learning_rate": 0.0009856625685002083, + "loss": 2.9739, + "step": 2890 + }, + { + "epoch": 0.08572784153248525, + "grad_norm": 0.1806887984275818, + "learning_rate": 0.000985651379744943, + "loss": 2.9482, + "step": 2891 + }, + { + "epoch": 0.08575749488479673, + "grad_norm": 0.19978944957256317, + "learning_rate": 0.0009856401866891516, + "loss": 2.9682, + "step": 2892 + }, + { + "epoch": 0.0857871482371082, + "grad_norm": 0.20107898116111755, + "learning_rate": 0.000985628989332933, + "loss": 2.9228, + "step": 2893 + }, + { + "epoch": 0.08581680158941968, + "grad_norm": 0.1945868581533432, + "learning_rate": 0.0009856177876763864, + "loss": 2.9529, + "step": 2894 + }, + { + "epoch": 0.08584645494173117, + "grad_norm": 0.1476285606622696, + "learning_rate": 0.0009856065817196106, + "loss": 2.9592, + "step": 2895 + }, + { + "epoch": 0.08587610829404264, + "grad_norm": 0.164979487657547, + "learning_rate": 0.0009855953714627056, + "loss": 2.9961, + "step": 2896 + }, + { + "epoch": 0.08590576164635412, + "grad_norm": 0.17660799622535706, + "learning_rate": 0.00098558415690577, + "loss": 2.9657, + "step": 2897 + }, + { + "epoch": 0.0859354149986656, + "grad_norm": 0.14744660258293152, + "learning_rate": 0.0009855729380489034, + "loss": 2.9954, + "step": 2898 + }, + { + "epoch": 0.08596506835097707, + "grad_norm": 0.1645917147397995, + "learning_rate": 0.000985561714892205, + "loss": 2.9452, + "step": 2899 + }, + { + "epoch": 0.08599472170328856, + "grad_norm": 0.17694911360740662, + "learning_rate": 0.0009855504874357744, + "loss": 2.9268, + "step": 2900 + }, + { + "epoch": 0.08602437505560004, + "grad_norm": 0.21855445206165314, + "learning_rate": 0.0009855392556797108, + "loss": 2.9428, + "step": 2901 + }, + { + "epoch": 0.08605402840791151, + "grad_norm": 0.23464125394821167, + "learning_rate": 0.0009855280196241138, + "loss": 2.9921, + "step": 2902 + }, + { + "epoch": 0.08608368176022299, + "grad_norm": 0.21412810683250427, + "learning_rate": 0.0009855167792690827, + "loss": 2.9792, + "step": 2903 + }, + { + "epoch": 0.08611333511253447, + "grad_norm": 0.21806730329990387, + "learning_rate": 0.0009855055346147173, + "loss": 2.9372, + "step": 2904 + }, + { + "epoch": 0.08614298846484596, + "grad_norm": 0.1921733170747757, + "learning_rate": 0.0009854942856611171, + "loss": 2.95, + "step": 2905 + }, + { + "epoch": 0.08617264181715743, + "grad_norm": 0.1862458437681198, + "learning_rate": 0.0009854830324083816, + "loss": 2.9483, + "step": 2906 + }, + { + "epoch": 0.08620229516946891, + "grad_norm": 0.1753748506307602, + "learning_rate": 0.0009854717748566106, + "loss": 2.9701, + "step": 2907 + }, + { + "epoch": 0.08623194852178039, + "grad_norm": 0.1743539720773697, + "learning_rate": 0.0009854605130059035, + "loss": 2.9291, + "step": 2908 + }, + { + "epoch": 0.08626160187409186, + "grad_norm": 0.1894117295742035, + "learning_rate": 0.0009854492468563604, + "loss": 2.9711, + "step": 2909 + }, + { + "epoch": 0.08629125522640335, + "grad_norm": 0.2235548496246338, + "learning_rate": 0.000985437976408081, + "loss": 2.9688, + "step": 2910 + }, + { + "epoch": 0.08632090857871483, + "grad_norm": 0.19818395376205444, + "learning_rate": 0.000985426701661165, + "loss": 2.9615, + "step": 2911 + }, + { + "epoch": 0.0863505619310263, + "grad_norm": 0.16403135657310486, + "learning_rate": 0.0009854154226157119, + "loss": 2.9188, + "step": 2912 + }, + { + "epoch": 0.08638021528333778, + "grad_norm": 0.16877834498882294, + "learning_rate": 0.000985404139271822, + "loss": 2.9759, + "step": 2913 + }, + { + "epoch": 0.08640986863564926, + "grad_norm": 0.1645197719335556, + "learning_rate": 0.0009853928516295953, + "loss": 2.9194, + "step": 2914 + }, + { + "epoch": 0.08643952198796073, + "grad_norm": 0.1634291559457779, + "learning_rate": 0.0009853815596891316, + "loss": 2.9501, + "step": 2915 + }, + { + "epoch": 0.08646917534027222, + "grad_norm": 0.1393069326877594, + "learning_rate": 0.0009853702634505309, + "loss": 2.9433, + "step": 2916 + }, + { + "epoch": 0.0864988286925837, + "grad_norm": 0.14800746738910675, + "learning_rate": 0.000985358962913893, + "loss": 2.9667, + "step": 2917 + }, + { + "epoch": 0.08652848204489517, + "grad_norm": 0.15134121477603912, + "learning_rate": 0.0009853476580793182, + "loss": 2.9319, + "step": 2918 + }, + { + "epoch": 0.08655813539720665, + "grad_norm": 0.15050137042999268, + "learning_rate": 0.0009853363489469068, + "loss": 2.9652, + "step": 2919 + }, + { + "epoch": 0.08658778874951813, + "grad_norm": 0.14524945616722107, + "learning_rate": 0.0009853250355167584, + "loss": 2.9517, + "step": 2920 + }, + { + "epoch": 0.08661744210182962, + "grad_norm": 0.15437224507331848, + "learning_rate": 0.0009853137177889737, + "loss": 2.9313, + "step": 2921 + }, + { + "epoch": 0.0866470954541411, + "grad_norm": 0.15417219698429108, + "learning_rate": 0.0009853023957636527, + "loss": 2.9789, + "step": 2922 + }, + { + "epoch": 0.08667674880645257, + "grad_norm": 0.17415277659893036, + "learning_rate": 0.0009852910694408956, + "loss": 2.9327, + "step": 2923 + }, + { + "epoch": 0.08670640215876405, + "grad_norm": 0.13933514058589935, + "learning_rate": 0.000985279738820803, + "loss": 2.9311, + "step": 2924 + }, + { + "epoch": 0.08673605551107552, + "grad_norm": 0.13836905360221863, + "learning_rate": 0.0009852684039034748, + "loss": 2.9332, + "step": 2925 + }, + { + "epoch": 0.08676570886338701, + "grad_norm": 0.15569688379764557, + "learning_rate": 0.0009852570646890118, + "loss": 2.9429, + "step": 2926 + }, + { + "epoch": 0.08679536221569849, + "grad_norm": 0.16670286655426025, + "learning_rate": 0.000985245721177514, + "loss": 2.9548, + "step": 2927 + }, + { + "epoch": 0.08682501556800996, + "grad_norm": 0.19882486760616302, + "learning_rate": 0.0009852343733690822, + "loss": 2.9344, + "step": 2928 + }, + { + "epoch": 0.08685466892032144, + "grad_norm": 0.21727631986141205, + "learning_rate": 0.0009852230212638164, + "loss": 2.9858, + "step": 2929 + }, + { + "epoch": 0.08688432227263292, + "grad_norm": 0.21058042347431183, + "learning_rate": 0.0009852116648618179, + "loss": 2.9849, + "step": 2930 + }, + { + "epoch": 0.0869139756249444, + "grad_norm": 0.18087562918663025, + "learning_rate": 0.0009852003041631865, + "loss": 2.965, + "step": 2931 + }, + { + "epoch": 0.08694362897725588, + "grad_norm": 0.21216802299022675, + "learning_rate": 0.0009851889391680234, + "loss": 2.9485, + "step": 2932 + }, + { + "epoch": 0.08697328232956736, + "grad_norm": 0.22541575133800507, + "learning_rate": 0.0009851775698764287, + "loss": 2.9434, + "step": 2933 + }, + { + "epoch": 0.08700293568187883, + "grad_norm": 0.20295172929763794, + "learning_rate": 0.0009851661962885035, + "loss": 2.9452, + "step": 2934 + }, + { + "epoch": 0.08703258903419031, + "grad_norm": 0.17794013023376465, + "learning_rate": 0.0009851548184043484, + "loss": 2.9364, + "step": 2935 + }, + { + "epoch": 0.0870622423865018, + "grad_norm": 0.1690993309020996, + "learning_rate": 0.000985143436224064, + "loss": 2.8928, + "step": 2936 + }, + { + "epoch": 0.08709189573881328, + "grad_norm": 0.14826293289661407, + "learning_rate": 0.0009851320497477512, + "loss": 2.9825, + "step": 2937 + }, + { + "epoch": 0.08712154909112475, + "grad_norm": 0.1307847946882248, + "learning_rate": 0.0009851206589755108, + "loss": 2.9395, + "step": 2938 + }, + { + "epoch": 0.08715120244343623, + "grad_norm": 0.1272217035293579, + "learning_rate": 0.0009851092639074437, + "loss": 2.9586, + "step": 2939 + }, + { + "epoch": 0.0871808557957477, + "grad_norm": 0.15567398071289062, + "learning_rate": 0.0009850978645436507, + "loss": 2.9608, + "step": 2940 + }, + { + "epoch": 0.08721050914805918, + "grad_norm": 0.15263943374156952, + "learning_rate": 0.0009850864608842332, + "loss": 2.9296, + "step": 2941 + }, + { + "epoch": 0.08724016250037067, + "grad_norm": 0.15848363935947418, + "learning_rate": 0.0009850750529292916, + "loss": 2.9669, + "step": 2942 + }, + { + "epoch": 0.08726981585268215, + "grad_norm": 0.1764773428440094, + "learning_rate": 0.000985063640678927, + "loss": 2.959, + "step": 2943 + }, + { + "epoch": 0.08729946920499362, + "grad_norm": 0.18939314782619476, + "learning_rate": 0.000985052224133241, + "loss": 2.9669, + "step": 2944 + }, + { + "epoch": 0.0873291225573051, + "grad_norm": 0.19923003017902374, + "learning_rate": 0.0009850408032923339, + "loss": 3.0065, + "step": 2945 + }, + { + "epoch": 0.08735877590961658, + "grad_norm": 0.1933009922504425, + "learning_rate": 0.0009850293781563074, + "loss": 2.9603, + "step": 2946 + }, + { + "epoch": 0.08738842926192807, + "grad_norm": 0.19548705220222473, + "learning_rate": 0.0009850179487252623, + "loss": 2.9633, + "step": 2947 + }, + { + "epoch": 0.08741808261423954, + "grad_norm": 0.1992463767528534, + "learning_rate": 0.0009850065149993004, + "loss": 2.9741, + "step": 2948 + }, + { + "epoch": 0.08744773596655102, + "grad_norm": 0.1827349066734314, + "learning_rate": 0.0009849950769785223, + "loss": 2.9655, + "step": 2949 + }, + { + "epoch": 0.0874773893188625, + "grad_norm": 0.17850390076637268, + "learning_rate": 0.0009849836346630296, + "loss": 2.9593, + "step": 2950 + }, + { + "epoch": 0.08750704267117397, + "grad_norm": 0.16935566067695618, + "learning_rate": 0.0009849721880529237, + "loss": 2.9607, + "step": 2951 + }, + { + "epoch": 0.08753669602348546, + "grad_norm": 0.16211506724357605, + "learning_rate": 0.0009849607371483055, + "loss": 2.928, + "step": 2952 + }, + { + "epoch": 0.08756634937579694, + "grad_norm": 0.1392783671617508, + "learning_rate": 0.000984949281949277, + "loss": 2.9418, + "step": 2953 + }, + { + "epoch": 0.08759600272810841, + "grad_norm": 0.13994146883487701, + "learning_rate": 0.0009849378224559393, + "loss": 2.9238, + "step": 2954 + }, + { + "epoch": 0.08762565608041989, + "grad_norm": 0.14185668528079987, + "learning_rate": 0.000984926358668394, + "loss": 2.9717, + "step": 2955 + }, + { + "epoch": 0.08765530943273137, + "grad_norm": 0.14324578642845154, + "learning_rate": 0.0009849148905867425, + "loss": 2.9217, + "step": 2956 + }, + { + "epoch": 0.08768496278504286, + "grad_norm": 0.14822055399417877, + "learning_rate": 0.0009849034182110863, + "loss": 2.9623, + "step": 2957 + }, + { + "epoch": 0.08771461613735433, + "grad_norm": 0.16712163388729095, + "learning_rate": 0.0009848919415415274, + "loss": 2.8976, + "step": 2958 + }, + { + "epoch": 0.08774426948966581, + "grad_norm": 0.17658653855323792, + "learning_rate": 0.0009848804605781668, + "loss": 2.9645, + "step": 2959 + }, + { + "epoch": 0.08777392284197728, + "grad_norm": 0.2010660618543625, + "learning_rate": 0.0009848689753211067, + "loss": 2.9702, + "step": 2960 + }, + { + "epoch": 0.08780357619428876, + "grad_norm": 0.21453021466732025, + "learning_rate": 0.0009848574857704484, + "loss": 2.9556, + "step": 2961 + }, + { + "epoch": 0.08783322954660025, + "grad_norm": 0.20287588238716125, + "learning_rate": 0.000984845991926294, + "loss": 2.9394, + "step": 2962 + }, + { + "epoch": 0.08786288289891173, + "grad_norm": 0.2165525257587433, + "learning_rate": 0.000984834493788745, + "loss": 2.9694, + "step": 2963 + }, + { + "epoch": 0.0878925362512232, + "grad_norm": 0.23038718104362488, + "learning_rate": 0.0009848229913579035, + "loss": 2.9365, + "step": 2964 + }, + { + "epoch": 0.08792218960353468, + "grad_norm": 0.23466740548610687, + "learning_rate": 0.0009848114846338712, + "loss": 2.9537, + "step": 2965 + }, + { + "epoch": 0.08795184295584615, + "grad_norm": 0.22862300276756287, + "learning_rate": 0.0009847999736167497, + "loss": 2.9689, + "step": 2966 + }, + { + "epoch": 0.08798149630815763, + "grad_norm": 0.17338210344314575, + "learning_rate": 0.0009847884583066414, + "loss": 2.9533, + "step": 2967 + }, + { + "epoch": 0.08801114966046912, + "grad_norm": 0.16033709049224854, + "learning_rate": 0.0009847769387036482, + "loss": 2.9807, + "step": 2968 + }, + { + "epoch": 0.0880408030127806, + "grad_norm": 0.16409319639205933, + "learning_rate": 0.000984765414807872, + "loss": 2.9489, + "step": 2969 + }, + { + "epoch": 0.08807045636509207, + "grad_norm": 0.14175789058208466, + "learning_rate": 0.0009847538866194147, + "loss": 2.931, + "step": 2970 + }, + { + "epoch": 0.08810010971740355, + "grad_norm": 0.17494043707847595, + "learning_rate": 0.0009847423541383786, + "loss": 2.9639, + "step": 2971 + }, + { + "epoch": 0.08812976306971503, + "grad_norm": 0.1630551517009735, + "learning_rate": 0.0009847308173648657, + "loss": 2.9079, + "step": 2972 + }, + { + "epoch": 0.08815941642202652, + "grad_norm": 0.15705548226833344, + "learning_rate": 0.0009847192762989783, + "loss": 2.9202, + "step": 2973 + }, + { + "epoch": 0.08818906977433799, + "grad_norm": 0.16335248947143555, + "learning_rate": 0.0009847077309408183, + "loss": 2.9235, + "step": 2974 + }, + { + "epoch": 0.08821872312664947, + "grad_norm": 0.1550016552209854, + "learning_rate": 0.0009846961812904882, + "loss": 2.9264, + "step": 2975 + }, + { + "epoch": 0.08824837647896094, + "grad_norm": 0.15806114673614502, + "learning_rate": 0.0009846846273480904, + "loss": 2.9621, + "step": 2976 + }, + { + "epoch": 0.08827802983127242, + "grad_norm": 0.14348363876342773, + "learning_rate": 0.0009846730691137268, + "loss": 2.9428, + "step": 2977 + }, + { + "epoch": 0.08830768318358391, + "grad_norm": 0.1533510833978653, + "learning_rate": 0.0009846615065875002, + "loss": 2.9641, + "step": 2978 + }, + { + "epoch": 0.08833733653589539, + "grad_norm": 0.17567935585975647, + "learning_rate": 0.0009846499397695128, + "loss": 2.9688, + "step": 2979 + }, + { + "epoch": 0.08836698988820686, + "grad_norm": 0.1946767419576645, + "learning_rate": 0.0009846383686598669, + "loss": 2.9155, + "step": 2980 + }, + { + "epoch": 0.08839664324051834, + "grad_norm": 0.18458236753940582, + "learning_rate": 0.0009846267932586649, + "loss": 2.9084, + "step": 2981 + }, + { + "epoch": 0.08842629659282981, + "grad_norm": 0.16382789611816406, + "learning_rate": 0.0009846152135660096, + "loss": 2.9284, + "step": 2982 + }, + { + "epoch": 0.0884559499451413, + "grad_norm": 0.16472792625427246, + "learning_rate": 0.0009846036295820034, + "loss": 2.9498, + "step": 2983 + }, + { + "epoch": 0.08848560329745278, + "grad_norm": 0.18800196051597595, + "learning_rate": 0.0009845920413067489, + "loss": 2.9406, + "step": 2984 + }, + { + "epoch": 0.08851525664976426, + "grad_norm": 0.18590500950813293, + "learning_rate": 0.0009845804487403488, + "loss": 2.948, + "step": 2985 + }, + { + "epoch": 0.08854491000207573, + "grad_norm": 0.1967320591211319, + "learning_rate": 0.0009845688518829053, + "loss": 2.9704, + "step": 2986 + }, + { + "epoch": 0.08857456335438721, + "grad_norm": 0.1760290563106537, + "learning_rate": 0.0009845572507345217, + "loss": 2.96, + "step": 2987 + }, + { + "epoch": 0.0886042167066987, + "grad_norm": 0.17524199187755585, + "learning_rate": 0.0009845456452953003, + "loss": 2.9476, + "step": 2988 + }, + { + "epoch": 0.08863387005901018, + "grad_norm": 0.19588322937488556, + "learning_rate": 0.0009845340355653443, + "loss": 2.9456, + "step": 2989 + }, + { + "epoch": 0.08866352341132165, + "grad_norm": 0.1902867555618286, + "learning_rate": 0.0009845224215447562, + "loss": 2.9705, + "step": 2990 + }, + { + "epoch": 0.08869317676363313, + "grad_norm": 0.16642296314239502, + "learning_rate": 0.0009845108032336387, + "loss": 2.9515, + "step": 2991 + }, + { + "epoch": 0.0887228301159446, + "grad_norm": 0.1441364735364914, + "learning_rate": 0.000984499180632095, + "loss": 2.9379, + "step": 2992 + }, + { + "epoch": 0.08875248346825608, + "grad_norm": 0.16083760559558868, + "learning_rate": 0.0009844875537402278, + "loss": 2.9385, + "step": 2993 + }, + { + "epoch": 0.08878213682056757, + "grad_norm": 0.14669057726860046, + "learning_rate": 0.0009844759225581402, + "loss": 2.9693, + "step": 2994 + }, + { + "epoch": 0.08881179017287905, + "grad_norm": 0.1612098515033722, + "learning_rate": 0.0009844642870859353, + "loss": 2.9658, + "step": 2995 + }, + { + "epoch": 0.08884144352519052, + "grad_norm": 0.16548804938793182, + "learning_rate": 0.0009844526473237157, + "loss": 2.9383, + "step": 2996 + }, + { + "epoch": 0.088871096877502, + "grad_norm": 0.14544615149497986, + "learning_rate": 0.000984441003271585, + "loss": 2.9566, + "step": 2997 + }, + { + "epoch": 0.08890075022981347, + "grad_norm": 0.14271849393844604, + "learning_rate": 0.000984429354929646, + "loss": 2.9369, + "step": 2998 + }, + { + "epoch": 0.08893040358212496, + "grad_norm": 0.13076724112033844, + "learning_rate": 0.0009844177022980017, + "loss": 2.942, + "step": 2999 + }, + { + "epoch": 0.08896005693443644, + "grad_norm": 0.13856977224349976, + "learning_rate": 0.0009844060453767557, + "loss": 2.9308, + "step": 3000 + }, + { + "epoch": 0.08898971028674792, + "grad_norm": 0.1423429697751999, + "learning_rate": 0.000984394384166011, + "loss": 2.9418, + "step": 3001 + }, + { + "epoch": 0.08901936363905939, + "grad_norm": 0.14697381854057312, + "learning_rate": 0.000984382718665871, + "loss": 2.9588, + "step": 3002 + }, + { + "epoch": 0.08904901699137087, + "grad_norm": 0.16239894926548004, + "learning_rate": 0.0009843710488764386, + "loss": 2.9498, + "step": 3003 + }, + { + "epoch": 0.08907867034368236, + "grad_norm": 0.15628738701343536, + "learning_rate": 0.0009843593747978178, + "loss": 2.9401, + "step": 3004 + }, + { + "epoch": 0.08910832369599384, + "grad_norm": 0.1820513755083084, + "learning_rate": 0.0009843476964301113, + "loss": 2.9472, + "step": 3005 + }, + { + "epoch": 0.08913797704830531, + "grad_norm": 0.18504582345485687, + "learning_rate": 0.0009843360137734227, + "loss": 2.9227, + "step": 3006 + }, + { + "epoch": 0.08916763040061679, + "grad_norm": 0.20574131608009338, + "learning_rate": 0.000984324326827856, + "loss": 2.912, + "step": 3007 + }, + { + "epoch": 0.08919728375292826, + "grad_norm": 0.24606646597385406, + "learning_rate": 0.0009843126355935138, + "loss": 2.9806, + "step": 3008 + }, + { + "epoch": 0.08922693710523975, + "grad_norm": 0.2229660302400589, + "learning_rate": 0.0009843009400705004, + "loss": 2.951, + "step": 3009 + }, + { + "epoch": 0.08925659045755123, + "grad_norm": 0.21301598846912384, + "learning_rate": 0.0009842892402589188, + "loss": 2.9699, + "step": 3010 + }, + { + "epoch": 0.0892862438098627, + "grad_norm": 0.19520026445388794, + "learning_rate": 0.000984277536158873, + "loss": 2.9814, + "step": 3011 + }, + { + "epoch": 0.08931589716217418, + "grad_norm": 0.17224563658237457, + "learning_rate": 0.0009842658277704665, + "loss": 2.9517, + "step": 3012 + }, + { + "epoch": 0.08934555051448566, + "grad_norm": 0.17255407571792603, + "learning_rate": 0.000984254115093803, + "loss": 2.9679, + "step": 3013 + }, + { + "epoch": 0.08937520386679715, + "grad_norm": 0.1670924425125122, + "learning_rate": 0.0009842423981289861, + "loss": 2.9479, + "step": 3014 + }, + { + "epoch": 0.08940485721910862, + "grad_norm": 0.19498120248317719, + "learning_rate": 0.0009842306768761196, + "loss": 2.9477, + "step": 3015 + }, + { + "epoch": 0.0894345105714201, + "grad_norm": 0.1853788197040558, + "learning_rate": 0.0009842189513353074, + "loss": 2.9286, + "step": 3016 + }, + { + "epoch": 0.08946416392373158, + "grad_norm": 0.18569804728031158, + "learning_rate": 0.0009842072215066533, + "loss": 2.9412, + "step": 3017 + }, + { + "epoch": 0.08949381727604305, + "grad_norm": 0.1485544890165329, + "learning_rate": 0.0009841954873902612, + "loss": 2.9387, + "step": 3018 + }, + { + "epoch": 0.08952347062835453, + "grad_norm": 0.15242482721805573, + "learning_rate": 0.0009841837489862348, + "loss": 2.9212, + "step": 3019 + }, + { + "epoch": 0.08955312398066602, + "grad_norm": 0.17238226532936096, + "learning_rate": 0.0009841720062946783, + "loss": 2.963, + "step": 3020 + }, + { + "epoch": 0.0895827773329775, + "grad_norm": 0.18819309771060944, + "learning_rate": 0.0009841602593156956, + "loss": 2.9795, + "step": 3021 + }, + { + "epoch": 0.08961243068528897, + "grad_norm": 0.201044961810112, + "learning_rate": 0.0009841485080493905, + "loss": 2.9929, + "step": 3022 + }, + { + "epoch": 0.08964208403760045, + "grad_norm": 0.18524177372455597, + "learning_rate": 0.0009841367524958675, + "loss": 2.9572, + "step": 3023 + }, + { + "epoch": 0.08967173738991192, + "grad_norm": 0.16514301300048828, + "learning_rate": 0.0009841249926552302, + "loss": 2.9287, + "step": 3024 + }, + { + "epoch": 0.08970139074222341, + "grad_norm": 0.18158523738384247, + "learning_rate": 0.0009841132285275834, + "loss": 2.9208, + "step": 3025 + }, + { + "epoch": 0.08973104409453489, + "grad_norm": 0.18950983881950378, + "learning_rate": 0.0009841014601130304, + "loss": 2.9499, + "step": 3026 + }, + { + "epoch": 0.08976069744684637, + "grad_norm": 0.19469483196735382, + "learning_rate": 0.0009840896874116762, + "loss": 2.9407, + "step": 3027 + }, + { + "epoch": 0.08979035079915784, + "grad_norm": 0.18356071412563324, + "learning_rate": 0.0009840779104236246, + "loss": 2.9412, + "step": 3028 + }, + { + "epoch": 0.08982000415146932, + "grad_norm": 0.1785203367471695, + "learning_rate": 0.0009840661291489802, + "loss": 2.9591, + "step": 3029 + }, + { + "epoch": 0.08984965750378081, + "grad_norm": 0.16620050370693207, + "learning_rate": 0.0009840543435878468, + "loss": 2.9282, + "step": 3030 + }, + { + "epoch": 0.08987931085609228, + "grad_norm": 0.18363481760025024, + "learning_rate": 0.0009840425537403293, + "loss": 2.943, + "step": 3031 + }, + { + "epoch": 0.08990896420840376, + "grad_norm": 0.1716415286064148, + "learning_rate": 0.0009840307596065319, + "loss": 2.9662, + "step": 3032 + }, + { + "epoch": 0.08993861756071524, + "grad_norm": 0.1507306843996048, + "learning_rate": 0.0009840189611865589, + "loss": 2.9478, + "step": 3033 + }, + { + "epoch": 0.08996827091302671, + "grad_norm": 0.17529195547103882, + "learning_rate": 0.000984007158480515, + "loss": 2.9359, + "step": 3034 + }, + { + "epoch": 0.0899979242653382, + "grad_norm": 0.17260490357875824, + "learning_rate": 0.0009839953514885046, + "loss": 2.9238, + "step": 3035 + }, + { + "epoch": 0.09002757761764968, + "grad_norm": 0.17912189662456512, + "learning_rate": 0.0009839835402106324, + "loss": 2.9444, + "step": 3036 + }, + { + "epoch": 0.09005723096996116, + "grad_norm": 0.17557835578918457, + "learning_rate": 0.0009839717246470027, + "loss": 2.9604, + "step": 3037 + }, + { + "epoch": 0.09008688432227263, + "grad_norm": 0.1672462671995163, + "learning_rate": 0.0009839599047977206, + "loss": 2.9559, + "step": 3038 + }, + { + "epoch": 0.09011653767458411, + "grad_norm": 0.1729012131690979, + "learning_rate": 0.0009839480806628901, + "loss": 2.9315, + "step": 3039 + }, + { + "epoch": 0.0901461910268956, + "grad_norm": 0.17738613486289978, + "learning_rate": 0.0009839362522426167, + "loss": 2.952, + "step": 3040 + }, + { + "epoch": 0.09017584437920707, + "grad_norm": 0.1697739064693451, + "learning_rate": 0.0009839244195370045, + "loss": 2.9332, + "step": 3041 + }, + { + "epoch": 0.09020549773151855, + "grad_norm": 0.15259288251399994, + "learning_rate": 0.0009839125825461584, + "loss": 2.929, + "step": 3042 + }, + { + "epoch": 0.09023515108383003, + "grad_norm": 0.16173055768013, + "learning_rate": 0.0009839007412701835, + "loss": 2.9673, + "step": 3043 + }, + { + "epoch": 0.0902648044361415, + "grad_norm": 0.1931421160697937, + "learning_rate": 0.0009838888957091844, + "loss": 2.9814, + "step": 3044 + }, + { + "epoch": 0.09029445778845298, + "grad_norm": 0.17180192470550537, + "learning_rate": 0.000983877045863266, + "loss": 2.9444, + "step": 3045 + }, + { + "epoch": 0.09032411114076447, + "grad_norm": 0.16659137606620789, + "learning_rate": 0.0009838651917325335, + "loss": 2.9359, + "step": 3046 + }, + { + "epoch": 0.09035376449307594, + "grad_norm": 0.18387529253959656, + "learning_rate": 0.0009838533333170914, + "loss": 2.9448, + "step": 3047 + }, + { + "epoch": 0.09038341784538742, + "grad_norm": 0.18601065874099731, + "learning_rate": 0.0009838414706170452, + "loss": 2.9076, + "step": 3048 + }, + { + "epoch": 0.0904130711976989, + "grad_norm": 0.18922747671604156, + "learning_rate": 0.0009838296036324995, + "loss": 2.9095, + "step": 3049 + }, + { + "epoch": 0.09044272455001037, + "grad_norm": 0.1635635942220688, + "learning_rate": 0.0009838177323635597, + "loss": 2.9282, + "step": 3050 + }, + { + "epoch": 0.09047237790232186, + "grad_norm": 0.13983039557933807, + "learning_rate": 0.000983805856810331, + "loss": 2.9769, + "step": 3051 + }, + { + "epoch": 0.09050203125463334, + "grad_norm": 0.14182542264461517, + "learning_rate": 0.0009837939769729183, + "loss": 2.9651, + "step": 3052 + }, + { + "epoch": 0.09053168460694482, + "grad_norm": 0.15496517717838287, + "learning_rate": 0.0009837820928514267, + "loss": 2.9436, + "step": 3053 + }, + { + "epoch": 0.09056133795925629, + "grad_norm": 0.1590854823589325, + "learning_rate": 0.000983770204445962, + "loss": 2.9519, + "step": 3054 + }, + { + "epoch": 0.09059099131156777, + "grad_norm": 0.1619018167257309, + "learning_rate": 0.000983758311756629, + "loss": 2.9439, + "step": 3055 + }, + { + "epoch": 0.09062064466387926, + "grad_norm": 0.1935766190290451, + "learning_rate": 0.0009837464147835328, + "loss": 2.9438, + "step": 3056 + }, + { + "epoch": 0.09065029801619073, + "grad_norm": 0.21808785200119019, + "learning_rate": 0.0009837345135267793, + "loss": 2.951, + "step": 3057 + }, + { + "epoch": 0.09067995136850221, + "grad_norm": 0.21968618035316467, + "learning_rate": 0.0009837226079864737, + "loss": 2.9361, + "step": 3058 + }, + { + "epoch": 0.09070960472081369, + "grad_norm": 0.18400992453098297, + "learning_rate": 0.0009837106981627213, + "loss": 2.9244, + "step": 3059 + }, + { + "epoch": 0.09073925807312516, + "grad_norm": 0.18532368540763855, + "learning_rate": 0.0009836987840556276, + "loss": 2.991, + "step": 3060 + }, + { + "epoch": 0.09076891142543665, + "grad_norm": 0.18320783972740173, + "learning_rate": 0.0009836868656652982, + "loss": 2.9413, + "step": 3061 + }, + { + "epoch": 0.09079856477774813, + "grad_norm": 0.16891717910766602, + "learning_rate": 0.0009836749429918386, + "loss": 2.927, + "step": 3062 + }, + { + "epoch": 0.0908282181300596, + "grad_norm": 0.18898439407348633, + "learning_rate": 0.0009836630160353543, + "loss": 2.9668, + "step": 3063 + }, + { + "epoch": 0.09085787148237108, + "grad_norm": 0.21621201932430267, + "learning_rate": 0.000983651084795951, + "loss": 2.9785, + "step": 3064 + }, + { + "epoch": 0.09088752483468256, + "grad_norm": 0.20374716818332672, + "learning_rate": 0.0009836391492737343, + "loss": 2.9362, + "step": 3065 + }, + { + "epoch": 0.09091717818699405, + "grad_norm": 0.1887224018573761, + "learning_rate": 0.00098362720946881, + "loss": 2.9429, + "step": 3066 + }, + { + "epoch": 0.09094683153930552, + "grad_norm": 0.17591719329357147, + "learning_rate": 0.0009836152653812838, + "loss": 2.9239, + "step": 3067 + }, + { + "epoch": 0.090976484891617, + "grad_norm": 0.15553945302963257, + "learning_rate": 0.0009836033170112612, + "loss": 2.9406, + "step": 3068 + }, + { + "epoch": 0.09100613824392847, + "grad_norm": 0.1399427354335785, + "learning_rate": 0.0009835913643588482, + "loss": 2.9408, + "step": 3069 + }, + { + "epoch": 0.09103579159623995, + "grad_norm": 0.15315411984920502, + "learning_rate": 0.0009835794074241509, + "loss": 2.957, + "step": 3070 + }, + { + "epoch": 0.09106544494855143, + "grad_norm": 0.1711503118276596, + "learning_rate": 0.0009835674462072748, + "loss": 2.9634, + "step": 3071 + }, + { + "epoch": 0.09109509830086292, + "grad_norm": 0.19696393609046936, + "learning_rate": 0.0009835554807083261, + "loss": 2.9459, + "step": 3072 + }, + { + "epoch": 0.0911247516531744, + "grad_norm": 0.19468599557876587, + "learning_rate": 0.0009835435109274105, + "loss": 2.9442, + "step": 3073 + }, + { + "epoch": 0.09115440500548587, + "grad_norm": 0.16690482199192047, + "learning_rate": 0.000983531536864634, + "loss": 2.9368, + "step": 3074 + }, + { + "epoch": 0.09118405835779735, + "grad_norm": 0.15525685250759125, + "learning_rate": 0.000983519558520103, + "loss": 2.9388, + "step": 3075 + }, + { + "epoch": 0.09121371171010882, + "grad_norm": 0.16277918219566345, + "learning_rate": 0.000983507575893923, + "loss": 2.9118, + "step": 3076 + }, + { + "epoch": 0.09124336506242031, + "grad_norm": 0.1702195703983307, + "learning_rate": 0.0009834955889862008, + "loss": 2.9587, + "step": 3077 + }, + { + "epoch": 0.09127301841473179, + "grad_norm": 0.1872839629650116, + "learning_rate": 0.0009834835977970417, + "loss": 2.9163, + "step": 3078 + }, + { + "epoch": 0.09130267176704326, + "grad_norm": 0.16533078253269196, + "learning_rate": 0.0009834716023265527, + "loss": 2.8989, + "step": 3079 + }, + { + "epoch": 0.09133232511935474, + "grad_norm": 0.17254450917243958, + "learning_rate": 0.0009834596025748397, + "loss": 2.9209, + "step": 3080 + }, + { + "epoch": 0.09136197847166622, + "grad_norm": 0.1644214391708374, + "learning_rate": 0.0009834475985420088, + "loss": 2.9353, + "step": 3081 + }, + { + "epoch": 0.0913916318239777, + "grad_norm": 0.1257067769765854, + "learning_rate": 0.0009834355902281664, + "loss": 2.905, + "step": 3082 + }, + { + "epoch": 0.09142128517628918, + "grad_norm": 0.14490346610546112, + "learning_rate": 0.000983423577633419, + "loss": 2.9433, + "step": 3083 + }, + { + "epoch": 0.09145093852860066, + "grad_norm": 0.16286805272102356, + "learning_rate": 0.0009834115607578727, + "loss": 2.9351, + "step": 3084 + }, + { + "epoch": 0.09148059188091213, + "grad_norm": 0.15263649821281433, + "learning_rate": 0.0009833995396016342, + "loss": 2.9713, + "step": 3085 + }, + { + "epoch": 0.09151024523322361, + "grad_norm": 0.15528936684131622, + "learning_rate": 0.0009833875141648097, + "loss": 2.9479, + "step": 3086 + }, + { + "epoch": 0.0915398985855351, + "grad_norm": 0.1879042536020279, + "learning_rate": 0.0009833754844475059, + "loss": 2.9745, + "step": 3087 + }, + { + "epoch": 0.09156955193784658, + "grad_norm": 0.21355637907981873, + "learning_rate": 0.000983363450449829, + "loss": 2.9681, + "step": 3088 + }, + { + "epoch": 0.09159920529015805, + "grad_norm": 0.23716720938682556, + "learning_rate": 0.000983351412171886, + "loss": 2.9681, + "step": 3089 + }, + { + "epoch": 0.09162885864246953, + "grad_norm": 0.20627669990062714, + "learning_rate": 0.0009833393696137831, + "loss": 2.9234, + "step": 3090 + }, + { + "epoch": 0.091658511994781, + "grad_norm": 0.19356019794940948, + "learning_rate": 0.0009833273227756272, + "loss": 2.8855, + "step": 3091 + }, + { + "epoch": 0.0916881653470925, + "grad_norm": 0.19747090339660645, + "learning_rate": 0.0009833152716575248, + "loss": 2.9695, + "step": 3092 + }, + { + "epoch": 0.09171781869940397, + "grad_norm": 0.18987534940242767, + "learning_rate": 0.000983303216259583, + "loss": 2.9371, + "step": 3093 + }, + { + "epoch": 0.09174747205171545, + "grad_norm": 0.173346608877182, + "learning_rate": 0.000983291156581908, + "loss": 2.9301, + "step": 3094 + }, + { + "epoch": 0.09177712540402692, + "grad_norm": 0.19489076733589172, + "learning_rate": 0.0009832790926246069, + "loss": 2.9443, + "step": 3095 + }, + { + "epoch": 0.0918067787563384, + "grad_norm": 0.19089244306087494, + "learning_rate": 0.0009832670243877866, + "loss": 2.9666, + "step": 3096 + }, + { + "epoch": 0.09183643210864988, + "grad_norm": 0.19137287139892578, + "learning_rate": 0.0009832549518715536, + "loss": 2.9465, + "step": 3097 + }, + { + "epoch": 0.09186608546096137, + "grad_norm": 0.18030281364917755, + "learning_rate": 0.0009832428750760152, + "loss": 2.9273, + "step": 3098 + }, + { + "epoch": 0.09189573881327284, + "grad_norm": 0.19707348942756653, + "learning_rate": 0.0009832307940012782, + "loss": 2.9695, + "step": 3099 + }, + { + "epoch": 0.09192539216558432, + "grad_norm": 0.19370503723621368, + "learning_rate": 0.0009832187086474496, + "loss": 2.9399, + "step": 3100 + }, + { + "epoch": 0.0919550455178958, + "grad_norm": 0.20649564266204834, + "learning_rate": 0.0009832066190146363, + "loss": 2.9343, + "step": 3101 + }, + { + "epoch": 0.09198469887020727, + "grad_norm": 0.22402286529541016, + "learning_rate": 0.0009831945251029457, + "loss": 2.9176, + "step": 3102 + }, + { + "epoch": 0.09201435222251876, + "grad_norm": 0.1777580827474594, + "learning_rate": 0.0009831824269124843, + "loss": 2.9039, + "step": 3103 + }, + { + "epoch": 0.09204400557483024, + "grad_norm": 0.15253989398479462, + "learning_rate": 0.0009831703244433598, + "loss": 2.9587, + "step": 3104 + }, + { + "epoch": 0.09207365892714171, + "grad_norm": 0.15576520562171936, + "learning_rate": 0.000983158217695679, + "loss": 2.938, + "step": 3105 + }, + { + "epoch": 0.09210331227945319, + "grad_norm": 0.1548379510641098, + "learning_rate": 0.0009831461066695493, + "loss": 2.945, + "step": 3106 + }, + { + "epoch": 0.09213296563176467, + "grad_norm": 0.146333247423172, + "learning_rate": 0.0009831339913650779, + "loss": 2.918, + "step": 3107 + }, + { + "epoch": 0.09216261898407616, + "grad_norm": 0.16313189268112183, + "learning_rate": 0.0009831218717823722, + "loss": 2.9284, + "step": 3108 + }, + { + "epoch": 0.09219227233638763, + "grad_norm": 0.1563093513250351, + "learning_rate": 0.0009831097479215392, + "loss": 2.9497, + "step": 3109 + }, + { + "epoch": 0.09222192568869911, + "grad_norm": 0.15541008114814758, + "learning_rate": 0.0009830976197826866, + "loss": 2.9389, + "step": 3110 + }, + { + "epoch": 0.09225157904101058, + "grad_norm": 0.1270742267370224, + "learning_rate": 0.0009830854873659216, + "loss": 2.9352, + "step": 3111 + }, + { + "epoch": 0.09228123239332206, + "grad_norm": 0.1413872241973877, + "learning_rate": 0.0009830733506713517, + "loss": 2.9777, + "step": 3112 + }, + { + "epoch": 0.09231088574563355, + "grad_norm": 0.1524813324213028, + "learning_rate": 0.0009830612096990844, + "loss": 2.9311, + "step": 3113 + }, + { + "epoch": 0.09234053909794503, + "grad_norm": 0.18775606155395508, + "learning_rate": 0.000983049064449227, + "loss": 2.9823, + "step": 3114 + }, + { + "epoch": 0.0923701924502565, + "grad_norm": 0.18389888107776642, + "learning_rate": 0.0009830369149218874, + "loss": 2.9395, + "step": 3115 + }, + { + "epoch": 0.09239984580256798, + "grad_norm": 0.16688604652881622, + "learning_rate": 0.0009830247611171729, + "loss": 2.9195, + "step": 3116 + }, + { + "epoch": 0.09242949915487945, + "grad_norm": 0.16323217749595642, + "learning_rate": 0.0009830126030351913, + "loss": 2.9543, + "step": 3117 + }, + { + "epoch": 0.09245915250719094, + "grad_norm": 0.17941106855869293, + "learning_rate": 0.0009830004406760503, + "loss": 2.9464, + "step": 3118 + }, + { + "epoch": 0.09248880585950242, + "grad_norm": 0.1807989627122879, + "learning_rate": 0.0009829882740398572, + "loss": 2.9433, + "step": 3119 + }, + { + "epoch": 0.0925184592118139, + "grad_norm": 0.1828482747077942, + "learning_rate": 0.00098297610312672, + "loss": 2.9685, + "step": 3120 + }, + { + "epoch": 0.09254811256412537, + "grad_norm": 0.1852722465991974, + "learning_rate": 0.0009829639279367469, + "loss": 2.9425, + "step": 3121 + }, + { + "epoch": 0.09257776591643685, + "grad_norm": 0.17327512800693512, + "learning_rate": 0.0009829517484700452, + "loss": 2.9426, + "step": 3122 + }, + { + "epoch": 0.09260741926874833, + "grad_norm": 0.163056418299675, + "learning_rate": 0.0009829395647267226, + "loss": 2.9472, + "step": 3123 + }, + { + "epoch": 0.09263707262105982, + "grad_norm": 0.17272639274597168, + "learning_rate": 0.0009829273767068874, + "loss": 2.9578, + "step": 3124 + }, + { + "epoch": 0.09266672597337129, + "grad_norm": 0.19692255556583405, + "learning_rate": 0.0009829151844106476, + "loss": 2.9273, + "step": 3125 + }, + { + "epoch": 0.09269637932568277, + "grad_norm": 0.2119758129119873, + "learning_rate": 0.0009829029878381107, + "loss": 2.9598, + "step": 3126 + }, + { + "epoch": 0.09272603267799424, + "grad_norm": 0.19423195719718933, + "learning_rate": 0.000982890786989385, + "loss": 2.9703, + "step": 3127 + }, + { + "epoch": 0.09275568603030572, + "grad_norm": 0.1745556741952896, + "learning_rate": 0.0009828785818645786, + "loss": 2.9478, + "step": 3128 + }, + { + "epoch": 0.09278533938261721, + "grad_norm": 0.19252605736255646, + "learning_rate": 0.0009828663724637994, + "loss": 2.9475, + "step": 3129 + }, + { + "epoch": 0.09281499273492869, + "grad_norm": 0.18349242210388184, + "learning_rate": 0.0009828541587871555, + "loss": 2.9503, + "step": 3130 + }, + { + "epoch": 0.09284464608724016, + "grad_norm": 0.1485726237297058, + "learning_rate": 0.0009828419408347553, + "loss": 2.9706, + "step": 3131 + }, + { + "epoch": 0.09287429943955164, + "grad_norm": 0.14535535871982574, + "learning_rate": 0.0009828297186067069, + "loss": 2.964, + "step": 3132 + }, + { + "epoch": 0.09290395279186311, + "grad_norm": 0.15224100649356842, + "learning_rate": 0.000982817492103118, + "loss": 2.9214, + "step": 3133 + }, + { + "epoch": 0.0929336061441746, + "grad_norm": 0.14946581423282623, + "learning_rate": 0.0009828052613240978, + "loss": 2.9579, + "step": 3134 + }, + { + "epoch": 0.09296325949648608, + "grad_norm": 0.15588563680648804, + "learning_rate": 0.000982793026269754, + "loss": 2.9694, + "step": 3135 + }, + { + "epoch": 0.09299291284879756, + "grad_norm": 0.1506616771221161, + "learning_rate": 0.000982780786940195, + "loss": 2.9589, + "step": 3136 + }, + { + "epoch": 0.09302256620110903, + "grad_norm": 0.15757757425308228, + "learning_rate": 0.0009827685433355295, + "loss": 2.9881, + "step": 3137 + }, + { + "epoch": 0.09305221955342051, + "grad_norm": 0.1643669605255127, + "learning_rate": 0.0009827562954558655, + "loss": 2.9297, + "step": 3138 + }, + { + "epoch": 0.093081872905732, + "grad_norm": 0.14989010989665985, + "learning_rate": 0.0009827440433013116, + "loss": 2.96, + "step": 3139 + }, + { + "epoch": 0.09311152625804348, + "grad_norm": 0.15436473488807678, + "learning_rate": 0.0009827317868719766, + "loss": 2.8663, + "step": 3140 + }, + { + "epoch": 0.09314117961035495, + "grad_norm": 0.1472577601671219, + "learning_rate": 0.0009827195261679685, + "loss": 2.9283, + "step": 3141 + }, + { + "epoch": 0.09317083296266643, + "grad_norm": 0.1716058850288391, + "learning_rate": 0.000982707261189396, + "loss": 2.9014, + "step": 3142 + }, + { + "epoch": 0.0932004863149779, + "grad_norm": 0.16170772910118103, + "learning_rate": 0.0009826949919363682, + "loss": 2.9659, + "step": 3143 + }, + { + "epoch": 0.0932301396672894, + "grad_norm": 0.16631507873535156, + "learning_rate": 0.0009826827184089932, + "loss": 2.9667, + "step": 3144 + }, + { + "epoch": 0.09325979301960087, + "grad_norm": 0.16762201488018036, + "learning_rate": 0.00098267044060738, + "loss": 2.9265, + "step": 3145 + }, + { + "epoch": 0.09328944637191235, + "grad_norm": 0.18594764173030853, + "learning_rate": 0.000982658158531637, + "loss": 2.943, + "step": 3146 + }, + { + "epoch": 0.09331909972422382, + "grad_norm": 0.2518799602985382, + "learning_rate": 0.0009826458721818735, + "loss": 2.9543, + "step": 3147 + }, + { + "epoch": 0.0933487530765353, + "grad_norm": 0.25176742672920227, + "learning_rate": 0.0009826335815581975, + "loss": 2.9244, + "step": 3148 + }, + { + "epoch": 0.09337840642884677, + "grad_norm": 0.19689206779003143, + "learning_rate": 0.0009826212866607185, + "loss": 2.927, + "step": 3149 + }, + { + "epoch": 0.09340805978115826, + "grad_norm": 0.1877928525209427, + "learning_rate": 0.0009826089874895453, + "loss": 2.9312, + "step": 3150 + }, + { + "epoch": 0.09343771313346974, + "grad_norm": 0.2067270427942276, + "learning_rate": 0.0009825966840447866, + "loss": 2.9374, + "step": 3151 + }, + { + "epoch": 0.09346736648578122, + "grad_norm": 0.1595538705587387, + "learning_rate": 0.0009825843763265514, + "loss": 2.9425, + "step": 3152 + }, + { + "epoch": 0.0934970198380927, + "grad_norm": 0.18810105323791504, + "learning_rate": 0.0009825720643349487, + "loss": 2.92, + "step": 3153 + }, + { + "epoch": 0.09352667319040417, + "grad_norm": 0.17562809586524963, + "learning_rate": 0.0009825597480700875, + "loss": 2.9713, + "step": 3154 + }, + { + "epoch": 0.09355632654271566, + "grad_norm": 0.20664028823375702, + "learning_rate": 0.0009825474275320769, + "loss": 2.9238, + "step": 3155 + }, + { + "epoch": 0.09358597989502714, + "grad_norm": 0.19352693855762482, + "learning_rate": 0.000982535102721026, + "loss": 2.9333, + "step": 3156 + }, + { + "epoch": 0.09361563324733861, + "grad_norm": 0.17519870400428772, + "learning_rate": 0.0009825227736370442, + "loss": 2.9489, + "step": 3157 + }, + { + "epoch": 0.09364528659965009, + "grad_norm": 0.1619558334350586, + "learning_rate": 0.00098251044028024, + "loss": 2.9706, + "step": 3158 + }, + { + "epoch": 0.09367493995196156, + "grad_norm": 0.15871991217136383, + "learning_rate": 0.0009824981026507235, + "loss": 2.9427, + "step": 3159 + }, + { + "epoch": 0.09370459330427305, + "grad_norm": 0.17164908349514008, + "learning_rate": 0.0009824857607486032, + "loss": 2.9889, + "step": 3160 + }, + { + "epoch": 0.09373424665658453, + "grad_norm": 0.1827775239944458, + "learning_rate": 0.0009824734145739886, + "loss": 2.9699, + "step": 3161 + }, + { + "epoch": 0.093763900008896, + "grad_norm": 0.16065120697021484, + "learning_rate": 0.000982461064126989, + "loss": 2.9366, + "step": 3162 + }, + { + "epoch": 0.09379355336120748, + "grad_norm": 0.1675449162721634, + "learning_rate": 0.0009824487094077143, + "loss": 2.92, + "step": 3163 + }, + { + "epoch": 0.09382320671351896, + "grad_norm": 0.19661065936088562, + "learning_rate": 0.0009824363504162732, + "loss": 2.9417, + "step": 3164 + }, + { + "epoch": 0.09385286006583045, + "grad_norm": 0.15437926352024078, + "learning_rate": 0.0009824239871527754, + "loss": 2.9307, + "step": 3165 + }, + { + "epoch": 0.09388251341814192, + "grad_norm": 0.16705475747585297, + "learning_rate": 0.0009824116196173304, + "loss": 2.9281, + "step": 3166 + }, + { + "epoch": 0.0939121667704534, + "grad_norm": 0.16648118197917938, + "learning_rate": 0.0009823992478100476, + "loss": 2.9906, + "step": 3167 + }, + { + "epoch": 0.09394182012276488, + "grad_norm": 0.14958631992340088, + "learning_rate": 0.000982386871731037, + "loss": 2.9216, + "step": 3168 + }, + { + "epoch": 0.09397147347507635, + "grad_norm": 0.1369066685438156, + "learning_rate": 0.0009823744913804076, + "loss": 2.9183, + "step": 3169 + }, + { + "epoch": 0.09400112682738784, + "grad_norm": 0.1364826261997223, + "learning_rate": 0.0009823621067582692, + "loss": 2.8817, + "step": 3170 + }, + { + "epoch": 0.09403078017969932, + "grad_norm": 0.21263089776039124, + "learning_rate": 0.0009823497178647316, + "loss": 2.9129, + "step": 3171 + }, + { + "epoch": 0.0940604335320108, + "grad_norm": 0.1663166731595993, + "learning_rate": 0.0009823373246999046, + "loss": 2.9349, + "step": 3172 + }, + { + "epoch": 0.09409008688432227, + "grad_norm": 0.1504797488451004, + "learning_rate": 0.0009823249272638977, + "loss": 2.9275, + "step": 3173 + }, + { + "epoch": 0.09411974023663375, + "grad_norm": 0.16347192227840424, + "learning_rate": 0.0009823125255568209, + "loss": 2.9416, + "step": 3174 + }, + { + "epoch": 0.09414939358894522, + "grad_norm": 0.1595231592655182, + "learning_rate": 0.0009823001195787837, + "loss": 2.931, + "step": 3175 + }, + { + "epoch": 0.09417904694125671, + "grad_norm": 0.1470792144536972, + "learning_rate": 0.0009822877093298961, + "loss": 2.9293, + "step": 3176 + }, + { + "epoch": 0.09420870029356819, + "grad_norm": 0.14862364530563354, + "learning_rate": 0.0009822752948102683, + "loss": 2.9241, + "step": 3177 + }, + { + "epoch": 0.09423835364587967, + "grad_norm": 0.17165066301822662, + "learning_rate": 0.0009822628760200098, + "loss": 2.9728, + "step": 3178 + }, + { + "epoch": 0.09426800699819114, + "grad_norm": 0.18937237560749054, + "learning_rate": 0.000982250452959231, + "loss": 2.9165, + "step": 3179 + }, + { + "epoch": 0.09429766035050262, + "grad_norm": 0.19763433933258057, + "learning_rate": 0.0009822380256280412, + "loss": 2.945, + "step": 3180 + }, + { + "epoch": 0.09432731370281411, + "grad_norm": 0.18406246602535248, + "learning_rate": 0.0009822255940265512, + "loss": 2.927, + "step": 3181 + }, + { + "epoch": 0.09435696705512558, + "grad_norm": 0.175408735871315, + "learning_rate": 0.000982213158154871, + "loss": 2.9411, + "step": 3182 + }, + { + "epoch": 0.09438662040743706, + "grad_norm": 0.1969306915998459, + "learning_rate": 0.0009822007180131103, + "loss": 2.9668, + "step": 3183 + }, + { + "epoch": 0.09441627375974854, + "grad_norm": 0.18989504873752594, + "learning_rate": 0.0009821882736013793, + "loss": 2.9316, + "step": 3184 + }, + { + "epoch": 0.09444592711206001, + "grad_norm": 0.17609849572181702, + "learning_rate": 0.0009821758249197886, + "loss": 2.9398, + "step": 3185 + }, + { + "epoch": 0.0944755804643715, + "grad_norm": 0.18185456097126007, + "learning_rate": 0.000982163371968448, + "loss": 2.9282, + "step": 3186 + }, + { + "epoch": 0.09450523381668298, + "grad_norm": 0.19829344749450684, + "learning_rate": 0.0009821509147474683, + "loss": 2.9377, + "step": 3187 + }, + { + "epoch": 0.09453488716899446, + "grad_norm": 0.17168506979942322, + "learning_rate": 0.0009821384532569593, + "loss": 2.947, + "step": 3188 + }, + { + "epoch": 0.09456454052130593, + "grad_norm": 0.15925350785255432, + "learning_rate": 0.0009821259874970316, + "loss": 2.9533, + "step": 3189 + }, + { + "epoch": 0.09459419387361741, + "grad_norm": 0.16552317142486572, + "learning_rate": 0.0009821135174677956, + "loss": 2.9266, + "step": 3190 + }, + { + "epoch": 0.0946238472259289, + "grad_norm": 0.1612582951784134, + "learning_rate": 0.0009821010431693613, + "loss": 2.9584, + "step": 3191 + }, + { + "epoch": 0.09465350057824037, + "grad_norm": 0.14798441529273987, + "learning_rate": 0.0009820885646018398, + "loss": 2.9213, + "step": 3192 + }, + { + "epoch": 0.09468315393055185, + "grad_norm": 0.15689052641391754, + "learning_rate": 0.0009820760817653414, + "loss": 2.9215, + "step": 3193 + }, + { + "epoch": 0.09471280728286333, + "grad_norm": 0.14760856330394745, + "learning_rate": 0.0009820635946599762, + "loss": 2.9293, + "step": 3194 + }, + { + "epoch": 0.0947424606351748, + "grad_norm": 0.16629254817962646, + "learning_rate": 0.0009820511032858554, + "loss": 2.9241, + "step": 3195 + }, + { + "epoch": 0.09477211398748629, + "grad_norm": 0.16470083594322205, + "learning_rate": 0.0009820386076430892, + "loss": 2.8977, + "step": 3196 + }, + { + "epoch": 0.09480176733979777, + "grad_norm": 0.16380347311496735, + "learning_rate": 0.0009820261077317885, + "loss": 2.9312, + "step": 3197 + }, + { + "epoch": 0.09483142069210924, + "grad_norm": 0.16311274468898773, + "learning_rate": 0.000982013603552064, + "loss": 2.9644, + "step": 3198 + }, + { + "epoch": 0.09486107404442072, + "grad_norm": 0.16167476773262024, + "learning_rate": 0.000982001095104026, + "loss": 2.9222, + "step": 3199 + }, + { + "epoch": 0.0948907273967322, + "grad_norm": 0.17060118913650513, + "learning_rate": 0.0009819885823877856, + "loss": 2.9192, + "step": 3200 + }, + { + "epoch": 0.09492038074904367, + "grad_norm": 0.17809997498989105, + "learning_rate": 0.0009819760654034538, + "loss": 2.9169, + "step": 3201 + }, + { + "epoch": 0.09495003410135516, + "grad_norm": 0.17410808801651, + "learning_rate": 0.000981963544151141, + "loss": 2.9135, + "step": 3202 + }, + { + "epoch": 0.09497968745366664, + "grad_norm": 0.18262793123722076, + "learning_rate": 0.0009819510186309583, + "loss": 2.9285, + "step": 3203 + }, + { + "epoch": 0.09500934080597812, + "grad_norm": 0.187385693192482, + "learning_rate": 0.0009819384888430168, + "loss": 2.9099, + "step": 3204 + }, + { + "epoch": 0.09503899415828959, + "grad_norm": 0.16927766799926758, + "learning_rate": 0.000981925954787427, + "loss": 2.9209, + "step": 3205 + }, + { + "epoch": 0.09506864751060107, + "grad_norm": 0.1753767877817154, + "learning_rate": 0.0009819134164643004, + "loss": 2.9405, + "step": 3206 + }, + { + "epoch": 0.09509830086291256, + "grad_norm": 0.1744181215763092, + "learning_rate": 0.0009819008738737476, + "loss": 2.9404, + "step": 3207 + }, + { + "epoch": 0.09512795421522403, + "grad_norm": 0.19749517738819122, + "learning_rate": 0.00098188832701588, + "loss": 2.9522, + "step": 3208 + }, + { + "epoch": 0.09515760756753551, + "grad_norm": 0.18648836016654968, + "learning_rate": 0.0009818757758908085, + "loss": 2.9243, + "step": 3209 + }, + { + "epoch": 0.09518726091984699, + "grad_norm": 0.19196459650993347, + "learning_rate": 0.0009818632204986442, + "loss": 2.936, + "step": 3210 + }, + { + "epoch": 0.09521691427215846, + "grad_norm": 0.18143728375434875, + "learning_rate": 0.0009818506608394984, + "loss": 2.9171, + "step": 3211 + }, + { + "epoch": 0.09524656762446995, + "grad_norm": 0.18036498129367828, + "learning_rate": 0.0009818380969134823, + "loss": 2.9281, + "step": 3212 + }, + { + "epoch": 0.09527622097678143, + "grad_norm": 0.23270151019096375, + "learning_rate": 0.0009818255287207072, + "loss": 2.9554, + "step": 3213 + }, + { + "epoch": 0.0953058743290929, + "grad_norm": 0.24109968543052673, + "learning_rate": 0.0009818129562612842, + "loss": 2.9602, + "step": 3214 + }, + { + "epoch": 0.09533552768140438, + "grad_norm": 0.25109514594078064, + "learning_rate": 0.000981800379535325, + "loss": 2.9007, + "step": 3215 + }, + { + "epoch": 0.09536518103371586, + "grad_norm": 0.19357874989509583, + "learning_rate": 0.0009817877985429406, + "loss": 2.9254, + "step": 3216 + }, + { + "epoch": 0.09539483438602735, + "grad_norm": 0.16956239938735962, + "learning_rate": 0.0009817752132842425, + "loss": 2.8892, + "step": 3217 + }, + { + "epoch": 0.09542448773833882, + "grad_norm": 0.1781914383172989, + "learning_rate": 0.0009817626237593423, + "loss": 2.9379, + "step": 3218 + }, + { + "epoch": 0.0954541410906503, + "grad_norm": 0.19108811020851135, + "learning_rate": 0.0009817500299683514, + "loss": 2.9514, + "step": 3219 + }, + { + "epoch": 0.09548379444296178, + "grad_norm": 0.17252713441848755, + "learning_rate": 0.000981737431911381, + "loss": 2.9392, + "step": 3220 + }, + { + "epoch": 0.09551344779527325, + "grad_norm": 0.14815235137939453, + "learning_rate": 0.000981724829588543, + "loss": 2.9797, + "step": 3221 + }, + { + "epoch": 0.09554310114758474, + "grad_norm": 0.13881129026412964, + "learning_rate": 0.0009817122229999493, + "loss": 2.9048, + "step": 3222 + }, + { + "epoch": 0.09557275449989622, + "grad_norm": 0.17717400193214417, + "learning_rate": 0.000981699612145711, + "loss": 2.9341, + "step": 3223 + }, + { + "epoch": 0.0956024078522077, + "grad_norm": 0.17378099262714386, + "learning_rate": 0.00098168699702594, + "loss": 2.936, + "step": 3224 + }, + { + "epoch": 0.09563206120451917, + "grad_norm": 0.14929725229740143, + "learning_rate": 0.0009816743776407478, + "loss": 2.9659, + "step": 3225 + }, + { + "epoch": 0.09566171455683065, + "grad_norm": 0.1335277259349823, + "learning_rate": 0.0009816617539902463, + "loss": 2.9331, + "step": 3226 + }, + { + "epoch": 0.09569136790914212, + "grad_norm": 0.1368490308523178, + "learning_rate": 0.0009816491260745475, + "loss": 2.9376, + "step": 3227 + }, + { + "epoch": 0.09572102126145361, + "grad_norm": 0.13824111223220825, + "learning_rate": 0.000981636493893763, + "loss": 2.9439, + "step": 3228 + }, + { + "epoch": 0.09575067461376509, + "grad_norm": 0.1407158076763153, + "learning_rate": 0.0009816238574480046, + "loss": 2.946, + "step": 3229 + }, + { + "epoch": 0.09578032796607656, + "grad_norm": 0.1412843018770218, + "learning_rate": 0.000981611216737384, + "loss": 2.9022, + "step": 3230 + }, + { + "epoch": 0.09580998131838804, + "grad_norm": 0.15741802752017975, + "learning_rate": 0.0009815985717620138, + "loss": 2.9234, + "step": 3231 + }, + { + "epoch": 0.09583963467069952, + "grad_norm": 0.19551701843738556, + "learning_rate": 0.0009815859225220055, + "loss": 2.9132, + "step": 3232 + }, + { + "epoch": 0.095869288023011, + "grad_norm": 0.22581841051578522, + "learning_rate": 0.000981573269017471, + "loss": 2.9672, + "step": 3233 + }, + { + "epoch": 0.09589894137532248, + "grad_norm": 0.24430204927921295, + "learning_rate": 0.0009815606112485227, + "loss": 2.9707, + "step": 3234 + }, + { + "epoch": 0.09592859472763396, + "grad_norm": 0.26180770993232727, + "learning_rate": 0.0009815479492152725, + "loss": 2.9172, + "step": 3235 + }, + { + "epoch": 0.09595824807994544, + "grad_norm": 0.25770190358161926, + "learning_rate": 0.0009815352829178326, + "loss": 2.9279, + "step": 3236 + }, + { + "epoch": 0.09598790143225691, + "grad_norm": 0.19068637490272522, + "learning_rate": 0.000981522612356315, + "loss": 2.8985, + "step": 3237 + }, + { + "epoch": 0.0960175547845684, + "grad_norm": 0.1842270940542221, + "learning_rate": 0.0009815099375308322, + "loss": 2.9215, + "step": 3238 + }, + { + "epoch": 0.09604720813687988, + "grad_norm": 0.1797284483909607, + "learning_rate": 0.000981497258441496, + "loss": 2.89, + "step": 3239 + }, + { + "epoch": 0.09607686148919135, + "grad_norm": 0.14384301006793976, + "learning_rate": 0.000981484575088419, + "loss": 2.9251, + "step": 3240 + }, + { + "epoch": 0.09610651484150283, + "grad_norm": 0.14981703460216522, + "learning_rate": 0.0009814718874717138, + "loss": 2.9205, + "step": 3241 + }, + { + "epoch": 0.0961361681938143, + "grad_norm": 0.16512161493301392, + "learning_rate": 0.000981459195591492, + "loss": 2.9201, + "step": 3242 + }, + { + "epoch": 0.0961658215461258, + "grad_norm": 0.1588679999113083, + "learning_rate": 0.0009814464994478665, + "loss": 2.9112, + "step": 3243 + }, + { + "epoch": 0.09619547489843727, + "grad_norm": 0.1630876064300537, + "learning_rate": 0.0009814337990409496, + "loss": 2.9427, + "step": 3244 + }, + { + "epoch": 0.09622512825074875, + "grad_norm": 0.16890664398670197, + "learning_rate": 0.000981421094370854, + "loss": 2.9441, + "step": 3245 + }, + { + "epoch": 0.09625478160306022, + "grad_norm": 0.18087072670459747, + "learning_rate": 0.0009814083854376915, + "loss": 2.9456, + "step": 3246 + }, + { + "epoch": 0.0962844349553717, + "grad_norm": 0.19897930324077606, + "learning_rate": 0.0009813956722415755, + "loss": 2.9374, + "step": 3247 + }, + { + "epoch": 0.09631408830768319, + "grad_norm": 0.18618088960647583, + "learning_rate": 0.000981382954782618, + "loss": 2.9528, + "step": 3248 + }, + { + "epoch": 0.09634374165999467, + "grad_norm": 0.2098691314458847, + "learning_rate": 0.000981370233060932, + "loss": 2.9208, + "step": 3249 + }, + { + "epoch": 0.09637339501230614, + "grad_norm": 0.17898806929588318, + "learning_rate": 0.0009813575070766296, + "loss": 2.9301, + "step": 3250 + }, + { + "epoch": 0.09640304836461762, + "grad_norm": 0.16711077094078064, + "learning_rate": 0.000981344776829824, + "loss": 2.9535, + "step": 3251 + }, + { + "epoch": 0.0964327017169291, + "grad_norm": 0.18414516746997833, + "learning_rate": 0.000981332042320628, + "loss": 2.9282, + "step": 3252 + }, + { + "epoch": 0.09646235506924057, + "grad_norm": 0.1598568707704544, + "learning_rate": 0.000981319303549154, + "loss": 2.9256, + "step": 3253 + }, + { + "epoch": 0.09649200842155206, + "grad_norm": 0.1400379091501236, + "learning_rate": 0.000981306560515515, + "loss": 2.9449, + "step": 3254 + }, + { + "epoch": 0.09652166177386354, + "grad_norm": 0.1599055677652359, + "learning_rate": 0.0009812938132198238, + "loss": 2.9385, + "step": 3255 + }, + { + "epoch": 0.09655131512617501, + "grad_norm": 0.15745119750499725, + "learning_rate": 0.000981281061662193, + "loss": 2.9504, + "step": 3256 + }, + { + "epoch": 0.09658096847848649, + "grad_norm": 0.1574297696352005, + "learning_rate": 0.000981268305842736, + "loss": 2.8999, + "step": 3257 + }, + { + "epoch": 0.09661062183079797, + "grad_norm": 0.14595802128314972, + "learning_rate": 0.0009812555457615656, + "loss": 2.9216, + "step": 3258 + }, + { + "epoch": 0.09664027518310946, + "grad_norm": 0.13544835150241852, + "learning_rate": 0.0009812427814187947, + "loss": 2.9541, + "step": 3259 + }, + { + "epoch": 0.09666992853542093, + "grad_norm": 0.14669829607009888, + "learning_rate": 0.0009812300128145364, + "loss": 2.9317, + "step": 3260 + }, + { + "epoch": 0.09669958188773241, + "grad_norm": 0.13139735162258148, + "learning_rate": 0.0009812172399489036, + "loss": 2.9174, + "step": 3261 + }, + { + "epoch": 0.09672923524004388, + "grad_norm": 0.1677769273519516, + "learning_rate": 0.0009812044628220095, + "loss": 2.9023, + "step": 3262 + }, + { + "epoch": 0.09675888859235536, + "grad_norm": 0.1961059421300888, + "learning_rate": 0.0009811916814339676, + "loss": 2.8889, + "step": 3263 + }, + { + "epoch": 0.09678854194466685, + "grad_norm": 0.2049960047006607, + "learning_rate": 0.0009811788957848906, + "loss": 2.9426, + "step": 3264 + }, + { + "epoch": 0.09681819529697833, + "grad_norm": 0.19310711324214935, + "learning_rate": 0.0009811661058748916, + "loss": 2.943, + "step": 3265 + }, + { + "epoch": 0.0968478486492898, + "grad_norm": 0.18464834988117218, + "learning_rate": 0.0009811533117040844, + "loss": 2.9244, + "step": 3266 + }, + { + "epoch": 0.09687750200160128, + "grad_norm": 0.2054729163646698, + "learning_rate": 0.0009811405132725821, + "loss": 2.9273, + "step": 3267 + }, + { + "epoch": 0.09690715535391276, + "grad_norm": 0.21564336121082306, + "learning_rate": 0.000981127710580498, + "loss": 2.9426, + "step": 3268 + }, + { + "epoch": 0.09693680870622425, + "grad_norm": 0.21081863343715668, + "learning_rate": 0.0009811149036279454, + "loss": 2.9348, + "step": 3269 + }, + { + "epoch": 0.09696646205853572, + "grad_norm": 0.22448676824569702, + "learning_rate": 0.0009811020924150376, + "loss": 2.9437, + "step": 3270 + }, + { + "epoch": 0.0969961154108472, + "grad_norm": 0.19407443702220917, + "learning_rate": 0.0009810892769418881, + "loss": 2.9217, + "step": 3271 + }, + { + "epoch": 0.09702576876315867, + "grad_norm": 0.16237027943134308, + "learning_rate": 0.0009810764572086108, + "loss": 2.9424, + "step": 3272 + }, + { + "epoch": 0.09705542211547015, + "grad_norm": 0.17345422506332397, + "learning_rate": 0.0009810636332153188, + "loss": 2.9421, + "step": 3273 + }, + { + "epoch": 0.09708507546778163, + "grad_norm": 0.15125803649425507, + "learning_rate": 0.0009810508049621256, + "loss": 2.9207, + "step": 3274 + }, + { + "epoch": 0.09711472882009312, + "grad_norm": 0.18748866021633148, + "learning_rate": 0.0009810379724491452, + "loss": 2.9654, + "step": 3275 + }, + { + "epoch": 0.09714438217240459, + "grad_norm": 0.17725123465061188, + "learning_rate": 0.0009810251356764908, + "loss": 2.9354, + "step": 3276 + }, + { + "epoch": 0.09717403552471607, + "grad_norm": 0.16953563690185547, + "learning_rate": 0.0009810122946442763, + "loss": 2.9377, + "step": 3277 + }, + { + "epoch": 0.09720368887702754, + "grad_norm": 0.15869084000587463, + "learning_rate": 0.0009809994493526152, + "loss": 2.9112, + "step": 3278 + }, + { + "epoch": 0.09723334222933902, + "grad_norm": 0.15320755541324615, + "learning_rate": 0.0009809865998016217, + "loss": 2.9088, + "step": 3279 + }, + { + "epoch": 0.09726299558165051, + "grad_norm": 0.16321073472499847, + "learning_rate": 0.000980973745991409, + "loss": 2.9359, + "step": 3280 + }, + { + "epoch": 0.09729264893396199, + "grad_norm": 0.17243261635303497, + "learning_rate": 0.0009809608879220914, + "loss": 2.9205, + "step": 3281 + }, + { + "epoch": 0.09732230228627346, + "grad_norm": 0.2096569687128067, + "learning_rate": 0.0009809480255937827, + "loss": 2.9106, + "step": 3282 + }, + { + "epoch": 0.09735195563858494, + "grad_norm": 0.19732870161533356, + "learning_rate": 0.0009809351590065966, + "loss": 2.9776, + "step": 3283 + }, + { + "epoch": 0.09738160899089641, + "grad_norm": 0.2043343484401703, + "learning_rate": 0.000980922288160647, + "loss": 2.9277, + "step": 3284 + }, + { + "epoch": 0.0974112623432079, + "grad_norm": 0.21314287185668945, + "learning_rate": 0.000980909413056048, + "loss": 2.9332, + "step": 3285 + }, + { + "epoch": 0.09744091569551938, + "grad_norm": 0.21708352863788605, + "learning_rate": 0.0009808965336929136, + "loss": 2.9667, + "step": 3286 + }, + { + "epoch": 0.09747056904783086, + "grad_norm": 0.20956923067569733, + "learning_rate": 0.000980883650071358, + "loss": 2.9355, + "step": 3287 + }, + { + "epoch": 0.09750022240014233, + "grad_norm": 0.15861716866493225, + "learning_rate": 0.000980870762191495, + "loss": 2.9139, + "step": 3288 + }, + { + "epoch": 0.09752987575245381, + "grad_norm": 0.16323669254779816, + "learning_rate": 0.000980857870053439, + "loss": 2.9137, + "step": 3289 + }, + { + "epoch": 0.0975595291047653, + "grad_norm": 0.17458687722682953, + "learning_rate": 0.000980844973657304, + "loss": 2.9552, + "step": 3290 + }, + { + "epoch": 0.09758918245707678, + "grad_norm": 0.1524134874343872, + "learning_rate": 0.000980832073003204, + "loss": 2.9213, + "step": 3291 + }, + { + "epoch": 0.09761883580938825, + "grad_norm": 0.14472231268882751, + "learning_rate": 0.0009808191680912536, + "loss": 2.9396, + "step": 3292 + }, + { + "epoch": 0.09764848916169973, + "grad_norm": 0.13655225932598114, + "learning_rate": 0.000980806258921567, + "loss": 2.9329, + "step": 3293 + }, + { + "epoch": 0.0976781425140112, + "grad_norm": 0.12346582114696503, + "learning_rate": 0.0009807933454942584, + "loss": 2.9267, + "step": 3294 + }, + { + "epoch": 0.0977077958663227, + "grad_norm": 0.14007335901260376, + "learning_rate": 0.000980780427809442, + "loss": 2.9095, + "step": 3295 + }, + { + "epoch": 0.09773744921863417, + "grad_norm": 0.12620317935943604, + "learning_rate": 0.0009807675058672327, + "loss": 2.9274, + "step": 3296 + }, + { + "epoch": 0.09776710257094565, + "grad_norm": 0.11624950915575027, + "learning_rate": 0.0009807545796677445, + "loss": 2.9565, + "step": 3297 + }, + { + "epoch": 0.09779675592325712, + "grad_norm": 0.15722699463367462, + "learning_rate": 0.0009807416492110918, + "loss": 2.9435, + "step": 3298 + }, + { + "epoch": 0.0978264092755686, + "grad_norm": 0.18118730187416077, + "learning_rate": 0.0009807287144973896, + "loss": 2.9335, + "step": 3299 + }, + { + "epoch": 0.09785606262788007, + "grad_norm": 0.19543981552124023, + "learning_rate": 0.000980715775526752, + "loss": 2.9376, + "step": 3300 + }, + { + "epoch": 0.09788571598019156, + "grad_norm": 0.1997646987438202, + "learning_rate": 0.0009807028322992937, + "loss": 2.9727, + "step": 3301 + }, + { + "epoch": 0.09791536933250304, + "grad_norm": 0.21248558163642883, + "learning_rate": 0.000980689884815129, + "loss": 2.8885, + "step": 3302 + }, + { + "epoch": 0.09794502268481452, + "grad_norm": 0.2274380326271057, + "learning_rate": 0.0009806769330743732, + "loss": 2.9369, + "step": 3303 + }, + { + "epoch": 0.097974676037126, + "grad_norm": 0.20742563903331757, + "learning_rate": 0.0009806639770771407, + "loss": 2.8838, + "step": 3304 + }, + { + "epoch": 0.09800432938943747, + "grad_norm": 0.18025508522987366, + "learning_rate": 0.0009806510168235462, + "loss": 2.9239, + "step": 3305 + }, + { + "epoch": 0.09803398274174896, + "grad_norm": 0.16919633746147156, + "learning_rate": 0.0009806380523137042, + "loss": 2.9158, + "step": 3306 + }, + { + "epoch": 0.09806363609406044, + "grad_norm": 0.17458505928516388, + "learning_rate": 0.0009806250835477297, + "loss": 2.9356, + "step": 3307 + }, + { + "epoch": 0.09809328944637191, + "grad_norm": 0.16107258200645447, + "learning_rate": 0.0009806121105257377, + "loss": 2.9484, + "step": 3308 + }, + { + "epoch": 0.09812294279868339, + "grad_norm": 0.17722533643245697, + "learning_rate": 0.000980599133247843, + "loss": 2.9191, + "step": 3309 + }, + { + "epoch": 0.09815259615099486, + "grad_norm": 0.17985416948795319, + "learning_rate": 0.0009805861517141606, + "loss": 2.9352, + "step": 3310 + }, + { + "epoch": 0.09818224950330635, + "grad_norm": 0.15449096262454987, + "learning_rate": 0.0009805731659248051, + "loss": 2.9519, + "step": 3311 + }, + { + "epoch": 0.09821190285561783, + "grad_norm": 0.14101463556289673, + "learning_rate": 0.000980560175879892, + "loss": 2.8999, + "step": 3312 + }, + { + "epoch": 0.0982415562079293, + "grad_norm": 0.1435285359621048, + "learning_rate": 0.0009805471815795357, + "loss": 2.9186, + "step": 3313 + }, + { + "epoch": 0.09827120956024078, + "grad_norm": 0.13567213714122772, + "learning_rate": 0.0009805341830238519, + "loss": 2.9026, + "step": 3314 + }, + { + "epoch": 0.09830086291255226, + "grad_norm": 0.12696880102157593, + "learning_rate": 0.0009805211802129553, + "loss": 2.9236, + "step": 3315 + }, + { + "epoch": 0.09833051626486375, + "grad_norm": 0.12512415647506714, + "learning_rate": 0.0009805081731469611, + "loss": 2.9406, + "step": 3316 + }, + { + "epoch": 0.09836016961717522, + "grad_norm": 0.14533670246601105, + "learning_rate": 0.0009804951618259848, + "loss": 2.9188, + "step": 3317 + }, + { + "epoch": 0.0983898229694867, + "grad_norm": 0.18702521920204163, + "learning_rate": 0.0009804821462501413, + "loss": 2.8908, + "step": 3318 + }, + { + "epoch": 0.09841947632179818, + "grad_norm": 0.2175694704055786, + "learning_rate": 0.0009804691264195457, + "loss": 2.9348, + "step": 3319 + }, + { + "epoch": 0.09844912967410965, + "grad_norm": 0.21753400564193726, + "learning_rate": 0.0009804561023343138, + "loss": 2.8902, + "step": 3320 + }, + { + "epoch": 0.09847878302642114, + "grad_norm": 0.20609425008296967, + "learning_rate": 0.0009804430739945604, + "loss": 2.9514, + "step": 3321 + }, + { + "epoch": 0.09850843637873262, + "grad_norm": 0.2092685103416443, + "learning_rate": 0.0009804300414004014, + "loss": 2.9177, + "step": 3322 + }, + { + "epoch": 0.0985380897310441, + "grad_norm": 0.22290708124637604, + "learning_rate": 0.0009804170045519516, + "loss": 2.8973, + "step": 3323 + }, + { + "epoch": 0.09856774308335557, + "grad_norm": 0.19342611730098724, + "learning_rate": 0.000980403963449327, + "loss": 2.9137, + "step": 3324 + }, + { + "epoch": 0.09859739643566705, + "grad_norm": 0.17117564380168915, + "learning_rate": 0.0009803909180926425, + "loss": 2.9049, + "step": 3325 + }, + { + "epoch": 0.09862704978797852, + "grad_norm": 0.18755652010440826, + "learning_rate": 0.0009803778684820143, + "loss": 2.9378, + "step": 3326 + }, + { + "epoch": 0.09865670314029001, + "grad_norm": 0.17023205757141113, + "learning_rate": 0.0009803648146175575, + "loss": 2.8662, + "step": 3327 + }, + { + "epoch": 0.09868635649260149, + "grad_norm": 0.1617126613855362, + "learning_rate": 0.0009803517564993878, + "loss": 2.9139, + "step": 3328 + }, + { + "epoch": 0.09871600984491297, + "grad_norm": 0.17147918045520782, + "learning_rate": 0.000980338694127621, + "loss": 2.9447, + "step": 3329 + }, + { + "epoch": 0.09874566319722444, + "grad_norm": 0.15828949213027954, + "learning_rate": 0.0009803256275023723, + "loss": 2.9112, + "step": 3330 + }, + { + "epoch": 0.09877531654953592, + "grad_norm": 0.17495551705360413, + "learning_rate": 0.000980312556623758, + "loss": 2.8956, + "step": 3331 + }, + { + "epoch": 0.09880496990184741, + "grad_norm": 0.1778465211391449, + "learning_rate": 0.0009802994814918935, + "loss": 2.9267, + "step": 3332 + }, + { + "epoch": 0.09883462325415888, + "grad_norm": 0.16869725286960602, + "learning_rate": 0.0009802864021068946, + "loss": 2.9244, + "step": 3333 + }, + { + "epoch": 0.09886427660647036, + "grad_norm": 0.17896799743175507, + "learning_rate": 0.0009802733184688772, + "loss": 2.8954, + "step": 3334 + }, + { + "epoch": 0.09889392995878184, + "grad_norm": 0.1700315922498703, + "learning_rate": 0.0009802602305779573, + "loss": 2.9286, + "step": 3335 + }, + { + "epoch": 0.09892358331109331, + "grad_norm": 0.14866510033607483, + "learning_rate": 0.0009802471384342505, + "loss": 2.9495, + "step": 3336 + }, + { + "epoch": 0.0989532366634048, + "grad_norm": 0.15286938846111298, + "learning_rate": 0.0009802340420378725, + "loss": 2.8636, + "step": 3337 + }, + { + "epoch": 0.09898289001571628, + "grad_norm": 0.1966034322977066, + "learning_rate": 0.00098022094138894, + "loss": 2.927, + "step": 3338 + }, + { + "epoch": 0.09901254336802776, + "grad_norm": 0.22983959317207336, + "learning_rate": 0.0009802078364875685, + "loss": 2.9264, + "step": 3339 + }, + { + "epoch": 0.09904219672033923, + "grad_norm": 0.16674838960170746, + "learning_rate": 0.0009801947273338743, + "loss": 2.9355, + "step": 3340 + }, + { + "epoch": 0.09907185007265071, + "grad_norm": 0.21136713027954102, + "learning_rate": 0.0009801816139279732, + "loss": 2.8988, + "step": 3341 + }, + { + "epoch": 0.0991015034249622, + "grad_norm": 0.2014012634754181, + "learning_rate": 0.0009801684962699817, + "loss": 2.9152, + "step": 3342 + }, + { + "epoch": 0.09913115677727367, + "grad_norm": 0.15757060050964355, + "learning_rate": 0.0009801553743600158, + "loss": 2.9477, + "step": 3343 + }, + { + "epoch": 0.09916081012958515, + "grad_norm": 0.15852975845336914, + "learning_rate": 0.0009801422481981914, + "loss": 2.8963, + "step": 3344 + }, + { + "epoch": 0.09919046348189663, + "grad_norm": 0.1902856081724167, + "learning_rate": 0.000980129117784625, + "loss": 2.9158, + "step": 3345 + }, + { + "epoch": 0.0992201168342081, + "grad_norm": 0.18452860414981842, + "learning_rate": 0.000980115983119433, + "loss": 2.9249, + "step": 3346 + }, + { + "epoch": 0.09924977018651959, + "grad_norm": 0.20876307785511017, + "learning_rate": 0.0009801028442027316, + "loss": 2.9297, + "step": 3347 + }, + { + "epoch": 0.09927942353883107, + "grad_norm": 0.22115576267242432, + "learning_rate": 0.0009800897010346368, + "loss": 2.898, + "step": 3348 + }, + { + "epoch": 0.09930907689114254, + "grad_norm": 0.18375231325626373, + "learning_rate": 0.0009800765536152657, + "loss": 2.9125, + "step": 3349 + }, + { + "epoch": 0.09933873024345402, + "grad_norm": 0.16561377048492432, + "learning_rate": 0.0009800634019447338, + "loss": 2.9243, + "step": 3350 + }, + { + "epoch": 0.0993683835957655, + "grad_norm": 0.18420957028865814, + "learning_rate": 0.0009800502460231583, + "loss": 2.942, + "step": 3351 + }, + { + "epoch": 0.09939803694807697, + "grad_norm": 0.17364417016506195, + "learning_rate": 0.0009800370858506558, + "loss": 2.9502, + "step": 3352 + }, + { + "epoch": 0.09942769030038846, + "grad_norm": 0.1657547950744629, + "learning_rate": 0.000980023921427342, + "loss": 2.9252, + "step": 3353 + }, + { + "epoch": 0.09945734365269994, + "grad_norm": 0.1696198582649231, + "learning_rate": 0.0009800107527533344, + "loss": 2.9532, + "step": 3354 + }, + { + "epoch": 0.09948699700501142, + "grad_norm": 0.1534772515296936, + "learning_rate": 0.000979997579828749, + "loss": 2.951, + "step": 3355 + }, + { + "epoch": 0.09951665035732289, + "grad_norm": 0.1865791231393814, + "learning_rate": 0.0009799844026537026, + "loss": 2.9174, + "step": 3356 + }, + { + "epoch": 0.09954630370963437, + "grad_norm": 0.20825767517089844, + "learning_rate": 0.000979971221228312, + "loss": 2.922, + "step": 3357 + }, + { + "epoch": 0.09957595706194586, + "grad_norm": 0.17634454369544983, + "learning_rate": 0.0009799580355526938, + "loss": 2.9053, + "step": 3358 + }, + { + "epoch": 0.09960561041425733, + "grad_norm": 0.16137434542179108, + "learning_rate": 0.0009799448456269649, + "loss": 2.8968, + "step": 3359 + }, + { + "epoch": 0.09963526376656881, + "grad_norm": 0.1535472273826599, + "learning_rate": 0.000979931651451242, + "loss": 2.8913, + "step": 3360 + }, + { + "epoch": 0.09966491711888029, + "grad_norm": 0.14924848079681396, + "learning_rate": 0.0009799184530256417, + "loss": 2.9156, + "step": 3361 + }, + { + "epoch": 0.09969457047119176, + "grad_norm": 0.15831781923770905, + "learning_rate": 0.0009799052503502814, + "loss": 2.9036, + "step": 3362 + }, + { + "epoch": 0.09972422382350325, + "grad_norm": 0.16255363821983337, + "learning_rate": 0.0009798920434252777, + "loss": 2.9085, + "step": 3363 + }, + { + "epoch": 0.09975387717581473, + "grad_norm": 0.18711498379707336, + "learning_rate": 0.0009798788322507475, + "loss": 2.9848, + "step": 3364 + }, + { + "epoch": 0.0997835305281262, + "grad_norm": 0.1698204129934311, + "learning_rate": 0.0009798656168268078, + "loss": 2.9112, + "step": 3365 + }, + { + "epoch": 0.09981318388043768, + "grad_norm": 0.1712159663438797, + "learning_rate": 0.0009798523971535759, + "loss": 2.9256, + "step": 3366 + }, + { + "epoch": 0.09984283723274916, + "grad_norm": 0.182750403881073, + "learning_rate": 0.0009798391732311685, + "loss": 2.9274, + "step": 3367 + }, + { + "epoch": 0.09987249058506065, + "grad_norm": 0.17147238552570343, + "learning_rate": 0.000979825945059703, + "loss": 2.9571, + "step": 3368 + }, + { + "epoch": 0.09990214393737212, + "grad_norm": 0.17750591039657593, + "learning_rate": 0.0009798127126392964, + "loss": 2.9238, + "step": 3369 + }, + { + "epoch": 0.0999317972896836, + "grad_norm": 0.15444055199623108, + "learning_rate": 0.0009797994759700656, + "loss": 2.923, + "step": 3370 + }, + { + "epoch": 0.09996145064199508, + "grad_norm": 0.15357713401317596, + "learning_rate": 0.0009797862350521282, + "loss": 2.9494, + "step": 3371 + }, + { + "epoch": 0.09999110399430655, + "grad_norm": 0.1572432667016983, + "learning_rate": 0.0009797729898856015, + "loss": 2.9484, + "step": 3372 + }, + { + "epoch": 0.10002075734661804, + "grad_norm": 0.1635265350341797, + "learning_rate": 0.0009797597404706026, + "loss": 2.9157, + "step": 3373 + }, + { + "epoch": 0.10005041069892952, + "grad_norm": 0.1873333901166916, + "learning_rate": 0.0009797464868072487, + "loss": 2.9164, + "step": 3374 + }, + { + "epoch": 0.100080064051241, + "grad_norm": 0.16181440651416779, + "learning_rate": 0.0009797332288956574, + "loss": 2.9332, + "step": 3375 + }, + { + "epoch": 0.10010971740355247, + "grad_norm": 0.14716988801956177, + "learning_rate": 0.000979719966735946, + "loss": 2.9387, + "step": 3376 + }, + { + "epoch": 0.10013937075586395, + "grad_norm": 0.1304091066122055, + "learning_rate": 0.000979706700328232, + "loss": 2.8836, + "step": 3377 + }, + { + "epoch": 0.10016902410817542, + "grad_norm": 0.15069544315338135, + "learning_rate": 0.0009796934296726328, + "loss": 2.9225, + "step": 3378 + }, + { + "epoch": 0.10019867746048691, + "grad_norm": 0.15278759598731995, + "learning_rate": 0.0009796801547692657, + "loss": 2.9403, + "step": 3379 + }, + { + "epoch": 0.10022833081279839, + "grad_norm": 0.1649685502052307, + "learning_rate": 0.0009796668756182488, + "loss": 2.9285, + "step": 3380 + }, + { + "epoch": 0.10025798416510986, + "grad_norm": 0.15694838762283325, + "learning_rate": 0.0009796535922196993, + "loss": 2.9226, + "step": 3381 + }, + { + "epoch": 0.10028763751742134, + "grad_norm": 0.15563684701919556, + "learning_rate": 0.000979640304573735, + "loss": 2.9299, + "step": 3382 + }, + { + "epoch": 0.10031729086973282, + "grad_norm": 0.19072836637496948, + "learning_rate": 0.0009796270126804735, + "loss": 2.9127, + "step": 3383 + }, + { + "epoch": 0.10034694422204431, + "grad_norm": 0.23428404331207275, + "learning_rate": 0.0009796137165400322, + "loss": 2.8892, + "step": 3384 + }, + { + "epoch": 0.10037659757435578, + "grad_norm": 0.24100781977176666, + "learning_rate": 0.000979600416152529, + "loss": 2.9314, + "step": 3385 + }, + { + "epoch": 0.10040625092666726, + "grad_norm": 0.19143790006637573, + "learning_rate": 0.0009795871115180824, + "loss": 2.922, + "step": 3386 + }, + { + "epoch": 0.10043590427897874, + "grad_norm": 0.1982809156179428, + "learning_rate": 0.000979573802636809, + "loss": 2.9354, + "step": 3387 + }, + { + "epoch": 0.10046555763129021, + "grad_norm": 0.2223173826932907, + "learning_rate": 0.0009795604895088278, + "loss": 2.953, + "step": 3388 + }, + { + "epoch": 0.1004952109836017, + "grad_norm": 0.22446918487548828, + "learning_rate": 0.0009795471721342557, + "loss": 2.864, + "step": 3389 + }, + { + "epoch": 0.10052486433591318, + "grad_norm": 0.2371692657470703, + "learning_rate": 0.0009795338505132114, + "loss": 2.9295, + "step": 3390 + }, + { + "epoch": 0.10055451768822465, + "grad_norm": 0.21183614432811737, + "learning_rate": 0.0009795205246458123, + "loss": 2.9213, + "step": 3391 + }, + { + "epoch": 0.10058417104053613, + "grad_norm": 0.18006367981433868, + "learning_rate": 0.0009795071945321767, + "loss": 2.9357, + "step": 3392 + }, + { + "epoch": 0.1006138243928476, + "grad_norm": 0.14191976189613342, + "learning_rate": 0.0009794938601724226, + "loss": 2.9508, + "step": 3393 + }, + { + "epoch": 0.1006434777451591, + "grad_norm": 0.15811333060264587, + "learning_rate": 0.0009794805215666681, + "loss": 2.9102, + "step": 3394 + }, + { + "epoch": 0.10067313109747057, + "grad_norm": 0.14219166338443756, + "learning_rate": 0.0009794671787150314, + "loss": 2.8881, + "step": 3395 + }, + { + "epoch": 0.10070278444978205, + "grad_norm": 0.12737508118152618, + "learning_rate": 0.0009794538316176304, + "loss": 2.9429, + "step": 3396 + }, + { + "epoch": 0.10073243780209352, + "grad_norm": 0.14861740171909332, + "learning_rate": 0.0009794404802745834, + "loss": 2.9424, + "step": 3397 + }, + { + "epoch": 0.100762091154405, + "grad_norm": 0.16463957726955414, + "learning_rate": 0.0009794271246860086, + "loss": 2.9275, + "step": 3398 + }, + { + "epoch": 0.10079174450671649, + "grad_norm": 0.15715013444423676, + "learning_rate": 0.0009794137648520245, + "loss": 2.887, + "step": 3399 + }, + { + "epoch": 0.10082139785902797, + "grad_norm": 0.1388375610113144, + "learning_rate": 0.000979400400772749, + "loss": 2.9191, + "step": 3400 + }, + { + "epoch": 0.10085105121133944, + "grad_norm": 0.1427650898694992, + "learning_rate": 0.000979387032448301, + "loss": 2.9782, + "step": 3401 + }, + { + "epoch": 0.10088070456365092, + "grad_norm": 0.15009984374046326, + "learning_rate": 0.000979373659878798, + "loss": 2.8763, + "step": 3402 + }, + { + "epoch": 0.1009103579159624, + "grad_norm": 0.1608206182718277, + "learning_rate": 0.0009793602830643596, + "loss": 2.9178, + "step": 3403 + }, + { + "epoch": 0.10094001126827387, + "grad_norm": 0.14636413753032684, + "learning_rate": 0.000979346902005103, + "loss": 2.9071, + "step": 3404 + }, + { + "epoch": 0.10096966462058536, + "grad_norm": 0.12967351078987122, + "learning_rate": 0.0009793335167011478, + "loss": 2.8936, + "step": 3405 + }, + { + "epoch": 0.10099931797289684, + "grad_norm": 0.15090174973011017, + "learning_rate": 0.0009793201271526117, + "loss": 2.9423, + "step": 3406 + }, + { + "epoch": 0.10102897132520831, + "grad_norm": 0.17428860068321228, + "learning_rate": 0.0009793067333596138, + "loss": 2.9392, + "step": 3407 + }, + { + "epoch": 0.10105862467751979, + "grad_norm": 0.20670028030872345, + "learning_rate": 0.0009792933353222726, + "loss": 2.9092, + "step": 3408 + }, + { + "epoch": 0.10108827802983127, + "grad_norm": 0.21306023001670837, + "learning_rate": 0.0009792799330407063, + "loss": 2.9496, + "step": 3409 + }, + { + "epoch": 0.10111793138214276, + "grad_norm": 0.2020617425441742, + "learning_rate": 0.0009792665265150342, + "loss": 2.9174, + "step": 3410 + }, + { + "epoch": 0.10114758473445423, + "grad_norm": 0.2087898999452591, + "learning_rate": 0.0009792531157453745, + "loss": 2.9486, + "step": 3411 + }, + { + "epoch": 0.10117723808676571, + "grad_norm": 0.21217580139636993, + "learning_rate": 0.0009792397007318465, + "loss": 2.9336, + "step": 3412 + }, + { + "epoch": 0.10120689143907718, + "grad_norm": 0.20440934598445892, + "learning_rate": 0.0009792262814745684, + "loss": 2.9213, + "step": 3413 + }, + { + "epoch": 0.10123654479138866, + "grad_norm": 0.19338646531105042, + "learning_rate": 0.0009792128579736595, + "loss": 2.9188, + "step": 3414 + }, + { + "epoch": 0.10126619814370015, + "grad_norm": 0.18762293457984924, + "learning_rate": 0.0009791994302292386, + "loss": 2.933, + "step": 3415 + }, + { + "epoch": 0.10129585149601163, + "grad_norm": 0.16812072694301605, + "learning_rate": 0.0009791859982414242, + "loss": 2.9342, + "step": 3416 + }, + { + "epoch": 0.1013255048483231, + "grad_norm": 0.15382888913154602, + "learning_rate": 0.0009791725620103358, + "loss": 2.905, + "step": 3417 + }, + { + "epoch": 0.10135515820063458, + "grad_norm": 0.1714504510164261, + "learning_rate": 0.0009791591215360918, + "loss": 2.9483, + "step": 3418 + }, + { + "epoch": 0.10138481155294606, + "grad_norm": 0.17125235497951508, + "learning_rate": 0.0009791456768188118, + "loss": 2.9315, + "step": 3419 + }, + { + "epoch": 0.10141446490525755, + "grad_norm": 0.15858110785484314, + "learning_rate": 0.0009791322278586145, + "loss": 2.8818, + "step": 3420 + }, + { + "epoch": 0.10144411825756902, + "grad_norm": 0.14311303198337555, + "learning_rate": 0.0009791187746556191, + "loss": 2.8993, + "step": 3421 + }, + { + "epoch": 0.1014737716098805, + "grad_norm": 0.15586809813976288, + "learning_rate": 0.0009791053172099446, + "loss": 2.9355, + "step": 3422 + }, + { + "epoch": 0.10150342496219197, + "grad_norm": 0.18697109818458557, + "learning_rate": 0.0009790918555217106, + "loss": 2.9058, + "step": 3423 + }, + { + "epoch": 0.10153307831450345, + "grad_norm": 0.21415115892887115, + "learning_rate": 0.0009790783895910356, + "loss": 2.9081, + "step": 3424 + }, + { + "epoch": 0.10156273166681494, + "grad_norm": 0.20272399485111237, + "learning_rate": 0.0009790649194180395, + "loss": 2.9022, + "step": 3425 + }, + { + "epoch": 0.10159238501912642, + "grad_norm": 0.18809784948825836, + "learning_rate": 0.000979051445002841, + "loss": 2.9579, + "step": 3426 + }, + { + "epoch": 0.10162203837143789, + "grad_norm": 0.1849660873413086, + "learning_rate": 0.0009790379663455599, + "loss": 2.9038, + "step": 3427 + }, + { + "epoch": 0.10165169172374937, + "grad_norm": 0.15107429027557373, + "learning_rate": 0.0009790244834463155, + "loss": 2.9247, + "step": 3428 + }, + { + "epoch": 0.10168134507606084, + "grad_norm": 0.16532054543495178, + "learning_rate": 0.0009790109963052271, + "loss": 2.9565, + "step": 3429 + }, + { + "epoch": 0.10171099842837232, + "grad_norm": 0.18621087074279785, + "learning_rate": 0.0009789975049224139, + "loss": 2.9097, + "step": 3430 + }, + { + "epoch": 0.10174065178068381, + "grad_norm": 0.17299236357212067, + "learning_rate": 0.0009789840092979958, + "loss": 2.877, + "step": 3431 + }, + { + "epoch": 0.10177030513299529, + "grad_norm": 0.15311723947525024, + "learning_rate": 0.0009789705094320918, + "loss": 2.9379, + "step": 3432 + }, + { + "epoch": 0.10179995848530676, + "grad_norm": 0.1596776694059372, + "learning_rate": 0.0009789570053248219, + "loss": 2.9222, + "step": 3433 + }, + { + "epoch": 0.10182961183761824, + "grad_norm": 0.16440729796886444, + "learning_rate": 0.0009789434969763055, + "loss": 2.9069, + "step": 3434 + }, + { + "epoch": 0.10185926518992972, + "grad_norm": 0.15108785033226013, + "learning_rate": 0.0009789299843866622, + "loss": 2.9286, + "step": 3435 + }, + { + "epoch": 0.1018889185422412, + "grad_norm": 0.17196397483348846, + "learning_rate": 0.0009789164675560115, + "loss": 2.9327, + "step": 3436 + }, + { + "epoch": 0.10191857189455268, + "grad_norm": 0.17681831121444702, + "learning_rate": 0.0009789029464844737, + "loss": 2.947, + "step": 3437 + }, + { + "epoch": 0.10194822524686416, + "grad_norm": 0.22088196873664856, + "learning_rate": 0.0009788894211721678, + "loss": 2.9283, + "step": 3438 + }, + { + "epoch": 0.10197787859917563, + "grad_norm": 0.18746225535869598, + "learning_rate": 0.000978875891619214, + "loss": 2.8956, + "step": 3439 + }, + { + "epoch": 0.10200753195148711, + "grad_norm": 0.15104793012142181, + "learning_rate": 0.0009788623578257318, + "loss": 2.8899, + "step": 3440 + }, + { + "epoch": 0.1020371853037986, + "grad_norm": 0.1850367784500122, + "learning_rate": 0.0009788488197918414, + "loss": 2.9244, + "step": 3441 + }, + { + "epoch": 0.10206683865611008, + "grad_norm": 0.20804081857204437, + "learning_rate": 0.0009788352775176622, + "loss": 2.9209, + "step": 3442 + }, + { + "epoch": 0.10209649200842155, + "grad_norm": 0.19116805493831635, + "learning_rate": 0.000978821731003315, + "loss": 2.901, + "step": 3443 + }, + { + "epoch": 0.10212614536073303, + "grad_norm": 0.1492908000946045, + "learning_rate": 0.0009788081802489187, + "loss": 2.9131, + "step": 3444 + }, + { + "epoch": 0.1021557987130445, + "grad_norm": 0.15138646960258484, + "learning_rate": 0.000978794625254594, + "loss": 2.9133, + "step": 3445 + }, + { + "epoch": 0.102185452065356, + "grad_norm": 0.18271037936210632, + "learning_rate": 0.0009787810660204605, + "loss": 2.9203, + "step": 3446 + }, + { + "epoch": 0.10221510541766747, + "grad_norm": 0.18182361125946045, + "learning_rate": 0.0009787675025466388, + "loss": 2.8829, + "step": 3447 + }, + { + "epoch": 0.10224475876997895, + "grad_norm": 0.15438061952590942, + "learning_rate": 0.0009787539348332485, + "loss": 2.9173, + "step": 3448 + }, + { + "epoch": 0.10227441212229042, + "grad_norm": 0.1310954988002777, + "learning_rate": 0.0009787403628804098, + "loss": 2.8989, + "step": 3449 + }, + { + "epoch": 0.1023040654746019, + "grad_norm": 0.14696024358272552, + "learning_rate": 0.0009787267866882433, + "loss": 2.9307, + "step": 3450 + }, + { + "epoch": 0.10233371882691339, + "grad_norm": 0.15179885923862457, + "learning_rate": 0.0009787132062568688, + "loss": 2.9374, + "step": 3451 + }, + { + "epoch": 0.10236337217922487, + "grad_norm": 0.153042271733284, + "learning_rate": 0.0009786996215864067, + "loss": 2.918, + "step": 3452 + }, + { + "epoch": 0.10239302553153634, + "grad_norm": 0.13213017582893372, + "learning_rate": 0.000978686032676977, + "loss": 2.9135, + "step": 3453 + }, + { + "epoch": 0.10242267888384782, + "grad_norm": 0.1810387670993805, + "learning_rate": 0.0009786724395287009, + "loss": 2.8758, + "step": 3454 + }, + { + "epoch": 0.1024523322361593, + "grad_norm": 0.17021982371807098, + "learning_rate": 0.0009786588421416978, + "loss": 2.9014, + "step": 3455 + }, + { + "epoch": 0.10248198558847077, + "grad_norm": 0.1732563078403473, + "learning_rate": 0.0009786452405160884, + "loss": 2.9142, + "step": 3456 + }, + { + "epoch": 0.10251163894078226, + "grad_norm": 0.1847667396068573, + "learning_rate": 0.0009786316346519935, + "loss": 2.9579, + "step": 3457 + }, + { + "epoch": 0.10254129229309374, + "grad_norm": 0.1822827160358429, + "learning_rate": 0.000978618024549533, + "loss": 2.9097, + "step": 3458 + }, + { + "epoch": 0.10257094564540521, + "grad_norm": 0.2233656793832779, + "learning_rate": 0.0009786044102088282, + "loss": 2.9139, + "step": 3459 + }, + { + "epoch": 0.10260059899771669, + "grad_norm": 0.2150331288576126, + "learning_rate": 0.0009785907916299989, + "loss": 2.9277, + "step": 3460 + }, + { + "epoch": 0.10263025235002816, + "grad_norm": 0.19260303676128387, + "learning_rate": 0.000978577168813166, + "loss": 2.9073, + "step": 3461 + }, + { + "epoch": 0.10265990570233965, + "grad_norm": 0.1929052174091339, + "learning_rate": 0.0009785635417584502, + "loss": 2.9559, + "step": 3462 + }, + { + "epoch": 0.10268955905465113, + "grad_norm": 0.1713527888059616, + "learning_rate": 0.000978549910465972, + "loss": 2.9155, + "step": 3463 + }, + { + "epoch": 0.1027192124069626, + "grad_norm": 0.1745004951953888, + "learning_rate": 0.0009785362749358522, + "loss": 2.8938, + "step": 3464 + }, + { + "epoch": 0.10274886575927408, + "grad_norm": 0.16576506197452545, + "learning_rate": 0.0009785226351682115, + "loss": 2.9373, + "step": 3465 + }, + { + "epoch": 0.10277851911158556, + "grad_norm": 0.16256333887577057, + "learning_rate": 0.000978508991163171, + "loss": 2.916, + "step": 3466 + }, + { + "epoch": 0.10280817246389705, + "grad_norm": 0.15386413037776947, + "learning_rate": 0.000978495342920851, + "loss": 2.9116, + "step": 3467 + }, + { + "epoch": 0.10283782581620853, + "grad_norm": 0.14582566916942596, + "learning_rate": 0.0009784816904413727, + "loss": 2.8987, + "step": 3468 + }, + { + "epoch": 0.10286747916852, + "grad_norm": 0.14325660467147827, + "learning_rate": 0.0009784680337248568, + "loss": 2.9236, + "step": 3469 + }, + { + "epoch": 0.10289713252083148, + "grad_norm": 0.16744527220726013, + "learning_rate": 0.0009784543727714245, + "loss": 2.9314, + "step": 3470 + }, + { + "epoch": 0.10292678587314295, + "grad_norm": 0.18629617989063263, + "learning_rate": 0.0009784407075811965, + "loss": 2.9075, + "step": 3471 + }, + { + "epoch": 0.10295643922545444, + "grad_norm": 0.1822882890701294, + "learning_rate": 0.000978427038154294, + "loss": 2.8905, + "step": 3472 + }, + { + "epoch": 0.10298609257776592, + "grad_norm": 0.19421015679836273, + "learning_rate": 0.0009784133644908377, + "loss": 2.9014, + "step": 3473 + }, + { + "epoch": 0.1030157459300774, + "grad_norm": 0.18319270014762878, + "learning_rate": 0.0009783996865909493, + "loss": 2.9303, + "step": 3474 + }, + { + "epoch": 0.10304539928238887, + "grad_norm": 0.18111121654510498, + "learning_rate": 0.0009783860044547492, + "loss": 2.9092, + "step": 3475 + }, + { + "epoch": 0.10307505263470035, + "grad_norm": 0.19670553505420685, + "learning_rate": 0.0009783723180823592, + "loss": 2.9165, + "step": 3476 + }, + { + "epoch": 0.10310470598701184, + "grad_norm": 0.20195765793323517, + "learning_rate": 0.0009783586274739, + "loss": 2.9218, + "step": 3477 + }, + { + "epoch": 0.10313435933932331, + "grad_norm": 0.17109832167625427, + "learning_rate": 0.0009783449326294933, + "loss": 2.9439, + "step": 3478 + }, + { + "epoch": 0.10316401269163479, + "grad_norm": 0.1520020216703415, + "learning_rate": 0.0009783312335492598, + "loss": 2.9156, + "step": 3479 + }, + { + "epoch": 0.10319366604394627, + "grad_norm": 0.1517459899187088, + "learning_rate": 0.0009783175302333215, + "loss": 2.8741, + "step": 3480 + }, + { + "epoch": 0.10322331939625774, + "grad_norm": 0.13617023825645447, + "learning_rate": 0.000978303822681799, + "loss": 2.9089, + "step": 3481 + }, + { + "epoch": 0.10325297274856922, + "grad_norm": 0.14150942862033844, + "learning_rate": 0.0009782901108948143, + "loss": 2.9201, + "step": 3482 + }, + { + "epoch": 0.10328262610088071, + "grad_norm": 0.15487582981586456, + "learning_rate": 0.0009782763948724884, + "loss": 2.9254, + "step": 3483 + }, + { + "epoch": 0.10331227945319219, + "grad_norm": 0.13358867168426514, + "learning_rate": 0.000978262674614943, + "loss": 2.9015, + "step": 3484 + }, + { + "epoch": 0.10334193280550366, + "grad_norm": 0.14715243875980377, + "learning_rate": 0.0009782489501222996, + "loss": 2.9255, + "step": 3485 + }, + { + "epoch": 0.10337158615781514, + "grad_norm": 0.17092128098011017, + "learning_rate": 0.0009782352213946795, + "loss": 2.9175, + "step": 3486 + }, + { + "epoch": 0.10340123951012661, + "grad_norm": 0.1936461180448532, + "learning_rate": 0.0009782214884322047, + "loss": 2.9435, + "step": 3487 + }, + { + "epoch": 0.1034308928624381, + "grad_norm": 0.19133776426315308, + "learning_rate": 0.0009782077512349963, + "loss": 2.9467, + "step": 3488 + }, + { + "epoch": 0.10346054621474958, + "grad_norm": 0.16182319819927216, + "learning_rate": 0.000978194009803176, + "loss": 2.9327, + "step": 3489 + }, + { + "epoch": 0.10349019956706106, + "grad_norm": 0.1674753576517105, + "learning_rate": 0.000978180264136866, + "loss": 2.896, + "step": 3490 + }, + { + "epoch": 0.10351985291937253, + "grad_norm": 0.19516131281852722, + "learning_rate": 0.0009781665142361876, + "loss": 2.9139, + "step": 3491 + }, + { + "epoch": 0.10354950627168401, + "grad_norm": 0.18403542041778564, + "learning_rate": 0.0009781527601012627, + "loss": 2.9113, + "step": 3492 + }, + { + "epoch": 0.1035791596239955, + "grad_norm": 0.16924959421157837, + "learning_rate": 0.000978139001732213, + "loss": 2.9033, + "step": 3493 + }, + { + "epoch": 0.10360881297630697, + "grad_norm": 0.18981380760669708, + "learning_rate": 0.00097812523912916, + "loss": 2.8738, + "step": 3494 + }, + { + "epoch": 0.10363846632861845, + "grad_norm": 0.19493132829666138, + "learning_rate": 0.0009781114722922264, + "loss": 2.9258, + "step": 3495 + }, + { + "epoch": 0.10366811968092993, + "grad_norm": 0.23217007517814636, + "learning_rate": 0.0009780977012215336, + "loss": 2.9291, + "step": 3496 + }, + { + "epoch": 0.1036977730332414, + "grad_norm": 0.21308697760105133, + "learning_rate": 0.0009780839259172034, + "loss": 2.9274, + "step": 3497 + }, + { + "epoch": 0.10372742638555289, + "grad_norm": 0.1856391876935959, + "learning_rate": 0.000978070146379358, + "loss": 2.92, + "step": 3498 + }, + { + "epoch": 0.10375707973786437, + "grad_norm": 0.21065005660057068, + "learning_rate": 0.0009780563626081197, + "loss": 2.898, + "step": 3499 + }, + { + "epoch": 0.10378673309017585, + "grad_norm": 0.19648385047912598, + "learning_rate": 0.0009780425746036098, + "loss": 2.8905, + "step": 3500 + }, + { + "epoch": 0.10381638644248732, + "grad_norm": 0.18598216772079468, + "learning_rate": 0.000978028782365951, + "loss": 2.9152, + "step": 3501 + }, + { + "epoch": 0.1038460397947988, + "grad_norm": 0.16945812106132507, + "learning_rate": 0.0009780149858952653, + "loss": 2.9027, + "step": 3502 + }, + { + "epoch": 0.10387569314711029, + "grad_norm": 0.1562819629907608, + "learning_rate": 0.000978001185191675, + "loss": 2.8911, + "step": 3503 + }, + { + "epoch": 0.10390534649942176, + "grad_norm": 0.18341222405433655, + "learning_rate": 0.0009779873802553021, + "loss": 2.9076, + "step": 3504 + }, + { + "epoch": 0.10393499985173324, + "grad_norm": 0.15130071341991425, + "learning_rate": 0.0009779735710862688, + "loss": 2.9588, + "step": 3505 + }, + { + "epoch": 0.10396465320404472, + "grad_norm": 0.13848835229873657, + "learning_rate": 0.0009779597576846976, + "loss": 2.9124, + "step": 3506 + }, + { + "epoch": 0.10399430655635619, + "grad_norm": 0.15420624613761902, + "learning_rate": 0.0009779459400507105, + "loss": 2.8997, + "step": 3507 + }, + { + "epoch": 0.10402395990866767, + "grad_norm": 0.15012365579605103, + "learning_rate": 0.0009779321181844303, + "loss": 2.8996, + "step": 3508 + }, + { + "epoch": 0.10405361326097916, + "grad_norm": 0.13397428393363953, + "learning_rate": 0.000977918292085979, + "loss": 2.895, + "step": 3509 + }, + { + "epoch": 0.10408326661329063, + "grad_norm": 0.1587110459804535, + "learning_rate": 0.0009779044617554793, + "loss": 2.9287, + "step": 3510 + }, + { + "epoch": 0.10411291996560211, + "grad_norm": 0.15951132774353027, + "learning_rate": 0.0009778906271930535, + "loss": 2.9004, + "step": 3511 + }, + { + "epoch": 0.10414257331791359, + "grad_norm": 0.2018638551235199, + "learning_rate": 0.0009778767883988242, + "loss": 2.9232, + "step": 3512 + }, + { + "epoch": 0.10417222667022506, + "grad_norm": 0.23652800917625427, + "learning_rate": 0.0009778629453729138, + "loss": 2.8952, + "step": 3513 + }, + { + "epoch": 0.10420188002253655, + "grad_norm": 0.18292294442653656, + "learning_rate": 0.000977849098115445, + "loss": 2.9725, + "step": 3514 + }, + { + "epoch": 0.10423153337484803, + "grad_norm": 0.1815694272518158, + "learning_rate": 0.0009778352466265406, + "loss": 2.8645, + "step": 3515 + }, + { + "epoch": 0.1042611867271595, + "grad_norm": 0.18644319474697113, + "learning_rate": 0.000977821390906323, + "loss": 2.9377, + "step": 3516 + }, + { + "epoch": 0.10429084007947098, + "grad_norm": 0.16492319107055664, + "learning_rate": 0.000977807530954915, + "loss": 2.9028, + "step": 3517 + }, + { + "epoch": 0.10432049343178246, + "grad_norm": 0.17379014194011688, + "learning_rate": 0.0009777936667724391, + "loss": 2.8917, + "step": 3518 + }, + { + "epoch": 0.10435014678409395, + "grad_norm": 0.17828626930713654, + "learning_rate": 0.0009777797983590185, + "loss": 2.9116, + "step": 3519 + }, + { + "epoch": 0.10437980013640542, + "grad_norm": 0.1789001226425171, + "learning_rate": 0.0009777659257147757, + "loss": 2.9088, + "step": 3520 + }, + { + "epoch": 0.1044094534887169, + "grad_norm": 0.1610165536403656, + "learning_rate": 0.0009777520488398336, + "loss": 2.9445, + "step": 3521 + }, + { + "epoch": 0.10443910684102838, + "grad_norm": 0.1435767561197281, + "learning_rate": 0.000977738167734315, + "loss": 2.9378, + "step": 3522 + }, + { + "epoch": 0.10446876019333985, + "grad_norm": 0.16713540256023407, + "learning_rate": 0.0009777242823983431, + "loss": 2.9273, + "step": 3523 + }, + { + "epoch": 0.10449841354565134, + "grad_norm": 0.17077718675136566, + "learning_rate": 0.0009777103928320405, + "loss": 2.9171, + "step": 3524 + }, + { + "epoch": 0.10452806689796282, + "grad_norm": 0.1794116050004959, + "learning_rate": 0.0009776964990355307, + "loss": 2.9156, + "step": 3525 + }, + { + "epoch": 0.1045577202502743, + "grad_norm": 0.19025588035583496, + "learning_rate": 0.000977682601008936, + "loss": 2.9176, + "step": 3526 + }, + { + "epoch": 0.10458737360258577, + "grad_norm": 0.1741243153810501, + "learning_rate": 0.00097766869875238, + "loss": 2.9193, + "step": 3527 + }, + { + "epoch": 0.10461702695489725, + "grad_norm": 0.15701517462730408, + "learning_rate": 0.0009776547922659858, + "loss": 2.902, + "step": 3528 + }, + { + "epoch": 0.10464668030720874, + "grad_norm": 0.18280334770679474, + "learning_rate": 0.0009776408815498764, + "loss": 2.9153, + "step": 3529 + }, + { + "epoch": 0.10467633365952021, + "grad_norm": 0.19805914163589478, + "learning_rate": 0.0009776269666041748, + "loss": 2.9353, + "step": 3530 + }, + { + "epoch": 0.10470598701183169, + "grad_norm": 0.20267170667648315, + "learning_rate": 0.0009776130474290047, + "loss": 2.9215, + "step": 3531 + }, + { + "epoch": 0.10473564036414316, + "grad_norm": 0.19649474322795868, + "learning_rate": 0.000977599124024489, + "loss": 2.9138, + "step": 3532 + }, + { + "epoch": 0.10476529371645464, + "grad_norm": 0.18778769671916962, + "learning_rate": 0.000977585196390751, + "loss": 2.9217, + "step": 3533 + }, + { + "epoch": 0.10479494706876612, + "grad_norm": 0.17853254079818726, + "learning_rate": 0.0009775712645279143, + "loss": 2.9034, + "step": 3534 + }, + { + "epoch": 0.10482460042107761, + "grad_norm": 0.1699894815683365, + "learning_rate": 0.0009775573284361019, + "loss": 2.9284, + "step": 3535 + }, + { + "epoch": 0.10485425377338908, + "grad_norm": 0.16734102368354797, + "learning_rate": 0.0009775433881154373, + "loss": 2.9292, + "step": 3536 + }, + { + "epoch": 0.10488390712570056, + "grad_norm": 0.1486494243144989, + "learning_rate": 0.0009775294435660441, + "loss": 2.8847, + "step": 3537 + }, + { + "epoch": 0.10491356047801204, + "grad_norm": 0.14462411403656006, + "learning_rate": 0.000977515494788046, + "loss": 2.8931, + "step": 3538 + }, + { + "epoch": 0.10494321383032351, + "grad_norm": 0.15955613553524017, + "learning_rate": 0.000977501541781566, + "loss": 2.9068, + "step": 3539 + }, + { + "epoch": 0.104972867182635, + "grad_norm": 0.16419170796871185, + "learning_rate": 0.0009774875845467278, + "loss": 2.9389, + "step": 3540 + }, + { + "epoch": 0.10500252053494648, + "grad_norm": 0.20142723619937897, + "learning_rate": 0.0009774736230836552, + "loss": 2.9053, + "step": 3541 + }, + { + "epoch": 0.10503217388725795, + "grad_norm": 0.2289096564054489, + "learning_rate": 0.0009774596573924716, + "loss": 2.9018, + "step": 3542 + }, + { + "epoch": 0.10506182723956943, + "grad_norm": 0.197881817817688, + "learning_rate": 0.000977445687473301, + "loss": 2.9297, + "step": 3543 + }, + { + "epoch": 0.1050914805918809, + "grad_norm": 0.18287472426891327, + "learning_rate": 0.0009774317133262667, + "loss": 2.9061, + "step": 3544 + }, + { + "epoch": 0.1051211339441924, + "grad_norm": 0.1446806937456131, + "learning_rate": 0.0009774177349514926, + "loss": 2.9063, + "step": 3545 + }, + { + "epoch": 0.10515078729650387, + "grad_norm": 0.1733686774969101, + "learning_rate": 0.0009774037523491027, + "loss": 2.9299, + "step": 3546 + }, + { + "epoch": 0.10518044064881535, + "grad_norm": 0.1753138303756714, + "learning_rate": 0.0009773897655192204, + "loss": 2.8677, + "step": 3547 + }, + { + "epoch": 0.10521009400112682, + "grad_norm": 0.1662297248840332, + "learning_rate": 0.00097737577446197, + "loss": 2.9125, + "step": 3548 + }, + { + "epoch": 0.1052397473534383, + "grad_norm": 0.16773171722888947, + "learning_rate": 0.0009773617791774749, + "loss": 2.9479, + "step": 3549 + }, + { + "epoch": 0.10526940070574979, + "grad_norm": 0.1664315015077591, + "learning_rate": 0.0009773477796658596, + "loss": 2.9088, + "step": 3550 + }, + { + "epoch": 0.10529905405806127, + "grad_norm": 0.1823274791240692, + "learning_rate": 0.0009773337759272475, + "loss": 2.9401, + "step": 3551 + }, + { + "epoch": 0.10532870741037274, + "grad_norm": 0.16655179858207703, + "learning_rate": 0.0009773197679617631, + "loss": 2.9374, + "step": 3552 + }, + { + "epoch": 0.10535836076268422, + "grad_norm": 0.17483893036842346, + "learning_rate": 0.0009773057557695302, + "loss": 2.8938, + "step": 3553 + }, + { + "epoch": 0.1053880141149957, + "grad_norm": 0.18948185443878174, + "learning_rate": 0.0009772917393506728, + "loss": 2.9084, + "step": 3554 + }, + { + "epoch": 0.10541766746730719, + "grad_norm": 0.15934956073760986, + "learning_rate": 0.0009772777187053152, + "loss": 2.9288, + "step": 3555 + }, + { + "epoch": 0.10544732081961866, + "grad_norm": 0.1573401242494583, + "learning_rate": 0.0009772636938335814, + "loss": 2.8972, + "step": 3556 + }, + { + "epoch": 0.10547697417193014, + "grad_norm": 0.16440646350383759, + "learning_rate": 0.000977249664735596, + "loss": 2.8914, + "step": 3557 + }, + { + "epoch": 0.10550662752424161, + "grad_norm": 0.1650739461183548, + "learning_rate": 0.0009772356314114825, + "loss": 2.9007, + "step": 3558 + }, + { + "epoch": 0.10553628087655309, + "grad_norm": 0.1735934019088745, + "learning_rate": 0.0009772215938613656, + "loss": 2.8942, + "step": 3559 + }, + { + "epoch": 0.10556593422886457, + "grad_norm": 0.1553967148065567, + "learning_rate": 0.0009772075520853697, + "loss": 2.9363, + "step": 3560 + }, + { + "epoch": 0.10559558758117606, + "grad_norm": 0.14001688361167908, + "learning_rate": 0.000977193506083619, + "loss": 2.9205, + "step": 3561 + }, + { + "epoch": 0.10562524093348753, + "grad_norm": 0.14674140512943268, + "learning_rate": 0.0009771794558562379, + "loss": 2.9217, + "step": 3562 + }, + { + "epoch": 0.10565489428579901, + "grad_norm": 0.16458015143871307, + "learning_rate": 0.0009771654014033508, + "loss": 2.8942, + "step": 3563 + }, + { + "epoch": 0.10568454763811048, + "grad_norm": 0.16236642003059387, + "learning_rate": 0.0009771513427250821, + "loss": 2.9007, + "step": 3564 + }, + { + "epoch": 0.10571420099042196, + "grad_norm": 0.2077052891254425, + "learning_rate": 0.0009771372798215564, + "loss": 2.9256, + "step": 3565 + }, + { + "epoch": 0.10574385434273345, + "grad_norm": 0.22731907665729523, + "learning_rate": 0.0009771232126928981, + "loss": 2.9005, + "step": 3566 + }, + { + "epoch": 0.10577350769504493, + "grad_norm": 0.20788946747779846, + "learning_rate": 0.000977109141339232, + "loss": 2.9049, + "step": 3567 + }, + { + "epoch": 0.1058031610473564, + "grad_norm": 0.21580944955348969, + "learning_rate": 0.0009770950657606826, + "loss": 2.9226, + "step": 3568 + }, + { + "epoch": 0.10583281439966788, + "grad_norm": 0.23453816771507263, + "learning_rate": 0.0009770809859573743, + "loss": 2.8965, + "step": 3569 + }, + { + "epoch": 0.10586246775197936, + "grad_norm": 0.18152791261672974, + "learning_rate": 0.0009770669019294324, + "loss": 2.9099, + "step": 3570 + }, + { + "epoch": 0.10589212110429085, + "grad_norm": 0.14741283655166626, + "learning_rate": 0.0009770528136769808, + "loss": 2.9059, + "step": 3571 + }, + { + "epoch": 0.10592177445660232, + "grad_norm": 0.1725383698940277, + "learning_rate": 0.0009770387212001447, + "loss": 2.9009, + "step": 3572 + }, + { + "epoch": 0.1059514278089138, + "grad_norm": 0.14771439135074615, + "learning_rate": 0.0009770246244990488, + "loss": 2.8831, + "step": 3573 + }, + { + "epoch": 0.10598108116122527, + "grad_norm": 0.169560506939888, + "learning_rate": 0.000977010523573818, + "loss": 2.9041, + "step": 3574 + }, + { + "epoch": 0.10601073451353675, + "grad_norm": 0.20065659284591675, + "learning_rate": 0.0009769964184245773, + "loss": 2.9228, + "step": 3575 + }, + { + "epoch": 0.10604038786584824, + "grad_norm": 0.19384688138961792, + "learning_rate": 0.0009769823090514513, + "loss": 2.8988, + "step": 3576 + }, + { + "epoch": 0.10607004121815972, + "grad_norm": 0.17457802593708038, + "learning_rate": 0.000976968195454565, + "loss": 2.9288, + "step": 3577 + }, + { + "epoch": 0.10609969457047119, + "grad_norm": 0.18008680641651154, + "learning_rate": 0.0009769540776340434, + "loss": 2.9326, + "step": 3578 + }, + { + "epoch": 0.10612934792278267, + "grad_norm": 0.17785586416721344, + "learning_rate": 0.0009769399555900119, + "loss": 2.898, + "step": 3579 + }, + { + "epoch": 0.10615900127509414, + "grad_norm": 0.17897328734397888, + "learning_rate": 0.0009769258293225952, + "loss": 2.8986, + "step": 3580 + }, + { + "epoch": 0.10618865462740563, + "grad_norm": 0.16219188272953033, + "learning_rate": 0.0009769116988319181, + "loss": 2.8862, + "step": 3581 + }, + { + "epoch": 0.10621830797971711, + "grad_norm": 0.15248671174049377, + "learning_rate": 0.0009768975641181064, + "loss": 2.9082, + "step": 3582 + }, + { + "epoch": 0.10624796133202859, + "grad_norm": 0.13833098113536835, + "learning_rate": 0.0009768834251812845, + "loss": 2.9371, + "step": 3583 + }, + { + "epoch": 0.10627761468434006, + "grad_norm": 0.14386726915836334, + "learning_rate": 0.0009768692820215784, + "loss": 2.9014, + "step": 3584 + }, + { + "epoch": 0.10630726803665154, + "grad_norm": 0.14868158102035522, + "learning_rate": 0.0009768551346391128, + "loss": 2.8895, + "step": 3585 + }, + { + "epoch": 0.10633692138896302, + "grad_norm": 0.1448124200105667, + "learning_rate": 0.0009768409830340132, + "loss": 2.9012, + "step": 3586 + }, + { + "epoch": 0.1063665747412745, + "grad_norm": 0.14138802886009216, + "learning_rate": 0.0009768268272064048, + "loss": 2.905, + "step": 3587 + }, + { + "epoch": 0.10639622809358598, + "grad_norm": 0.16406428813934326, + "learning_rate": 0.0009768126671564129, + "loss": 2.9391, + "step": 3588 + }, + { + "epoch": 0.10642588144589746, + "grad_norm": 0.1700470745563507, + "learning_rate": 0.000976798502884163, + "loss": 2.9236, + "step": 3589 + }, + { + "epoch": 0.10645553479820893, + "grad_norm": 0.15987585484981537, + "learning_rate": 0.0009767843343897807, + "loss": 2.907, + "step": 3590 + }, + { + "epoch": 0.10648518815052041, + "grad_norm": 0.14379450678825378, + "learning_rate": 0.000976770161673391, + "loss": 2.8799, + "step": 3591 + }, + { + "epoch": 0.1065148415028319, + "grad_norm": 0.14489513635635376, + "learning_rate": 0.00097675598473512, + "loss": 2.8999, + "step": 3592 + }, + { + "epoch": 0.10654449485514338, + "grad_norm": 0.13059058785438538, + "learning_rate": 0.0009767418035750927, + "loss": 2.8969, + "step": 3593 + }, + { + "epoch": 0.10657414820745485, + "grad_norm": 0.1436566561460495, + "learning_rate": 0.000976727618193435, + "loss": 2.8811, + "step": 3594 + }, + { + "epoch": 0.10660380155976633, + "grad_norm": 0.16111914813518524, + "learning_rate": 0.0009767134285902724, + "loss": 2.9255, + "step": 3595 + }, + { + "epoch": 0.1066334549120778, + "grad_norm": 0.20377150177955627, + "learning_rate": 0.0009766992347657307, + "loss": 2.8784, + "step": 3596 + }, + { + "epoch": 0.1066631082643893, + "grad_norm": 0.2646561563014984, + "learning_rate": 0.0009766850367199352, + "loss": 2.8999, + "step": 3597 + }, + { + "epoch": 0.10669276161670077, + "grad_norm": 0.27136480808258057, + "learning_rate": 0.000976670834453012, + "loss": 2.9192, + "step": 3598 + }, + { + "epoch": 0.10672241496901225, + "grad_norm": 0.246387779712677, + "learning_rate": 0.0009766566279650866, + "loss": 2.9171, + "step": 3599 + }, + { + "epoch": 0.10675206832132372, + "grad_norm": 0.2854137122631073, + "learning_rate": 0.000976642417256285, + "loss": 2.9071, + "step": 3600 + }, + { + "epoch": 0.1067817216736352, + "grad_norm": 0.2483680695295334, + "learning_rate": 0.0009766282023267333, + "loss": 2.9589, + "step": 3601 + }, + { + "epoch": 0.10681137502594669, + "grad_norm": 0.23914997279644012, + "learning_rate": 0.0009766139831765565, + "loss": 2.9305, + "step": 3602 + }, + { + "epoch": 0.10684102837825817, + "grad_norm": 0.22591356933116913, + "learning_rate": 0.0009765997598058815, + "loss": 2.9144, + "step": 3603 + }, + { + "epoch": 0.10687068173056964, + "grad_norm": 0.20920580625534058, + "learning_rate": 0.0009765855322148337, + "loss": 2.8954, + "step": 3604 + }, + { + "epoch": 0.10690033508288112, + "grad_norm": 0.19519291818141937, + "learning_rate": 0.0009765713004035391, + "loss": 2.9102, + "step": 3605 + }, + { + "epoch": 0.1069299884351926, + "grad_norm": 0.167585089802742, + "learning_rate": 0.000976557064372124, + "loss": 2.946, + "step": 3606 + }, + { + "epoch": 0.10695964178750408, + "grad_norm": 0.1638226956129074, + "learning_rate": 0.0009765428241207142, + "loss": 2.887, + "step": 3607 + }, + { + "epoch": 0.10698929513981556, + "grad_norm": 0.14056414365768433, + "learning_rate": 0.0009765285796494359, + "loss": 2.911, + "step": 3608 + }, + { + "epoch": 0.10701894849212704, + "grad_norm": 0.1538623571395874, + "learning_rate": 0.0009765143309584152, + "loss": 2.9261, + "step": 3609 + }, + { + "epoch": 0.10704860184443851, + "grad_norm": 0.13067355751991272, + "learning_rate": 0.0009765000780477784, + "loss": 2.9107, + "step": 3610 + }, + { + "epoch": 0.10707825519674999, + "grad_norm": 0.1492467224597931, + "learning_rate": 0.0009764858209176517, + "loss": 2.8984, + "step": 3611 + }, + { + "epoch": 0.10710790854906146, + "grad_norm": 0.152786523103714, + "learning_rate": 0.000976471559568161, + "loss": 2.8818, + "step": 3612 + }, + { + "epoch": 0.10713756190137295, + "grad_norm": 0.14945919811725616, + "learning_rate": 0.0009764572939994331, + "loss": 2.9033, + "step": 3613 + }, + { + "epoch": 0.10716721525368443, + "grad_norm": 0.1398804485797882, + "learning_rate": 0.0009764430242115939, + "loss": 2.9305, + "step": 3614 + }, + { + "epoch": 0.10719686860599591, + "grad_norm": 0.15302257239818573, + "learning_rate": 0.00097642875020477, + "loss": 2.9416, + "step": 3615 + }, + { + "epoch": 0.10722652195830738, + "grad_norm": 0.15636087954044342, + "learning_rate": 0.0009764144719790878, + "loss": 2.9475, + "step": 3616 + }, + { + "epoch": 0.10725617531061886, + "grad_norm": 0.14750638604164124, + "learning_rate": 0.0009764001895346736, + "loss": 2.8715, + "step": 3617 + }, + { + "epoch": 0.10728582866293035, + "grad_norm": 0.14643032848834991, + "learning_rate": 0.0009763859028716539, + "loss": 2.9057, + "step": 3618 + }, + { + "epoch": 0.10731548201524183, + "grad_norm": 0.1456802487373352, + "learning_rate": 0.0009763716119901555, + "loss": 2.8746, + "step": 3619 + }, + { + "epoch": 0.1073451353675533, + "grad_norm": 0.14624008536338806, + "learning_rate": 0.0009763573168903045, + "loss": 2.9014, + "step": 3620 + }, + { + "epoch": 0.10737478871986478, + "grad_norm": 0.1760273426771164, + "learning_rate": 0.0009763430175722277, + "loss": 2.895, + "step": 3621 + }, + { + "epoch": 0.10740444207217625, + "grad_norm": 0.18124054372310638, + "learning_rate": 0.0009763287140360517, + "loss": 2.9393, + "step": 3622 + }, + { + "epoch": 0.10743409542448774, + "grad_norm": 0.16786634922027588, + "learning_rate": 0.0009763144062819033, + "loss": 2.9095, + "step": 3623 + }, + { + "epoch": 0.10746374877679922, + "grad_norm": 0.18365272879600525, + "learning_rate": 0.000976300094309909, + "loss": 2.9389, + "step": 3624 + }, + { + "epoch": 0.1074934021291107, + "grad_norm": 0.17971748113632202, + "learning_rate": 0.0009762857781201956, + "loss": 2.9161, + "step": 3625 + }, + { + "epoch": 0.10752305548142217, + "grad_norm": 0.19403082132339478, + "learning_rate": 0.00097627145771289, + "loss": 2.9258, + "step": 3626 + }, + { + "epoch": 0.10755270883373365, + "grad_norm": 0.2187681496143341, + "learning_rate": 0.0009762571330881187, + "loss": 2.9163, + "step": 3627 + }, + { + "epoch": 0.10758236218604514, + "grad_norm": 0.21266630291938782, + "learning_rate": 0.0009762428042460088, + "loss": 2.8957, + "step": 3628 + }, + { + "epoch": 0.10761201553835661, + "grad_norm": 0.1931651085615158, + "learning_rate": 0.0009762284711866871, + "loss": 2.922, + "step": 3629 + }, + { + "epoch": 0.10764166889066809, + "grad_norm": 0.21226000785827637, + "learning_rate": 0.0009762141339102805, + "loss": 2.9131, + "step": 3630 + }, + { + "epoch": 0.10767132224297957, + "grad_norm": 0.19252315163612366, + "learning_rate": 0.0009761997924169162, + "loss": 2.9001, + "step": 3631 + }, + { + "epoch": 0.10770097559529104, + "grad_norm": 0.171283558011055, + "learning_rate": 0.000976185446706721, + "loss": 2.8966, + "step": 3632 + }, + { + "epoch": 0.10773062894760253, + "grad_norm": 0.19910891354084015, + "learning_rate": 0.0009761710967798217, + "loss": 2.9092, + "step": 3633 + }, + { + "epoch": 0.10776028229991401, + "grad_norm": 0.16098976135253906, + "learning_rate": 0.0009761567426363458, + "loss": 2.9256, + "step": 3634 + }, + { + "epoch": 0.10778993565222549, + "grad_norm": 0.14869576692581177, + "learning_rate": 0.0009761423842764201, + "loss": 2.896, + "step": 3635 + }, + { + "epoch": 0.10781958900453696, + "grad_norm": 0.13816525042057037, + "learning_rate": 0.0009761280217001719, + "loss": 2.9121, + "step": 3636 + }, + { + "epoch": 0.10784924235684844, + "grad_norm": 0.12022869288921356, + "learning_rate": 0.0009761136549077283, + "loss": 2.8911, + "step": 3637 + }, + { + "epoch": 0.10787889570915991, + "grad_norm": 0.1477530300617218, + "learning_rate": 0.0009760992838992167, + "loss": 2.9301, + "step": 3638 + }, + { + "epoch": 0.1079085490614714, + "grad_norm": 0.17044496536254883, + "learning_rate": 0.0009760849086747641, + "loss": 2.9125, + "step": 3639 + }, + { + "epoch": 0.10793820241378288, + "grad_norm": 0.15249648690223694, + "learning_rate": 0.0009760705292344979, + "loss": 2.9176, + "step": 3640 + }, + { + "epoch": 0.10796785576609436, + "grad_norm": 0.16932357847690582, + "learning_rate": 0.0009760561455785455, + "loss": 2.917, + "step": 3641 + }, + { + "epoch": 0.10799750911840583, + "grad_norm": 0.19078129529953003, + "learning_rate": 0.0009760417577070341, + "loss": 2.8941, + "step": 3642 + }, + { + "epoch": 0.10802716247071731, + "grad_norm": 0.21095645427703857, + "learning_rate": 0.0009760273656200915, + "loss": 2.9083, + "step": 3643 + }, + { + "epoch": 0.1080568158230288, + "grad_norm": 0.18660637736320496, + "learning_rate": 0.0009760129693178446, + "loss": 2.8712, + "step": 3644 + }, + { + "epoch": 0.10808646917534027, + "grad_norm": 0.1722399890422821, + "learning_rate": 0.0009759985688004214, + "loss": 2.9195, + "step": 3645 + }, + { + "epoch": 0.10811612252765175, + "grad_norm": 0.17259569466114044, + "learning_rate": 0.0009759841640679488, + "loss": 2.9287, + "step": 3646 + }, + { + "epoch": 0.10814577587996323, + "grad_norm": 0.16427652537822723, + "learning_rate": 0.0009759697551205551, + "loss": 2.8898, + "step": 3647 + }, + { + "epoch": 0.1081754292322747, + "grad_norm": 0.14680318534374237, + "learning_rate": 0.0009759553419583674, + "loss": 2.8998, + "step": 3648 + }, + { + "epoch": 0.10820508258458619, + "grad_norm": 0.15250007808208466, + "learning_rate": 0.0009759409245815132, + "loss": 2.8925, + "step": 3649 + }, + { + "epoch": 0.10823473593689767, + "grad_norm": 0.1832078993320465, + "learning_rate": 0.0009759265029901208, + "loss": 2.9318, + "step": 3650 + }, + { + "epoch": 0.10826438928920915, + "grad_norm": 0.19567584991455078, + "learning_rate": 0.0009759120771843173, + "loss": 2.9135, + "step": 3651 + }, + { + "epoch": 0.10829404264152062, + "grad_norm": 0.21653573215007782, + "learning_rate": 0.0009758976471642307, + "loss": 2.9272, + "step": 3652 + }, + { + "epoch": 0.1083236959938321, + "grad_norm": 0.21117214858531952, + "learning_rate": 0.0009758832129299888, + "loss": 2.8732, + "step": 3653 + }, + { + "epoch": 0.10835334934614359, + "grad_norm": 0.1586422324180603, + "learning_rate": 0.0009758687744817193, + "loss": 2.9027, + "step": 3654 + }, + { + "epoch": 0.10838300269845506, + "grad_norm": 0.1876296103000641, + "learning_rate": 0.0009758543318195501, + "loss": 2.9091, + "step": 3655 + }, + { + "epoch": 0.10841265605076654, + "grad_norm": 0.2259896695613861, + "learning_rate": 0.0009758398849436092, + "loss": 2.9041, + "step": 3656 + }, + { + "epoch": 0.10844230940307802, + "grad_norm": 0.18062180280685425, + "learning_rate": 0.0009758254338540245, + "loss": 2.9019, + "step": 3657 + }, + { + "epoch": 0.10847196275538949, + "grad_norm": 0.17694123089313507, + "learning_rate": 0.0009758109785509237, + "loss": 2.9174, + "step": 3658 + }, + { + "epoch": 0.10850161610770098, + "grad_norm": 0.1708173304796219, + "learning_rate": 0.0009757965190344351, + "loss": 2.9186, + "step": 3659 + }, + { + "epoch": 0.10853126946001246, + "grad_norm": 0.14636333286762238, + "learning_rate": 0.0009757820553046867, + "loss": 2.9208, + "step": 3660 + }, + { + "epoch": 0.10856092281232393, + "grad_norm": 0.13724170625209808, + "learning_rate": 0.0009757675873618067, + "loss": 2.9103, + "step": 3661 + }, + { + "epoch": 0.10859057616463541, + "grad_norm": 0.12811784446239471, + "learning_rate": 0.0009757531152059227, + "loss": 2.8674, + "step": 3662 + }, + { + "epoch": 0.10862022951694689, + "grad_norm": 0.1502103954553604, + "learning_rate": 0.0009757386388371634, + "loss": 2.9063, + "step": 3663 + }, + { + "epoch": 0.10864988286925836, + "grad_norm": 0.14568373560905457, + "learning_rate": 0.0009757241582556567, + "loss": 2.902, + "step": 3664 + }, + { + "epoch": 0.10867953622156985, + "grad_norm": 0.13583101332187653, + "learning_rate": 0.0009757096734615311, + "loss": 2.876, + "step": 3665 + }, + { + "epoch": 0.10870918957388133, + "grad_norm": 0.14951585233211517, + "learning_rate": 0.0009756951844549145, + "loss": 2.8593, + "step": 3666 + }, + { + "epoch": 0.1087388429261928, + "grad_norm": 0.1323622167110443, + "learning_rate": 0.0009756806912359355, + "loss": 2.8953, + "step": 3667 + }, + { + "epoch": 0.10876849627850428, + "grad_norm": 0.14227907359600067, + "learning_rate": 0.0009756661938047222, + "loss": 2.9223, + "step": 3668 + }, + { + "epoch": 0.10879814963081576, + "grad_norm": 0.15100742876529694, + "learning_rate": 0.0009756516921614033, + "loss": 2.9162, + "step": 3669 + }, + { + "epoch": 0.10882780298312725, + "grad_norm": 0.15408901870250702, + "learning_rate": 0.0009756371863061068, + "loss": 2.9199, + "step": 3670 + }, + { + "epoch": 0.10885745633543872, + "grad_norm": 0.16150425374507904, + "learning_rate": 0.0009756226762389615, + "loss": 2.8941, + "step": 3671 + }, + { + "epoch": 0.1088871096877502, + "grad_norm": 0.16421987116336823, + "learning_rate": 0.0009756081619600958, + "loss": 2.9235, + "step": 3672 + }, + { + "epoch": 0.10891676304006168, + "grad_norm": 0.1628073751926422, + "learning_rate": 0.0009755936434696382, + "loss": 2.9098, + "step": 3673 + }, + { + "epoch": 0.10894641639237315, + "grad_norm": 0.17594295740127563, + "learning_rate": 0.0009755791207677172, + "loss": 2.8868, + "step": 3674 + }, + { + "epoch": 0.10897606974468464, + "grad_norm": 0.17971044778823853, + "learning_rate": 0.0009755645938544615, + "loss": 2.9253, + "step": 3675 + }, + { + "epoch": 0.10900572309699612, + "grad_norm": 0.1753111630678177, + "learning_rate": 0.0009755500627299996, + "loss": 2.9173, + "step": 3676 + }, + { + "epoch": 0.1090353764493076, + "grad_norm": 0.15191836655139923, + "learning_rate": 0.0009755355273944603, + "loss": 2.9321, + "step": 3677 + }, + { + "epoch": 0.10906502980161907, + "grad_norm": 0.15360203385353088, + "learning_rate": 0.0009755209878479723, + "loss": 2.901, + "step": 3678 + }, + { + "epoch": 0.10909468315393055, + "grad_norm": 0.17740142345428467, + "learning_rate": 0.0009755064440906642, + "loss": 2.8996, + "step": 3679 + }, + { + "epoch": 0.10912433650624204, + "grad_norm": 0.17588140070438385, + "learning_rate": 0.0009754918961226651, + "loss": 2.9165, + "step": 3680 + }, + { + "epoch": 0.10915398985855351, + "grad_norm": 0.20226994156837463, + "learning_rate": 0.0009754773439441035, + "loss": 2.9006, + "step": 3681 + }, + { + "epoch": 0.10918364321086499, + "grad_norm": 0.1772398203611374, + "learning_rate": 0.0009754627875551085, + "loss": 2.9072, + "step": 3682 + }, + { + "epoch": 0.10921329656317647, + "grad_norm": 0.15547263622283936, + "learning_rate": 0.0009754482269558089, + "loss": 2.8984, + "step": 3683 + }, + { + "epoch": 0.10924294991548794, + "grad_norm": 0.2040461003780365, + "learning_rate": 0.0009754336621463337, + "loss": 2.9072, + "step": 3684 + }, + { + "epoch": 0.10927260326779943, + "grad_norm": 0.2210390716791153, + "learning_rate": 0.0009754190931268118, + "loss": 2.8725, + "step": 3685 + }, + { + "epoch": 0.10930225662011091, + "grad_norm": 0.31208503246307373, + "learning_rate": 0.0009754045198973721, + "loss": 2.9164, + "step": 3686 + }, + { + "epoch": 0.10933190997242238, + "grad_norm": 0.17686761915683746, + "learning_rate": 0.0009753899424581439, + "loss": 2.9199, + "step": 3687 + }, + { + "epoch": 0.10936156332473386, + "grad_norm": 0.18986459076404572, + "learning_rate": 0.0009753753608092561, + "loss": 2.9044, + "step": 3688 + }, + { + "epoch": 0.10939121667704534, + "grad_norm": 0.18243762850761414, + "learning_rate": 0.0009753607749508379, + "loss": 2.9127, + "step": 3689 + }, + { + "epoch": 0.10942087002935681, + "grad_norm": 0.19144317507743835, + "learning_rate": 0.0009753461848830183, + "loss": 2.8694, + "step": 3690 + }, + { + "epoch": 0.1094505233816683, + "grad_norm": 0.18541014194488525, + "learning_rate": 0.0009753315906059268, + "loss": 2.9286, + "step": 3691 + }, + { + "epoch": 0.10948017673397978, + "grad_norm": 0.1907634735107422, + "learning_rate": 0.0009753169921196924, + "loss": 2.9124, + "step": 3692 + }, + { + "epoch": 0.10950983008629125, + "grad_norm": 0.181043341755867, + "learning_rate": 0.0009753023894244446, + "loss": 2.8837, + "step": 3693 + }, + { + "epoch": 0.10953948343860273, + "grad_norm": 0.1685120016336441, + "learning_rate": 0.0009752877825203124, + "loss": 2.9202, + "step": 3694 + }, + { + "epoch": 0.1095691367909142, + "grad_norm": 0.17429213225841522, + "learning_rate": 0.0009752731714074254, + "loss": 2.9117, + "step": 3695 + }, + { + "epoch": 0.1095987901432257, + "grad_norm": 0.15261127054691315, + "learning_rate": 0.0009752585560859129, + "loss": 2.9327, + "step": 3696 + }, + { + "epoch": 0.10962844349553717, + "grad_norm": 0.14089109003543854, + "learning_rate": 0.0009752439365559041, + "loss": 2.946, + "step": 3697 + }, + { + "epoch": 0.10965809684784865, + "grad_norm": 0.14934056997299194, + "learning_rate": 0.0009752293128175289, + "loss": 2.8974, + "step": 3698 + }, + { + "epoch": 0.10968775020016013, + "grad_norm": 0.14915713667869568, + "learning_rate": 0.0009752146848709165, + "loss": 2.8974, + "step": 3699 + }, + { + "epoch": 0.1097174035524716, + "grad_norm": 0.16807885468006134, + "learning_rate": 0.0009752000527161964, + "loss": 2.9415, + "step": 3700 + }, + { + "epoch": 0.10974705690478309, + "grad_norm": 0.19155237078666687, + "learning_rate": 0.0009751854163534984, + "loss": 2.8727, + "step": 3701 + }, + { + "epoch": 0.10977671025709457, + "grad_norm": 0.18287397921085358, + "learning_rate": 0.000975170775782952, + "loss": 2.8996, + "step": 3702 + }, + { + "epoch": 0.10980636360940604, + "grad_norm": 0.16137124598026276, + "learning_rate": 0.0009751561310046866, + "loss": 2.9059, + "step": 3703 + }, + { + "epoch": 0.10983601696171752, + "grad_norm": 0.1418571174144745, + "learning_rate": 0.0009751414820188322, + "loss": 2.9094, + "step": 3704 + }, + { + "epoch": 0.109865670314029, + "grad_norm": 0.1511652022600174, + "learning_rate": 0.0009751268288255186, + "loss": 2.8804, + "step": 3705 + }, + { + "epoch": 0.10989532366634049, + "grad_norm": 0.15629807114601135, + "learning_rate": 0.0009751121714248751, + "loss": 2.8935, + "step": 3706 + }, + { + "epoch": 0.10992497701865196, + "grad_norm": 0.1648366004228592, + "learning_rate": 0.0009750975098170321, + "loss": 2.9057, + "step": 3707 + }, + { + "epoch": 0.10995463037096344, + "grad_norm": 0.1495400071144104, + "learning_rate": 0.0009750828440021188, + "loss": 2.8948, + "step": 3708 + }, + { + "epoch": 0.10998428372327491, + "grad_norm": 0.1457461416721344, + "learning_rate": 0.0009750681739802654, + "loss": 2.9043, + "step": 3709 + }, + { + "epoch": 0.11001393707558639, + "grad_norm": 0.15738391876220703, + "learning_rate": 0.0009750534997516019, + "loss": 2.8861, + "step": 3710 + }, + { + "epoch": 0.11004359042789788, + "grad_norm": 0.1662086844444275, + "learning_rate": 0.000975038821316258, + "loss": 2.9139, + "step": 3711 + }, + { + "epoch": 0.11007324378020936, + "grad_norm": 0.16727665066719055, + "learning_rate": 0.0009750241386743639, + "loss": 2.9032, + "step": 3712 + }, + { + "epoch": 0.11010289713252083, + "grad_norm": 0.17939341068267822, + "learning_rate": 0.0009750094518260495, + "loss": 2.9066, + "step": 3713 + }, + { + "epoch": 0.11013255048483231, + "grad_norm": 0.19725126028060913, + "learning_rate": 0.0009749947607714449, + "loss": 2.9044, + "step": 3714 + }, + { + "epoch": 0.11016220383714379, + "grad_norm": 0.2240574061870575, + "learning_rate": 0.0009749800655106801, + "loss": 2.8827, + "step": 3715 + }, + { + "epoch": 0.11019185718945526, + "grad_norm": 0.21679946780204773, + "learning_rate": 0.0009749653660438853, + "loss": 2.9184, + "step": 3716 + }, + { + "epoch": 0.11022151054176675, + "grad_norm": 0.22529347240924835, + "learning_rate": 0.0009749506623711906, + "loss": 2.8463, + "step": 3717 + }, + { + "epoch": 0.11025116389407823, + "grad_norm": 0.19160693883895874, + "learning_rate": 0.0009749359544927263, + "loss": 2.8917, + "step": 3718 + }, + { + "epoch": 0.1102808172463897, + "grad_norm": 0.1641804426908493, + "learning_rate": 0.0009749212424086227, + "loss": 2.9047, + "step": 3719 + }, + { + "epoch": 0.11031047059870118, + "grad_norm": 0.16622614860534668, + "learning_rate": 0.0009749065261190099, + "loss": 2.9241, + "step": 3720 + }, + { + "epoch": 0.11034012395101266, + "grad_norm": 0.17987845838069916, + "learning_rate": 0.0009748918056240182, + "loss": 2.8994, + "step": 3721 + }, + { + "epoch": 0.11036977730332415, + "grad_norm": 0.21632112562656403, + "learning_rate": 0.0009748770809237782, + "loss": 2.8894, + "step": 3722 + }, + { + "epoch": 0.11039943065563562, + "grad_norm": 0.20798753201961517, + "learning_rate": 0.0009748623520184201, + "loss": 2.9013, + "step": 3723 + }, + { + "epoch": 0.1104290840079471, + "grad_norm": 0.18937817215919495, + "learning_rate": 0.0009748476189080744, + "loss": 2.9171, + "step": 3724 + }, + { + "epoch": 0.11045873736025857, + "grad_norm": 0.17356114089488983, + "learning_rate": 0.0009748328815928713, + "loss": 2.9146, + "step": 3725 + }, + { + "epoch": 0.11048839071257005, + "grad_norm": 0.1750592142343521, + "learning_rate": 0.0009748181400729418, + "loss": 2.9146, + "step": 3726 + }, + { + "epoch": 0.11051804406488154, + "grad_norm": 0.17546825110912323, + "learning_rate": 0.000974803394348416, + "loss": 2.865, + "step": 3727 + }, + { + "epoch": 0.11054769741719302, + "grad_norm": 0.14283791184425354, + "learning_rate": 0.0009747886444194247, + "loss": 2.887, + "step": 3728 + }, + { + "epoch": 0.11057735076950449, + "grad_norm": 0.13881686329841614, + "learning_rate": 0.0009747738902860985, + "loss": 2.9044, + "step": 3729 + }, + { + "epoch": 0.11060700412181597, + "grad_norm": 0.15191207826137543, + "learning_rate": 0.000974759131948568, + "loss": 2.9048, + "step": 3730 + }, + { + "epoch": 0.11063665747412745, + "grad_norm": 0.14713367819786072, + "learning_rate": 0.0009747443694069638, + "loss": 2.858, + "step": 3731 + }, + { + "epoch": 0.11066631082643894, + "grad_norm": 0.1494884341955185, + "learning_rate": 0.0009747296026614169, + "loss": 2.9347, + "step": 3732 + }, + { + "epoch": 0.11069596417875041, + "grad_norm": 0.13923783600330353, + "learning_rate": 0.0009747148317120577, + "loss": 2.9176, + "step": 3733 + }, + { + "epoch": 0.11072561753106189, + "grad_norm": 0.15214011073112488, + "learning_rate": 0.0009747000565590172, + "loss": 2.863, + "step": 3734 + }, + { + "epoch": 0.11075527088337336, + "grad_norm": 0.15075740218162537, + "learning_rate": 0.0009746852772024264, + "loss": 2.9218, + "step": 3735 + }, + { + "epoch": 0.11078492423568484, + "grad_norm": 0.16887298226356506, + "learning_rate": 0.0009746704936424158, + "loss": 2.9145, + "step": 3736 + }, + { + "epoch": 0.11081457758799633, + "grad_norm": 0.17415449023246765, + "learning_rate": 0.0009746557058791166, + "loss": 2.9075, + "step": 3737 + }, + { + "epoch": 0.1108442309403078, + "grad_norm": 0.19925259053707123, + "learning_rate": 0.0009746409139126596, + "loss": 2.8994, + "step": 3738 + }, + { + "epoch": 0.11087388429261928, + "grad_norm": 0.21497343480587006, + "learning_rate": 0.0009746261177431759, + "loss": 2.9033, + "step": 3739 + }, + { + "epoch": 0.11090353764493076, + "grad_norm": 0.19475966691970825, + "learning_rate": 0.0009746113173707963, + "loss": 2.8729, + "step": 3740 + }, + { + "epoch": 0.11093319099724223, + "grad_norm": 0.17462965846061707, + "learning_rate": 0.0009745965127956522, + "loss": 2.8707, + "step": 3741 + }, + { + "epoch": 0.11096284434955371, + "grad_norm": 0.18817925453186035, + "learning_rate": 0.0009745817040178744, + "loss": 2.9071, + "step": 3742 + }, + { + "epoch": 0.1109924977018652, + "grad_norm": 0.18391567468643188, + "learning_rate": 0.0009745668910375942, + "loss": 2.8646, + "step": 3743 + }, + { + "epoch": 0.11102215105417668, + "grad_norm": 0.16598302125930786, + "learning_rate": 0.0009745520738549427, + "loss": 2.9236, + "step": 3744 + }, + { + "epoch": 0.11105180440648815, + "grad_norm": 0.1472875475883484, + "learning_rate": 0.0009745372524700512, + "loss": 2.8843, + "step": 3745 + }, + { + "epoch": 0.11108145775879963, + "grad_norm": 0.1532631665468216, + "learning_rate": 0.0009745224268830508, + "loss": 2.907, + "step": 3746 + }, + { + "epoch": 0.1111111111111111, + "grad_norm": 0.17347657680511475, + "learning_rate": 0.0009745075970940729, + "loss": 2.8912, + "step": 3747 + }, + { + "epoch": 0.1111407644634226, + "grad_norm": 0.1709812432527542, + "learning_rate": 0.0009744927631032488, + "loss": 2.8804, + "step": 3748 + }, + { + "epoch": 0.11117041781573407, + "grad_norm": 0.15787099301815033, + "learning_rate": 0.0009744779249107097, + "loss": 2.9282, + "step": 3749 + }, + { + "epoch": 0.11120007116804555, + "grad_norm": 0.1463298499584198, + "learning_rate": 0.0009744630825165874, + "loss": 2.889, + "step": 3750 + }, + { + "epoch": 0.11122972452035702, + "grad_norm": 0.14878959953784943, + "learning_rate": 0.0009744482359210127, + "loss": 2.9272, + "step": 3751 + }, + { + "epoch": 0.1112593778726685, + "grad_norm": 0.14877638220787048, + "learning_rate": 0.0009744333851241177, + "loss": 2.8957, + "step": 3752 + }, + { + "epoch": 0.11128903122497999, + "grad_norm": 0.15843161940574646, + "learning_rate": 0.0009744185301260336, + "loss": 2.8683, + "step": 3753 + }, + { + "epoch": 0.11131868457729147, + "grad_norm": 0.16532647609710693, + "learning_rate": 0.0009744036709268919, + "loss": 2.9287, + "step": 3754 + }, + { + "epoch": 0.11134833792960294, + "grad_norm": 0.18262940645217896, + "learning_rate": 0.0009743888075268243, + "loss": 2.9067, + "step": 3755 + }, + { + "epoch": 0.11137799128191442, + "grad_norm": 0.18332701921463013, + "learning_rate": 0.0009743739399259624, + "loss": 2.8955, + "step": 3756 + }, + { + "epoch": 0.1114076446342259, + "grad_norm": 0.16881899535655975, + "learning_rate": 0.0009743590681244379, + "loss": 2.9214, + "step": 3757 + }, + { + "epoch": 0.11143729798653738, + "grad_norm": 0.18278934061527252, + "learning_rate": 0.0009743441921223824, + "loss": 2.9102, + "step": 3758 + }, + { + "epoch": 0.11146695133884886, + "grad_norm": 0.18741659820079803, + "learning_rate": 0.0009743293119199276, + "loss": 2.8914, + "step": 3759 + }, + { + "epoch": 0.11149660469116034, + "grad_norm": 0.18927718698978424, + "learning_rate": 0.0009743144275172053, + "loss": 2.9011, + "step": 3760 + }, + { + "epoch": 0.11152625804347181, + "grad_norm": 0.19706487655639648, + "learning_rate": 0.0009742995389143474, + "loss": 2.9311, + "step": 3761 + }, + { + "epoch": 0.11155591139578329, + "grad_norm": 0.16351354122161865, + "learning_rate": 0.0009742846461114856, + "loss": 2.9114, + "step": 3762 + }, + { + "epoch": 0.11158556474809478, + "grad_norm": 0.14916382730007172, + "learning_rate": 0.000974269749108752, + "loss": 2.9238, + "step": 3763 + }, + { + "epoch": 0.11161521810040625, + "grad_norm": 0.13666614890098572, + "learning_rate": 0.0009742548479062783, + "loss": 2.892, + "step": 3764 + }, + { + "epoch": 0.11164487145271773, + "grad_norm": 0.13777229189872742, + "learning_rate": 0.0009742399425041963, + "loss": 2.8928, + "step": 3765 + }, + { + "epoch": 0.11167452480502921, + "grad_norm": 0.15644696354866028, + "learning_rate": 0.0009742250329026385, + "loss": 2.8599, + "step": 3766 + }, + { + "epoch": 0.11170417815734068, + "grad_norm": 0.13600173592567444, + "learning_rate": 0.0009742101191017365, + "loss": 2.9274, + "step": 3767 + }, + { + "epoch": 0.11173383150965216, + "grad_norm": 0.1470690667629242, + "learning_rate": 0.0009741952011016224, + "loss": 2.8798, + "step": 3768 + }, + { + "epoch": 0.11176348486196365, + "grad_norm": 0.16225554049015045, + "learning_rate": 0.0009741802789024286, + "loss": 2.9108, + "step": 3769 + }, + { + "epoch": 0.11179313821427513, + "grad_norm": 0.1821356862783432, + "learning_rate": 0.000974165352504287, + "loss": 2.8695, + "step": 3770 + }, + { + "epoch": 0.1118227915665866, + "grad_norm": 0.16710259020328522, + "learning_rate": 0.0009741504219073297, + "loss": 2.9117, + "step": 3771 + }, + { + "epoch": 0.11185244491889808, + "grad_norm": 0.16662666201591492, + "learning_rate": 0.0009741354871116892, + "loss": 2.8724, + "step": 3772 + }, + { + "epoch": 0.11188209827120955, + "grad_norm": 0.16717968881130219, + "learning_rate": 0.0009741205481174974, + "loss": 2.8973, + "step": 3773 + }, + { + "epoch": 0.11191175162352104, + "grad_norm": 0.14733991026878357, + "learning_rate": 0.0009741056049248868, + "loss": 2.8793, + "step": 3774 + }, + { + "epoch": 0.11194140497583252, + "grad_norm": 0.1560240238904953, + "learning_rate": 0.0009740906575339898, + "loss": 2.8573, + "step": 3775 + }, + { + "epoch": 0.111971058328144, + "grad_norm": 0.17526216804981232, + "learning_rate": 0.0009740757059449386, + "loss": 2.9238, + "step": 3776 + }, + { + "epoch": 0.11200071168045547, + "grad_norm": 0.1921984702348709, + "learning_rate": 0.0009740607501578655, + "loss": 2.8844, + "step": 3777 + }, + { + "epoch": 0.11203036503276695, + "grad_norm": 0.21888013184070587, + "learning_rate": 0.0009740457901729033, + "loss": 2.8948, + "step": 3778 + }, + { + "epoch": 0.11206001838507844, + "grad_norm": 0.21838565170764923, + "learning_rate": 0.0009740308259901842, + "loss": 2.9174, + "step": 3779 + }, + { + "epoch": 0.11208967173738991, + "grad_norm": 0.2055254727602005, + "learning_rate": 0.0009740158576098407, + "loss": 2.8747, + "step": 3780 + }, + { + "epoch": 0.11211932508970139, + "grad_norm": 0.2045716494321823, + "learning_rate": 0.0009740008850320054, + "loss": 2.9118, + "step": 3781 + }, + { + "epoch": 0.11214897844201287, + "grad_norm": 0.1845403015613556, + "learning_rate": 0.000973985908256811, + "loss": 2.8849, + "step": 3782 + }, + { + "epoch": 0.11217863179432434, + "grad_norm": 0.1776576191186905, + "learning_rate": 0.00097397092728439, + "loss": 2.9087, + "step": 3783 + }, + { + "epoch": 0.11220828514663583, + "grad_norm": 0.17438288033008575, + "learning_rate": 0.000973955942114875, + "loss": 2.8946, + "step": 3784 + }, + { + "epoch": 0.11223793849894731, + "grad_norm": 0.19725364446640015, + "learning_rate": 0.0009739409527483989, + "loss": 2.9135, + "step": 3785 + }, + { + "epoch": 0.11226759185125879, + "grad_norm": 0.19738605618476868, + "learning_rate": 0.0009739259591850941, + "loss": 2.9161, + "step": 3786 + }, + { + "epoch": 0.11229724520357026, + "grad_norm": 0.15907147526741028, + "learning_rate": 0.0009739109614250939, + "loss": 2.8815, + "step": 3787 + }, + { + "epoch": 0.11232689855588174, + "grad_norm": 0.17121267318725586, + "learning_rate": 0.0009738959594685306, + "loss": 2.8766, + "step": 3788 + }, + { + "epoch": 0.11235655190819323, + "grad_norm": 0.1865416020154953, + "learning_rate": 0.0009738809533155373, + "loss": 2.9285, + "step": 3789 + }, + { + "epoch": 0.1123862052605047, + "grad_norm": 0.15381181240081787, + "learning_rate": 0.0009738659429662468, + "loss": 2.8609, + "step": 3790 + }, + { + "epoch": 0.11241585861281618, + "grad_norm": 0.14586994051933289, + "learning_rate": 0.0009738509284207919, + "loss": 2.8907, + "step": 3791 + }, + { + "epoch": 0.11244551196512766, + "grad_norm": 0.1449396163225174, + "learning_rate": 0.0009738359096793059, + "loss": 2.9013, + "step": 3792 + }, + { + "epoch": 0.11247516531743913, + "grad_norm": 0.1615595817565918, + "learning_rate": 0.0009738208867419216, + "loss": 2.8865, + "step": 3793 + }, + { + "epoch": 0.11250481866975061, + "grad_norm": 0.15393999218940735, + "learning_rate": 0.0009738058596087718, + "loss": 2.8772, + "step": 3794 + }, + { + "epoch": 0.1125344720220621, + "grad_norm": 0.1530304104089737, + "learning_rate": 0.0009737908282799898, + "loss": 2.8996, + "step": 3795 + }, + { + "epoch": 0.11256412537437357, + "grad_norm": 0.16351598501205444, + "learning_rate": 0.0009737757927557089, + "loss": 2.919, + "step": 3796 + }, + { + "epoch": 0.11259377872668505, + "grad_norm": 0.15973873436450958, + "learning_rate": 0.000973760753036062, + "loss": 2.886, + "step": 3797 + }, + { + "epoch": 0.11262343207899653, + "grad_norm": 0.157292902469635, + "learning_rate": 0.0009737457091211822, + "loss": 2.9379, + "step": 3798 + }, + { + "epoch": 0.112653085431308, + "grad_norm": 0.177865132689476, + "learning_rate": 0.0009737306610112029, + "loss": 2.9078, + "step": 3799 + }, + { + "epoch": 0.1126827387836195, + "grad_norm": 0.1946439892053604, + "learning_rate": 0.0009737156087062573, + "loss": 2.9049, + "step": 3800 + }, + { + "epoch": 0.11271239213593097, + "grad_norm": 0.20009280741214752, + "learning_rate": 0.0009737005522064785, + "loss": 2.9267, + "step": 3801 + }, + { + "epoch": 0.11274204548824245, + "grad_norm": 0.20897400379180908, + "learning_rate": 0.0009736854915120001, + "loss": 2.8868, + "step": 3802 + }, + { + "epoch": 0.11277169884055392, + "grad_norm": 0.19855757057666779, + "learning_rate": 0.0009736704266229554, + "loss": 2.9042, + "step": 3803 + }, + { + "epoch": 0.1128013521928654, + "grad_norm": 0.16432130336761475, + "learning_rate": 0.0009736553575394778, + "loss": 2.8885, + "step": 3804 + }, + { + "epoch": 0.11283100554517689, + "grad_norm": 0.17374321818351746, + "learning_rate": 0.0009736402842617007, + "loss": 2.9001, + "step": 3805 + }, + { + "epoch": 0.11286065889748836, + "grad_norm": 0.16895896196365356, + "learning_rate": 0.0009736252067897575, + "loss": 2.8541, + "step": 3806 + }, + { + "epoch": 0.11289031224979984, + "grad_norm": 0.1464489996433258, + "learning_rate": 0.0009736101251237819, + "loss": 2.8965, + "step": 3807 + }, + { + "epoch": 0.11291996560211132, + "grad_norm": 0.15851342678070068, + "learning_rate": 0.0009735950392639073, + "loss": 2.8978, + "step": 3808 + }, + { + "epoch": 0.11294961895442279, + "grad_norm": 0.1855105757713318, + "learning_rate": 0.0009735799492102673, + "loss": 2.8617, + "step": 3809 + }, + { + "epoch": 0.11297927230673428, + "grad_norm": 0.15293097496032715, + "learning_rate": 0.0009735648549629956, + "loss": 2.9451, + "step": 3810 + }, + { + "epoch": 0.11300892565904576, + "grad_norm": 0.15539617836475372, + "learning_rate": 0.0009735497565222258, + "loss": 2.9075, + "step": 3811 + }, + { + "epoch": 0.11303857901135723, + "grad_norm": 0.15773774683475494, + "learning_rate": 0.0009735346538880916, + "loss": 2.9238, + "step": 3812 + }, + { + "epoch": 0.11306823236366871, + "grad_norm": 0.1638406366109848, + "learning_rate": 0.0009735195470607269, + "loss": 2.9139, + "step": 3813 + }, + { + "epoch": 0.11309788571598019, + "grad_norm": 0.16614675521850586, + "learning_rate": 0.0009735044360402651, + "loss": 2.8539, + "step": 3814 + }, + { + "epoch": 0.11312753906829168, + "grad_norm": 0.17215180397033691, + "learning_rate": 0.0009734893208268405, + "loss": 2.9186, + "step": 3815 + }, + { + "epoch": 0.11315719242060315, + "grad_norm": 0.1665201336145401, + "learning_rate": 0.0009734742014205865, + "loss": 2.8746, + "step": 3816 + }, + { + "epoch": 0.11318684577291463, + "grad_norm": 0.18466150760650635, + "learning_rate": 0.0009734590778216372, + "loss": 2.8755, + "step": 3817 + }, + { + "epoch": 0.1132164991252261, + "grad_norm": 0.17801369726657867, + "learning_rate": 0.0009734439500301267, + "loss": 2.8541, + "step": 3818 + }, + { + "epoch": 0.11324615247753758, + "grad_norm": 0.16543050110340118, + "learning_rate": 0.0009734288180461885, + "loss": 2.869, + "step": 3819 + }, + { + "epoch": 0.11327580582984906, + "grad_norm": 0.1537112146615982, + "learning_rate": 0.0009734136818699569, + "loss": 2.9102, + "step": 3820 + }, + { + "epoch": 0.11330545918216055, + "grad_norm": 0.15783493220806122, + "learning_rate": 0.0009733985415015659, + "loss": 2.8862, + "step": 3821 + }, + { + "epoch": 0.11333511253447202, + "grad_norm": 0.16767333447933197, + "learning_rate": 0.0009733833969411496, + "loss": 2.9144, + "step": 3822 + }, + { + "epoch": 0.1133647658867835, + "grad_norm": 0.1500309258699417, + "learning_rate": 0.000973368248188842, + "loss": 2.9036, + "step": 3823 + }, + { + "epoch": 0.11339441923909498, + "grad_norm": 0.14181391894817352, + "learning_rate": 0.0009733530952447775, + "loss": 2.8613, + "step": 3824 + }, + { + "epoch": 0.11342407259140645, + "grad_norm": 0.16090381145477295, + "learning_rate": 0.0009733379381090899, + "loss": 2.8996, + "step": 3825 + }, + { + "epoch": 0.11345372594371794, + "grad_norm": 0.18407149612903595, + "learning_rate": 0.0009733227767819137, + "loss": 2.9166, + "step": 3826 + }, + { + "epoch": 0.11348337929602942, + "grad_norm": 0.16978420317173004, + "learning_rate": 0.000973307611263383, + "loss": 2.8712, + "step": 3827 + }, + { + "epoch": 0.1135130326483409, + "grad_norm": 0.1605474352836609, + "learning_rate": 0.0009732924415536322, + "loss": 2.868, + "step": 3828 + }, + { + "epoch": 0.11354268600065237, + "grad_norm": 0.1552859991788864, + "learning_rate": 0.0009732772676527956, + "loss": 2.8624, + "step": 3829 + }, + { + "epoch": 0.11357233935296385, + "grad_norm": 0.16137558221817017, + "learning_rate": 0.0009732620895610075, + "loss": 2.8426, + "step": 3830 + }, + { + "epoch": 0.11360199270527534, + "grad_norm": 0.18301494419574738, + "learning_rate": 0.0009732469072784023, + "loss": 2.8987, + "step": 3831 + }, + { + "epoch": 0.11363164605758681, + "grad_norm": 0.1928037852048874, + "learning_rate": 0.0009732317208051147, + "loss": 2.8759, + "step": 3832 + }, + { + "epoch": 0.11366129940989829, + "grad_norm": 0.15866224467754364, + "learning_rate": 0.000973216530141279, + "loss": 2.879, + "step": 3833 + }, + { + "epoch": 0.11369095276220977, + "grad_norm": 0.14867758750915527, + "learning_rate": 0.0009732013352870295, + "loss": 2.9229, + "step": 3834 + }, + { + "epoch": 0.11372060611452124, + "grad_norm": 0.19289256632328033, + "learning_rate": 0.0009731861362425009, + "loss": 2.8687, + "step": 3835 + }, + { + "epoch": 0.11375025946683273, + "grad_norm": 0.21693244576454163, + "learning_rate": 0.0009731709330078281, + "loss": 2.9134, + "step": 3836 + }, + { + "epoch": 0.11377991281914421, + "grad_norm": 0.2342207282781601, + "learning_rate": 0.0009731557255831454, + "loss": 2.9331, + "step": 3837 + }, + { + "epoch": 0.11380956617145568, + "grad_norm": 0.19490160048007965, + "learning_rate": 0.0009731405139685874, + "loss": 2.9224, + "step": 3838 + }, + { + "epoch": 0.11383921952376716, + "grad_norm": 0.1811864823102951, + "learning_rate": 0.0009731252981642891, + "loss": 2.8928, + "step": 3839 + }, + { + "epoch": 0.11386887287607864, + "grad_norm": 0.2150079756975174, + "learning_rate": 0.000973110078170385, + "loss": 2.8685, + "step": 3840 + }, + { + "epoch": 0.11389852622839013, + "grad_norm": 0.18457837402820587, + "learning_rate": 0.0009730948539870099, + "loss": 2.8902, + "step": 3841 + }, + { + "epoch": 0.1139281795807016, + "grad_norm": 0.16684560477733612, + "learning_rate": 0.0009730796256142986, + "loss": 2.8799, + "step": 3842 + }, + { + "epoch": 0.11395783293301308, + "grad_norm": 0.16402314603328705, + "learning_rate": 0.0009730643930523863, + "loss": 2.8719, + "step": 3843 + }, + { + "epoch": 0.11398748628532455, + "grad_norm": 0.1528938114643097, + "learning_rate": 0.0009730491563014073, + "loss": 2.9205, + "step": 3844 + }, + { + "epoch": 0.11401713963763603, + "grad_norm": 0.1913820207118988, + "learning_rate": 0.000973033915361497, + "loss": 2.9053, + "step": 3845 + }, + { + "epoch": 0.1140467929899475, + "grad_norm": 0.20948466658592224, + "learning_rate": 0.0009730186702327901, + "loss": 2.8991, + "step": 3846 + }, + { + "epoch": 0.114076446342259, + "grad_norm": 0.20475679636001587, + "learning_rate": 0.0009730034209154217, + "loss": 2.8838, + "step": 3847 + }, + { + "epoch": 0.11410609969457047, + "grad_norm": 0.17373225092887878, + "learning_rate": 0.0009729881674095269, + "loss": 2.9012, + "step": 3848 + }, + { + "epoch": 0.11413575304688195, + "grad_norm": 0.14274540543556213, + "learning_rate": 0.0009729729097152405, + "loss": 2.9043, + "step": 3849 + }, + { + "epoch": 0.11416540639919343, + "grad_norm": 0.14964184165000916, + "learning_rate": 0.0009729576478326981, + "loss": 2.9014, + "step": 3850 + }, + { + "epoch": 0.1141950597515049, + "grad_norm": 0.14809705317020416, + "learning_rate": 0.0009729423817620342, + "loss": 2.9068, + "step": 3851 + }, + { + "epoch": 0.11422471310381639, + "grad_norm": 0.13728764653205872, + "learning_rate": 0.0009729271115033845, + "loss": 2.9277, + "step": 3852 + }, + { + "epoch": 0.11425436645612787, + "grad_norm": 0.14187969267368317, + "learning_rate": 0.0009729118370568841, + "loss": 2.8842, + "step": 3853 + }, + { + "epoch": 0.11428401980843934, + "grad_norm": 0.15641897916793823, + "learning_rate": 0.0009728965584226681, + "loss": 2.9181, + "step": 3854 + }, + { + "epoch": 0.11431367316075082, + "grad_norm": 0.1868876814842224, + "learning_rate": 0.0009728812756008721, + "loss": 2.864, + "step": 3855 + }, + { + "epoch": 0.1143433265130623, + "grad_norm": 0.19330495595932007, + "learning_rate": 0.000972865988591631, + "loss": 2.8757, + "step": 3856 + }, + { + "epoch": 0.11437297986537379, + "grad_norm": 0.1740424484014511, + "learning_rate": 0.0009728506973950805, + "loss": 2.8807, + "step": 3857 + }, + { + "epoch": 0.11440263321768526, + "grad_norm": 0.15961338579654694, + "learning_rate": 0.0009728354020113559, + "loss": 2.8987, + "step": 3858 + }, + { + "epoch": 0.11443228656999674, + "grad_norm": 0.15510882437229156, + "learning_rate": 0.0009728201024405927, + "loss": 2.9134, + "step": 3859 + }, + { + "epoch": 0.11446193992230821, + "grad_norm": 0.15456737577915192, + "learning_rate": 0.0009728047986829263, + "loss": 2.8931, + "step": 3860 + }, + { + "epoch": 0.11449159327461969, + "grad_norm": 0.1569613367319107, + "learning_rate": 0.0009727894907384922, + "loss": 2.8899, + "step": 3861 + }, + { + "epoch": 0.11452124662693118, + "grad_norm": 0.18485945463180542, + "learning_rate": 0.000972774178607426, + "loss": 2.8809, + "step": 3862 + }, + { + "epoch": 0.11455089997924266, + "grad_norm": 0.16819670796394348, + "learning_rate": 0.0009727588622898633, + "loss": 2.8894, + "step": 3863 + }, + { + "epoch": 0.11458055333155413, + "grad_norm": 0.16647419333457947, + "learning_rate": 0.0009727435417859399, + "loss": 2.9162, + "step": 3864 + }, + { + "epoch": 0.11461020668386561, + "grad_norm": 0.16014066338539124, + "learning_rate": 0.0009727282170957912, + "loss": 2.8854, + "step": 3865 + }, + { + "epoch": 0.11463986003617709, + "grad_norm": 0.17422041296958923, + "learning_rate": 0.0009727128882195528, + "loss": 2.8856, + "step": 3866 + }, + { + "epoch": 0.11466951338848858, + "grad_norm": 0.19968779385089874, + "learning_rate": 0.000972697555157361, + "loss": 2.8852, + "step": 3867 + }, + { + "epoch": 0.11469916674080005, + "grad_norm": 0.1882133036851883, + "learning_rate": 0.0009726822179093508, + "loss": 2.9157, + "step": 3868 + }, + { + "epoch": 0.11472882009311153, + "grad_norm": 0.13885533809661865, + "learning_rate": 0.0009726668764756587, + "loss": 2.9213, + "step": 3869 + }, + { + "epoch": 0.114758473445423, + "grad_norm": 0.15092752873897552, + "learning_rate": 0.0009726515308564202, + "loss": 2.9188, + "step": 3870 + }, + { + "epoch": 0.11478812679773448, + "grad_norm": 0.18703630566596985, + "learning_rate": 0.0009726361810517714, + "loss": 2.8789, + "step": 3871 + }, + { + "epoch": 0.11481778015004596, + "grad_norm": 0.17769411206245422, + "learning_rate": 0.0009726208270618479, + "loss": 2.8592, + "step": 3872 + }, + { + "epoch": 0.11484743350235745, + "grad_norm": 0.20475687086582184, + "learning_rate": 0.0009726054688867859, + "loss": 2.9121, + "step": 3873 + }, + { + "epoch": 0.11487708685466892, + "grad_norm": 0.2232515662908554, + "learning_rate": 0.0009725901065267213, + "loss": 2.9142, + "step": 3874 + }, + { + "epoch": 0.1149067402069804, + "grad_norm": 0.22641721367835999, + "learning_rate": 0.0009725747399817904, + "loss": 2.8925, + "step": 3875 + }, + { + "epoch": 0.11493639355929187, + "grad_norm": 0.21349003911018372, + "learning_rate": 0.0009725593692521289, + "loss": 2.8774, + "step": 3876 + }, + { + "epoch": 0.11496604691160335, + "grad_norm": 0.20571070909500122, + "learning_rate": 0.0009725439943378731, + "loss": 2.8592, + "step": 3877 + }, + { + "epoch": 0.11499570026391484, + "grad_norm": 0.1901409924030304, + "learning_rate": 0.000972528615239159, + "loss": 2.8788, + "step": 3878 + }, + { + "epoch": 0.11502535361622632, + "grad_norm": 0.18865367770195007, + "learning_rate": 0.0009725132319561231, + "loss": 2.882, + "step": 3879 + }, + { + "epoch": 0.11505500696853779, + "grad_norm": 0.16812674701213837, + "learning_rate": 0.0009724978444889012, + "loss": 2.8941, + "step": 3880 + }, + { + "epoch": 0.11508466032084927, + "grad_norm": 0.16127657890319824, + "learning_rate": 0.00097248245283763, + "loss": 2.8822, + "step": 3881 + }, + { + "epoch": 0.11511431367316075, + "grad_norm": 0.1718166321516037, + "learning_rate": 0.0009724670570024455, + "loss": 2.8824, + "step": 3882 + }, + { + "epoch": 0.11514396702547224, + "grad_norm": 0.17966681718826294, + "learning_rate": 0.0009724516569834842, + "loss": 2.8951, + "step": 3883 + }, + { + "epoch": 0.11517362037778371, + "grad_norm": 0.17164872586727142, + "learning_rate": 0.0009724362527808822, + "loss": 2.911, + "step": 3884 + }, + { + "epoch": 0.11520327373009519, + "grad_norm": 0.15831869840621948, + "learning_rate": 0.0009724208443947762, + "loss": 2.8813, + "step": 3885 + }, + { + "epoch": 0.11523292708240666, + "grad_norm": 0.16291853785514832, + "learning_rate": 0.0009724054318253026, + "loss": 2.8985, + "step": 3886 + }, + { + "epoch": 0.11526258043471814, + "grad_norm": 0.1470281481742859, + "learning_rate": 0.0009723900150725977, + "loss": 2.8916, + "step": 3887 + }, + { + "epoch": 0.11529223378702963, + "grad_norm": 0.13523465394973755, + "learning_rate": 0.000972374594136798, + "loss": 2.8841, + "step": 3888 + }, + { + "epoch": 0.1153218871393411, + "grad_norm": 0.16040533781051636, + "learning_rate": 0.0009723591690180405, + "loss": 2.9425, + "step": 3889 + }, + { + "epoch": 0.11535154049165258, + "grad_norm": 0.1724710315465927, + "learning_rate": 0.0009723437397164612, + "loss": 2.9012, + "step": 3890 + }, + { + "epoch": 0.11538119384396406, + "grad_norm": 0.17100559175014496, + "learning_rate": 0.0009723283062321972, + "loss": 2.9044, + "step": 3891 + }, + { + "epoch": 0.11541084719627553, + "grad_norm": 0.17974340915679932, + "learning_rate": 0.000972312868565385, + "loss": 2.9132, + "step": 3892 + }, + { + "epoch": 0.11544050054858702, + "grad_norm": 0.1817082017660141, + "learning_rate": 0.0009722974267161612, + "loss": 2.8546, + "step": 3893 + }, + { + "epoch": 0.1154701539008985, + "grad_norm": 0.16715538501739502, + "learning_rate": 0.0009722819806846626, + "loss": 2.8912, + "step": 3894 + }, + { + "epoch": 0.11549980725320998, + "grad_norm": 0.17576855421066284, + "learning_rate": 0.000972266530471026, + "loss": 2.9234, + "step": 3895 + }, + { + "epoch": 0.11552946060552145, + "grad_norm": 0.1662280112504959, + "learning_rate": 0.0009722510760753882, + "loss": 2.8824, + "step": 3896 + }, + { + "epoch": 0.11555911395783293, + "grad_norm": 0.22402478754520416, + "learning_rate": 0.000972235617497886, + "loss": 2.8939, + "step": 3897 + }, + { + "epoch": 0.1155887673101444, + "grad_norm": 0.2472948282957077, + "learning_rate": 0.0009722201547386564, + "loss": 2.909, + "step": 3898 + }, + { + "epoch": 0.1156184206624559, + "grad_norm": 0.1874609738588333, + "learning_rate": 0.0009722046877978363, + "loss": 2.8751, + "step": 3899 + }, + { + "epoch": 0.11564807401476737, + "grad_norm": 0.15117263793945312, + "learning_rate": 0.0009721892166755627, + "loss": 2.8926, + "step": 3900 + }, + { + "epoch": 0.11567772736707885, + "grad_norm": 0.15927770733833313, + "learning_rate": 0.0009721737413719725, + "loss": 2.8527, + "step": 3901 + }, + { + "epoch": 0.11570738071939032, + "grad_norm": 0.1446276754140854, + "learning_rate": 0.0009721582618872027, + "loss": 2.8843, + "step": 3902 + }, + { + "epoch": 0.1157370340717018, + "grad_norm": 0.1543104648590088, + "learning_rate": 0.0009721427782213905, + "loss": 2.8561, + "step": 3903 + }, + { + "epoch": 0.11576668742401329, + "grad_norm": 0.16268469393253326, + "learning_rate": 0.0009721272903746729, + "loss": 2.8821, + "step": 3904 + }, + { + "epoch": 0.11579634077632477, + "grad_norm": 0.16511927545070648, + "learning_rate": 0.0009721117983471872, + "loss": 2.9009, + "step": 3905 + }, + { + "epoch": 0.11582599412863624, + "grad_norm": 0.15535207092761993, + "learning_rate": 0.0009720963021390704, + "loss": 2.8867, + "step": 3906 + }, + { + "epoch": 0.11585564748094772, + "grad_norm": 0.13524791598320007, + "learning_rate": 0.0009720808017504599, + "loss": 2.8863, + "step": 3907 + }, + { + "epoch": 0.1158853008332592, + "grad_norm": 0.14902953803539276, + "learning_rate": 0.0009720652971814928, + "loss": 2.8866, + "step": 3908 + }, + { + "epoch": 0.11591495418557068, + "grad_norm": 0.136860653758049, + "learning_rate": 0.0009720497884323064, + "loss": 2.8713, + "step": 3909 + }, + { + "epoch": 0.11594460753788216, + "grad_norm": 0.13857412338256836, + "learning_rate": 0.0009720342755030382, + "loss": 2.8944, + "step": 3910 + }, + { + "epoch": 0.11597426089019364, + "grad_norm": 0.156696617603302, + "learning_rate": 0.0009720187583938254, + "loss": 2.883, + "step": 3911 + }, + { + "epoch": 0.11600391424250511, + "grad_norm": 0.16099900007247925, + "learning_rate": 0.0009720032371048056, + "loss": 2.8764, + "step": 3912 + }, + { + "epoch": 0.11603356759481659, + "grad_norm": 0.1999591439962387, + "learning_rate": 0.000971987711636116, + "loss": 2.8771, + "step": 3913 + }, + { + "epoch": 0.11606322094712808, + "grad_norm": 0.19587989151477814, + "learning_rate": 0.0009719721819878941, + "loss": 2.869, + "step": 3914 + }, + { + "epoch": 0.11609287429943956, + "grad_norm": 0.16263064742088318, + "learning_rate": 0.0009719566481602778, + "loss": 2.8413, + "step": 3915 + }, + { + "epoch": 0.11612252765175103, + "grad_norm": 0.16061709821224213, + "learning_rate": 0.0009719411101534041, + "loss": 2.9133, + "step": 3916 + }, + { + "epoch": 0.11615218100406251, + "grad_norm": 0.16359323263168335, + "learning_rate": 0.0009719255679674111, + "loss": 2.8628, + "step": 3917 + }, + { + "epoch": 0.11618183435637398, + "grad_norm": 0.18515901267528534, + "learning_rate": 0.000971910021602436, + "loss": 2.9044, + "step": 3918 + }, + { + "epoch": 0.11621148770868547, + "grad_norm": 0.19170024991035461, + "learning_rate": 0.0009718944710586169, + "loss": 2.8636, + "step": 3919 + }, + { + "epoch": 0.11624114106099695, + "grad_norm": 0.16610266268253326, + "learning_rate": 0.0009718789163360909, + "loss": 2.8931, + "step": 3920 + }, + { + "epoch": 0.11627079441330843, + "grad_norm": 0.1612638533115387, + "learning_rate": 0.0009718633574349963, + "loss": 2.8768, + "step": 3921 + }, + { + "epoch": 0.1163004477656199, + "grad_norm": 0.1636081337928772, + "learning_rate": 0.0009718477943554707, + "loss": 2.8497, + "step": 3922 + }, + { + "epoch": 0.11633010111793138, + "grad_norm": 0.18251681327819824, + "learning_rate": 0.0009718322270976518, + "loss": 2.8935, + "step": 3923 + }, + { + "epoch": 0.11635975447024285, + "grad_norm": 0.16811200976371765, + "learning_rate": 0.0009718166556616776, + "loss": 2.8644, + "step": 3924 + }, + { + "epoch": 0.11638940782255434, + "grad_norm": 0.18861836194992065, + "learning_rate": 0.0009718010800476859, + "loss": 2.8682, + "step": 3925 + }, + { + "epoch": 0.11641906117486582, + "grad_norm": 0.19353003799915314, + "learning_rate": 0.0009717855002558147, + "loss": 2.8942, + "step": 3926 + }, + { + "epoch": 0.1164487145271773, + "grad_norm": 0.1798715442419052, + "learning_rate": 0.0009717699162862019, + "loss": 2.8911, + "step": 3927 + }, + { + "epoch": 0.11647836787948877, + "grad_norm": 0.20354293286800385, + "learning_rate": 0.0009717543281389855, + "loss": 2.9099, + "step": 3928 + }, + { + "epoch": 0.11650802123180025, + "grad_norm": 0.21031714975833893, + "learning_rate": 0.0009717387358143035, + "loss": 2.9151, + "step": 3929 + }, + { + "epoch": 0.11653767458411174, + "grad_norm": 0.2185419648885727, + "learning_rate": 0.0009717231393122941, + "loss": 2.8811, + "step": 3930 + }, + { + "epoch": 0.11656732793642322, + "grad_norm": 0.1762865036725998, + "learning_rate": 0.0009717075386330953, + "loss": 2.8765, + "step": 3931 + }, + { + "epoch": 0.11659698128873469, + "grad_norm": 0.1470019668340683, + "learning_rate": 0.0009716919337768452, + "loss": 2.8988, + "step": 3932 + }, + { + "epoch": 0.11662663464104617, + "grad_norm": 0.17534881830215454, + "learning_rate": 0.0009716763247436821, + "loss": 2.8901, + "step": 3933 + }, + { + "epoch": 0.11665628799335764, + "grad_norm": 0.17208832502365112, + "learning_rate": 0.0009716607115337443, + "loss": 2.8949, + "step": 3934 + }, + { + "epoch": 0.11668594134566913, + "grad_norm": 0.15241572260856628, + "learning_rate": 0.0009716450941471699, + "loss": 2.8914, + "step": 3935 + }, + { + "epoch": 0.11671559469798061, + "grad_norm": 0.17280058562755585, + "learning_rate": 0.0009716294725840972, + "loss": 2.8863, + "step": 3936 + }, + { + "epoch": 0.11674524805029209, + "grad_norm": 0.13811980187892914, + "learning_rate": 0.0009716138468446646, + "loss": 2.8652, + "step": 3937 + }, + { + "epoch": 0.11677490140260356, + "grad_norm": 0.14612844586372375, + "learning_rate": 0.0009715982169290103, + "loss": 2.8819, + "step": 3938 + }, + { + "epoch": 0.11680455475491504, + "grad_norm": 0.14901185035705566, + "learning_rate": 0.0009715825828372729, + "loss": 2.8133, + "step": 3939 + }, + { + "epoch": 0.11683420810722653, + "grad_norm": 0.15897952020168304, + "learning_rate": 0.0009715669445695909, + "loss": 2.9018, + "step": 3940 + }, + { + "epoch": 0.116863861459538, + "grad_norm": 0.15659794211387634, + "learning_rate": 0.0009715513021261026, + "loss": 2.8747, + "step": 3941 + }, + { + "epoch": 0.11689351481184948, + "grad_norm": 0.1678168922662735, + "learning_rate": 0.0009715356555069465, + "loss": 2.9067, + "step": 3942 + }, + { + "epoch": 0.11692316816416096, + "grad_norm": 0.19459865987300873, + "learning_rate": 0.0009715200047122613, + "loss": 2.8779, + "step": 3943 + }, + { + "epoch": 0.11695282151647243, + "grad_norm": 0.20860059559345245, + "learning_rate": 0.0009715043497421856, + "loss": 2.9017, + "step": 3944 + }, + { + "epoch": 0.11698247486878392, + "grad_norm": 0.224748432636261, + "learning_rate": 0.0009714886905968579, + "loss": 2.8745, + "step": 3945 + }, + { + "epoch": 0.1170121282210954, + "grad_norm": 0.2106458842754364, + "learning_rate": 0.0009714730272764167, + "loss": 2.8598, + "step": 3946 + }, + { + "epoch": 0.11704178157340688, + "grad_norm": 0.15959231555461884, + "learning_rate": 0.0009714573597810012, + "loss": 2.873, + "step": 3947 + }, + { + "epoch": 0.11707143492571835, + "grad_norm": 0.16793377697467804, + "learning_rate": 0.0009714416881107498, + "loss": 2.889, + "step": 3948 + }, + { + "epoch": 0.11710108827802983, + "grad_norm": 0.16492603719234467, + "learning_rate": 0.0009714260122658012, + "loss": 2.9161, + "step": 3949 + }, + { + "epoch": 0.1171307416303413, + "grad_norm": 0.15233376622200012, + "learning_rate": 0.0009714103322462944, + "loss": 2.8919, + "step": 3950 + }, + { + "epoch": 0.1171603949826528, + "grad_norm": 0.17030827701091766, + "learning_rate": 0.0009713946480523684, + "loss": 2.858, + "step": 3951 + }, + { + "epoch": 0.11719004833496427, + "grad_norm": 0.17152366042137146, + "learning_rate": 0.0009713789596841615, + "loss": 2.8841, + "step": 3952 + }, + { + "epoch": 0.11721970168727575, + "grad_norm": 0.18019893765449524, + "learning_rate": 0.0009713632671418133, + "loss": 2.8982, + "step": 3953 + }, + { + "epoch": 0.11724935503958722, + "grad_norm": 0.16391269862651825, + "learning_rate": 0.0009713475704254623, + "loss": 2.8727, + "step": 3954 + }, + { + "epoch": 0.1172790083918987, + "grad_norm": 0.15762430429458618, + "learning_rate": 0.0009713318695352478, + "loss": 2.8997, + "step": 3955 + }, + { + "epoch": 0.11730866174421019, + "grad_norm": 0.14847376942634583, + "learning_rate": 0.0009713161644713085, + "loss": 2.9016, + "step": 3956 + }, + { + "epoch": 0.11733831509652166, + "grad_norm": 0.13795574009418488, + "learning_rate": 0.0009713004552337839, + "loss": 2.9285, + "step": 3957 + }, + { + "epoch": 0.11736796844883314, + "grad_norm": 0.13848358392715454, + "learning_rate": 0.0009712847418228126, + "loss": 2.8747, + "step": 3958 + }, + { + "epoch": 0.11739762180114462, + "grad_norm": 0.14468176662921906, + "learning_rate": 0.0009712690242385342, + "loss": 2.8603, + "step": 3959 + }, + { + "epoch": 0.11742727515345609, + "grad_norm": 0.1697995960712433, + "learning_rate": 0.0009712533024810876, + "loss": 2.9338, + "step": 3960 + }, + { + "epoch": 0.11745692850576758, + "grad_norm": 0.19477425515651703, + "learning_rate": 0.0009712375765506122, + "loss": 2.8846, + "step": 3961 + }, + { + "epoch": 0.11748658185807906, + "grad_norm": 0.20126059651374817, + "learning_rate": 0.0009712218464472471, + "loss": 2.8695, + "step": 3962 + }, + { + "epoch": 0.11751623521039053, + "grad_norm": 0.1839759349822998, + "learning_rate": 0.0009712061121711317, + "loss": 2.904, + "step": 3963 + }, + { + "epoch": 0.11754588856270201, + "grad_norm": 0.14838124811649323, + "learning_rate": 0.0009711903737224054, + "loss": 2.8806, + "step": 3964 + }, + { + "epoch": 0.11757554191501349, + "grad_norm": 0.16292889416217804, + "learning_rate": 0.0009711746311012073, + "loss": 2.8758, + "step": 3965 + }, + { + "epoch": 0.11760519526732498, + "grad_norm": 0.19267114996910095, + "learning_rate": 0.0009711588843076771, + "loss": 2.8833, + "step": 3966 + }, + { + "epoch": 0.11763484861963645, + "grad_norm": 0.18914037942886353, + "learning_rate": 0.0009711431333419541, + "loss": 2.9269, + "step": 3967 + }, + { + "epoch": 0.11766450197194793, + "grad_norm": 0.185395285487175, + "learning_rate": 0.0009711273782041776, + "loss": 2.8828, + "step": 3968 + }, + { + "epoch": 0.1176941553242594, + "grad_norm": 0.18135517835617065, + "learning_rate": 0.0009711116188944874, + "loss": 2.9116, + "step": 3969 + }, + { + "epoch": 0.11772380867657088, + "grad_norm": 0.18066638708114624, + "learning_rate": 0.0009710958554130229, + "loss": 2.8949, + "step": 3970 + }, + { + "epoch": 0.11775346202888237, + "grad_norm": 0.18393933773040771, + "learning_rate": 0.0009710800877599239, + "loss": 2.9035, + "step": 3971 + }, + { + "epoch": 0.11778311538119385, + "grad_norm": 0.16591578722000122, + "learning_rate": 0.0009710643159353299, + "loss": 2.8493, + "step": 3972 + }, + { + "epoch": 0.11781276873350532, + "grad_norm": 0.1624690741300583, + "learning_rate": 0.0009710485399393803, + "loss": 2.8785, + "step": 3973 + }, + { + "epoch": 0.1178424220858168, + "grad_norm": 0.18464034795761108, + "learning_rate": 0.000971032759772215, + "loss": 2.8809, + "step": 3974 + }, + { + "epoch": 0.11787207543812828, + "grad_norm": 0.1720595806837082, + "learning_rate": 0.0009710169754339739, + "loss": 2.8883, + "step": 3975 + }, + { + "epoch": 0.11790172879043975, + "grad_norm": 0.17630285024642944, + "learning_rate": 0.0009710011869247967, + "loss": 2.9214, + "step": 3976 + }, + { + "epoch": 0.11793138214275124, + "grad_norm": 0.19508075714111328, + "learning_rate": 0.000970985394244823, + "loss": 2.8497, + "step": 3977 + }, + { + "epoch": 0.11796103549506272, + "grad_norm": 0.17477481067180634, + "learning_rate": 0.0009709695973941928, + "loss": 2.89, + "step": 3978 + }, + { + "epoch": 0.1179906888473742, + "grad_norm": 0.1673010289669037, + "learning_rate": 0.0009709537963730462, + "loss": 2.876, + "step": 3979 + }, + { + "epoch": 0.11802034219968567, + "grad_norm": 0.14140674471855164, + "learning_rate": 0.0009709379911815226, + "loss": 2.8772, + "step": 3980 + }, + { + "epoch": 0.11804999555199715, + "grad_norm": 0.14346247911453247, + "learning_rate": 0.0009709221818197624, + "loss": 2.8661, + "step": 3981 + }, + { + "epoch": 0.11807964890430864, + "grad_norm": 0.13466933369636536, + "learning_rate": 0.0009709063682879054, + "loss": 2.9169, + "step": 3982 + }, + { + "epoch": 0.11810930225662011, + "grad_norm": 0.13030871748924255, + "learning_rate": 0.0009708905505860917, + "loss": 2.8631, + "step": 3983 + }, + { + "epoch": 0.11813895560893159, + "grad_norm": 0.14996536076068878, + "learning_rate": 0.0009708747287144612, + "loss": 2.9191, + "step": 3984 + }, + { + "epoch": 0.11816860896124307, + "grad_norm": 0.16229961812496185, + "learning_rate": 0.0009708589026731544, + "loss": 2.9023, + "step": 3985 + }, + { + "epoch": 0.11819826231355454, + "grad_norm": 0.18735617399215698, + "learning_rate": 0.0009708430724623112, + "loss": 2.8967, + "step": 3986 + }, + { + "epoch": 0.11822791566586603, + "grad_norm": 0.2316695898771286, + "learning_rate": 0.0009708272380820715, + "loss": 2.8892, + "step": 3987 + }, + { + "epoch": 0.11825756901817751, + "grad_norm": 0.2265024334192276, + "learning_rate": 0.000970811399532576, + "loss": 2.89, + "step": 3988 + }, + { + "epoch": 0.11828722237048898, + "grad_norm": 0.18133623898029327, + "learning_rate": 0.0009707955568139647, + "loss": 2.873, + "step": 3989 + }, + { + "epoch": 0.11831687572280046, + "grad_norm": 0.1698227971792221, + "learning_rate": 0.000970779709926378, + "loss": 2.8744, + "step": 3990 + }, + { + "epoch": 0.11834652907511194, + "grad_norm": 0.1878935694694519, + "learning_rate": 0.0009707638588699561, + "loss": 2.8882, + "step": 3991 + }, + { + "epoch": 0.11837618242742343, + "grad_norm": 0.18035563826560974, + "learning_rate": 0.0009707480036448393, + "loss": 2.8853, + "step": 3992 + }, + { + "epoch": 0.1184058357797349, + "grad_norm": 0.18052849173545837, + "learning_rate": 0.0009707321442511683, + "loss": 2.8773, + "step": 3993 + }, + { + "epoch": 0.11843548913204638, + "grad_norm": 0.1616443395614624, + "learning_rate": 0.0009707162806890832, + "loss": 2.8562, + "step": 3994 + }, + { + "epoch": 0.11846514248435785, + "grad_norm": 0.14428049325942993, + "learning_rate": 0.000970700412958725, + "loss": 2.8612, + "step": 3995 + }, + { + "epoch": 0.11849479583666933, + "grad_norm": 0.14996293187141418, + "learning_rate": 0.0009706845410602334, + "loss": 2.869, + "step": 3996 + }, + { + "epoch": 0.11852444918898082, + "grad_norm": 0.15694274008274078, + "learning_rate": 0.0009706686649937496, + "loss": 2.9051, + "step": 3997 + }, + { + "epoch": 0.1185541025412923, + "grad_norm": 0.15316544473171234, + "learning_rate": 0.0009706527847594139, + "loss": 2.8929, + "step": 3998 + }, + { + "epoch": 0.11858375589360377, + "grad_norm": 0.14088857173919678, + "learning_rate": 0.0009706369003573672, + "loss": 2.8841, + "step": 3999 + }, + { + "epoch": 0.11861340924591525, + "grad_norm": 0.14285223186016083, + "learning_rate": 0.0009706210117877498, + "loss": 2.887, + "step": 4000 + }, + { + "epoch": 0.11864306259822673, + "grad_norm": 0.15981502830982208, + "learning_rate": 0.0009706051190507026, + "loss": 2.9103, + "step": 4001 + }, + { + "epoch": 0.1186727159505382, + "grad_norm": 0.14582380652427673, + "learning_rate": 0.0009705892221463663, + "loss": 2.8804, + "step": 4002 + }, + { + "epoch": 0.11870236930284969, + "grad_norm": 0.1384301334619522, + "learning_rate": 0.0009705733210748816, + "loss": 2.8828, + "step": 4003 + }, + { + "epoch": 0.11873202265516117, + "grad_norm": 0.13488183915615082, + "learning_rate": 0.0009705574158363894, + "loss": 2.908, + "step": 4004 + }, + { + "epoch": 0.11876167600747264, + "grad_norm": 0.14713533222675323, + "learning_rate": 0.0009705415064310306, + "loss": 2.875, + "step": 4005 + }, + { + "epoch": 0.11879132935978412, + "grad_norm": 0.14775368571281433, + "learning_rate": 0.0009705255928589458, + "loss": 2.8684, + "step": 4006 + }, + { + "epoch": 0.1188209827120956, + "grad_norm": 0.15237994492053986, + "learning_rate": 0.0009705096751202763, + "loss": 2.8919, + "step": 4007 + }, + { + "epoch": 0.11885063606440709, + "grad_norm": 0.13762855529785156, + "learning_rate": 0.0009704937532151628, + "loss": 2.9015, + "step": 4008 + }, + { + "epoch": 0.11888028941671856, + "grad_norm": 0.15679989755153656, + "learning_rate": 0.0009704778271437465, + "loss": 2.8969, + "step": 4009 + }, + { + "epoch": 0.11890994276903004, + "grad_norm": 0.17789201438426971, + "learning_rate": 0.0009704618969061681, + "loss": 2.8692, + "step": 4010 + }, + { + "epoch": 0.11893959612134151, + "grad_norm": 0.19905616343021393, + "learning_rate": 0.0009704459625025688, + "loss": 2.8836, + "step": 4011 + }, + { + "epoch": 0.11896924947365299, + "grad_norm": 0.19908662140369415, + "learning_rate": 0.0009704300239330899, + "loss": 2.9121, + "step": 4012 + }, + { + "epoch": 0.11899890282596448, + "grad_norm": 0.22430653870105743, + "learning_rate": 0.0009704140811978724, + "loss": 2.8666, + "step": 4013 + }, + { + "epoch": 0.11902855617827596, + "grad_norm": 0.2653953731060028, + "learning_rate": 0.0009703981342970572, + "loss": 2.8792, + "step": 4014 + }, + { + "epoch": 0.11905820953058743, + "grad_norm": 0.23393797874450684, + "learning_rate": 0.000970382183230786, + "loss": 2.8987, + "step": 4015 + }, + { + "epoch": 0.11908786288289891, + "grad_norm": 0.18018509447574615, + "learning_rate": 0.0009703662279991998, + "loss": 2.8648, + "step": 4016 + }, + { + "epoch": 0.11911751623521039, + "grad_norm": 0.19579434394836426, + "learning_rate": 0.0009703502686024399, + "loss": 2.8755, + "step": 4017 + }, + { + "epoch": 0.11914716958752188, + "grad_norm": 0.15788474678993225, + "learning_rate": 0.0009703343050406477, + "loss": 2.882, + "step": 4018 + }, + { + "epoch": 0.11917682293983335, + "grad_norm": 0.17830580472946167, + "learning_rate": 0.0009703183373139645, + "loss": 2.8882, + "step": 4019 + }, + { + "epoch": 0.11920647629214483, + "grad_norm": 0.18334901332855225, + "learning_rate": 0.0009703023654225316, + "loss": 2.8641, + "step": 4020 + }, + { + "epoch": 0.1192361296444563, + "grad_norm": 0.16016560792922974, + "learning_rate": 0.0009702863893664907, + "loss": 2.8591, + "step": 4021 + }, + { + "epoch": 0.11926578299676778, + "grad_norm": 0.13970522582530975, + "learning_rate": 0.0009702704091459829, + "loss": 2.9049, + "step": 4022 + }, + { + "epoch": 0.11929543634907927, + "grad_norm": 0.14684276282787323, + "learning_rate": 0.0009702544247611499, + "loss": 2.8867, + "step": 4023 + }, + { + "epoch": 0.11932508970139075, + "grad_norm": 0.138013556599617, + "learning_rate": 0.0009702384362121333, + "loss": 2.8728, + "step": 4024 + }, + { + "epoch": 0.11935474305370222, + "grad_norm": 0.14020881056785583, + "learning_rate": 0.0009702224434990748, + "loss": 2.8726, + "step": 4025 + }, + { + "epoch": 0.1193843964060137, + "grad_norm": 0.14758209884166718, + "learning_rate": 0.0009702064466221156, + "loss": 2.8734, + "step": 4026 + }, + { + "epoch": 0.11941404975832517, + "grad_norm": 0.1660165637731552, + "learning_rate": 0.0009701904455813976, + "loss": 2.8976, + "step": 4027 + }, + { + "epoch": 0.11944370311063665, + "grad_norm": 0.20344623923301697, + "learning_rate": 0.0009701744403770627, + "loss": 2.8794, + "step": 4028 + }, + { + "epoch": 0.11947335646294814, + "grad_norm": 0.21394525468349457, + "learning_rate": 0.0009701584310092524, + "loss": 2.8943, + "step": 4029 + }, + { + "epoch": 0.11950300981525962, + "grad_norm": 0.1958407461643219, + "learning_rate": 0.0009701424174781084, + "loss": 2.9135, + "step": 4030 + }, + { + "epoch": 0.1195326631675711, + "grad_norm": 0.173900306224823, + "learning_rate": 0.0009701263997837726, + "loss": 2.8591, + "step": 4031 + }, + { + "epoch": 0.11956231651988257, + "grad_norm": 0.1480322927236557, + "learning_rate": 0.0009701103779263868, + "loss": 2.8893, + "step": 4032 + }, + { + "epoch": 0.11959196987219405, + "grad_norm": 0.1843489408493042, + "learning_rate": 0.0009700943519060929, + "loss": 2.8284, + "step": 4033 + }, + { + "epoch": 0.11962162322450554, + "grad_norm": 0.1736505627632141, + "learning_rate": 0.000970078321723033, + "loss": 2.8592, + "step": 4034 + }, + { + "epoch": 0.11965127657681701, + "grad_norm": 0.1610173135995865, + "learning_rate": 0.0009700622873773489, + "loss": 2.8822, + "step": 4035 + }, + { + "epoch": 0.11968092992912849, + "grad_norm": 0.15388113260269165, + "learning_rate": 0.0009700462488691823, + "loss": 2.8612, + "step": 4036 + }, + { + "epoch": 0.11971058328143996, + "grad_norm": 0.17087028920650482, + "learning_rate": 0.0009700302061986756, + "loss": 2.9311, + "step": 4037 + }, + { + "epoch": 0.11974023663375144, + "grad_norm": 0.16310684382915497, + "learning_rate": 0.0009700141593659708, + "loss": 2.8984, + "step": 4038 + }, + { + "epoch": 0.11976988998606293, + "grad_norm": 0.17171750962734222, + "learning_rate": 0.0009699981083712098, + "loss": 2.8533, + "step": 4039 + }, + { + "epoch": 0.1197995433383744, + "grad_norm": 0.17798876762390137, + "learning_rate": 0.0009699820532145351, + "loss": 2.9028, + "step": 4040 + }, + { + "epoch": 0.11982919669068588, + "grad_norm": 0.1765255481004715, + "learning_rate": 0.0009699659938960884, + "loss": 2.8859, + "step": 4041 + }, + { + "epoch": 0.11985885004299736, + "grad_norm": 0.16491465270519257, + "learning_rate": 0.0009699499304160124, + "loss": 2.8606, + "step": 4042 + }, + { + "epoch": 0.11988850339530883, + "grad_norm": 0.1573878824710846, + "learning_rate": 0.000969933862774449, + "loss": 2.8767, + "step": 4043 + }, + { + "epoch": 0.11991815674762032, + "grad_norm": 0.18018567562103271, + "learning_rate": 0.0009699177909715404, + "loss": 2.885, + "step": 4044 + }, + { + "epoch": 0.1199478100999318, + "grad_norm": 0.15234991908073425, + "learning_rate": 0.0009699017150074293, + "loss": 2.8877, + "step": 4045 + }, + { + "epoch": 0.11997746345224328, + "grad_norm": 0.1570829302072525, + "learning_rate": 0.0009698856348822577, + "loss": 2.9068, + "step": 4046 + }, + { + "epoch": 0.12000711680455475, + "grad_norm": 0.14891856908798218, + "learning_rate": 0.0009698695505961683, + "loss": 2.8666, + "step": 4047 + }, + { + "epoch": 0.12003677015686623, + "grad_norm": 0.1456296145915985, + "learning_rate": 0.0009698534621493033, + "loss": 2.8965, + "step": 4048 + }, + { + "epoch": 0.12006642350917772, + "grad_norm": 0.18283942341804504, + "learning_rate": 0.0009698373695418054, + "loss": 2.8819, + "step": 4049 + }, + { + "epoch": 0.1200960768614892, + "grad_norm": 0.1851814240217209, + "learning_rate": 0.0009698212727738168, + "loss": 2.9311, + "step": 4050 + }, + { + "epoch": 0.12012573021380067, + "grad_norm": 0.19869057834148407, + "learning_rate": 0.0009698051718454802, + "loss": 2.8802, + "step": 4051 + }, + { + "epoch": 0.12015538356611215, + "grad_norm": 0.18953362107276917, + "learning_rate": 0.0009697890667569383, + "loss": 2.8549, + "step": 4052 + }, + { + "epoch": 0.12018503691842362, + "grad_norm": 0.15315496921539307, + "learning_rate": 0.0009697729575083334, + "loss": 2.8503, + "step": 4053 + }, + { + "epoch": 0.1202146902707351, + "grad_norm": 0.19071410596370697, + "learning_rate": 0.0009697568440998084, + "loss": 2.8673, + "step": 4054 + }, + { + "epoch": 0.12024434362304659, + "grad_norm": 0.20493771135807037, + "learning_rate": 0.0009697407265315058, + "loss": 2.9231, + "step": 4055 + }, + { + "epoch": 0.12027399697535807, + "grad_norm": 0.1499268114566803, + "learning_rate": 0.0009697246048035686, + "loss": 2.8657, + "step": 4056 + }, + { + "epoch": 0.12030365032766954, + "grad_norm": 0.15914396941661835, + "learning_rate": 0.0009697084789161392, + "loss": 2.9244, + "step": 4057 + }, + { + "epoch": 0.12033330367998102, + "grad_norm": 0.15379908680915833, + "learning_rate": 0.0009696923488693608, + "loss": 2.8469, + "step": 4058 + }, + { + "epoch": 0.1203629570322925, + "grad_norm": 0.1481287032365799, + "learning_rate": 0.000969676214663376, + "loss": 2.8767, + "step": 4059 + }, + { + "epoch": 0.12039261038460398, + "grad_norm": 0.15288148820400238, + "learning_rate": 0.0009696600762983277, + "loss": 2.883, + "step": 4060 + }, + { + "epoch": 0.12042226373691546, + "grad_norm": 0.16412505507469177, + "learning_rate": 0.0009696439337743586, + "loss": 2.9003, + "step": 4061 + }, + { + "epoch": 0.12045191708922694, + "grad_norm": 0.1599489003419876, + "learning_rate": 0.0009696277870916121, + "loss": 2.87, + "step": 4062 + }, + { + "epoch": 0.12048157044153841, + "grad_norm": 0.15067367255687714, + "learning_rate": 0.0009696116362502308, + "loss": 2.9198, + "step": 4063 + }, + { + "epoch": 0.12051122379384989, + "grad_norm": 0.15061230957508087, + "learning_rate": 0.0009695954812503578, + "loss": 2.8756, + "step": 4064 + }, + { + "epoch": 0.12054087714616138, + "grad_norm": 0.1637287735939026, + "learning_rate": 0.0009695793220921364, + "loss": 2.8734, + "step": 4065 + }, + { + "epoch": 0.12057053049847286, + "grad_norm": 0.18958011269569397, + "learning_rate": 0.0009695631587757095, + "loss": 2.8695, + "step": 4066 + }, + { + "epoch": 0.12060018385078433, + "grad_norm": 0.19851702451705933, + "learning_rate": 0.00096954699130122, + "loss": 2.8743, + "step": 4067 + }, + { + "epoch": 0.12062983720309581, + "grad_norm": 0.1973983496427536, + "learning_rate": 0.0009695308196688115, + "loss": 2.9034, + "step": 4068 + }, + { + "epoch": 0.12065949055540728, + "grad_norm": 0.21072328090667725, + "learning_rate": 0.0009695146438786268, + "loss": 2.8905, + "step": 4069 + }, + { + "epoch": 0.12068914390771877, + "grad_norm": 0.2240707278251648, + "learning_rate": 0.0009694984639308095, + "loss": 2.8766, + "step": 4070 + }, + { + "epoch": 0.12071879726003025, + "grad_norm": 0.17150601744651794, + "learning_rate": 0.0009694822798255027, + "loss": 2.8673, + "step": 4071 + }, + { + "epoch": 0.12074845061234173, + "grad_norm": 0.1566307693719864, + "learning_rate": 0.0009694660915628497, + "loss": 2.8936, + "step": 4072 + }, + { + "epoch": 0.1207781039646532, + "grad_norm": 0.18505644798278809, + "learning_rate": 0.0009694498991429938, + "loss": 2.9112, + "step": 4073 + }, + { + "epoch": 0.12080775731696468, + "grad_norm": 0.1899765431880951, + "learning_rate": 0.0009694337025660787, + "loss": 2.897, + "step": 4074 + }, + { + "epoch": 0.12083741066927617, + "grad_norm": 0.1949891299009323, + "learning_rate": 0.0009694175018322473, + "loss": 2.8749, + "step": 4075 + }, + { + "epoch": 0.12086706402158764, + "grad_norm": 0.19049647450447083, + "learning_rate": 0.0009694012969416436, + "loss": 2.8659, + "step": 4076 + }, + { + "epoch": 0.12089671737389912, + "grad_norm": 0.16255338490009308, + "learning_rate": 0.0009693850878944106, + "loss": 2.8659, + "step": 4077 + }, + { + "epoch": 0.1209263707262106, + "grad_norm": 0.15206433832645416, + "learning_rate": 0.0009693688746906923, + "loss": 2.8945, + "step": 4078 + }, + { + "epoch": 0.12095602407852207, + "grad_norm": 0.15709517896175385, + "learning_rate": 0.0009693526573306319, + "loss": 2.9084, + "step": 4079 + }, + { + "epoch": 0.12098567743083355, + "grad_norm": 0.14524441957473755, + "learning_rate": 0.0009693364358143732, + "loss": 2.905, + "step": 4080 + }, + { + "epoch": 0.12101533078314504, + "grad_norm": 0.1417893022298813, + "learning_rate": 0.0009693202101420599, + "loss": 2.887, + "step": 4081 + }, + { + "epoch": 0.12104498413545652, + "grad_norm": 0.14415428042411804, + "learning_rate": 0.0009693039803138355, + "loss": 2.8513, + "step": 4082 + }, + { + "epoch": 0.12107463748776799, + "grad_norm": 0.14102669060230255, + "learning_rate": 0.0009692877463298437, + "loss": 2.8806, + "step": 4083 + }, + { + "epoch": 0.12110429084007947, + "grad_norm": 0.14891387522220612, + "learning_rate": 0.0009692715081902285, + "loss": 2.8666, + "step": 4084 + }, + { + "epoch": 0.12113394419239094, + "grad_norm": 0.13906678557395935, + "learning_rate": 0.0009692552658951334, + "loss": 2.8465, + "step": 4085 + }, + { + "epoch": 0.12116359754470243, + "grad_norm": 0.1474997103214264, + "learning_rate": 0.0009692390194447025, + "loss": 2.8401, + "step": 4086 + }, + { + "epoch": 0.12119325089701391, + "grad_norm": 0.1620621234178543, + "learning_rate": 0.0009692227688390796, + "loss": 2.8918, + "step": 4087 + }, + { + "epoch": 0.12122290424932539, + "grad_norm": 0.15423187613487244, + "learning_rate": 0.0009692065140784083, + "loss": 2.8643, + "step": 4088 + }, + { + "epoch": 0.12125255760163686, + "grad_norm": 0.1676342636346817, + "learning_rate": 0.0009691902551628329, + "loss": 2.9313, + "step": 4089 + }, + { + "epoch": 0.12128221095394834, + "grad_norm": 0.19896864891052246, + "learning_rate": 0.0009691739920924974, + "loss": 2.8751, + "step": 4090 + }, + { + "epoch": 0.12131186430625983, + "grad_norm": 0.21163451671600342, + "learning_rate": 0.0009691577248675454, + "loss": 2.9005, + "step": 4091 + }, + { + "epoch": 0.1213415176585713, + "grad_norm": 0.1866862177848816, + "learning_rate": 0.0009691414534881215, + "loss": 2.9, + "step": 4092 + }, + { + "epoch": 0.12137117101088278, + "grad_norm": 0.17373524606227875, + "learning_rate": 0.0009691251779543693, + "loss": 2.8526, + "step": 4093 + }, + { + "epoch": 0.12140082436319426, + "grad_norm": 0.14796654880046844, + "learning_rate": 0.0009691088982664331, + "loss": 2.8725, + "step": 4094 + }, + { + "epoch": 0.12143047771550573, + "grad_norm": 0.14772291481494904, + "learning_rate": 0.0009690926144244571, + "loss": 2.8819, + "step": 4095 + }, + { + "epoch": 0.12146013106781722, + "grad_norm": 0.1454543173313141, + "learning_rate": 0.0009690763264285857, + "loss": 2.8498, + "step": 4096 + }, + { + "epoch": 0.1214897844201287, + "grad_norm": 0.1480015516281128, + "learning_rate": 0.0009690600342789627, + "loss": 2.8789, + "step": 4097 + }, + { + "epoch": 0.12151943777244018, + "grad_norm": 0.15102311968803406, + "learning_rate": 0.0009690437379757327, + "loss": 2.8711, + "step": 4098 + }, + { + "epoch": 0.12154909112475165, + "grad_norm": 0.1615646630525589, + "learning_rate": 0.0009690274375190398, + "loss": 2.8949, + "step": 4099 + }, + { + "epoch": 0.12157874447706313, + "grad_norm": 0.15635299682617188, + "learning_rate": 0.0009690111329090285, + "loss": 2.8818, + "step": 4100 + }, + { + "epoch": 0.12160839782937462, + "grad_norm": 0.13961005210876465, + "learning_rate": 0.0009689948241458431, + "loss": 2.812, + "step": 4101 + }, + { + "epoch": 0.1216380511816861, + "grad_norm": 0.14947283267974854, + "learning_rate": 0.000968978511229628, + "loss": 2.8824, + "step": 4102 + }, + { + "epoch": 0.12166770453399757, + "grad_norm": 0.15287505090236664, + "learning_rate": 0.0009689621941605278, + "loss": 2.8788, + "step": 4103 + }, + { + "epoch": 0.12169735788630905, + "grad_norm": 0.17179572582244873, + "learning_rate": 0.0009689458729386866, + "loss": 2.9048, + "step": 4104 + }, + { + "epoch": 0.12172701123862052, + "grad_norm": 0.16436293721199036, + "learning_rate": 0.0009689295475642495, + "loss": 2.8921, + "step": 4105 + }, + { + "epoch": 0.121756664590932, + "grad_norm": 0.17768071591854095, + "learning_rate": 0.0009689132180373606, + "loss": 2.8748, + "step": 4106 + }, + { + "epoch": 0.12178631794324349, + "grad_norm": 0.2041126787662506, + "learning_rate": 0.0009688968843581647, + "loss": 2.8253, + "step": 4107 + }, + { + "epoch": 0.12181597129555496, + "grad_norm": 0.1931096911430359, + "learning_rate": 0.0009688805465268064, + "loss": 2.8699, + "step": 4108 + }, + { + "epoch": 0.12184562464786644, + "grad_norm": 0.18173423409461975, + "learning_rate": 0.0009688642045434304, + "loss": 2.8838, + "step": 4109 + }, + { + "epoch": 0.12187527800017792, + "grad_norm": 0.18858882784843445, + "learning_rate": 0.0009688478584081813, + "loss": 2.8744, + "step": 4110 + }, + { + "epoch": 0.12190493135248939, + "grad_norm": 0.21136420965194702, + "learning_rate": 0.000968831508121204, + "loss": 2.8693, + "step": 4111 + }, + { + "epoch": 0.12193458470480088, + "grad_norm": 0.19945010542869568, + "learning_rate": 0.0009688151536826433, + "loss": 2.8349, + "step": 4112 + }, + { + "epoch": 0.12196423805711236, + "grad_norm": 0.20533084869384766, + "learning_rate": 0.000968798795092644, + "loss": 2.8777, + "step": 4113 + }, + { + "epoch": 0.12199389140942384, + "grad_norm": 0.19414246082305908, + "learning_rate": 0.0009687824323513506, + "loss": 2.8944, + "step": 4114 + }, + { + "epoch": 0.12202354476173531, + "grad_norm": 0.16992507874965668, + "learning_rate": 0.0009687660654589085, + "loss": 2.9031, + "step": 4115 + }, + { + "epoch": 0.12205319811404679, + "grad_norm": 0.18510238826274872, + "learning_rate": 0.0009687496944154625, + "loss": 2.9034, + "step": 4116 + }, + { + "epoch": 0.12208285146635828, + "grad_norm": 0.18146024644374847, + "learning_rate": 0.0009687333192211574, + "loss": 2.9033, + "step": 4117 + }, + { + "epoch": 0.12211250481866975, + "grad_norm": 0.18917672336101532, + "learning_rate": 0.0009687169398761382, + "loss": 2.8829, + "step": 4118 + }, + { + "epoch": 0.12214215817098123, + "grad_norm": 0.19787709414958954, + "learning_rate": 0.0009687005563805502, + "loss": 2.8953, + "step": 4119 + }, + { + "epoch": 0.1221718115232927, + "grad_norm": 0.17194864153862, + "learning_rate": 0.0009686841687345382, + "loss": 2.8929, + "step": 4120 + }, + { + "epoch": 0.12220146487560418, + "grad_norm": 0.170098215341568, + "learning_rate": 0.0009686677769382474, + "loss": 2.8818, + "step": 4121 + }, + { + "epoch": 0.12223111822791567, + "grad_norm": 0.17484115064144135, + "learning_rate": 0.0009686513809918232, + "loss": 2.9082, + "step": 4122 + }, + { + "epoch": 0.12226077158022715, + "grad_norm": 0.19256213307380676, + "learning_rate": 0.0009686349808954105, + "loss": 2.8779, + "step": 4123 + }, + { + "epoch": 0.12229042493253862, + "grad_norm": 0.18113042414188385, + "learning_rate": 0.0009686185766491546, + "loss": 2.8677, + "step": 4124 + }, + { + "epoch": 0.1223200782848501, + "grad_norm": 0.15717782080173492, + "learning_rate": 0.0009686021682532007, + "loss": 2.8796, + "step": 4125 + }, + { + "epoch": 0.12234973163716158, + "grad_norm": 0.18361006677150726, + "learning_rate": 0.0009685857557076942, + "loss": 2.8995, + "step": 4126 + }, + { + "epoch": 0.12237938498947307, + "grad_norm": 0.1704261302947998, + "learning_rate": 0.0009685693390127805, + "loss": 2.852, + "step": 4127 + }, + { + "epoch": 0.12240903834178454, + "grad_norm": 0.1702761948108673, + "learning_rate": 0.0009685529181686048, + "loss": 2.8683, + "step": 4128 + }, + { + "epoch": 0.12243869169409602, + "grad_norm": 0.17351949214935303, + "learning_rate": 0.0009685364931753124, + "loss": 2.8864, + "step": 4129 + }, + { + "epoch": 0.1224683450464075, + "grad_norm": 0.1706933081150055, + "learning_rate": 0.0009685200640330491, + "loss": 2.9083, + "step": 4130 + }, + { + "epoch": 0.12249799839871897, + "grad_norm": 0.17000718414783478, + "learning_rate": 0.0009685036307419604, + "loss": 2.8908, + "step": 4131 + }, + { + "epoch": 0.12252765175103045, + "grad_norm": 0.16904380917549133, + "learning_rate": 0.0009684871933021913, + "loss": 2.873, + "step": 4132 + }, + { + "epoch": 0.12255730510334194, + "grad_norm": 0.1562974750995636, + "learning_rate": 0.0009684707517138879, + "loss": 2.8671, + "step": 4133 + }, + { + "epoch": 0.12258695845565341, + "grad_norm": 0.1661602258682251, + "learning_rate": 0.0009684543059771955, + "loss": 2.8744, + "step": 4134 + }, + { + "epoch": 0.12261661180796489, + "grad_norm": 0.16764691472053528, + "learning_rate": 0.0009684378560922597, + "loss": 2.8909, + "step": 4135 + }, + { + "epoch": 0.12264626516027637, + "grad_norm": 0.1733025163412094, + "learning_rate": 0.0009684214020592265, + "loss": 2.9148, + "step": 4136 + }, + { + "epoch": 0.12267591851258784, + "grad_norm": 0.1959434449672699, + "learning_rate": 0.0009684049438782413, + "loss": 2.877, + "step": 4137 + }, + { + "epoch": 0.12270557186489933, + "grad_norm": 0.21314266324043274, + "learning_rate": 0.0009683884815494499, + "loss": 2.8867, + "step": 4138 + }, + { + "epoch": 0.12273522521721081, + "grad_norm": 0.1751747876405716, + "learning_rate": 0.0009683720150729981, + "loss": 2.9072, + "step": 4139 + }, + { + "epoch": 0.12276487856952228, + "grad_norm": 0.13946951925754547, + "learning_rate": 0.0009683555444490317, + "loss": 2.8811, + "step": 4140 + }, + { + "epoch": 0.12279453192183376, + "grad_norm": 0.17021261155605316, + "learning_rate": 0.0009683390696776966, + "loss": 2.8956, + "step": 4141 + }, + { + "epoch": 0.12282418527414524, + "grad_norm": 0.18278662860393524, + "learning_rate": 0.0009683225907591386, + "loss": 2.8916, + "step": 4142 + }, + { + "epoch": 0.12285383862645673, + "grad_norm": 0.1783309429883957, + "learning_rate": 0.0009683061076935037, + "loss": 2.8487, + "step": 4143 + }, + { + "epoch": 0.1228834919787682, + "grad_norm": 0.17163735628128052, + "learning_rate": 0.0009682896204809378, + "loss": 2.8847, + "step": 4144 + }, + { + "epoch": 0.12291314533107968, + "grad_norm": 0.16250847280025482, + "learning_rate": 0.000968273129121587, + "loss": 2.8513, + "step": 4145 + }, + { + "epoch": 0.12294279868339116, + "grad_norm": 0.1421467810869217, + "learning_rate": 0.0009682566336155971, + "loss": 2.9, + "step": 4146 + }, + { + "epoch": 0.12297245203570263, + "grad_norm": 0.15496842563152313, + "learning_rate": 0.0009682401339631146, + "loss": 2.9064, + "step": 4147 + }, + { + "epoch": 0.12300210538801412, + "grad_norm": 0.14169910550117493, + "learning_rate": 0.0009682236301642852, + "loss": 2.9276, + "step": 4148 + }, + { + "epoch": 0.1230317587403256, + "grad_norm": 0.14446136355400085, + "learning_rate": 0.0009682071222192551, + "loss": 2.8717, + "step": 4149 + }, + { + "epoch": 0.12306141209263707, + "grad_norm": 0.12850956618785858, + "learning_rate": 0.0009681906101281707, + "loss": 2.9111, + "step": 4150 + }, + { + "epoch": 0.12309106544494855, + "grad_norm": 0.13808086514472961, + "learning_rate": 0.0009681740938911781, + "loss": 2.8636, + "step": 4151 + }, + { + "epoch": 0.12312071879726003, + "grad_norm": 0.12835252285003662, + "learning_rate": 0.0009681575735084233, + "loss": 2.85, + "step": 4152 + }, + { + "epoch": 0.12315037214957152, + "grad_norm": 0.13267406821250916, + "learning_rate": 0.000968141048980053, + "loss": 2.8862, + "step": 4153 + }, + { + "epoch": 0.12318002550188299, + "grad_norm": 0.14840175211429596, + "learning_rate": 0.0009681245203062135, + "loss": 2.8586, + "step": 4154 + }, + { + "epoch": 0.12320967885419447, + "grad_norm": 0.1531345248222351, + "learning_rate": 0.0009681079874870508, + "loss": 2.8882, + "step": 4155 + }, + { + "epoch": 0.12323933220650594, + "grad_norm": 0.17797784507274628, + "learning_rate": 0.0009680914505227116, + "loss": 2.9121, + "step": 4156 + }, + { + "epoch": 0.12326898555881742, + "grad_norm": 0.21566829085350037, + "learning_rate": 0.0009680749094133423, + "loss": 2.8667, + "step": 4157 + }, + { + "epoch": 0.1232986389111289, + "grad_norm": 0.2141442745923996, + "learning_rate": 0.0009680583641590892, + "loss": 2.8985, + "step": 4158 + }, + { + "epoch": 0.12332829226344039, + "grad_norm": 0.18761904537677765, + "learning_rate": 0.0009680418147600991, + "loss": 2.8991, + "step": 4159 + }, + { + "epoch": 0.12335794561575186, + "grad_norm": 0.1865551769733429, + "learning_rate": 0.0009680252612165183, + "loss": 2.8946, + "step": 4160 + }, + { + "epoch": 0.12338759896806334, + "grad_norm": 0.19173215329647064, + "learning_rate": 0.0009680087035284935, + "loss": 2.8884, + "step": 4161 + }, + { + "epoch": 0.12341725232037482, + "grad_norm": 0.1959259957075119, + "learning_rate": 0.0009679921416961713, + "loss": 2.8557, + "step": 4162 + }, + { + "epoch": 0.12344690567268629, + "grad_norm": 0.20646506547927856, + "learning_rate": 0.0009679755757196984, + "loss": 2.8766, + "step": 4163 + }, + { + "epoch": 0.12347655902499778, + "grad_norm": 0.2058059126138687, + "learning_rate": 0.0009679590055992213, + "loss": 2.8713, + "step": 4164 + }, + { + "epoch": 0.12350621237730926, + "grad_norm": 0.2066449671983719, + "learning_rate": 0.000967942431334887, + "loss": 2.8703, + "step": 4165 + }, + { + "epoch": 0.12353586572962073, + "grad_norm": 0.19435031712055206, + "learning_rate": 0.0009679258529268422, + "loss": 2.9188, + "step": 4166 + }, + { + "epoch": 0.12356551908193221, + "grad_norm": 0.18189671635627747, + "learning_rate": 0.0009679092703752336, + "loss": 2.8689, + "step": 4167 + }, + { + "epoch": 0.12359517243424369, + "grad_norm": 0.1515432447195053, + "learning_rate": 0.000967892683680208, + "loss": 2.9138, + "step": 4168 + }, + { + "epoch": 0.12362482578655518, + "grad_norm": 0.1701812744140625, + "learning_rate": 0.0009678760928419124, + "loss": 2.847, + "step": 4169 + }, + { + "epoch": 0.12365447913886665, + "grad_norm": 0.16717128455638885, + "learning_rate": 0.0009678594978604938, + "loss": 2.8536, + "step": 4170 + }, + { + "epoch": 0.12368413249117813, + "grad_norm": 0.15394391119480133, + "learning_rate": 0.000967842898736099, + "loss": 2.8685, + "step": 4171 + }, + { + "epoch": 0.1237137858434896, + "grad_norm": 0.1648608297109604, + "learning_rate": 0.000967826295468875, + "loss": 2.8991, + "step": 4172 + }, + { + "epoch": 0.12374343919580108, + "grad_norm": 0.18637874722480774, + "learning_rate": 0.0009678096880589689, + "loss": 2.8535, + "step": 4173 + }, + { + "epoch": 0.12377309254811257, + "grad_norm": 0.17630186676979065, + "learning_rate": 0.0009677930765065277, + "loss": 2.887, + "step": 4174 + }, + { + "epoch": 0.12380274590042405, + "grad_norm": 0.2087106704711914, + "learning_rate": 0.0009677764608116984, + "loss": 2.8687, + "step": 4175 + }, + { + "epoch": 0.12383239925273552, + "grad_norm": 0.21712280809879303, + "learning_rate": 0.0009677598409746285, + "loss": 2.8812, + "step": 4176 + }, + { + "epoch": 0.123862052605047, + "grad_norm": 0.18687480688095093, + "learning_rate": 0.0009677432169954646, + "loss": 2.8942, + "step": 4177 + }, + { + "epoch": 0.12389170595735848, + "grad_norm": 0.18536527454853058, + "learning_rate": 0.0009677265888743545, + "loss": 2.886, + "step": 4178 + }, + { + "epoch": 0.12392135930966997, + "grad_norm": 0.17359597980976105, + "learning_rate": 0.0009677099566114449, + "loss": 2.8661, + "step": 4179 + }, + { + "epoch": 0.12395101266198144, + "grad_norm": 0.14906297624111176, + "learning_rate": 0.0009676933202068836, + "loss": 2.8901, + "step": 4180 + }, + { + "epoch": 0.12398066601429292, + "grad_norm": 0.14652115106582642, + "learning_rate": 0.0009676766796608175, + "loss": 2.8699, + "step": 4181 + }, + { + "epoch": 0.1240103193666044, + "grad_norm": 0.15271110832691193, + "learning_rate": 0.0009676600349733942, + "loss": 2.8229, + "step": 4182 + }, + { + "epoch": 0.12403997271891587, + "grad_norm": 0.14219355583190918, + "learning_rate": 0.0009676433861447612, + "loss": 2.875, + "step": 4183 + }, + { + "epoch": 0.12406962607122735, + "grad_norm": 0.14902739226818085, + "learning_rate": 0.0009676267331750654, + "loss": 2.9341, + "step": 4184 + }, + { + "epoch": 0.12409927942353884, + "grad_norm": 0.1681215465068817, + "learning_rate": 0.0009676100760644548, + "loss": 2.8915, + "step": 4185 + }, + { + "epoch": 0.12412893277585031, + "grad_norm": 0.1666153073310852, + "learning_rate": 0.0009675934148130767, + "loss": 2.8804, + "step": 4186 + }, + { + "epoch": 0.12415858612816179, + "grad_norm": 0.16258187592029572, + "learning_rate": 0.0009675767494210785, + "loss": 2.857, + "step": 4187 + }, + { + "epoch": 0.12418823948047326, + "grad_norm": 0.14736708998680115, + "learning_rate": 0.0009675600798886082, + "loss": 2.8244, + "step": 4188 + }, + { + "epoch": 0.12421789283278474, + "grad_norm": 0.14105194807052612, + "learning_rate": 0.000967543406215813, + "loss": 2.8605, + "step": 4189 + }, + { + "epoch": 0.12424754618509623, + "grad_norm": 0.145621195435524, + "learning_rate": 0.0009675267284028407, + "loss": 2.8523, + "step": 4190 + }, + { + "epoch": 0.1242771995374077, + "grad_norm": 0.13603775203227997, + "learning_rate": 0.000967510046449839, + "loss": 2.8573, + "step": 4191 + }, + { + "epoch": 0.12430685288971918, + "grad_norm": 0.1412336379289627, + "learning_rate": 0.0009674933603569555, + "loss": 2.8594, + "step": 4192 + }, + { + "epoch": 0.12433650624203066, + "grad_norm": 0.15513883531093597, + "learning_rate": 0.0009674766701243381, + "loss": 2.8642, + "step": 4193 + }, + { + "epoch": 0.12436615959434213, + "grad_norm": 0.16201984882354736, + "learning_rate": 0.0009674599757521345, + "loss": 2.8411, + "step": 4194 + }, + { + "epoch": 0.12439581294665362, + "grad_norm": 0.14715497195720673, + "learning_rate": 0.0009674432772404926, + "loss": 2.8642, + "step": 4195 + }, + { + "epoch": 0.1244254662989651, + "grad_norm": 0.13286156952381134, + "learning_rate": 0.0009674265745895602, + "loss": 2.8413, + "step": 4196 + }, + { + "epoch": 0.12445511965127658, + "grad_norm": 0.1472068428993225, + "learning_rate": 0.0009674098677994854, + "loss": 2.8586, + "step": 4197 + }, + { + "epoch": 0.12448477300358805, + "grad_norm": 0.1713421493768692, + "learning_rate": 0.000967393156870416, + "loss": 2.8954, + "step": 4198 + }, + { + "epoch": 0.12451442635589953, + "grad_norm": 0.1880786120891571, + "learning_rate": 0.0009673764418024997, + "loss": 2.8294, + "step": 4199 + }, + { + "epoch": 0.12454407970821102, + "grad_norm": 0.19478203356266022, + "learning_rate": 0.000967359722595885, + "loss": 2.8825, + "step": 4200 + }, + { + "epoch": 0.1245737330605225, + "grad_norm": 0.18851281702518463, + "learning_rate": 0.0009673429992507197, + "loss": 2.8898, + "step": 4201 + }, + { + "epoch": 0.12460338641283397, + "grad_norm": 0.1463925987482071, + "learning_rate": 0.000967326271767152, + "loss": 2.8966, + "step": 4202 + }, + { + "epoch": 0.12463303976514545, + "grad_norm": 0.13764899969100952, + "learning_rate": 0.0009673095401453298, + "loss": 2.8553, + "step": 4203 + }, + { + "epoch": 0.12466269311745692, + "grad_norm": 0.16051776707172394, + "learning_rate": 0.0009672928043854014, + "loss": 2.8853, + "step": 4204 + }, + { + "epoch": 0.12469234646976841, + "grad_norm": 0.18544068932533264, + "learning_rate": 0.0009672760644875151, + "loss": 2.8858, + "step": 4205 + }, + { + "epoch": 0.12472199982207989, + "grad_norm": 0.1933782994747162, + "learning_rate": 0.000967259320451819, + "loss": 2.8551, + "step": 4206 + }, + { + "epoch": 0.12475165317439137, + "grad_norm": 0.16797511279582977, + "learning_rate": 0.0009672425722784615, + "loss": 2.88, + "step": 4207 + }, + { + "epoch": 0.12478130652670284, + "grad_norm": 0.16751953959465027, + "learning_rate": 0.0009672258199675907, + "loss": 2.9299, + "step": 4208 + }, + { + "epoch": 0.12481095987901432, + "grad_norm": 0.16264797747135162, + "learning_rate": 0.0009672090635193552, + "loss": 2.924, + "step": 4209 + }, + { + "epoch": 0.1248406132313258, + "grad_norm": 0.1711617112159729, + "learning_rate": 0.0009671923029339032, + "loss": 2.8729, + "step": 4210 + }, + { + "epoch": 0.12487026658363728, + "grad_norm": 0.18748049437999725, + "learning_rate": 0.000967175538211383, + "loss": 2.8972, + "step": 4211 + }, + { + "epoch": 0.12489991993594876, + "grad_norm": 0.18255898356437683, + "learning_rate": 0.0009671587693519435, + "loss": 2.8634, + "step": 4212 + }, + { + "epoch": 0.12492957328826024, + "grad_norm": 0.21889452636241913, + "learning_rate": 0.0009671419963557326, + "loss": 2.898, + "step": 4213 + }, + { + "epoch": 0.12495922664057171, + "grad_norm": 0.21384021639823914, + "learning_rate": 0.0009671252192228994, + "loss": 2.8726, + "step": 4214 + }, + { + "epoch": 0.12498887999288319, + "grad_norm": 0.17723125219345093, + "learning_rate": 0.0009671084379535922, + "loss": 2.8634, + "step": 4215 + }, + { + "epoch": 0.12501853334519467, + "grad_norm": 0.1769624501466751, + "learning_rate": 0.0009670916525479594, + "loss": 2.8963, + "step": 4216 + }, + { + "epoch": 0.12504818669750614, + "grad_norm": 0.16578109562397003, + "learning_rate": 0.0009670748630061499, + "loss": 2.8854, + "step": 4217 + }, + { + "epoch": 0.12507784004981765, + "grad_norm": 0.1502094268798828, + "learning_rate": 0.0009670580693283124, + "loss": 2.868, + "step": 4218 + }, + { + "epoch": 0.12510749340212912, + "grad_norm": 0.15551233291625977, + "learning_rate": 0.0009670412715145955, + "loss": 2.8859, + "step": 4219 + }, + { + "epoch": 0.1251371467544406, + "grad_norm": 0.17587275803089142, + "learning_rate": 0.000967024469565148, + "loss": 2.8662, + "step": 4220 + }, + { + "epoch": 0.12516680010675207, + "grad_norm": 0.18663737177848816, + "learning_rate": 0.0009670076634801186, + "loss": 2.8665, + "step": 4221 + }, + { + "epoch": 0.12519645345906355, + "grad_norm": 0.2019195407629013, + "learning_rate": 0.0009669908532596562, + "loss": 2.8833, + "step": 4222 + }, + { + "epoch": 0.12522610681137503, + "grad_norm": 0.18881620466709137, + "learning_rate": 0.0009669740389039097, + "loss": 2.8753, + "step": 4223 + }, + { + "epoch": 0.1252557601636865, + "grad_norm": 0.17017923295497894, + "learning_rate": 0.0009669572204130278, + "loss": 2.8989, + "step": 4224 + }, + { + "epoch": 0.12528541351599798, + "grad_norm": 0.15792149305343628, + "learning_rate": 0.0009669403977871596, + "loss": 2.8529, + "step": 4225 + }, + { + "epoch": 0.12531506686830945, + "grad_norm": 0.133897066116333, + "learning_rate": 0.0009669235710264542, + "loss": 2.8559, + "step": 4226 + }, + { + "epoch": 0.12534472022062093, + "grad_norm": 0.13425078988075256, + "learning_rate": 0.0009669067401310602, + "loss": 2.9008, + "step": 4227 + }, + { + "epoch": 0.1253743735729324, + "grad_norm": 0.14684131741523743, + "learning_rate": 0.0009668899051011269, + "loss": 2.8537, + "step": 4228 + }, + { + "epoch": 0.1254040269252439, + "grad_norm": 0.13652049005031586, + "learning_rate": 0.0009668730659368035, + "loss": 2.8613, + "step": 4229 + }, + { + "epoch": 0.1254336802775554, + "grad_norm": 0.1353001892566681, + "learning_rate": 0.0009668562226382388, + "loss": 2.8606, + "step": 4230 + }, + { + "epoch": 0.12546333362986686, + "grad_norm": 0.14152204990386963, + "learning_rate": 0.0009668393752055821, + "loss": 2.8917, + "step": 4231 + }, + { + "epoch": 0.12549298698217834, + "grad_norm": 0.15096142888069153, + "learning_rate": 0.0009668225236389829, + "loss": 2.8676, + "step": 4232 + }, + { + "epoch": 0.12552264033448982, + "grad_norm": 0.1850546896457672, + "learning_rate": 0.0009668056679385898, + "loss": 2.8993, + "step": 4233 + }, + { + "epoch": 0.1255522936868013, + "grad_norm": 0.20421157777309418, + "learning_rate": 0.0009667888081045525, + "loss": 2.8861, + "step": 4234 + }, + { + "epoch": 0.12558194703911277, + "grad_norm": 0.2065596878528595, + "learning_rate": 0.0009667719441370201, + "loss": 2.8803, + "step": 4235 + }, + { + "epoch": 0.12561160039142424, + "grad_norm": 0.2129167765378952, + "learning_rate": 0.0009667550760361422, + "loss": 2.9056, + "step": 4236 + }, + { + "epoch": 0.12564125374373572, + "grad_norm": 0.1933096945285797, + "learning_rate": 0.0009667382038020679, + "loss": 2.8903, + "step": 4237 + }, + { + "epoch": 0.1256709070960472, + "grad_norm": 0.17726249992847443, + "learning_rate": 0.0009667213274349467, + "loss": 2.8598, + "step": 4238 + }, + { + "epoch": 0.1257005604483587, + "grad_norm": 0.16122117638587952, + "learning_rate": 0.000966704446934928, + "loss": 2.8684, + "step": 4239 + }, + { + "epoch": 0.12573021380067018, + "grad_norm": 0.15556767582893372, + "learning_rate": 0.0009666875623021613, + "loss": 2.8842, + "step": 4240 + }, + { + "epoch": 0.12575986715298165, + "grad_norm": 0.1868819147348404, + "learning_rate": 0.0009666706735367961, + "loss": 2.891, + "step": 4241 + }, + { + "epoch": 0.12578952050529313, + "grad_norm": 0.19803984463214874, + "learning_rate": 0.000966653780638982, + "loss": 2.8792, + "step": 4242 + }, + { + "epoch": 0.1258191738576046, + "grad_norm": 0.2170540690422058, + "learning_rate": 0.0009666368836088686, + "loss": 2.8862, + "step": 4243 + }, + { + "epoch": 0.12584882720991608, + "grad_norm": 0.2022748589515686, + "learning_rate": 0.0009666199824466056, + "loss": 2.8941, + "step": 4244 + }, + { + "epoch": 0.12587848056222756, + "grad_norm": 0.16326020658016205, + "learning_rate": 0.0009666030771523424, + "loss": 2.8732, + "step": 4245 + }, + { + "epoch": 0.12590813391453903, + "grad_norm": 0.16243097186088562, + "learning_rate": 0.0009665861677262289, + "loss": 2.8303, + "step": 4246 + }, + { + "epoch": 0.1259377872668505, + "grad_norm": 0.159882590174675, + "learning_rate": 0.0009665692541684147, + "loss": 2.8494, + "step": 4247 + }, + { + "epoch": 0.12596744061916199, + "grad_norm": 0.18856754899024963, + "learning_rate": 0.0009665523364790499, + "loss": 2.8953, + "step": 4248 + }, + { + "epoch": 0.12599709397147346, + "grad_norm": 0.16330382227897644, + "learning_rate": 0.000966535414658284, + "loss": 2.896, + "step": 4249 + }, + { + "epoch": 0.12602674732378497, + "grad_norm": 0.15060730278491974, + "learning_rate": 0.000966518488706267, + "loss": 2.8542, + "step": 4250 + }, + { + "epoch": 0.12605640067609644, + "grad_norm": 0.160318523645401, + "learning_rate": 0.0009665015586231485, + "loss": 2.8592, + "step": 4251 + }, + { + "epoch": 0.12608605402840792, + "grad_norm": 0.19948022067546844, + "learning_rate": 0.0009664846244090787, + "loss": 2.927, + "step": 4252 + }, + { + "epoch": 0.1261157073807194, + "grad_norm": 0.1826813817024231, + "learning_rate": 0.0009664676860642074, + "loss": 2.8829, + "step": 4253 + }, + { + "epoch": 0.12614536073303087, + "grad_norm": 0.1436227262020111, + "learning_rate": 0.0009664507435886849, + "loss": 2.8556, + "step": 4254 + }, + { + "epoch": 0.12617501408534235, + "grad_norm": 0.1416121870279312, + "learning_rate": 0.0009664337969826609, + "loss": 2.8941, + "step": 4255 + }, + { + "epoch": 0.12620466743765382, + "grad_norm": 0.1422046273946762, + "learning_rate": 0.0009664168462462855, + "loss": 2.8549, + "step": 4256 + }, + { + "epoch": 0.1262343207899653, + "grad_norm": 0.14849208295345306, + "learning_rate": 0.0009663998913797089, + "loss": 2.8557, + "step": 4257 + }, + { + "epoch": 0.12626397414227677, + "grad_norm": 0.15798692405223846, + "learning_rate": 0.0009663829323830811, + "loss": 2.8865, + "step": 4258 + }, + { + "epoch": 0.12629362749458825, + "grad_norm": 0.16495050489902496, + "learning_rate": 0.0009663659692565525, + "loss": 2.8402, + "step": 4259 + }, + { + "epoch": 0.12632328084689975, + "grad_norm": 0.14657914638519287, + "learning_rate": 0.0009663490020002732, + "loss": 2.8709, + "step": 4260 + }, + { + "epoch": 0.12635293419921123, + "grad_norm": 0.1438087373971939, + "learning_rate": 0.0009663320306143935, + "loss": 2.8715, + "step": 4261 + }, + { + "epoch": 0.1263825875515227, + "grad_norm": 0.1380489468574524, + "learning_rate": 0.0009663150550990636, + "loss": 2.8499, + "step": 4262 + }, + { + "epoch": 0.12641224090383418, + "grad_norm": 0.12169024348258972, + "learning_rate": 0.0009662980754544337, + "loss": 2.8741, + "step": 4263 + }, + { + "epoch": 0.12644189425614566, + "grad_norm": 0.1204184740781784, + "learning_rate": 0.0009662810916806543, + "loss": 2.892, + "step": 4264 + }, + { + "epoch": 0.12647154760845714, + "grad_norm": 0.13284270465373993, + "learning_rate": 0.000966264103777876, + "loss": 2.893, + "step": 4265 + }, + { + "epoch": 0.1265012009607686, + "grad_norm": 0.14939995110034943, + "learning_rate": 0.0009662471117462489, + "loss": 2.8565, + "step": 4266 + }, + { + "epoch": 0.1265308543130801, + "grad_norm": 0.1763327270746231, + "learning_rate": 0.0009662301155859236, + "loss": 2.8765, + "step": 4267 + }, + { + "epoch": 0.12656050766539156, + "grad_norm": 0.20627468824386597, + "learning_rate": 0.0009662131152970506, + "loss": 2.872, + "step": 4268 + }, + { + "epoch": 0.12659016101770304, + "grad_norm": 0.1948653906583786, + "learning_rate": 0.0009661961108797805, + "loss": 2.8312, + "step": 4269 + }, + { + "epoch": 0.12661981437001454, + "grad_norm": 0.17450135946273804, + "learning_rate": 0.0009661791023342637, + "loss": 2.8757, + "step": 4270 + }, + { + "epoch": 0.12664946772232602, + "grad_norm": 0.17189176380634308, + "learning_rate": 0.0009661620896606511, + "loss": 2.9228, + "step": 4271 + }, + { + "epoch": 0.1266791210746375, + "grad_norm": 0.19485078752040863, + "learning_rate": 0.0009661450728590931, + "loss": 2.8717, + "step": 4272 + }, + { + "epoch": 0.12670877442694897, + "grad_norm": 0.16849878430366516, + "learning_rate": 0.0009661280519297404, + "loss": 2.8954, + "step": 4273 + }, + { + "epoch": 0.12673842777926045, + "grad_norm": 0.2059759497642517, + "learning_rate": 0.0009661110268727438, + "loss": 2.9049, + "step": 4274 + }, + { + "epoch": 0.12676808113157192, + "grad_norm": 0.19705556333065033, + "learning_rate": 0.0009660939976882541, + "loss": 2.8909, + "step": 4275 + }, + { + "epoch": 0.1267977344838834, + "grad_norm": 0.18278931081295013, + "learning_rate": 0.000966076964376422, + "loss": 2.8918, + "step": 4276 + }, + { + "epoch": 0.12682738783619488, + "grad_norm": 0.2057195007801056, + "learning_rate": 0.0009660599269373984, + "loss": 2.906, + "step": 4277 + }, + { + "epoch": 0.12685704118850635, + "grad_norm": 0.19961638748645782, + "learning_rate": 0.0009660428853713343, + "loss": 2.8836, + "step": 4278 + }, + { + "epoch": 0.12688669454081783, + "grad_norm": 0.15262345969676971, + "learning_rate": 0.0009660258396783802, + "loss": 2.8949, + "step": 4279 + }, + { + "epoch": 0.1269163478931293, + "grad_norm": 0.16117031872272491, + "learning_rate": 0.0009660087898586874, + "loss": 2.8952, + "step": 4280 + }, + { + "epoch": 0.1269460012454408, + "grad_norm": 0.16098704934120178, + "learning_rate": 0.0009659917359124069, + "loss": 2.8543, + "step": 4281 + }, + { + "epoch": 0.12697565459775229, + "grad_norm": 0.14081215858459473, + "learning_rate": 0.0009659746778396894, + "loss": 2.8902, + "step": 4282 + }, + { + "epoch": 0.12700530795006376, + "grad_norm": 0.1611699014902115, + "learning_rate": 0.0009659576156406861, + "loss": 2.8797, + "step": 4283 + }, + { + "epoch": 0.12703496130237524, + "grad_norm": 0.17292727530002594, + "learning_rate": 0.0009659405493155484, + "loss": 2.8979, + "step": 4284 + }, + { + "epoch": 0.12706461465468671, + "grad_norm": 0.15401071310043335, + "learning_rate": 0.000965923478864427, + "loss": 2.9001, + "step": 4285 + }, + { + "epoch": 0.1270942680069982, + "grad_norm": 0.15931448340415955, + "learning_rate": 0.0009659064042874733, + "loss": 2.8656, + "step": 4286 + }, + { + "epoch": 0.12712392135930967, + "grad_norm": 0.19322527945041656, + "learning_rate": 0.0009658893255848382, + "loss": 2.8876, + "step": 4287 + }, + { + "epoch": 0.12715357471162114, + "grad_norm": 0.19531448185443878, + "learning_rate": 0.0009658722427566734, + "loss": 2.8797, + "step": 4288 + }, + { + "epoch": 0.12718322806393262, + "grad_norm": 0.19162799417972565, + "learning_rate": 0.0009658551558031299, + "loss": 2.875, + "step": 4289 + }, + { + "epoch": 0.1272128814162441, + "grad_norm": 0.14479565620422363, + "learning_rate": 0.0009658380647243589, + "loss": 2.8563, + "step": 4290 + }, + { + "epoch": 0.1272425347685556, + "grad_norm": 0.15491360425949097, + "learning_rate": 0.0009658209695205119, + "loss": 2.8625, + "step": 4291 + }, + { + "epoch": 0.12727218812086707, + "grad_norm": 0.17600922286510468, + "learning_rate": 0.0009658038701917403, + "loss": 2.8443, + "step": 4292 + }, + { + "epoch": 0.12730184147317855, + "grad_norm": 0.18469850718975067, + "learning_rate": 0.0009657867667381954, + "loss": 2.8622, + "step": 4293 + }, + { + "epoch": 0.12733149482549003, + "grad_norm": 0.19298450648784637, + "learning_rate": 0.0009657696591600289, + "loss": 2.8778, + "step": 4294 + }, + { + "epoch": 0.1273611481778015, + "grad_norm": 0.16677050292491913, + "learning_rate": 0.0009657525474573921, + "loss": 2.8819, + "step": 4295 + }, + { + "epoch": 0.12739080153011298, + "grad_norm": 0.17257094383239746, + "learning_rate": 0.0009657354316304364, + "loss": 2.8704, + "step": 4296 + }, + { + "epoch": 0.12742045488242446, + "grad_norm": 0.16681653261184692, + "learning_rate": 0.0009657183116793136, + "loss": 2.8532, + "step": 4297 + }, + { + "epoch": 0.12745010823473593, + "grad_norm": 0.1649530977010727, + "learning_rate": 0.0009657011876041753, + "loss": 2.8779, + "step": 4298 + }, + { + "epoch": 0.1274797615870474, + "grad_norm": 0.16989761590957642, + "learning_rate": 0.0009656840594051728, + "loss": 2.8509, + "step": 4299 + }, + { + "epoch": 0.12750941493935888, + "grad_norm": 0.1717999279499054, + "learning_rate": 0.0009656669270824582, + "loss": 2.8588, + "step": 4300 + }, + { + "epoch": 0.12753906829167036, + "grad_norm": 0.16138200461864471, + "learning_rate": 0.000965649790636183, + "loss": 2.8615, + "step": 4301 + }, + { + "epoch": 0.12756872164398186, + "grad_norm": 0.15606968104839325, + "learning_rate": 0.0009656326500664989, + "loss": 2.859, + "step": 4302 + }, + { + "epoch": 0.12759837499629334, + "grad_norm": 0.14027419686317444, + "learning_rate": 0.0009656155053735579, + "loss": 2.8997, + "step": 4303 + }, + { + "epoch": 0.12762802834860482, + "grad_norm": 0.1386970579624176, + "learning_rate": 0.0009655983565575115, + "loss": 2.83, + "step": 4304 + }, + { + "epoch": 0.1276576817009163, + "grad_norm": 0.1439884454011917, + "learning_rate": 0.0009655812036185119, + "loss": 2.8868, + "step": 4305 + }, + { + "epoch": 0.12768733505322777, + "grad_norm": 0.14089593291282654, + "learning_rate": 0.0009655640465567108, + "loss": 2.9179, + "step": 4306 + }, + { + "epoch": 0.12771698840553924, + "grad_norm": 0.13071592152118683, + "learning_rate": 0.00096554688537226, + "loss": 2.9068, + "step": 4307 + }, + { + "epoch": 0.12774664175785072, + "grad_norm": 0.1475938856601715, + "learning_rate": 0.0009655297200653119, + "loss": 2.8584, + "step": 4308 + }, + { + "epoch": 0.1277762951101622, + "grad_norm": 0.17949114739894867, + "learning_rate": 0.000965512550636018, + "loss": 2.8837, + "step": 4309 + }, + { + "epoch": 0.12780594846247367, + "grad_norm": 0.16293466091156006, + "learning_rate": 0.0009654953770845305, + "loss": 2.8457, + "step": 4310 + }, + { + "epoch": 0.12783560181478515, + "grad_norm": 0.12674348056316376, + "learning_rate": 0.0009654781994110018, + "loss": 2.8785, + "step": 4311 + }, + { + "epoch": 0.12786525516709665, + "grad_norm": 0.14640718698501587, + "learning_rate": 0.0009654610176155836, + "loss": 2.8598, + "step": 4312 + }, + { + "epoch": 0.12789490851940813, + "grad_norm": 0.1422143280506134, + "learning_rate": 0.0009654438316984281, + "loss": 2.8847, + "step": 4313 + }, + { + "epoch": 0.1279245618717196, + "grad_norm": 0.13555963337421417, + "learning_rate": 0.0009654266416596878, + "loss": 2.84, + "step": 4314 + }, + { + "epoch": 0.12795421522403108, + "grad_norm": 0.16106489300727844, + "learning_rate": 0.0009654094474995144, + "loss": 2.867, + "step": 4315 + }, + { + "epoch": 0.12798386857634256, + "grad_norm": 0.16465085744857788, + "learning_rate": 0.0009653922492180607, + "loss": 2.8529, + "step": 4316 + }, + { + "epoch": 0.12801352192865403, + "grad_norm": 0.1522303819656372, + "learning_rate": 0.0009653750468154788, + "loss": 2.8611, + "step": 4317 + }, + { + "epoch": 0.1280431752809655, + "grad_norm": 0.17950591444969177, + "learning_rate": 0.0009653578402919207, + "loss": 2.8778, + "step": 4318 + }, + { + "epoch": 0.12807282863327699, + "grad_norm": 0.22316467761993408, + "learning_rate": 0.0009653406296475393, + "loss": 2.8809, + "step": 4319 + }, + { + "epoch": 0.12810248198558846, + "grad_norm": 0.2403753101825714, + "learning_rate": 0.0009653234148824866, + "loss": 2.8913, + "step": 4320 + }, + { + "epoch": 0.12813213533789994, + "grad_norm": 0.20702871680259705, + "learning_rate": 0.0009653061959969152, + "loss": 2.8818, + "step": 4321 + }, + { + "epoch": 0.12816178869021144, + "grad_norm": 0.19356289505958557, + "learning_rate": 0.0009652889729909776, + "loss": 2.875, + "step": 4322 + }, + { + "epoch": 0.12819144204252292, + "grad_norm": 0.19248224794864655, + "learning_rate": 0.0009652717458648264, + "loss": 2.8796, + "step": 4323 + }, + { + "epoch": 0.1282210953948344, + "grad_norm": 0.19419081509113312, + "learning_rate": 0.0009652545146186138, + "loss": 2.8683, + "step": 4324 + }, + { + "epoch": 0.12825074874714587, + "grad_norm": 0.19526036083698273, + "learning_rate": 0.0009652372792524927, + "loss": 2.8735, + "step": 4325 + }, + { + "epoch": 0.12828040209945735, + "grad_norm": 0.19520030915737152, + "learning_rate": 0.0009652200397666157, + "loss": 2.8791, + "step": 4326 + }, + { + "epoch": 0.12831005545176882, + "grad_norm": 0.20660561323165894, + "learning_rate": 0.0009652027961611354, + "loss": 2.8562, + "step": 4327 + }, + { + "epoch": 0.1283397088040803, + "grad_norm": 0.1575862616300583, + "learning_rate": 0.0009651855484362045, + "loss": 2.9061, + "step": 4328 + }, + { + "epoch": 0.12836936215639178, + "grad_norm": 0.15205718576908112, + "learning_rate": 0.0009651682965919755, + "loss": 2.8782, + "step": 4329 + }, + { + "epoch": 0.12839901550870325, + "grad_norm": 0.1510539948940277, + "learning_rate": 0.0009651510406286016, + "loss": 2.859, + "step": 4330 + }, + { + "epoch": 0.12842866886101473, + "grad_norm": 0.13880440592765808, + "learning_rate": 0.0009651337805462354, + "loss": 2.8536, + "step": 4331 + }, + { + "epoch": 0.1284583222133262, + "grad_norm": 0.14534632861614227, + "learning_rate": 0.0009651165163450296, + "loss": 2.8731, + "step": 4332 + }, + { + "epoch": 0.1284879755656377, + "grad_norm": 0.14065352082252502, + "learning_rate": 0.0009650992480251373, + "loss": 2.8741, + "step": 4333 + }, + { + "epoch": 0.12851762891794918, + "grad_norm": 0.13632334768772125, + "learning_rate": 0.0009650819755867113, + "loss": 2.864, + "step": 4334 + }, + { + "epoch": 0.12854728227026066, + "grad_norm": 0.16904005408287048, + "learning_rate": 0.0009650646990299046, + "loss": 2.8571, + "step": 4335 + }, + { + "epoch": 0.12857693562257214, + "grad_norm": 0.20457644760608673, + "learning_rate": 0.0009650474183548701, + "loss": 2.8671, + "step": 4336 + }, + { + "epoch": 0.1286065889748836, + "grad_norm": 0.21302834153175354, + "learning_rate": 0.000965030133561761, + "loss": 2.8663, + "step": 4337 + }, + { + "epoch": 0.1286362423271951, + "grad_norm": 0.1830681413412094, + "learning_rate": 0.0009650128446507302, + "loss": 2.8848, + "step": 4338 + }, + { + "epoch": 0.12866589567950656, + "grad_norm": 0.1390032172203064, + "learning_rate": 0.0009649955516219308, + "loss": 2.8338, + "step": 4339 + }, + { + "epoch": 0.12869554903181804, + "grad_norm": 0.16196033358573914, + "learning_rate": 0.000964978254475516, + "loss": 2.8967, + "step": 4340 + }, + { + "epoch": 0.12872520238412952, + "grad_norm": 0.18636253476142883, + "learning_rate": 0.000964960953211639, + "loss": 2.8865, + "step": 4341 + }, + { + "epoch": 0.128754855736441, + "grad_norm": 0.18911822140216827, + "learning_rate": 0.0009649436478304528, + "loss": 2.881, + "step": 4342 + }, + { + "epoch": 0.1287845090887525, + "grad_norm": 0.18434637784957886, + "learning_rate": 0.000964926338332111, + "loss": 2.8769, + "step": 4343 + }, + { + "epoch": 0.12881416244106397, + "grad_norm": 0.1841350644826889, + "learning_rate": 0.0009649090247167664, + "loss": 2.866, + "step": 4344 + }, + { + "epoch": 0.12884381579337545, + "grad_norm": 0.17460161447525024, + "learning_rate": 0.0009648917069845728, + "loss": 2.8858, + "step": 4345 + }, + { + "epoch": 0.12887346914568693, + "grad_norm": 0.19570252299308777, + "learning_rate": 0.0009648743851356833, + "loss": 2.8529, + "step": 4346 + }, + { + "epoch": 0.1289031224979984, + "grad_norm": 0.19419991970062256, + "learning_rate": 0.0009648570591702513, + "loss": 2.9159, + "step": 4347 + }, + { + "epoch": 0.12893277585030988, + "grad_norm": 0.17606529593467712, + "learning_rate": 0.0009648397290884304, + "loss": 2.8439, + "step": 4348 + }, + { + "epoch": 0.12896242920262135, + "grad_norm": 0.16166310012340546, + "learning_rate": 0.0009648223948903736, + "loss": 2.8476, + "step": 4349 + }, + { + "epoch": 0.12899208255493283, + "grad_norm": 0.1576535701751709, + "learning_rate": 0.0009648050565762349, + "loss": 2.8586, + "step": 4350 + }, + { + "epoch": 0.1290217359072443, + "grad_norm": 0.1748097687959671, + "learning_rate": 0.0009647877141461676, + "loss": 2.8571, + "step": 4351 + }, + { + "epoch": 0.12905138925955578, + "grad_norm": 0.16930434107780457, + "learning_rate": 0.0009647703676003254, + "loss": 2.8239, + "step": 4352 + }, + { + "epoch": 0.12908104261186726, + "grad_norm": 0.14963571727275848, + "learning_rate": 0.0009647530169388617, + "loss": 2.8482, + "step": 4353 + }, + { + "epoch": 0.12911069596417876, + "grad_norm": 0.15801838040351868, + "learning_rate": 0.0009647356621619303, + "loss": 2.8921, + "step": 4354 + }, + { + "epoch": 0.12914034931649024, + "grad_norm": 0.16179563105106354, + "learning_rate": 0.0009647183032696849, + "loss": 2.8389, + "step": 4355 + }, + { + "epoch": 0.12917000266880171, + "grad_norm": 0.1691284477710724, + "learning_rate": 0.000964700940262279, + "loss": 2.862, + "step": 4356 + }, + { + "epoch": 0.1291996560211132, + "grad_norm": 0.17723435163497925, + "learning_rate": 0.0009646835731398667, + "loss": 2.8858, + "step": 4357 + }, + { + "epoch": 0.12922930937342467, + "grad_norm": 0.15811310708522797, + "learning_rate": 0.0009646662019026016, + "loss": 2.865, + "step": 4358 + }, + { + "epoch": 0.12925896272573614, + "grad_norm": 0.18517358601093292, + "learning_rate": 0.0009646488265506373, + "loss": 2.8577, + "step": 4359 + }, + { + "epoch": 0.12928861607804762, + "grad_norm": 0.17370454967021942, + "learning_rate": 0.000964631447084128, + "loss": 2.8995, + "step": 4360 + }, + { + "epoch": 0.1293182694303591, + "grad_norm": 0.14604637026786804, + "learning_rate": 0.0009646140635032277, + "loss": 2.8739, + "step": 4361 + }, + { + "epoch": 0.12934792278267057, + "grad_norm": 0.1753235161304474, + "learning_rate": 0.0009645966758080898, + "loss": 2.884, + "step": 4362 + }, + { + "epoch": 0.12937757613498205, + "grad_norm": 0.1460968554019928, + "learning_rate": 0.0009645792839988687, + "loss": 2.8617, + "step": 4363 + }, + { + "epoch": 0.12940722948729355, + "grad_norm": 0.15439701080322266, + "learning_rate": 0.0009645618880757183, + "loss": 2.8532, + "step": 4364 + }, + { + "epoch": 0.12943688283960503, + "grad_norm": 0.1633371263742447, + "learning_rate": 0.0009645444880387927, + "loss": 2.8428, + "step": 4365 + }, + { + "epoch": 0.1294665361919165, + "grad_norm": 0.17023995518684387, + "learning_rate": 0.0009645270838882458, + "loss": 2.8865, + "step": 4366 + }, + { + "epoch": 0.12949618954422798, + "grad_norm": 0.1838754564523697, + "learning_rate": 0.000964509675624232, + "loss": 2.8778, + "step": 4367 + }, + { + "epoch": 0.12952584289653946, + "grad_norm": 0.184464231133461, + "learning_rate": 0.0009644922632469051, + "loss": 2.8726, + "step": 4368 + }, + { + "epoch": 0.12955549624885093, + "grad_norm": 0.1763373613357544, + "learning_rate": 0.0009644748467564196, + "loss": 2.8255, + "step": 4369 + }, + { + "epoch": 0.1295851496011624, + "grad_norm": 0.18334124982357025, + "learning_rate": 0.0009644574261529295, + "loss": 2.8637, + "step": 4370 + }, + { + "epoch": 0.12961480295347388, + "grad_norm": 0.21306174993515015, + "learning_rate": 0.0009644400014365892, + "loss": 2.8824, + "step": 4371 + }, + { + "epoch": 0.12964445630578536, + "grad_norm": 0.19542574882507324, + "learning_rate": 0.0009644225726075531, + "loss": 2.9161, + "step": 4372 + }, + { + "epoch": 0.12967410965809684, + "grad_norm": 0.18321265280246735, + "learning_rate": 0.0009644051396659754, + "loss": 2.8699, + "step": 4373 + }, + { + "epoch": 0.1297037630104083, + "grad_norm": 0.206406831741333, + "learning_rate": 0.0009643877026120104, + "loss": 2.8645, + "step": 4374 + }, + { + "epoch": 0.12973341636271982, + "grad_norm": 0.18366600573062897, + "learning_rate": 0.0009643702614458126, + "loss": 2.8421, + "step": 4375 + }, + { + "epoch": 0.1297630697150313, + "grad_norm": 0.14803838729858398, + "learning_rate": 0.0009643528161675364, + "loss": 2.8591, + "step": 4376 + }, + { + "epoch": 0.12979272306734277, + "grad_norm": 0.1471179872751236, + "learning_rate": 0.0009643353667773362, + "loss": 2.8251, + "step": 4377 + }, + { + "epoch": 0.12982237641965425, + "grad_norm": 0.1347707062959671, + "learning_rate": 0.0009643179132753668, + "loss": 2.8931, + "step": 4378 + }, + { + "epoch": 0.12985202977196572, + "grad_norm": 0.14184455573558807, + "learning_rate": 0.0009643004556617825, + "loss": 2.8647, + "step": 4379 + }, + { + "epoch": 0.1298816831242772, + "grad_norm": 0.14225482940673828, + "learning_rate": 0.0009642829939367379, + "loss": 2.8927, + "step": 4380 + }, + { + "epoch": 0.12991133647658867, + "grad_norm": 0.14811639487743378, + "learning_rate": 0.0009642655281003878, + "loss": 2.8292, + "step": 4381 + }, + { + "epoch": 0.12994098982890015, + "grad_norm": 0.19387920200824738, + "learning_rate": 0.0009642480581528867, + "loss": 2.8688, + "step": 4382 + }, + { + "epoch": 0.12997064318121163, + "grad_norm": 0.22719550132751465, + "learning_rate": 0.0009642305840943894, + "loss": 2.8774, + "step": 4383 + }, + { + "epoch": 0.1300002965335231, + "grad_norm": 0.23514871299266815, + "learning_rate": 0.0009642131059250506, + "loss": 2.8934, + "step": 4384 + }, + { + "epoch": 0.1300299498858346, + "grad_norm": 0.18098655343055725, + "learning_rate": 0.0009641956236450251, + "loss": 2.8701, + "step": 4385 + }, + { + "epoch": 0.13005960323814608, + "grad_norm": 0.15619629621505737, + "learning_rate": 0.0009641781372544676, + "loss": 2.8625, + "step": 4386 + }, + { + "epoch": 0.13008925659045756, + "grad_norm": 0.17679187655448914, + "learning_rate": 0.0009641606467535331, + "loss": 2.8933, + "step": 4387 + }, + { + "epoch": 0.13011890994276903, + "grad_norm": 0.16555625200271606, + "learning_rate": 0.0009641431521423763, + "loss": 2.8814, + "step": 4388 + }, + { + "epoch": 0.1301485632950805, + "grad_norm": 0.1659911721944809, + "learning_rate": 0.0009641256534211522, + "loss": 2.8871, + "step": 4389 + }, + { + "epoch": 0.130178216647392, + "grad_norm": 0.1550644487142563, + "learning_rate": 0.000964108150590016, + "loss": 2.9153, + "step": 4390 + }, + { + "epoch": 0.13020786999970346, + "grad_norm": 0.15971040725708008, + "learning_rate": 0.0009640906436491222, + "loss": 2.8781, + "step": 4391 + }, + { + "epoch": 0.13023752335201494, + "grad_norm": 0.15819919109344482, + "learning_rate": 0.0009640731325986263, + "loss": 2.8616, + "step": 4392 + }, + { + "epoch": 0.13026717670432642, + "grad_norm": 0.15966132283210754, + "learning_rate": 0.000964055617438683, + "loss": 2.8848, + "step": 4393 + }, + { + "epoch": 0.1302968300566379, + "grad_norm": 0.16465002298355103, + "learning_rate": 0.0009640380981694476, + "loss": 2.8646, + "step": 4394 + }, + { + "epoch": 0.1303264834089494, + "grad_norm": 0.1464013159275055, + "learning_rate": 0.0009640205747910751, + "loss": 2.8675, + "step": 4395 + }, + { + "epoch": 0.13035613676126087, + "grad_norm": 0.14640671014785767, + "learning_rate": 0.0009640030473037209, + "loss": 2.8393, + "step": 4396 + }, + { + "epoch": 0.13038579011357235, + "grad_norm": 0.17078512907028198, + "learning_rate": 0.00096398551570754, + "loss": 2.8727, + "step": 4397 + }, + { + "epoch": 0.13041544346588382, + "grad_norm": 0.17229583859443665, + "learning_rate": 0.0009639679800026877, + "loss": 2.867, + "step": 4398 + }, + { + "epoch": 0.1304450968181953, + "grad_norm": 0.16977430880069733, + "learning_rate": 0.0009639504401893193, + "loss": 2.8471, + "step": 4399 + }, + { + "epoch": 0.13047475017050678, + "grad_norm": 0.17271961271762848, + "learning_rate": 0.0009639328962675902, + "loss": 2.8677, + "step": 4400 + }, + { + "epoch": 0.13050440352281825, + "grad_norm": 0.165573388338089, + "learning_rate": 0.0009639153482376557, + "loss": 2.8842, + "step": 4401 + }, + { + "epoch": 0.13053405687512973, + "grad_norm": 0.15924207866191864, + "learning_rate": 0.0009638977960996711, + "loss": 2.8714, + "step": 4402 + }, + { + "epoch": 0.1305637102274412, + "grad_norm": 0.13720636069774628, + "learning_rate": 0.0009638802398537919, + "loss": 2.9026, + "step": 4403 + }, + { + "epoch": 0.13059336357975268, + "grad_norm": 0.1533677577972412, + "learning_rate": 0.0009638626795001735, + "loss": 2.8891, + "step": 4404 + }, + { + "epoch": 0.13062301693206416, + "grad_norm": 0.15434373915195465, + "learning_rate": 0.0009638451150389715, + "loss": 2.8533, + "step": 4405 + }, + { + "epoch": 0.13065267028437566, + "grad_norm": 0.15947449207305908, + "learning_rate": 0.0009638275464703413, + "loss": 2.8378, + "step": 4406 + }, + { + "epoch": 0.13068232363668714, + "grad_norm": 0.18229681253433228, + "learning_rate": 0.0009638099737944386, + "loss": 2.8372, + "step": 4407 + }, + { + "epoch": 0.1307119769889986, + "grad_norm": 0.16388046741485596, + "learning_rate": 0.0009637923970114191, + "loss": 2.8647, + "step": 4408 + }, + { + "epoch": 0.1307416303413101, + "grad_norm": 0.1495353728532791, + "learning_rate": 0.0009637748161214381, + "loss": 2.8875, + "step": 4409 + }, + { + "epoch": 0.13077128369362157, + "grad_norm": 0.16584017872810364, + "learning_rate": 0.0009637572311246516, + "loss": 2.8754, + "step": 4410 + }, + { + "epoch": 0.13080093704593304, + "grad_norm": 0.14337031543254852, + "learning_rate": 0.0009637396420212152, + "loss": 2.8278, + "step": 4411 + }, + { + "epoch": 0.13083059039824452, + "grad_norm": 0.14361946284770966, + "learning_rate": 0.0009637220488112847, + "loss": 2.8828, + "step": 4412 + }, + { + "epoch": 0.130860243750556, + "grad_norm": 0.17735038697719574, + "learning_rate": 0.000963704451495016, + "loss": 2.8278, + "step": 4413 + }, + { + "epoch": 0.13088989710286747, + "grad_norm": 0.1689116209745407, + "learning_rate": 0.0009636868500725646, + "loss": 2.8987, + "step": 4414 + }, + { + "epoch": 0.13091955045517895, + "grad_norm": 0.17147889733314514, + "learning_rate": 0.0009636692445440866, + "loss": 2.8681, + "step": 4415 + }, + { + "epoch": 0.13094920380749045, + "grad_norm": 0.17875264585018158, + "learning_rate": 0.0009636516349097377, + "loss": 2.8368, + "step": 4416 + }, + { + "epoch": 0.13097885715980193, + "grad_norm": 0.16584254801273346, + "learning_rate": 0.0009636340211696743, + "loss": 2.8512, + "step": 4417 + }, + { + "epoch": 0.1310085105121134, + "grad_norm": 0.19141355156898499, + "learning_rate": 0.0009636164033240517, + "loss": 2.8866, + "step": 4418 + }, + { + "epoch": 0.13103816386442488, + "grad_norm": 0.20827458798885345, + "learning_rate": 0.0009635987813730266, + "loss": 2.8785, + "step": 4419 + }, + { + "epoch": 0.13106781721673635, + "grad_norm": 0.20822148025035858, + "learning_rate": 0.0009635811553167546, + "loss": 2.8677, + "step": 4420 + }, + { + "epoch": 0.13109747056904783, + "grad_norm": 0.19372247159481049, + "learning_rate": 0.0009635635251553918, + "loss": 2.8178, + "step": 4421 + }, + { + "epoch": 0.1311271239213593, + "grad_norm": 0.15750358998775482, + "learning_rate": 0.0009635458908890946, + "loss": 2.8725, + "step": 4422 + }, + { + "epoch": 0.13115677727367078, + "grad_norm": 0.16666613519191742, + "learning_rate": 0.0009635282525180189, + "loss": 2.8677, + "step": 4423 + }, + { + "epoch": 0.13118643062598226, + "grad_norm": 0.1645805835723877, + "learning_rate": 0.0009635106100423209, + "loss": 2.8591, + "step": 4424 + }, + { + "epoch": 0.13121608397829373, + "grad_norm": 0.15850642323493958, + "learning_rate": 0.0009634929634621569, + "loss": 2.8466, + "step": 4425 + }, + { + "epoch": 0.1312457373306052, + "grad_norm": 0.14941157400608063, + "learning_rate": 0.0009634753127776832, + "loss": 2.8673, + "step": 4426 + }, + { + "epoch": 0.13127539068291671, + "grad_norm": 0.14755846560001373, + "learning_rate": 0.000963457657989056, + "loss": 2.8308, + "step": 4427 + }, + { + "epoch": 0.1313050440352282, + "grad_norm": 0.15732041001319885, + "learning_rate": 0.0009634399990964316, + "loss": 2.877, + "step": 4428 + }, + { + "epoch": 0.13133469738753967, + "grad_norm": 0.14698509871959686, + "learning_rate": 0.0009634223360999666, + "loss": 2.8101, + "step": 4429 + }, + { + "epoch": 0.13136435073985114, + "grad_norm": 0.20264926552772522, + "learning_rate": 0.0009634046689998173, + "loss": 2.8846, + "step": 4430 + }, + { + "epoch": 0.13139400409216262, + "grad_norm": 0.1376863270998001, + "learning_rate": 0.00096338699779614, + "loss": 2.8331, + "step": 4431 + }, + { + "epoch": 0.1314236574444741, + "grad_norm": 0.13747401535511017, + "learning_rate": 0.0009633693224890914, + "loss": 2.8626, + "step": 4432 + }, + { + "epoch": 0.13145331079678557, + "grad_norm": 0.13303369283676147, + "learning_rate": 0.0009633516430788278, + "loss": 2.8527, + "step": 4433 + }, + { + "epoch": 0.13148296414909705, + "grad_norm": 0.13918983936309814, + "learning_rate": 0.0009633339595655059, + "loss": 2.8588, + "step": 4434 + }, + { + "epoch": 0.13151261750140852, + "grad_norm": 0.1519002765417099, + "learning_rate": 0.0009633162719492823, + "loss": 2.8422, + "step": 4435 + }, + { + "epoch": 0.13154227085372, + "grad_norm": 0.1591346710920334, + "learning_rate": 0.0009632985802303136, + "loss": 2.8619, + "step": 4436 + }, + { + "epoch": 0.1315719242060315, + "grad_norm": 0.1766563057899475, + "learning_rate": 0.0009632808844087564, + "loss": 2.8694, + "step": 4437 + }, + { + "epoch": 0.13160157755834298, + "grad_norm": 0.17281776666641235, + "learning_rate": 0.0009632631844847673, + "loss": 2.8413, + "step": 4438 + }, + { + "epoch": 0.13163123091065446, + "grad_norm": 0.17207659780979156, + "learning_rate": 0.0009632454804585033, + "loss": 2.8646, + "step": 4439 + }, + { + "epoch": 0.13166088426296593, + "grad_norm": 0.1404251903295517, + "learning_rate": 0.0009632277723301213, + "loss": 2.8743, + "step": 4440 + }, + { + "epoch": 0.1316905376152774, + "grad_norm": 0.13960471749305725, + "learning_rate": 0.0009632100600997775, + "loss": 2.8613, + "step": 4441 + }, + { + "epoch": 0.13172019096758888, + "grad_norm": 0.14520208537578583, + "learning_rate": 0.0009631923437676294, + "loss": 2.8769, + "step": 4442 + }, + { + "epoch": 0.13174984431990036, + "grad_norm": 0.15223348140716553, + "learning_rate": 0.0009631746233338335, + "loss": 2.8661, + "step": 4443 + }, + { + "epoch": 0.13177949767221184, + "grad_norm": 0.1562366485595703, + "learning_rate": 0.0009631568987985466, + "loss": 2.8837, + "step": 4444 + }, + { + "epoch": 0.1318091510245233, + "grad_norm": 0.15884344279766083, + "learning_rate": 0.0009631391701619261, + "loss": 2.8645, + "step": 4445 + }, + { + "epoch": 0.1318388043768348, + "grad_norm": 0.17921586334705353, + "learning_rate": 0.0009631214374241287, + "loss": 2.8269, + "step": 4446 + }, + { + "epoch": 0.1318684577291463, + "grad_norm": 0.18844226002693176, + "learning_rate": 0.0009631037005853114, + "loss": 2.8658, + "step": 4447 + }, + { + "epoch": 0.13189811108145777, + "grad_norm": 0.19993087649345398, + "learning_rate": 0.0009630859596456314, + "loss": 2.8603, + "step": 4448 + }, + { + "epoch": 0.13192776443376925, + "grad_norm": 0.18708349764347076, + "learning_rate": 0.0009630682146052458, + "loss": 2.8286, + "step": 4449 + }, + { + "epoch": 0.13195741778608072, + "grad_norm": 0.180325448513031, + "learning_rate": 0.0009630504654643115, + "loss": 2.8455, + "step": 4450 + }, + { + "epoch": 0.1319870711383922, + "grad_norm": 0.1843925565481186, + "learning_rate": 0.000963032712222986, + "loss": 2.7865, + "step": 4451 + }, + { + "epoch": 0.13201672449070367, + "grad_norm": 0.17170265316963196, + "learning_rate": 0.0009630149548814263, + "loss": 2.8791, + "step": 4452 + }, + { + "epoch": 0.13204637784301515, + "grad_norm": 0.175213024020195, + "learning_rate": 0.0009629971934397897, + "loss": 2.8515, + "step": 4453 + }, + { + "epoch": 0.13207603119532663, + "grad_norm": 0.15893316268920898, + "learning_rate": 0.0009629794278982335, + "loss": 2.8861, + "step": 4454 + }, + { + "epoch": 0.1321056845476381, + "grad_norm": 0.15176518261432648, + "learning_rate": 0.0009629616582569149, + "loss": 2.8931, + "step": 4455 + }, + { + "epoch": 0.13213533789994958, + "grad_norm": 0.15142807364463806, + "learning_rate": 0.0009629438845159914, + "loss": 2.8835, + "step": 4456 + }, + { + "epoch": 0.13216499125226105, + "grad_norm": 0.18294331431388855, + "learning_rate": 0.0009629261066756205, + "loss": 2.8616, + "step": 4457 + }, + { + "epoch": 0.13219464460457256, + "grad_norm": 0.19888417422771454, + "learning_rate": 0.0009629083247359593, + "loss": 2.8378, + "step": 4458 + }, + { + "epoch": 0.13222429795688403, + "grad_norm": 0.23091155290603638, + "learning_rate": 0.0009628905386971655, + "loss": 2.8872, + "step": 4459 + }, + { + "epoch": 0.1322539513091955, + "grad_norm": 0.19127483665943146, + "learning_rate": 0.0009628727485593965, + "loss": 2.857, + "step": 4460 + }, + { + "epoch": 0.132283604661507, + "grad_norm": 0.1568809449672699, + "learning_rate": 0.0009628549543228098, + "loss": 2.8435, + "step": 4461 + }, + { + "epoch": 0.13231325801381846, + "grad_norm": 0.17632077634334564, + "learning_rate": 0.0009628371559875632, + "loss": 2.8416, + "step": 4462 + }, + { + "epoch": 0.13234291136612994, + "grad_norm": 0.17146670818328857, + "learning_rate": 0.0009628193535538139, + "loss": 2.866, + "step": 4463 + }, + { + "epoch": 0.13237256471844142, + "grad_norm": 0.16559924185276031, + "learning_rate": 0.0009628015470217199, + "loss": 2.8846, + "step": 4464 + }, + { + "epoch": 0.1324022180707529, + "grad_norm": 0.15685956180095673, + "learning_rate": 0.0009627837363914389, + "loss": 2.851, + "step": 4465 + }, + { + "epoch": 0.13243187142306437, + "grad_norm": 0.15663187205791473, + "learning_rate": 0.0009627659216631284, + "loss": 2.8435, + "step": 4466 + }, + { + "epoch": 0.13246152477537584, + "grad_norm": 0.15787464380264282, + "learning_rate": 0.000962748102836946, + "loss": 2.8873, + "step": 4467 + }, + { + "epoch": 0.13249117812768735, + "grad_norm": 0.16973848640918732, + "learning_rate": 0.00096273027991305, + "loss": 2.886, + "step": 4468 + }, + { + "epoch": 0.13252083147999882, + "grad_norm": 0.1743224561214447, + "learning_rate": 0.0009627124528915978, + "loss": 2.8492, + "step": 4469 + }, + { + "epoch": 0.1325504848323103, + "grad_norm": 0.1852840632200241, + "learning_rate": 0.0009626946217727475, + "loss": 2.8595, + "step": 4470 + }, + { + "epoch": 0.13258013818462178, + "grad_norm": 0.186560720205307, + "learning_rate": 0.0009626767865566568, + "loss": 2.8309, + "step": 4471 + }, + { + "epoch": 0.13260979153693325, + "grad_norm": 0.19597460329532623, + "learning_rate": 0.0009626589472434838, + "loss": 2.8266, + "step": 4472 + }, + { + "epoch": 0.13263944488924473, + "grad_norm": 0.17213207483291626, + "learning_rate": 0.0009626411038333864, + "loss": 2.8872, + "step": 4473 + }, + { + "epoch": 0.1326690982415562, + "grad_norm": 0.16467510163784027, + "learning_rate": 0.0009626232563265227, + "loss": 2.828, + "step": 4474 + }, + { + "epoch": 0.13269875159386768, + "grad_norm": 0.1603081375360489, + "learning_rate": 0.0009626054047230506, + "loss": 2.8806, + "step": 4475 + }, + { + "epoch": 0.13272840494617916, + "grad_norm": 0.1602339744567871, + "learning_rate": 0.0009625875490231282, + "loss": 2.842, + "step": 4476 + }, + { + "epoch": 0.13275805829849063, + "grad_norm": 0.1395491510629654, + "learning_rate": 0.0009625696892269135, + "loss": 2.8495, + "step": 4477 + }, + { + "epoch": 0.1327877116508021, + "grad_norm": 0.15602506697177887, + "learning_rate": 0.0009625518253345651, + "loss": 2.8292, + "step": 4478 + }, + { + "epoch": 0.1328173650031136, + "grad_norm": 0.16670772433280945, + "learning_rate": 0.0009625339573462406, + "loss": 2.8759, + "step": 4479 + }, + { + "epoch": 0.1328470183554251, + "grad_norm": 0.15995562076568604, + "learning_rate": 0.0009625160852620987, + "loss": 2.8534, + "step": 4480 + }, + { + "epoch": 0.13287667170773657, + "grad_norm": 0.1567016839981079, + "learning_rate": 0.0009624982090822975, + "loss": 2.8929, + "step": 4481 + }, + { + "epoch": 0.13290632506004804, + "grad_norm": 0.14698490500450134, + "learning_rate": 0.0009624803288069952, + "loss": 2.849, + "step": 4482 + }, + { + "epoch": 0.13293597841235952, + "grad_norm": 0.13350003957748413, + "learning_rate": 0.0009624624444363502, + "loss": 2.8371, + "step": 4483 + }, + { + "epoch": 0.132965631764671, + "grad_norm": 0.133753702044487, + "learning_rate": 0.0009624445559705208, + "loss": 2.8486, + "step": 4484 + }, + { + "epoch": 0.13299528511698247, + "grad_norm": 0.1487128883600235, + "learning_rate": 0.0009624266634096657, + "loss": 2.9065, + "step": 4485 + }, + { + "epoch": 0.13302493846929395, + "grad_norm": 0.16638372838497162, + "learning_rate": 0.000962408766753943, + "loss": 2.8757, + "step": 4486 + }, + { + "epoch": 0.13305459182160542, + "grad_norm": 0.18193651735782623, + "learning_rate": 0.0009623908660035112, + "loss": 2.8789, + "step": 4487 + }, + { + "epoch": 0.1330842451739169, + "grad_norm": 0.1764557659626007, + "learning_rate": 0.000962372961158529, + "loss": 2.8717, + "step": 4488 + }, + { + "epoch": 0.1331138985262284, + "grad_norm": 0.15939560532569885, + "learning_rate": 0.0009623550522191549, + "loss": 2.8594, + "step": 4489 + }, + { + "epoch": 0.13314355187853988, + "grad_norm": 0.16853831708431244, + "learning_rate": 0.0009623371391855475, + "loss": 2.8969, + "step": 4490 + }, + { + "epoch": 0.13317320523085135, + "grad_norm": 0.1717609018087387, + "learning_rate": 0.0009623192220578652, + "loss": 2.8617, + "step": 4491 + }, + { + "epoch": 0.13320285858316283, + "grad_norm": 0.16273729503154755, + "learning_rate": 0.0009623013008362669, + "loss": 2.8329, + "step": 4492 + }, + { + "epoch": 0.1332325119354743, + "grad_norm": 0.15724343061447144, + "learning_rate": 0.0009622833755209113, + "loss": 2.8535, + "step": 4493 + }, + { + "epoch": 0.13326216528778578, + "grad_norm": 0.16486157476902008, + "learning_rate": 0.000962265446111957, + "loss": 2.8683, + "step": 4494 + }, + { + "epoch": 0.13329181864009726, + "grad_norm": 0.1826598197221756, + "learning_rate": 0.0009622475126095629, + "loss": 2.8681, + "step": 4495 + }, + { + "epoch": 0.13332147199240874, + "grad_norm": 0.17792126536369324, + "learning_rate": 0.0009622295750138876, + "loss": 2.879, + "step": 4496 + }, + { + "epoch": 0.1333511253447202, + "grad_norm": 0.14449360966682434, + "learning_rate": 0.0009622116333250901, + "loss": 2.8468, + "step": 4497 + }, + { + "epoch": 0.1333807786970317, + "grad_norm": 0.14247702062129974, + "learning_rate": 0.0009621936875433293, + "loss": 2.8663, + "step": 4498 + }, + { + "epoch": 0.1334104320493432, + "grad_norm": 0.2019369900226593, + "learning_rate": 0.0009621757376687641, + "loss": 2.8387, + "step": 4499 + }, + { + "epoch": 0.13344008540165467, + "grad_norm": 0.16966012120246887, + "learning_rate": 0.0009621577837015534, + "loss": 2.8387, + "step": 4500 + }, + { + "epoch": 0.13346973875396614, + "grad_norm": 0.16511037945747375, + "learning_rate": 0.0009621398256418561, + "loss": 2.8629, + "step": 4501 + }, + { + "epoch": 0.13349939210627762, + "grad_norm": 0.15509265661239624, + "learning_rate": 0.0009621218634898314, + "loss": 2.8948, + "step": 4502 + }, + { + "epoch": 0.1335290454585891, + "grad_norm": 0.1606379598379135, + "learning_rate": 0.0009621038972456383, + "loss": 2.8635, + "step": 4503 + }, + { + "epoch": 0.13355869881090057, + "grad_norm": 0.14956291019916534, + "learning_rate": 0.0009620859269094357, + "loss": 2.8471, + "step": 4504 + }, + { + "epoch": 0.13358835216321205, + "grad_norm": 0.1365787535905838, + "learning_rate": 0.0009620679524813831, + "loss": 2.8301, + "step": 4505 + }, + { + "epoch": 0.13361800551552352, + "grad_norm": 0.1592217981815338, + "learning_rate": 0.0009620499739616395, + "loss": 2.8746, + "step": 4506 + }, + { + "epoch": 0.133647658867835, + "grad_norm": 0.15895028412342072, + "learning_rate": 0.0009620319913503639, + "loss": 2.8467, + "step": 4507 + }, + { + "epoch": 0.13367731222014648, + "grad_norm": 0.14271213114261627, + "learning_rate": 0.0009620140046477157, + "loss": 2.8524, + "step": 4508 + }, + { + "epoch": 0.13370696557245795, + "grad_norm": 0.1645759642124176, + "learning_rate": 0.0009619960138538544, + "loss": 2.8645, + "step": 4509 + }, + { + "epoch": 0.13373661892476946, + "grad_norm": 0.17397668957710266, + "learning_rate": 0.0009619780189689389, + "loss": 2.8978, + "step": 4510 + }, + { + "epoch": 0.13376627227708093, + "grad_norm": 0.19583620131015778, + "learning_rate": 0.0009619600199931289, + "loss": 2.8951, + "step": 4511 + }, + { + "epoch": 0.1337959256293924, + "grad_norm": 0.19507580995559692, + "learning_rate": 0.0009619420169265834, + "loss": 2.8874, + "step": 4512 + }, + { + "epoch": 0.13382557898170389, + "grad_norm": 0.1784968376159668, + "learning_rate": 0.0009619240097694622, + "loss": 2.8516, + "step": 4513 + }, + { + "epoch": 0.13385523233401536, + "grad_norm": 0.16342748701572418, + "learning_rate": 0.0009619059985219246, + "loss": 2.8484, + "step": 4514 + }, + { + "epoch": 0.13388488568632684, + "grad_norm": 0.20111314952373505, + "learning_rate": 0.0009618879831841301, + "loss": 2.8928, + "step": 4515 + }, + { + "epoch": 0.1339145390386383, + "grad_norm": 0.22720558941364288, + "learning_rate": 0.0009618699637562383, + "loss": 2.8873, + "step": 4516 + }, + { + "epoch": 0.1339441923909498, + "grad_norm": 0.28454673290252686, + "learning_rate": 0.0009618519402384085, + "loss": 2.8468, + "step": 4517 + }, + { + "epoch": 0.13397384574326127, + "grad_norm": 0.2898685038089752, + "learning_rate": 0.0009618339126308006, + "loss": 2.9187, + "step": 4518 + }, + { + "epoch": 0.13400349909557274, + "grad_norm": 0.23444505035877228, + "learning_rate": 0.0009618158809335742, + "loss": 2.862, + "step": 4519 + }, + { + "epoch": 0.13403315244788425, + "grad_norm": 0.2582055926322937, + "learning_rate": 0.0009617978451468887, + "loss": 2.8428, + "step": 4520 + }, + { + "epoch": 0.13406280580019572, + "grad_norm": 0.23387566208839417, + "learning_rate": 0.0009617798052709043, + "loss": 2.8436, + "step": 4521 + }, + { + "epoch": 0.1340924591525072, + "grad_norm": 0.16431653499603271, + "learning_rate": 0.0009617617613057803, + "loss": 2.8658, + "step": 4522 + }, + { + "epoch": 0.13412211250481867, + "grad_norm": 0.1789362132549286, + "learning_rate": 0.0009617437132516766, + "loss": 2.8328, + "step": 4523 + }, + { + "epoch": 0.13415176585713015, + "grad_norm": 0.13528360426425934, + "learning_rate": 0.000961725661108753, + "loss": 2.8731, + "step": 4524 + }, + { + "epoch": 0.13418141920944163, + "grad_norm": 0.13235242664813995, + "learning_rate": 0.0009617076048771695, + "loss": 2.8608, + "step": 4525 + }, + { + "epoch": 0.1342110725617531, + "grad_norm": 0.12630847096443176, + "learning_rate": 0.0009616895445570861, + "loss": 2.8174, + "step": 4526 + }, + { + "epoch": 0.13424072591406458, + "grad_norm": 0.11797840148210526, + "learning_rate": 0.0009616714801486624, + "loss": 2.8535, + "step": 4527 + }, + { + "epoch": 0.13427037926637606, + "grad_norm": 0.11903761327266693, + "learning_rate": 0.0009616534116520584, + "loss": 2.8319, + "step": 4528 + }, + { + "epoch": 0.13430003261868753, + "grad_norm": 0.12810465693473816, + "learning_rate": 0.0009616353390674344, + "loss": 2.8765, + "step": 4529 + }, + { + "epoch": 0.134329685970999, + "grad_norm": 0.15702474117279053, + "learning_rate": 0.00096161726239495, + "loss": 2.8388, + "step": 4530 + }, + { + "epoch": 0.1343593393233105, + "grad_norm": 0.14333750307559967, + "learning_rate": 0.0009615991816347655, + "loss": 2.883, + "step": 4531 + }, + { + "epoch": 0.134388992675622, + "grad_norm": 0.15015994012355804, + "learning_rate": 0.0009615810967870414, + "loss": 2.808, + "step": 4532 + }, + { + "epoch": 0.13441864602793346, + "grad_norm": 0.16565178334712982, + "learning_rate": 0.0009615630078519371, + "loss": 2.9044, + "step": 4533 + }, + { + "epoch": 0.13444829938024494, + "grad_norm": 0.15414072573184967, + "learning_rate": 0.0009615449148296132, + "loss": 2.8473, + "step": 4534 + }, + { + "epoch": 0.13447795273255642, + "grad_norm": 0.1588192582130432, + "learning_rate": 0.0009615268177202298, + "loss": 2.9016, + "step": 4535 + }, + { + "epoch": 0.1345076060848679, + "grad_norm": 0.14631691575050354, + "learning_rate": 0.0009615087165239473, + "loss": 2.8866, + "step": 4536 + }, + { + "epoch": 0.13453725943717937, + "grad_norm": 0.1168816015124321, + "learning_rate": 0.0009614906112409259, + "loss": 2.8495, + "step": 4537 + }, + { + "epoch": 0.13456691278949084, + "grad_norm": 0.13391394913196564, + "learning_rate": 0.000961472501871326, + "loss": 2.8855, + "step": 4538 + }, + { + "epoch": 0.13459656614180232, + "grad_norm": 0.14942578971385956, + "learning_rate": 0.0009614543884153078, + "loss": 2.8575, + "step": 4539 + }, + { + "epoch": 0.1346262194941138, + "grad_norm": 0.14189963042736053, + "learning_rate": 0.0009614362708730318, + "loss": 2.8643, + "step": 4540 + }, + { + "epoch": 0.1346558728464253, + "grad_norm": 0.15405601263046265, + "learning_rate": 0.0009614181492446583, + "loss": 2.8788, + "step": 4541 + }, + { + "epoch": 0.13468552619873678, + "grad_norm": 0.16902029514312744, + "learning_rate": 0.000961400023530348, + "loss": 2.8312, + "step": 4542 + }, + { + "epoch": 0.13471517955104825, + "grad_norm": 0.18920084834098816, + "learning_rate": 0.0009613818937302612, + "loss": 2.8533, + "step": 4543 + }, + { + "epoch": 0.13474483290335973, + "grad_norm": 0.21411307156085968, + "learning_rate": 0.0009613637598445586, + "loss": 2.8453, + "step": 4544 + }, + { + "epoch": 0.1347744862556712, + "grad_norm": 0.23133626580238342, + "learning_rate": 0.0009613456218734008, + "loss": 2.848, + "step": 4545 + }, + { + "epoch": 0.13480413960798268, + "grad_norm": 0.2014360874891281, + "learning_rate": 0.0009613274798169482, + "loss": 2.8505, + "step": 4546 + }, + { + "epoch": 0.13483379296029416, + "grad_norm": 0.17072956264019012, + "learning_rate": 0.0009613093336753617, + "loss": 2.8783, + "step": 4547 + }, + { + "epoch": 0.13486344631260563, + "grad_norm": 0.1565936654806137, + "learning_rate": 0.0009612911834488018, + "loss": 2.8761, + "step": 4548 + }, + { + "epoch": 0.1348930996649171, + "grad_norm": 0.1478196084499359, + "learning_rate": 0.0009612730291374292, + "loss": 2.8501, + "step": 4549 + }, + { + "epoch": 0.13492275301722859, + "grad_norm": 0.16122375428676605, + "learning_rate": 0.0009612548707414048, + "loss": 2.8672, + "step": 4550 + }, + { + "epoch": 0.1349524063695401, + "grad_norm": 0.1570226103067398, + "learning_rate": 0.0009612367082608895, + "loss": 2.8412, + "step": 4551 + }, + { + "epoch": 0.13498205972185157, + "grad_norm": 0.1513441950082779, + "learning_rate": 0.0009612185416960439, + "loss": 2.8413, + "step": 4552 + }, + { + "epoch": 0.13501171307416304, + "grad_norm": 0.14752547442913055, + "learning_rate": 0.0009612003710470289, + "loss": 2.8333, + "step": 4553 + }, + { + "epoch": 0.13504136642647452, + "grad_norm": 0.14670594036579132, + "learning_rate": 0.0009611821963140055, + "loss": 2.8468, + "step": 4554 + }, + { + "epoch": 0.135071019778786, + "grad_norm": 0.15487201511859894, + "learning_rate": 0.0009611640174971345, + "loss": 2.8914, + "step": 4555 + }, + { + "epoch": 0.13510067313109747, + "grad_norm": 0.1361704021692276, + "learning_rate": 0.000961145834596577, + "loss": 2.8636, + "step": 4556 + }, + { + "epoch": 0.13513032648340895, + "grad_norm": 0.14258375763893127, + "learning_rate": 0.0009611276476124939, + "loss": 2.8391, + "step": 4557 + }, + { + "epoch": 0.13515997983572042, + "grad_norm": 0.1313389241695404, + "learning_rate": 0.0009611094565450466, + "loss": 2.8471, + "step": 4558 + }, + { + "epoch": 0.1351896331880319, + "grad_norm": 0.14653602242469788, + "learning_rate": 0.0009610912613943957, + "loss": 2.8946, + "step": 4559 + }, + { + "epoch": 0.13521928654034338, + "grad_norm": 0.17126929759979248, + "learning_rate": 0.0009610730621607026, + "loss": 2.8591, + "step": 4560 + }, + { + "epoch": 0.13524893989265485, + "grad_norm": 0.17794832587242126, + "learning_rate": 0.0009610548588441283, + "loss": 2.8479, + "step": 4561 + }, + { + "epoch": 0.13527859324496636, + "grad_norm": 0.18159599602222443, + "learning_rate": 0.0009610366514448342, + "loss": 2.8604, + "step": 4562 + }, + { + "epoch": 0.13530824659727783, + "grad_norm": 0.1734713762998581, + "learning_rate": 0.0009610184399629813, + "loss": 2.8601, + "step": 4563 + }, + { + "epoch": 0.1353378999495893, + "grad_norm": 0.18241746723651886, + "learning_rate": 0.0009610002243987311, + "loss": 2.8375, + "step": 4564 + }, + { + "epoch": 0.13536755330190078, + "grad_norm": 0.19568178057670593, + "learning_rate": 0.0009609820047522448, + "loss": 2.865, + "step": 4565 + }, + { + "epoch": 0.13539720665421226, + "grad_norm": 0.17072390019893646, + "learning_rate": 0.0009609637810236837, + "loss": 2.8539, + "step": 4566 + }, + { + "epoch": 0.13542686000652374, + "grad_norm": 0.16429808735847473, + "learning_rate": 0.0009609455532132091, + "loss": 2.8607, + "step": 4567 + }, + { + "epoch": 0.1354565133588352, + "grad_norm": 0.14739546179771423, + "learning_rate": 0.0009609273213209826, + "loss": 2.8584, + "step": 4568 + }, + { + "epoch": 0.1354861667111467, + "grad_norm": 0.1482408195734024, + "learning_rate": 0.0009609090853471654, + "loss": 2.8567, + "step": 4569 + }, + { + "epoch": 0.13551582006345816, + "grad_norm": 0.1507638543844223, + "learning_rate": 0.0009608908452919194, + "loss": 2.8659, + "step": 4570 + }, + { + "epoch": 0.13554547341576964, + "grad_norm": 0.15342020988464355, + "learning_rate": 0.0009608726011554056, + "loss": 2.8923, + "step": 4571 + }, + { + "epoch": 0.13557512676808114, + "grad_norm": 0.15760180354118347, + "learning_rate": 0.000960854352937786, + "loss": 2.8631, + "step": 4572 + }, + { + "epoch": 0.13560478012039262, + "grad_norm": 0.1457705795764923, + "learning_rate": 0.0009608361006392219, + "loss": 2.8581, + "step": 4573 + }, + { + "epoch": 0.1356344334727041, + "grad_norm": 0.15108667314052582, + "learning_rate": 0.000960817844259875, + "loss": 2.8504, + "step": 4574 + }, + { + "epoch": 0.13566408682501557, + "grad_norm": 0.167293518781662, + "learning_rate": 0.0009607995837999071, + "loss": 2.8221, + "step": 4575 + }, + { + "epoch": 0.13569374017732705, + "grad_norm": 0.1698719710111618, + "learning_rate": 0.0009607813192594796, + "loss": 2.8562, + "step": 4576 + }, + { + "epoch": 0.13572339352963853, + "grad_norm": 0.1768784523010254, + "learning_rate": 0.0009607630506387546, + "loss": 2.8534, + "step": 4577 + }, + { + "epoch": 0.13575304688195, + "grad_norm": 0.18110312521457672, + "learning_rate": 0.0009607447779378937, + "loss": 2.8872, + "step": 4578 + }, + { + "epoch": 0.13578270023426148, + "grad_norm": 0.21300873160362244, + "learning_rate": 0.0009607265011570585, + "loss": 2.8617, + "step": 4579 + }, + { + "epoch": 0.13581235358657295, + "grad_norm": 0.2238638550043106, + "learning_rate": 0.0009607082202964112, + "loss": 2.8583, + "step": 4580 + }, + { + "epoch": 0.13584200693888443, + "grad_norm": 0.22209849953651428, + "learning_rate": 0.0009606899353561136, + "loss": 2.8511, + "step": 4581 + }, + { + "epoch": 0.1358716602911959, + "grad_norm": 0.22093507647514343, + "learning_rate": 0.0009606716463363274, + "loss": 2.8712, + "step": 4582 + }, + { + "epoch": 0.1359013136435074, + "grad_norm": 0.22236129641532898, + "learning_rate": 0.0009606533532372148, + "loss": 2.8422, + "step": 4583 + }, + { + "epoch": 0.13593096699581889, + "grad_norm": 0.1752251237630844, + "learning_rate": 0.0009606350560589377, + "loss": 2.8466, + "step": 4584 + }, + { + "epoch": 0.13596062034813036, + "grad_norm": 0.17875906825065613, + "learning_rate": 0.0009606167548016581, + "loss": 2.8697, + "step": 4585 + }, + { + "epoch": 0.13599027370044184, + "grad_norm": 0.24491703510284424, + "learning_rate": 0.0009605984494655379, + "loss": 2.8689, + "step": 4586 + }, + { + "epoch": 0.13601992705275331, + "grad_norm": 0.15481267869472504, + "learning_rate": 0.0009605801400507397, + "loss": 2.8327, + "step": 4587 + }, + { + "epoch": 0.1360495804050648, + "grad_norm": 0.1495916098356247, + "learning_rate": 0.0009605618265574251, + "loss": 2.8508, + "step": 4588 + }, + { + "epoch": 0.13607923375737627, + "grad_norm": 0.14165304601192474, + "learning_rate": 0.0009605435089857564, + "loss": 2.8521, + "step": 4589 + }, + { + "epoch": 0.13610888710968774, + "grad_norm": 0.13864098489284515, + "learning_rate": 0.000960525187335896, + "loss": 2.856, + "step": 4590 + }, + { + "epoch": 0.13613854046199922, + "grad_norm": 0.12862512469291687, + "learning_rate": 0.000960506861608006, + "loss": 2.8561, + "step": 4591 + }, + { + "epoch": 0.1361681938143107, + "grad_norm": 0.13301128149032593, + "learning_rate": 0.0009604885318022487, + "loss": 2.8765, + "step": 4592 + }, + { + "epoch": 0.1361978471666222, + "grad_norm": 0.1288823038339615, + "learning_rate": 0.0009604701979187864, + "loss": 2.8734, + "step": 4593 + }, + { + "epoch": 0.13622750051893368, + "grad_norm": 0.12822549045085907, + "learning_rate": 0.0009604518599577814, + "loss": 2.8798, + "step": 4594 + }, + { + "epoch": 0.13625715387124515, + "grad_norm": 0.12463908642530441, + "learning_rate": 0.0009604335179193962, + "loss": 2.874, + "step": 4595 + }, + { + "epoch": 0.13628680722355663, + "grad_norm": 0.15013650059700012, + "learning_rate": 0.0009604151718037933, + "loss": 2.8621, + "step": 4596 + }, + { + "epoch": 0.1363164605758681, + "grad_norm": 0.1440981924533844, + "learning_rate": 0.0009603968216111348, + "loss": 2.8487, + "step": 4597 + }, + { + "epoch": 0.13634611392817958, + "grad_norm": 0.13839471340179443, + "learning_rate": 0.0009603784673415834, + "loss": 2.8527, + "step": 4598 + }, + { + "epoch": 0.13637576728049106, + "grad_norm": 0.17080551385879517, + "learning_rate": 0.0009603601089953018, + "loss": 2.892, + "step": 4599 + }, + { + "epoch": 0.13640542063280253, + "grad_norm": 0.18967005610466003, + "learning_rate": 0.0009603417465724525, + "loss": 2.8506, + "step": 4600 + }, + { + "epoch": 0.136435073985114, + "grad_norm": 0.20792733132839203, + "learning_rate": 0.0009603233800731978, + "loss": 2.8649, + "step": 4601 + }, + { + "epoch": 0.13646472733742548, + "grad_norm": 0.22185863554477692, + "learning_rate": 0.0009603050094977006, + "loss": 2.8888, + "step": 4602 + }, + { + "epoch": 0.136494380689737, + "grad_norm": 0.16754736006259918, + "learning_rate": 0.0009602866348461236, + "loss": 2.876, + "step": 4603 + }, + { + "epoch": 0.13652403404204846, + "grad_norm": 0.17254306375980377, + "learning_rate": 0.0009602682561186294, + "loss": 2.8601, + "step": 4604 + }, + { + "epoch": 0.13655368739435994, + "grad_norm": 0.18111008405685425, + "learning_rate": 0.0009602498733153809, + "loss": 2.8634, + "step": 4605 + }, + { + "epoch": 0.13658334074667142, + "grad_norm": 0.14892511069774628, + "learning_rate": 0.0009602314864365404, + "loss": 2.84, + "step": 4606 + }, + { + "epoch": 0.1366129940989829, + "grad_norm": 0.14771819114685059, + "learning_rate": 0.0009602130954822714, + "loss": 2.8534, + "step": 4607 + }, + { + "epoch": 0.13664264745129437, + "grad_norm": 0.1579388678073883, + "learning_rate": 0.0009601947004527364, + "loss": 2.8378, + "step": 4608 + }, + { + "epoch": 0.13667230080360585, + "grad_norm": 0.13639304041862488, + "learning_rate": 0.0009601763013480984, + "loss": 2.8813, + "step": 4609 + }, + { + "epoch": 0.13670195415591732, + "grad_norm": 0.1431570202112198, + "learning_rate": 0.0009601578981685201, + "loss": 2.8789, + "step": 4610 + }, + { + "epoch": 0.1367316075082288, + "grad_norm": 0.15553708374500275, + "learning_rate": 0.0009601394909141648, + "loss": 2.8203, + "step": 4611 + }, + { + "epoch": 0.13676126086054027, + "grad_norm": 0.1865958273410797, + "learning_rate": 0.0009601210795851953, + "loss": 2.8406, + "step": 4612 + }, + { + "epoch": 0.13679091421285175, + "grad_norm": 0.19788818061351776, + "learning_rate": 0.0009601026641817747, + "loss": 2.8511, + "step": 4613 + }, + { + "epoch": 0.13682056756516325, + "grad_norm": 0.17052999138832092, + "learning_rate": 0.0009600842447040659, + "loss": 2.8525, + "step": 4614 + }, + { + "epoch": 0.13685022091747473, + "grad_norm": 0.155185729265213, + "learning_rate": 0.0009600658211522322, + "loss": 2.8663, + "step": 4615 + }, + { + "epoch": 0.1368798742697862, + "grad_norm": 0.1573803424835205, + "learning_rate": 0.0009600473935264367, + "loss": 2.8263, + "step": 4616 + }, + { + "epoch": 0.13690952762209768, + "grad_norm": 0.15831857919692993, + "learning_rate": 0.0009600289618268425, + "loss": 2.8425, + "step": 4617 + }, + { + "epoch": 0.13693918097440916, + "grad_norm": 0.16604521870613098, + "learning_rate": 0.000960010526053613, + "loss": 2.8863, + "step": 4618 + }, + { + "epoch": 0.13696883432672063, + "grad_norm": 0.1732517033815384, + "learning_rate": 0.0009599920862069112, + "loss": 2.8661, + "step": 4619 + }, + { + "epoch": 0.1369984876790321, + "grad_norm": 0.17432419955730438, + "learning_rate": 0.0009599736422869006, + "loss": 2.8732, + "step": 4620 + }, + { + "epoch": 0.1370281410313436, + "grad_norm": 0.179005965590477, + "learning_rate": 0.0009599551942937444, + "loss": 2.8897, + "step": 4621 + }, + { + "epoch": 0.13705779438365506, + "grad_norm": 0.17724847793579102, + "learning_rate": 0.000959936742227606, + "loss": 2.8408, + "step": 4622 + }, + { + "epoch": 0.13708744773596654, + "grad_norm": 0.17355017364025116, + "learning_rate": 0.0009599182860886488, + "loss": 2.8326, + "step": 4623 + }, + { + "epoch": 0.13711710108827804, + "grad_norm": 0.15981532633304596, + "learning_rate": 0.0009598998258770362, + "loss": 2.8857, + "step": 4624 + }, + { + "epoch": 0.13714675444058952, + "grad_norm": 0.1430581957101822, + "learning_rate": 0.0009598813615929318, + "loss": 2.8371, + "step": 4625 + }, + { + "epoch": 0.137176407792901, + "grad_norm": 0.1379367858171463, + "learning_rate": 0.000959862893236499, + "loss": 2.8643, + "step": 4626 + }, + { + "epoch": 0.13720606114521247, + "grad_norm": 0.15437431633472443, + "learning_rate": 0.0009598444208079013, + "loss": 2.8461, + "step": 4627 + }, + { + "epoch": 0.13723571449752395, + "grad_norm": 0.16920024156570435, + "learning_rate": 0.0009598259443073023, + "loss": 2.8935, + "step": 4628 + }, + { + "epoch": 0.13726536784983542, + "grad_norm": 0.20526961982250214, + "learning_rate": 0.0009598074637348657, + "loss": 2.8549, + "step": 4629 + }, + { + "epoch": 0.1372950212021469, + "grad_norm": 0.2309492826461792, + "learning_rate": 0.000959788979090755, + "loss": 2.8519, + "step": 4630 + }, + { + "epoch": 0.13732467455445838, + "grad_norm": 0.2266542613506317, + "learning_rate": 0.000959770490375134, + "loss": 2.8809, + "step": 4631 + }, + { + "epoch": 0.13735432790676985, + "grad_norm": 0.18747277557849884, + "learning_rate": 0.0009597519975881665, + "loss": 2.8551, + "step": 4632 + }, + { + "epoch": 0.13738398125908133, + "grad_norm": 0.19229751825332642, + "learning_rate": 0.000959733500730016, + "loss": 2.8532, + "step": 4633 + }, + { + "epoch": 0.1374136346113928, + "grad_norm": 0.20898562669754028, + "learning_rate": 0.0009597149998008466, + "loss": 2.8387, + "step": 4634 + }, + { + "epoch": 0.1374432879637043, + "grad_norm": 0.17866893112659454, + "learning_rate": 0.0009596964948008217, + "loss": 2.8854, + "step": 4635 + }, + { + "epoch": 0.13747294131601578, + "grad_norm": 0.17776115238666534, + "learning_rate": 0.0009596779857301056, + "loss": 2.847, + "step": 4636 + }, + { + "epoch": 0.13750259466832726, + "grad_norm": 0.18024003505706787, + "learning_rate": 0.0009596594725888621, + "loss": 2.8569, + "step": 4637 + }, + { + "epoch": 0.13753224802063874, + "grad_norm": 0.14075452089309692, + "learning_rate": 0.000959640955377255, + "loss": 2.864, + "step": 4638 + }, + { + "epoch": 0.1375619013729502, + "grad_norm": 0.15202559530735016, + "learning_rate": 0.0009596224340954482, + "loss": 2.8723, + "step": 4639 + }, + { + "epoch": 0.1375915547252617, + "grad_norm": 0.19111333787441254, + "learning_rate": 0.000959603908743606, + "loss": 2.8701, + "step": 4640 + }, + { + "epoch": 0.13762120807757316, + "grad_norm": 0.20098397135734558, + "learning_rate": 0.0009595853793218923, + "loss": 2.8722, + "step": 4641 + }, + { + "epoch": 0.13765086142988464, + "grad_norm": 0.16433323919773102, + "learning_rate": 0.0009595668458304711, + "loss": 2.7987, + "step": 4642 + }, + { + "epoch": 0.13768051478219612, + "grad_norm": 0.15206217765808105, + "learning_rate": 0.0009595483082695068, + "loss": 2.8822, + "step": 4643 + }, + { + "epoch": 0.1377101681345076, + "grad_norm": 0.13399595022201538, + "learning_rate": 0.0009595297666391632, + "loss": 2.8773, + "step": 4644 + }, + { + "epoch": 0.1377398214868191, + "grad_norm": 0.15647612512111664, + "learning_rate": 0.0009595112209396046, + "loss": 2.848, + "step": 4645 + }, + { + "epoch": 0.13776947483913057, + "grad_norm": 0.15639124810695648, + "learning_rate": 0.0009594926711709953, + "loss": 2.8566, + "step": 4646 + }, + { + "epoch": 0.13779912819144205, + "grad_norm": 0.1531158685684204, + "learning_rate": 0.0009594741173334996, + "loss": 2.8394, + "step": 4647 + }, + { + "epoch": 0.13782878154375353, + "grad_norm": 0.14960601925849915, + "learning_rate": 0.0009594555594272816, + "loss": 2.8443, + "step": 4648 + }, + { + "epoch": 0.137858434896065, + "grad_norm": 0.1481797844171524, + "learning_rate": 0.0009594369974525059, + "loss": 2.8675, + "step": 4649 + }, + { + "epoch": 0.13788808824837648, + "grad_norm": 0.16267584264278412, + "learning_rate": 0.0009594184314093366, + "loss": 2.8727, + "step": 4650 + }, + { + "epoch": 0.13791774160068795, + "grad_norm": 0.17027276754379272, + "learning_rate": 0.0009593998612979383, + "loss": 2.8903, + "step": 4651 + }, + { + "epoch": 0.13794739495299943, + "grad_norm": 0.16683200001716614, + "learning_rate": 0.0009593812871184754, + "loss": 2.8265, + "step": 4652 + }, + { + "epoch": 0.1379770483053109, + "grad_norm": 0.20165708661079407, + "learning_rate": 0.0009593627088711124, + "loss": 2.8835, + "step": 4653 + }, + { + "epoch": 0.13800670165762238, + "grad_norm": 0.21222035586833954, + "learning_rate": 0.0009593441265560136, + "loss": 2.8882, + "step": 4654 + }, + { + "epoch": 0.1380363550099339, + "grad_norm": 0.2237689048051834, + "learning_rate": 0.0009593255401733437, + "loss": 2.8643, + "step": 4655 + }, + { + "epoch": 0.13806600836224536, + "grad_norm": 0.19109809398651123, + "learning_rate": 0.0009593069497232674, + "loss": 2.8695, + "step": 4656 + }, + { + "epoch": 0.13809566171455684, + "grad_norm": 0.19332453608512878, + "learning_rate": 0.0009592883552059493, + "loss": 2.8804, + "step": 4657 + }, + { + "epoch": 0.13812531506686831, + "grad_norm": 0.19847160577774048, + "learning_rate": 0.0009592697566215538, + "loss": 2.875, + "step": 4658 + }, + { + "epoch": 0.1381549684191798, + "grad_norm": 0.1531039923429489, + "learning_rate": 0.0009592511539702459, + "loss": 2.8637, + "step": 4659 + }, + { + "epoch": 0.13818462177149127, + "grad_norm": 0.15361814200878143, + "learning_rate": 0.0009592325472521901, + "loss": 2.8654, + "step": 4660 + }, + { + "epoch": 0.13821427512380274, + "grad_norm": 0.17242848873138428, + "learning_rate": 0.0009592139364675514, + "loss": 2.829, + "step": 4661 + }, + { + "epoch": 0.13824392847611422, + "grad_norm": 0.15811291337013245, + "learning_rate": 0.0009591953216164943, + "loss": 2.844, + "step": 4662 + }, + { + "epoch": 0.1382735818284257, + "grad_norm": 0.14131247997283936, + "learning_rate": 0.000959176702699184, + "loss": 2.8484, + "step": 4663 + }, + { + "epoch": 0.13830323518073717, + "grad_norm": 0.14984767138957977, + "learning_rate": 0.0009591580797157851, + "loss": 2.8553, + "step": 4664 + }, + { + "epoch": 0.13833288853304865, + "grad_norm": 0.15543712675571442, + "learning_rate": 0.0009591394526664625, + "loss": 2.881, + "step": 4665 + }, + { + "epoch": 0.13836254188536015, + "grad_norm": 0.13518226146697998, + "learning_rate": 0.0009591208215513813, + "loss": 2.8303, + "step": 4666 + }, + { + "epoch": 0.13839219523767163, + "grad_norm": 0.15797999501228333, + "learning_rate": 0.0009591021863707065, + "loss": 2.8419, + "step": 4667 + }, + { + "epoch": 0.1384218485899831, + "grad_norm": 0.178233802318573, + "learning_rate": 0.0009590835471246029, + "loss": 2.8492, + "step": 4668 + }, + { + "epoch": 0.13845150194229458, + "grad_norm": 0.18504071235656738, + "learning_rate": 0.0009590649038132358, + "loss": 2.8809, + "step": 4669 + }, + { + "epoch": 0.13848115529460606, + "grad_norm": 0.2062065601348877, + "learning_rate": 0.0009590462564367701, + "loss": 2.8234, + "step": 4670 + }, + { + "epoch": 0.13851080864691753, + "grad_norm": 0.20742741227149963, + "learning_rate": 0.000959027604995371, + "loss": 2.8515, + "step": 4671 + }, + { + "epoch": 0.138540461999229, + "grad_norm": 0.17801563441753387, + "learning_rate": 0.0009590089494892039, + "loss": 2.8303, + "step": 4672 + }, + { + "epoch": 0.13857011535154048, + "grad_norm": 0.15651766955852509, + "learning_rate": 0.0009589902899184334, + "loss": 2.8486, + "step": 4673 + }, + { + "epoch": 0.13859976870385196, + "grad_norm": 0.1615440994501114, + "learning_rate": 0.0009589716262832253, + "loss": 2.868, + "step": 4674 + }, + { + "epoch": 0.13862942205616344, + "grad_norm": 0.1481926292181015, + "learning_rate": 0.0009589529585837446, + "loss": 2.8273, + "step": 4675 + }, + { + "epoch": 0.13865907540847494, + "grad_norm": 0.15109729766845703, + "learning_rate": 0.0009589342868201568, + "loss": 2.8905, + "step": 4676 + }, + { + "epoch": 0.13868872876078642, + "grad_norm": 0.1567247211933136, + "learning_rate": 0.0009589156109926269, + "loss": 2.8582, + "step": 4677 + }, + { + "epoch": 0.1387183821130979, + "grad_norm": 0.1573452204465866, + "learning_rate": 0.0009588969311013207, + "loss": 2.8565, + "step": 4678 + }, + { + "epoch": 0.13874803546540937, + "grad_norm": 0.16213496029376984, + "learning_rate": 0.0009588782471464033, + "loss": 2.8554, + "step": 4679 + }, + { + "epoch": 0.13877768881772085, + "grad_norm": 0.16413861513137817, + "learning_rate": 0.0009588595591280403, + "loss": 2.8585, + "step": 4680 + }, + { + "epoch": 0.13880734217003232, + "grad_norm": 0.1720077246427536, + "learning_rate": 0.0009588408670463971, + "loss": 2.8856, + "step": 4681 + }, + { + "epoch": 0.1388369955223438, + "grad_norm": 0.16059499979019165, + "learning_rate": 0.0009588221709016392, + "loss": 2.8728, + "step": 4682 + }, + { + "epoch": 0.13886664887465527, + "grad_norm": 0.16495686769485474, + "learning_rate": 0.0009588034706939323, + "loss": 2.834, + "step": 4683 + }, + { + "epoch": 0.13889630222696675, + "grad_norm": 0.1692160815000534, + "learning_rate": 0.0009587847664234419, + "loss": 2.8279, + "step": 4684 + }, + { + "epoch": 0.13892595557927823, + "grad_norm": 0.17207583785057068, + "learning_rate": 0.0009587660580903338, + "loss": 2.8425, + "step": 4685 + }, + { + "epoch": 0.1389556089315897, + "grad_norm": 0.18784672021865845, + "learning_rate": 0.0009587473456947733, + "loss": 2.8192, + "step": 4686 + }, + { + "epoch": 0.1389852622839012, + "grad_norm": 0.1685241460800171, + "learning_rate": 0.0009587286292369264, + "loss": 2.848, + "step": 4687 + }, + { + "epoch": 0.13901491563621268, + "grad_norm": 0.15274064242839813, + "learning_rate": 0.0009587099087169587, + "loss": 2.8467, + "step": 4688 + }, + { + "epoch": 0.13904456898852416, + "grad_norm": 0.16050156950950623, + "learning_rate": 0.0009586911841350361, + "loss": 2.8329, + "step": 4689 + }, + { + "epoch": 0.13907422234083563, + "grad_norm": 0.15996809303760529, + "learning_rate": 0.0009586724554913243, + "loss": 2.8489, + "step": 4690 + }, + { + "epoch": 0.1391038756931471, + "grad_norm": 0.13536924123764038, + "learning_rate": 0.0009586537227859892, + "loss": 2.8169, + "step": 4691 + }, + { + "epoch": 0.1391335290454586, + "grad_norm": 0.13866648077964783, + "learning_rate": 0.0009586349860191965, + "loss": 2.8712, + "step": 4692 + }, + { + "epoch": 0.13916318239777006, + "grad_norm": 0.1458006650209427, + "learning_rate": 0.0009586162451911124, + "loss": 2.8413, + "step": 4693 + }, + { + "epoch": 0.13919283575008154, + "grad_norm": 0.16291311383247375, + "learning_rate": 0.0009585975003019027, + "loss": 2.8688, + "step": 4694 + }, + { + "epoch": 0.13922248910239302, + "grad_norm": 0.1836056262254715, + "learning_rate": 0.0009585787513517334, + "loss": 2.8798, + "step": 4695 + }, + { + "epoch": 0.1392521424547045, + "grad_norm": 0.22061730921268463, + "learning_rate": 0.0009585599983407707, + "loss": 2.8791, + "step": 4696 + }, + { + "epoch": 0.139281795807016, + "grad_norm": 0.20423679053783417, + "learning_rate": 0.0009585412412691805, + "loss": 2.8449, + "step": 4697 + }, + { + "epoch": 0.13931144915932747, + "grad_norm": 0.17016535997390747, + "learning_rate": 0.0009585224801371286, + "loss": 2.8376, + "step": 4698 + }, + { + "epoch": 0.13934110251163895, + "grad_norm": 0.1791103631258011, + "learning_rate": 0.0009585037149447817, + "loss": 2.8337, + "step": 4699 + }, + { + "epoch": 0.13937075586395042, + "grad_norm": 0.1725824773311615, + "learning_rate": 0.0009584849456923057, + "loss": 2.8382, + "step": 4700 + }, + { + "epoch": 0.1394004092162619, + "grad_norm": 0.16602104902267456, + "learning_rate": 0.0009584661723798666, + "loss": 2.8428, + "step": 4701 + }, + { + "epoch": 0.13943006256857338, + "grad_norm": 0.17114271223545074, + "learning_rate": 0.0009584473950076312, + "loss": 2.8442, + "step": 4702 + }, + { + "epoch": 0.13945971592088485, + "grad_norm": 0.1822984367609024, + "learning_rate": 0.0009584286135757651, + "loss": 2.8364, + "step": 4703 + }, + { + "epoch": 0.13948936927319633, + "grad_norm": 0.16353049874305725, + "learning_rate": 0.000958409828084435, + "loss": 2.8676, + "step": 4704 + }, + { + "epoch": 0.1395190226255078, + "grad_norm": 0.1678495854139328, + "learning_rate": 0.0009583910385338073, + "loss": 2.8411, + "step": 4705 + }, + { + "epoch": 0.13954867597781928, + "grad_norm": 0.16468271613121033, + "learning_rate": 0.0009583722449240481, + "loss": 2.8696, + "step": 4706 + }, + { + "epoch": 0.13957832933013078, + "grad_norm": 0.1869918256998062, + "learning_rate": 0.0009583534472553241, + "loss": 2.8037, + "step": 4707 + }, + { + "epoch": 0.13960798268244226, + "grad_norm": 0.19605456292629242, + "learning_rate": 0.0009583346455278017, + "loss": 2.8762, + "step": 4708 + }, + { + "epoch": 0.13963763603475374, + "grad_norm": 0.1839001476764679, + "learning_rate": 0.0009583158397416473, + "loss": 2.7987, + "step": 4709 + }, + { + "epoch": 0.1396672893870652, + "grad_norm": 0.17625372111797333, + "learning_rate": 0.0009582970298970274, + "loss": 2.8289, + "step": 4710 + }, + { + "epoch": 0.1396969427393767, + "grad_norm": 0.16388019919395447, + "learning_rate": 0.0009582782159941088, + "loss": 2.8766, + "step": 4711 + }, + { + "epoch": 0.13972659609168817, + "grad_norm": 0.14923416078090668, + "learning_rate": 0.0009582593980330578, + "loss": 2.8708, + "step": 4712 + }, + { + "epoch": 0.13975624944399964, + "grad_norm": 0.14671097695827484, + "learning_rate": 0.0009582405760140411, + "loss": 2.8772, + "step": 4713 + }, + { + "epoch": 0.13978590279631112, + "grad_norm": 0.14864006638526917, + "learning_rate": 0.0009582217499372257, + "loss": 2.8371, + "step": 4714 + }, + { + "epoch": 0.1398155561486226, + "grad_norm": 0.16780583560466766, + "learning_rate": 0.0009582029198027778, + "loss": 2.8424, + "step": 4715 + }, + { + "epoch": 0.13984520950093407, + "grad_norm": 0.1548002064228058, + "learning_rate": 0.0009581840856108646, + "loss": 2.8451, + "step": 4716 + }, + { + "epoch": 0.13987486285324555, + "grad_norm": 0.14150263369083405, + "learning_rate": 0.0009581652473616524, + "loss": 2.8829, + "step": 4717 + }, + { + "epoch": 0.13990451620555705, + "grad_norm": 0.14866399765014648, + "learning_rate": 0.0009581464050553086, + "loss": 2.9042, + "step": 4718 + }, + { + "epoch": 0.13993416955786853, + "grad_norm": 0.14807720482349396, + "learning_rate": 0.0009581275586919995, + "loss": 2.8235, + "step": 4719 + }, + { + "epoch": 0.13996382291018, + "grad_norm": 0.15222910046577454, + "learning_rate": 0.0009581087082718924, + "loss": 2.8344, + "step": 4720 + }, + { + "epoch": 0.13999347626249148, + "grad_norm": 0.1694420874118805, + "learning_rate": 0.0009580898537951539, + "loss": 2.8351, + "step": 4721 + }, + { + "epoch": 0.14002312961480295, + "grad_norm": 0.16980168223381042, + "learning_rate": 0.0009580709952619513, + "loss": 2.8417, + "step": 4722 + }, + { + "epoch": 0.14005278296711443, + "grad_norm": 0.1394941806793213, + "learning_rate": 0.0009580521326724513, + "loss": 2.8669, + "step": 4723 + }, + { + "epoch": 0.1400824363194259, + "grad_norm": 0.1625019609928131, + "learning_rate": 0.000958033266026821, + "loss": 2.8882, + "step": 4724 + }, + { + "epoch": 0.14011208967173738, + "grad_norm": 0.18701650202274323, + "learning_rate": 0.0009580143953252276, + "loss": 2.9144, + "step": 4725 + }, + { + "epoch": 0.14014174302404886, + "grad_norm": 0.18990114331245422, + "learning_rate": 0.0009579955205678381, + "loss": 2.8674, + "step": 4726 + }, + { + "epoch": 0.14017139637636034, + "grad_norm": 0.17928791046142578, + "learning_rate": 0.0009579766417548196, + "loss": 2.8299, + "step": 4727 + }, + { + "epoch": 0.14020104972867184, + "grad_norm": 0.18731480836868286, + "learning_rate": 0.0009579577588863392, + "loss": 2.8722, + "step": 4728 + }, + { + "epoch": 0.14023070308098332, + "grad_norm": 0.17759425938129425, + "learning_rate": 0.0009579388719625645, + "loss": 2.8465, + "step": 4729 + }, + { + "epoch": 0.1402603564332948, + "grad_norm": 0.19355212152004242, + "learning_rate": 0.0009579199809836624, + "loss": 2.8443, + "step": 4730 + }, + { + "epoch": 0.14029000978560627, + "grad_norm": 0.1939537674188614, + "learning_rate": 0.0009579010859498003, + "loss": 2.9097, + "step": 4731 + }, + { + "epoch": 0.14031966313791774, + "grad_norm": 0.1509189009666443, + "learning_rate": 0.0009578821868611453, + "loss": 2.8673, + "step": 4732 + }, + { + "epoch": 0.14034931649022922, + "grad_norm": 0.15129032731056213, + "learning_rate": 0.0009578632837178652, + "loss": 2.8422, + "step": 4733 + }, + { + "epoch": 0.1403789698425407, + "grad_norm": 0.14302004873752594, + "learning_rate": 0.000957844376520127, + "loss": 2.8559, + "step": 4734 + }, + { + "epoch": 0.14040862319485217, + "grad_norm": 0.1333545446395874, + "learning_rate": 0.0009578254652680982, + "loss": 2.8461, + "step": 4735 + }, + { + "epoch": 0.14043827654716365, + "grad_norm": 0.166955828666687, + "learning_rate": 0.0009578065499619464, + "loss": 2.8409, + "step": 4736 + }, + { + "epoch": 0.14046792989947512, + "grad_norm": 0.17620526254177094, + "learning_rate": 0.000957787630601839, + "loss": 2.8555, + "step": 4737 + }, + { + "epoch": 0.1404975832517866, + "grad_norm": 0.156056210398674, + "learning_rate": 0.0009577687071879435, + "loss": 2.8445, + "step": 4738 + }, + { + "epoch": 0.1405272366040981, + "grad_norm": 0.15387539565563202, + "learning_rate": 0.0009577497797204276, + "loss": 2.8685, + "step": 4739 + }, + { + "epoch": 0.14055688995640958, + "grad_norm": 0.1858622282743454, + "learning_rate": 0.0009577308481994589, + "loss": 2.8543, + "step": 4740 + }, + { + "epoch": 0.14058654330872106, + "grad_norm": 0.1889190673828125, + "learning_rate": 0.0009577119126252048, + "loss": 2.8439, + "step": 4741 + }, + { + "epoch": 0.14061619666103253, + "grad_norm": 0.20826934278011322, + "learning_rate": 0.0009576929729978332, + "loss": 2.8517, + "step": 4742 + }, + { + "epoch": 0.140645850013344, + "grad_norm": 0.23604373633861542, + "learning_rate": 0.0009576740293175118, + "loss": 2.8512, + "step": 4743 + }, + { + "epoch": 0.14067550336565549, + "grad_norm": 0.21140095591545105, + "learning_rate": 0.0009576550815844082, + "loss": 2.8456, + "step": 4744 + }, + { + "epoch": 0.14070515671796696, + "grad_norm": 0.20375114679336548, + "learning_rate": 0.0009576361297986904, + "loss": 2.8436, + "step": 4745 + }, + { + "epoch": 0.14073481007027844, + "grad_norm": 0.21014589071273804, + "learning_rate": 0.0009576171739605261, + "loss": 2.8663, + "step": 4746 + }, + { + "epoch": 0.1407644634225899, + "grad_norm": 0.19828712940216064, + "learning_rate": 0.0009575982140700833, + "loss": 2.8537, + "step": 4747 + }, + { + "epoch": 0.1407941167749014, + "grad_norm": 0.20431974530220032, + "learning_rate": 0.0009575792501275295, + "loss": 2.8793, + "step": 4748 + }, + { + "epoch": 0.1408237701272129, + "grad_norm": 0.17619316279888153, + "learning_rate": 0.0009575602821330332, + "loss": 2.8582, + "step": 4749 + }, + { + "epoch": 0.14085342347952437, + "grad_norm": 0.18459805846214294, + "learning_rate": 0.0009575413100867619, + "loss": 2.8409, + "step": 4750 + }, + { + "epoch": 0.14088307683183585, + "grad_norm": 0.16207943856716156, + "learning_rate": 0.0009575223339888838, + "loss": 2.8192, + "step": 4751 + }, + { + "epoch": 0.14091273018414732, + "grad_norm": 0.13571710884571075, + "learning_rate": 0.0009575033538395669, + "loss": 2.8793, + "step": 4752 + }, + { + "epoch": 0.1409423835364588, + "grad_norm": 0.1633492112159729, + "learning_rate": 0.0009574843696389792, + "loss": 2.8319, + "step": 4753 + }, + { + "epoch": 0.14097203688877027, + "grad_norm": 0.158145934343338, + "learning_rate": 0.0009574653813872888, + "loss": 2.8636, + "step": 4754 + }, + { + "epoch": 0.14100169024108175, + "grad_norm": 0.14385181665420532, + "learning_rate": 0.0009574463890846642, + "loss": 2.8261, + "step": 4755 + }, + { + "epoch": 0.14103134359339323, + "grad_norm": 0.13861891627311707, + "learning_rate": 0.0009574273927312731, + "loss": 2.8151, + "step": 4756 + }, + { + "epoch": 0.1410609969457047, + "grad_norm": 0.11622875183820724, + "learning_rate": 0.000957408392327284, + "loss": 2.8575, + "step": 4757 + }, + { + "epoch": 0.14109065029801618, + "grad_norm": 0.15259653329849243, + "learning_rate": 0.0009573893878728651, + "loss": 2.8416, + "step": 4758 + }, + { + "epoch": 0.14112030365032768, + "grad_norm": 0.13602131605148315, + "learning_rate": 0.0009573703793681846, + "loss": 2.821, + "step": 4759 + }, + { + "epoch": 0.14114995700263916, + "grad_norm": 0.11591412872076035, + "learning_rate": 0.0009573513668134109, + "loss": 2.8538, + "step": 4760 + }, + { + "epoch": 0.14117961035495064, + "grad_norm": 0.1271556317806244, + "learning_rate": 0.0009573323502087124, + "loss": 2.8546, + "step": 4761 + }, + { + "epoch": 0.1412092637072621, + "grad_norm": 0.1550300568342209, + "learning_rate": 0.0009573133295542574, + "loss": 2.8759, + "step": 4762 + }, + { + "epoch": 0.1412389170595736, + "grad_norm": 0.1598680317401886, + "learning_rate": 0.0009572943048502143, + "loss": 2.8401, + "step": 4763 + }, + { + "epoch": 0.14126857041188506, + "grad_norm": 0.17238101363182068, + "learning_rate": 0.0009572752760967517, + "loss": 2.8372, + "step": 4764 + }, + { + "epoch": 0.14129822376419654, + "grad_norm": 0.18803872168064117, + "learning_rate": 0.000957256243294038, + "loss": 2.8443, + "step": 4765 + }, + { + "epoch": 0.14132787711650802, + "grad_norm": 0.17781966924667358, + "learning_rate": 0.0009572372064422419, + "loss": 2.84, + "step": 4766 + }, + { + "epoch": 0.1413575304688195, + "grad_norm": 0.1776231974363327, + "learning_rate": 0.0009572181655415318, + "loss": 2.8325, + "step": 4767 + }, + { + "epoch": 0.14138718382113097, + "grad_norm": 0.15721292793750763, + "learning_rate": 0.0009571991205920763, + "loss": 2.8477, + "step": 4768 + }, + { + "epoch": 0.14141683717344244, + "grad_norm": 0.16466523706912994, + "learning_rate": 0.000957180071594044, + "loss": 2.8242, + "step": 4769 + }, + { + "epoch": 0.14144649052575395, + "grad_norm": 0.1850043684244156, + "learning_rate": 0.0009571610185476039, + "loss": 2.8402, + "step": 4770 + }, + { + "epoch": 0.14147614387806542, + "grad_norm": 0.1538064032793045, + "learning_rate": 0.0009571419614529244, + "loss": 2.8304, + "step": 4771 + }, + { + "epoch": 0.1415057972303769, + "grad_norm": 0.16264136135578156, + "learning_rate": 0.0009571229003101744, + "loss": 2.8412, + "step": 4772 + }, + { + "epoch": 0.14153545058268838, + "grad_norm": 0.17269635200500488, + "learning_rate": 0.0009571038351195227, + "loss": 2.8949, + "step": 4773 + }, + { + "epoch": 0.14156510393499985, + "grad_norm": 0.1678270697593689, + "learning_rate": 0.000957084765881138, + "loss": 2.8744, + "step": 4774 + }, + { + "epoch": 0.14159475728731133, + "grad_norm": 0.18228547275066376, + "learning_rate": 0.0009570656925951893, + "loss": 2.8418, + "step": 4775 + }, + { + "epoch": 0.1416244106396228, + "grad_norm": 0.16739404201507568, + "learning_rate": 0.0009570466152618453, + "loss": 2.8473, + "step": 4776 + }, + { + "epoch": 0.14165406399193428, + "grad_norm": 0.1510169506072998, + "learning_rate": 0.0009570275338812753, + "loss": 2.8625, + "step": 4777 + }, + { + "epoch": 0.14168371734424576, + "grad_norm": 0.14486180245876312, + "learning_rate": 0.0009570084484536479, + "loss": 2.858, + "step": 4778 + }, + { + "epoch": 0.14171337069655723, + "grad_norm": 0.1457810252904892, + "learning_rate": 0.0009569893589791323, + "loss": 2.8568, + "step": 4779 + }, + { + "epoch": 0.14174302404886874, + "grad_norm": 0.13503536581993103, + "learning_rate": 0.0009569702654578974, + "loss": 2.8319, + "step": 4780 + }, + { + "epoch": 0.1417726774011802, + "grad_norm": 0.13317134976387024, + "learning_rate": 0.0009569511678901123, + "loss": 2.8563, + "step": 4781 + }, + { + "epoch": 0.1418023307534917, + "grad_norm": 0.14828643202781677, + "learning_rate": 0.0009569320662759464, + "loss": 2.8329, + "step": 4782 + }, + { + "epoch": 0.14183198410580317, + "grad_norm": 0.14662018418312073, + "learning_rate": 0.0009569129606155685, + "loss": 2.8363, + "step": 4783 + }, + { + "epoch": 0.14186163745811464, + "grad_norm": 0.12715958058834076, + "learning_rate": 0.0009568938509091479, + "loss": 2.8673, + "step": 4784 + }, + { + "epoch": 0.14189129081042612, + "grad_norm": 0.13676570355892181, + "learning_rate": 0.0009568747371568539, + "loss": 2.8369, + "step": 4785 + }, + { + "epoch": 0.1419209441627376, + "grad_norm": 0.12900196015834808, + "learning_rate": 0.0009568556193588556, + "loss": 2.846, + "step": 4786 + }, + { + "epoch": 0.14195059751504907, + "grad_norm": 0.14092478156089783, + "learning_rate": 0.0009568364975153224, + "loss": 2.9014, + "step": 4787 + }, + { + "epoch": 0.14198025086736055, + "grad_norm": 0.14636895060539246, + "learning_rate": 0.0009568173716264235, + "loss": 2.8744, + "step": 4788 + }, + { + "epoch": 0.14200990421967202, + "grad_norm": 0.16266216337680817, + "learning_rate": 0.0009567982416923285, + "loss": 2.846, + "step": 4789 + }, + { + "epoch": 0.1420395575719835, + "grad_norm": 0.185074582695961, + "learning_rate": 0.0009567791077132067, + "loss": 2.8197, + "step": 4790 + }, + { + "epoch": 0.142069210924295, + "grad_norm": 0.1981067657470703, + "learning_rate": 0.0009567599696892274, + "loss": 2.8446, + "step": 4791 + }, + { + "epoch": 0.14209886427660648, + "grad_norm": 0.20128712058067322, + "learning_rate": 0.0009567408276205602, + "loss": 2.859, + "step": 4792 + }, + { + "epoch": 0.14212851762891796, + "grad_norm": 0.21245825290679932, + "learning_rate": 0.0009567216815073745, + "loss": 2.8505, + "step": 4793 + }, + { + "epoch": 0.14215817098122943, + "grad_norm": 0.2046656757593155, + "learning_rate": 0.00095670253134984, + "loss": 2.8321, + "step": 4794 + }, + { + "epoch": 0.1421878243335409, + "grad_norm": 0.20241841673851013, + "learning_rate": 0.0009566833771481262, + "loss": 2.8628, + "step": 4795 + }, + { + "epoch": 0.14221747768585238, + "grad_norm": 0.19220571219921112, + "learning_rate": 0.0009566642189024026, + "loss": 2.8934, + "step": 4796 + }, + { + "epoch": 0.14224713103816386, + "grad_norm": 0.16688330471515656, + "learning_rate": 0.0009566450566128391, + "loss": 2.8834, + "step": 4797 + }, + { + "epoch": 0.14227678439047534, + "grad_norm": 0.1766139715909958, + "learning_rate": 0.0009566258902796051, + "loss": 2.8362, + "step": 4798 + }, + { + "epoch": 0.1423064377427868, + "grad_norm": 0.17050758004188538, + "learning_rate": 0.0009566067199028705, + "loss": 2.8447, + "step": 4799 + }, + { + "epoch": 0.1423360910950983, + "grad_norm": 0.13685230910778046, + "learning_rate": 0.000956587545482805, + "loss": 2.8387, + "step": 4800 + }, + { + "epoch": 0.1423657444474098, + "grad_norm": 0.1522243171930313, + "learning_rate": 0.0009565683670195787, + "loss": 2.8609, + "step": 4801 + }, + { + "epoch": 0.14239539779972127, + "grad_norm": 0.1489870697259903, + "learning_rate": 0.0009565491845133607, + "loss": 2.8195, + "step": 4802 + }, + { + "epoch": 0.14242505115203274, + "grad_norm": 0.15472926199436188, + "learning_rate": 0.0009565299979643217, + "loss": 2.8574, + "step": 4803 + }, + { + "epoch": 0.14245470450434422, + "grad_norm": 0.15301819145679474, + "learning_rate": 0.0009565108073726308, + "loss": 2.8408, + "step": 4804 + }, + { + "epoch": 0.1424843578566557, + "grad_norm": 0.13634827733039856, + "learning_rate": 0.0009564916127384587, + "loss": 2.8624, + "step": 4805 + }, + { + "epoch": 0.14251401120896717, + "grad_norm": 0.13137304782867432, + "learning_rate": 0.0009564724140619747, + "loss": 2.8862, + "step": 4806 + }, + { + "epoch": 0.14254366456127865, + "grad_norm": 0.14667417109012604, + "learning_rate": 0.0009564532113433493, + "loss": 2.8481, + "step": 4807 + }, + { + "epoch": 0.14257331791359013, + "grad_norm": 0.18558987975120544, + "learning_rate": 0.0009564340045827524, + "loss": 2.854, + "step": 4808 + }, + { + "epoch": 0.1426029712659016, + "grad_norm": 0.2377573698759079, + "learning_rate": 0.000956414793780354, + "loss": 2.86, + "step": 4809 + }, + { + "epoch": 0.14263262461821308, + "grad_norm": 0.23708313703536987, + "learning_rate": 0.0009563955789363243, + "loss": 2.8578, + "step": 4810 + }, + { + "epoch": 0.14266227797052458, + "grad_norm": 0.18369466066360474, + "learning_rate": 0.0009563763600508333, + "loss": 2.8464, + "step": 4811 + }, + { + "epoch": 0.14269193132283606, + "grad_norm": 0.1863843947649002, + "learning_rate": 0.0009563571371240514, + "loss": 2.8373, + "step": 4812 + }, + { + "epoch": 0.14272158467514753, + "grad_norm": 0.2026182860136032, + "learning_rate": 0.0009563379101561487, + "loss": 2.8471, + "step": 4813 + }, + { + "epoch": 0.142751238027459, + "grad_norm": 0.15302182734012604, + "learning_rate": 0.0009563186791472954, + "loss": 2.893, + "step": 4814 + }, + { + "epoch": 0.14278089137977049, + "grad_norm": 0.17426139116287231, + "learning_rate": 0.000956299444097662, + "loss": 2.8645, + "step": 4815 + }, + { + "epoch": 0.14281054473208196, + "grad_norm": 0.1911999136209488, + "learning_rate": 0.0009562802050074186, + "loss": 2.8823, + "step": 4816 + }, + { + "epoch": 0.14284019808439344, + "grad_norm": 0.20927506685256958, + "learning_rate": 0.0009562609618767357, + "loss": 2.8739, + "step": 4817 + }, + { + "epoch": 0.14286985143670491, + "grad_norm": 0.2169525921344757, + "learning_rate": 0.0009562417147057836, + "loss": 2.8507, + "step": 4818 + }, + { + "epoch": 0.1428995047890164, + "grad_norm": 0.1760651171207428, + "learning_rate": 0.0009562224634947329, + "loss": 2.8267, + "step": 4819 + }, + { + "epoch": 0.14292915814132787, + "grad_norm": 0.14831684529781342, + "learning_rate": 0.0009562032082437539, + "loss": 2.8811, + "step": 4820 + }, + { + "epoch": 0.14295881149363934, + "grad_norm": 0.1464914083480835, + "learning_rate": 0.0009561839489530173, + "loss": 2.8754, + "step": 4821 + }, + { + "epoch": 0.14298846484595085, + "grad_norm": 0.14946068823337555, + "learning_rate": 0.0009561646856226933, + "loss": 2.8751, + "step": 4822 + }, + { + "epoch": 0.14301811819826232, + "grad_norm": 0.13336893916130066, + "learning_rate": 0.0009561454182529529, + "loss": 2.8547, + "step": 4823 + }, + { + "epoch": 0.1430477715505738, + "grad_norm": 0.1330002248287201, + "learning_rate": 0.0009561261468439666, + "loss": 2.8905, + "step": 4824 + }, + { + "epoch": 0.14307742490288528, + "grad_norm": 0.12508296966552734, + "learning_rate": 0.0009561068713959048, + "loss": 2.8541, + "step": 4825 + }, + { + "epoch": 0.14310707825519675, + "grad_norm": 0.15577059984207153, + "learning_rate": 0.0009560875919089384, + "loss": 2.8251, + "step": 4826 + }, + { + "epoch": 0.14313673160750823, + "grad_norm": 0.16649915277957916, + "learning_rate": 0.0009560683083832381, + "loss": 2.8382, + "step": 4827 + }, + { + "epoch": 0.1431663849598197, + "grad_norm": 0.149896040558815, + "learning_rate": 0.0009560490208189747, + "loss": 2.8759, + "step": 4828 + }, + { + "epoch": 0.14319603831213118, + "grad_norm": 0.1228126659989357, + "learning_rate": 0.0009560297292163189, + "loss": 2.8479, + "step": 4829 + }, + { + "epoch": 0.14322569166444266, + "grad_norm": 0.12356320768594742, + "learning_rate": 0.0009560104335754416, + "loss": 2.8411, + "step": 4830 + }, + { + "epoch": 0.14325534501675413, + "grad_norm": 0.13680832087993622, + "learning_rate": 0.0009559911338965135, + "loss": 2.8555, + "step": 4831 + }, + { + "epoch": 0.14328499836906564, + "grad_norm": 0.14971068501472473, + "learning_rate": 0.0009559718301797058, + "loss": 2.8586, + "step": 4832 + }, + { + "epoch": 0.1433146517213771, + "grad_norm": 0.1476059854030609, + "learning_rate": 0.0009559525224251893, + "loss": 2.8337, + "step": 4833 + }, + { + "epoch": 0.1433443050736886, + "grad_norm": 0.17830303311347961, + "learning_rate": 0.0009559332106331348, + "loss": 2.8466, + "step": 4834 + }, + { + "epoch": 0.14337395842600006, + "grad_norm": 0.21158334612846375, + "learning_rate": 0.0009559138948037136, + "loss": 2.8308, + "step": 4835 + }, + { + "epoch": 0.14340361177831154, + "grad_norm": 0.22022143006324768, + "learning_rate": 0.0009558945749370964, + "loss": 2.8514, + "step": 4836 + }, + { + "epoch": 0.14343326513062302, + "grad_norm": 0.21244138479232788, + "learning_rate": 0.0009558752510334548, + "loss": 2.8683, + "step": 4837 + }, + { + "epoch": 0.1434629184829345, + "grad_norm": 0.2332642525434494, + "learning_rate": 0.0009558559230929593, + "loss": 2.887, + "step": 4838 + }, + { + "epoch": 0.14349257183524597, + "grad_norm": 0.202005997300148, + "learning_rate": 0.0009558365911157815, + "loss": 2.8225, + "step": 4839 + }, + { + "epoch": 0.14352222518755745, + "grad_norm": 0.18494252860546112, + "learning_rate": 0.0009558172551020925, + "loss": 2.8734, + "step": 4840 + }, + { + "epoch": 0.14355187853986892, + "grad_norm": 0.18195432424545288, + "learning_rate": 0.0009557979150520633, + "loss": 2.847, + "step": 4841 + }, + { + "epoch": 0.1435815318921804, + "grad_norm": 0.17218686640262604, + "learning_rate": 0.0009557785709658654, + "loss": 2.8045, + "step": 4842 + }, + { + "epoch": 0.1436111852444919, + "grad_norm": 0.1687728762626648, + "learning_rate": 0.00095575922284367, + "loss": 2.8168, + "step": 4843 + }, + { + "epoch": 0.14364083859680338, + "grad_norm": 0.15338680148124695, + "learning_rate": 0.0009557398706856486, + "loss": 2.8569, + "step": 4844 + }, + { + "epoch": 0.14367049194911485, + "grad_norm": 0.17157875001430511, + "learning_rate": 0.0009557205144919723, + "loss": 2.8799, + "step": 4845 + }, + { + "epoch": 0.14370014530142633, + "grad_norm": 0.1732209026813507, + "learning_rate": 0.0009557011542628126, + "loss": 2.8191, + "step": 4846 + }, + { + "epoch": 0.1437297986537378, + "grad_norm": 0.17814496159553528, + "learning_rate": 0.0009556817899983409, + "loss": 2.8488, + "step": 4847 + }, + { + "epoch": 0.14375945200604928, + "grad_norm": 0.1765468716621399, + "learning_rate": 0.0009556624216987288, + "loss": 2.868, + "step": 4848 + }, + { + "epoch": 0.14378910535836076, + "grad_norm": 0.16063189506530762, + "learning_rate": 0.0009556430493641479, + "loss": 2.8363, + "step": 4849 + }, + { + "epoch": 0.14381875871067223, + "grad_norm": 0.15956728160381317, + "learning_rate": 0.0009556236729947694, + "loss": 2.816, + "step": 4850 + }, + { + "epoch": 0.1438484120629837, + "grad_norm": 0.1652260571718216, + "learning_rate": 0.0009556042925907651, + "loss": 2.8318, + "step": 4851 + }, + { + "epoch": 0.1438780654152952, + "grad_norm": 0.14473369717597961, + "learning_rate": 0.0009555849081523066, + "loss": 2.8558, + "step": 4852 + }, + { + "epoch": 0.1439077187676067, + "grad_norm": 0.14571093022823334, + "learning_rate": 0.0009555655196795657, + "loss": 2.8629, + "step": 4853 + }, + { + "epoch": 0.14393737211991817, + "grad_norm": 0.16391994059085846, + "learning_rate": 0.0009555461271727136, + "loss": 2.8615, + "step": 4854 + }, + { + "epoch": 0.14396702547222964, + "grad_norm": 0.17830699682235718, + "learning_rate": 0.0009555267306319225, + "loss": 2.8507, + "step": 4855 + }, + { + "epoch": 0.14399667882454112, + "grad_norm": 0.16854748129844666, + "learning_rate": 0.000955507330057364, + "loss": 2.8261, + "step": 4856 + }, + { + "epoch": 0.1440263321768526, + "grad_norm": 0.13817910850048065, + "learning_rate": 0.00095548792544921, + "loss": 2.7835, + "step": 4857 + }, + { + "epoch": 0.14405598552916407, + "grad_norm": 0.1679200381040573, + "learning_rate": 0.0009554685168076323, + "loss": 2.8502, + "step": 4858 + }, + { + "epoch": 0.14408563888147555, + "grad_norm": 0.18367458879947662, + "learning_rate": 0.0009554491041328023, + "loss": 2.8373, + "step": 4859 + }, + { + "epoch": 0.14411529223378702, + "grad_norm": 0.1649985909461975, + "learning_rate": 0.0009554296874248927, + "loss": 2.8666, + "step": 4860 + }, + { + "epoch": 0.1441449455860985, + "grad_norm": 0.16517603397369385, + "learning_rate": 0.000955410266684075, + "loss": 2.8145, + "step": 4861 + }, + { + "epoch": 0.14417459893840998, + "grad_norm": 0.17515741288661957, + "learning_rate": 0.0009553908419105211, + "loss": 2.8179, + "step": 4862 + }, + { + "epoch": 0.14420425229072148, + "grad_norm": 0.16342760622501373, + "learning_rate": 0.0009553714131044031, + "loss": 2.8523, + "step": 4863 + }, + { + "epoch": 0.14423390564303296, + "grad_norm": 0.15759974718093872, + "learning_rate": 0.0009553519802658932, + "loss": 2.819, + "step": 4864 + }, + { + "epoch": 0.14426355899534443, + "grad_norm": 0.1636471003293991, + "learning_rate": 0.0009553325433951633, + "loss": 2.8414, + "step": 4865 + }, + { + "epoch": 0.1442932123476559, + "grad_norm": 0.15778030455112457, + "learning_rate": 0.0009553131024923855, + "loss": 2.8702, + "step": 4866 + }, + { + "epoch": 0.14432286569996738, + "grad_norm": 0.14474090933799744, + "learning_rate": 0.0009552936575577322, + "loss": 2.857, + "step": 4867 + }, + { + "epoch": 0.14435251905227886, + "grad_norm": 0.14486558735370636, + "learning_rate": 0.0009552742085913753, + "loss": 2.8406, + "step": 4868 + }, + { + "epoch": 0.14438217240459034, + "grad_norm": 0.15457196533679962, + "learning_rate": 0.0009552547555934872, + "loss": 2.8421, + "step": 4869 + }, + { + "epoch": 0.1444118257569018, + "grad_norm": 0.14072299003601074, + "learning_rate": 0.00095523529856424, + "loss": 2.8204, + "step": 4870 + }, + { + "epoch": 0.1444414791092133, + "grad_norm": 0.16638432443141937, + "learning_rate": 0.000955215837503806, + "loss": 2.8688, + "step": 4871 + }, + { + "epoch": 0.14447113246152476, + "grad_norm": 0.18458181619644165, + "learning_rate": 0.0009551963724123577, + "loss": 2.8644, + "step": 4872 + }, + { + "epoch": 0.14450078581383624, + "grad_norm": 0.19854916632175446, + "learning_rate": 0.0009551769032900676, + "loss": 2.8382, + "step": 4873 + }, + { + "epoch": 0.14453043916614775, + "grad_norm": 0.16466008126735687, + "learning_rate": 0.0009551574301371078, + "loss": 2.8415, + "step": 4874 + }, + { + "epoch": 0.14456009251845922, + "grad_norm": 0.14979569613933563, + "learning_rate": 0.0009551379529536507, + "loss": 2.8785, + "step": 4875 + }, + { + "epoch": 0.1445897458707707, + "grad_norm": 0.19293171167373657, + "learning_rate": 0.0009551184717398689, + "loss": 2.7994, + "step": 4876 + }, + { + "epoch": 0.14461939922308217, + "grad_norm": 0.1895996481180191, + "learning_rate": 0.000955098986495935, + "loss": 2.8369, + "step": 4877 + }, + { + "epoch": 0.14464905257539365, + "grad_norm": 0.17099390923976898, + "learning_rate": 0.0009550794972220213, + "loss": 2.8763, + "step": 4878 + }, + { + "epoch": 0.14467870592770513, + "grad_norm": 0.1936415433883667, + "learning_rate": 0.0009550600039183009, + "loss": 2.8684, + "step": 4879 + }, + { + "epoch": 0.1447083592800166, + "grad_norm": 0.21431395411491394, + "learning_rate": 0.0009550405065849456, + "loss": 2.8278, + "step": 4880 + }, + { + "epoch": 0.14473801263232808, + "grad_norm": 0.23485638201236725, + "learning_rate": 0.0009550210052221288, + "loss": 2.8565, + "step": 4881 + }, + { + "epoch": 0.14476766598463955, + "grad_norm": 0.20170089602470398, + "learning_rate": 0.0009550014998300229, + "loss": 2.8862, + "step": 4882 + }, + { + "epoch": 0.14479731933695103, + "grad_norm": 0.14584152400493622, + "learning_rate": 0.0009549819904088006, + "loss": 2.8363, + "step": 4883 + }, + { + "epoch": 0.14482697268926253, + "grad_norm": 0.16606231033802032, + "learning_rate": 0.0009549624769586345, + "loss": 2.8376, + "step": 4884 + }, + { + "epoch": 0.144856626041574, + "grad_norm": 0.1553979367017746, + "learning_rate": 0.0009549429594796978, + "loss": 2.8454, + "step": 4885 + }, + { + "epoch": 0.1448862793938855, + "grad_norm": 0.14159227907657623, + "learning_rate": 0.000954923437972163, + "loss": 2.888, + "step": 4886 + }, + { + "epoch": 0.14491593274619696, + "grad_norm": 0.15808451175689697, + "learning_rate": 0.0009549039124362031, + "loss": 2.8404, + "step": 4887 + }, + { + "epoch": 0.14494558609850844, + "grad_norm": 0.14431019127368927, + "learning_rate": 0.0009548843828719909, + "loss": 2.8663, + "step": 4888 + }, + { + "epoch": 0.14497523945081991, + "grad_norm": 0.14851830899715424, + "learning_rate": 0.0009548648492796994, + "loss": 2.8807, + "step": 4889 + }, + { + "epoch": 0.1450048928031314, + "grad_norm": 0.1658877283334732, + "learning_rate": 0.0009548453116595018, + "loss": 2.8466, + "step": 4890 + }, + { + "epoch": 0.14503454615544287, + "grad_norm": 0.15431080758571625, + "learning_rate": 0.0009548257700115706, + "loss": 2.8562, + "step": 4891 + }, + { + "epoch": 0.14506419950775434, + "grad_norm": 0.15575715899467468, + "learning_rate": 0.0009548062243360793, + "loss": 2.8202, + "step": 4892 + }, + { + "epoch": 0.14509385286006582, + "grad_norm": 0.14939211308956146, + "learning_rate": 0.0009547866746332008, + "loss": 2.8428, + "step": 4893 + }, + { + "epoch": 0.1451235062123773, + "grad_norm": 0.1399289220571518, + "learning_rate": 0.0009547671209031082, + "loss": 2.8338, + "step": 4894 + }, + { + "epoch": 0.1451531595646888, + "grad_norm": 0.1397542804479599, + "learning_rate": 0.0009547475631459748, + "loss": 2.8537, + "step": 4895 + }, + { + "epoch": 0.14518281291700028, + "grad_norm": 0.1551046520471573, + "learning_rate": 0.0009547280013619734, + "loss": 2.8701, + "step": 4896 + }, + { + "epoch": 0.14521246626931175, + "grad_norm": 0.16550013422966003, + "learning_rate": 0.0009547084355512778, + "loss": 2.8604, + "step": 4897 + }, + { + "epoch": 0.14524211962162323, + "grad_norm": 0.1587887704372406, + "learning_rate": 0.0009546888657140609, + "loss": 2.8995, + "step": 4898 + }, + { + "epoch": 0.1452717729739347, + "grad_norm": 0.1528800129890442, + "learning_rate": 0.0009546692918504959, + "loss": 2.8512, + "step": 4899 + }, + { + "epoch": 0.14530142632624618, + "grad_norm": 0.13927161693572998, + "learning_rate": 0.0009546497139607564, + "loss": 2.8173, + "step": 4900 + }, + { + "epoch": 0.14533107967855766, + "grad_norm": 0.1346932202577591, + "learning_rate": 0.0009546301320450155, + "loss": 2.8447, + "step": 4901 + }, + { + "epoch": 0.14536073303086913, + "grad_norm": 0.1487630009651184, + "learning_rate": 0.0009546105461034469, + "loss": 2.8489, + "step": 4902 + }, + { + "epoch": 0.1453903863831806, + "grad_norm": 0.15050622820854187, + "learning_rate": 0.0009545909561362239, + "loss": 2.851, + "step": 4903 + }, + { + "epoch": 0.14542003973549208, + "grad_norm": 0.15843600034713745, + "learning_rate": 0.0009545713621435197, + "loss": 2.8455, + "step": 4904 + }, + { + "epoch": 0.1454496930878036, + "grad_norm": 0.16421887278556824, + "learning_rate": 0.0009545517641255083, + "loss": 2.829, + "step": 4905 + }, + { + "epoch": 0.14547934644011506, + "grad_norm": 0.1398434042930603, + "learning_rate": 0.000954532162082363, + "loss": 2.8406, + "step": 4906 + }, + { + "epoch": 0.14550899979242654, + "grad_norm": 0.2004764974117279, + "learning_rate": 0.0009545125560142573, + "loss": 2.8916, + "step": 4907 + }, + { + "epoch": 0.14553865314473802, + "grad_norm": 0.2246624380350113, + "learning_rate": 0.0009544929459213649, + "loss": 2.8585, + "step": 4908 + }, + { + "epoch": 0.1455683064970495, + "grad_norm": 0.22727401554584503, + "learning_rate": 0.0009544733318038594, + "loss": 2.856, + "step": 4909 + }, + { + "epoch": 0.14559795984936097, + "grad_norm": 0.1943638175725937, + "learning_rate": 0.0009544537136619147, + "loss": 2.8203, + "step": 4910 + }, + { + "epoch": 0.14562761320167245, + "grad_norm": 0.1643177717924118, + "learning_rate": 0.0009544340914957042, + "loss": 2.8378, + "step": 4911 + }, + { + "epoch": 0.14565726655398392, + "grad_norm": 0.18512234091758728, + "learning_rate": 0.0009544144653054018, + "loss": 2.8761, + "step": 4912 + }, + { + "epoch": 0.1456869199062954, + "grad_norm": 0.20832133293151855, + "learning_rate": 0.0009543948350911815, + "loss": 2.8616, + "step": 4913 + }, + { + "epoch": 0.14571657325860687, + "grad_norm": 0.19471359252929688, + "learning_rate": 0.0009543752008532169, + "loss": 2.8703, + "step": 4914 + }, + { + "epoch": 0.14574622661091838, + "grad_norm": 0.20051495730876923, + "learning_rate": 0.0009543555625916817, + "loss": 2.8371, + "step": 4915 + }, + { + "epoch": 0.14577587996322985, + "grad_norm": 0.19965727627277374, + "learning_rate": 0.0009543359203067504, + "loss": 2.8599, + "step": 4916 + }, + { + "epoch": 0.14580553331554133, + "grad_norm": 0.17166195809841156, + "learning_rate": 0.0009543162739985962, + "loss": 2.8883, + "step": 4917 + }, + { + "epoch": 0.1458351866678528, + "grad_norm": 0.17446115612983704, + "learning_rate": 0.0009542966236673935, + "loss": 2.817, + "step": 4918 + }, + { + "epoch": 0.14586484002016428, + "grad_norm": 0.1494232714176178, + "learning_rate": 0.0009542769693133162, + "loss": 2.8702, + "step": 4919 + }, + { + "epoch": 0.14589449337247576, + "grad_norm": 0.1414320021867752, + "learning_rate": 0.0009542573109365385, + "loss": 2.844, + "step": 4920 + }, + { + "epoch": 0.14592414672478723, + "grad_norm": 0.13504569232463837, + "learning_rate": 0.0009542376485372341, + "loss": 2.8623, + "step": 4921 + }, + { + "epoch": 0.1459538000770987, + "grad_norm": 0.14393730461597443, + "learning_rate": 0.0009542179821155774, + "loss": 2.8597, + "step": 4922 + }, + { + "epoch": 0.1459834534294102, + "grad_norm": 0.15401577949523926, + "learning_rate": 0.0009541983116717426, + "loss": 2.8564, + "step": 4923 + }, + { + "epoch": 0.14601310678172166, + "grad_norm": 0.1831844449043274, + "learning_rate": 0.0009541786372059038, + "loss": 2.8679, + "step": 4924 + }, + { + "epoch": 0.14604276013403314, + "grad_norm": 0.21924427151679993, + "learning_rate": 0.0009541589587182352, + "loss": 2.8722, + "step": 4925 + }, + { + "epoch": 0.14607241348634464, + "grad_norm": 0.18033891916275024, + "learning_rate": 0.000954139276208911, + "loss": 2.8498, + "step": 4926 + }, + { + "epoch": 0.14610206683865612, + "grad_norm": 0.16445910930633545, + "learning_rate": 0.0009541195896781056, + "loss": 2.8594, + "step": 4927 + }, + { + "epoch": 0.1461317201909676, + "grad_norm": 0.14931835234165192, + "learning_rate": 0.0009540998991259933, + "loss": 2.8437, + "step": 4928 + }, + { + "epoch": 0.14616137354327907, + "grad_norm": 0.14874988794326782, + "learning_rate": 0.0009540802045527485, + "loss": 2.8294, + "step": 4929 + }, + { + "epoch": 0.14619102689559055, + "grad_norm": 0.1765814870595932, + "learning_rate": 0.0009540605059585454, + "loss": 2.8638, + "step": 4930 + }, + { + "epoch": 0.14622068024790202, + "grad_norm": 0.17793188989162445, + "learning_rate": 0.0009540408033435587, + "loss": 2.832, + "step": 4931 + }, + { + "epoch": 0.1462503336002135, + "grad_norm": 0.16164572536945343, + "learning_rate": 0.0009540210967079627, + "loss": 2.8688, + "step": 4932 + }, + { + "epoch": 0.14627998695252498, + "grad_norm": 0.1374644786119461, + "learning_rate": 0.000954001386051932, + "loss": 2.8527, + "step": 4933 + }, + { + "epoch": 0.14630964030483645, + "grad_norm": 0.15574248135089874, + "learning_rate": 0.0009539816713756411, + "loss": 2.8428, + "step": 4934 + }, + { + "epoch": 0.14633929365714793, + "grad_norm": 0.1552976369857788, + "learning_rate": 0.0009539619526792645, + "loss": 2.8479, + "step": 4935 + }, + { + "epoch": 0.14636894700945943, + "grad_norm": 0.15711478888988495, + "learning_rate": 0.0009539422299629769, + "loss": 2.8341, + "step": 4936 + }, + { + "epoch": 0.1463986003617709, + "grad_norm": 0.16621148586273193, + "learning_rate": 0.000953922503226953, + "loss": 2.8647, + "step": 4937 + }, + { + "epoch": 0.14642825371408238, + "grad_norm": 0.18830788135528564, + "learning_rate": 0.0009539027724713673, + "loss": 2.8361, + "step": 4938 + }, + { + "epoch": 0.14645790706639386, + "grad_norm": 0.1671103835105896, + "learning_rate": 0.0009538830376963947, + "loss": 2.88, + "step": 4939 + }, + { + "epoch": 0.14648756041870534, + "grad_norm": 0.14830553531646729, + "learning_rate": 0.0009538632989022101, + "loss": 2.8279, + "step": 4940 + }, + { + "epoch": 0.1465172137710168, + "grad_norm": 0.14385990798473358, + "learning_rate": 0.0009538435560889878, + "loss": 2.844, + "step": 4941 + }, + { + "epoch": 0.1465468671233283, + "grad_norm": 0.1454424113035202, + "learning_rate": 0.0009538238092569029, + "loss": 2.8202, + "step": 4942 + }, + { + "epoch": 0.14657652047563977, + "grad_norm": 0.1415282040834427, + "learning_rate": 0.0009538040584061305, + "loss": 2.8361, + "step": 4943 + }, + { + "epoch": 0.14660617382795124, + "grad_norm": 0.15637078881263733, + "learning_rate": 0.0009537843035368451, + "loss": 2.8625, + "step": 4944 + }, + { + "epoch": 0.14663582718026272, + "grad_norm": 0.182911217212677, + "learning_rate": 0.0009537645446492218, + "loss": 2.8594, + "step": 4945 + }, + { + "epoch": 0.1466654805325742, + "grad_norm": 0.18290215730667114, + "learning_rate": 0.0009537447817434357, + "loss": 2.8318, + "step": 4946 + }, + { + "epoch": 0.1466951338848857, + "grad_norm": 0.1747276335954666, + "learning_rate": 0.0009537250148196614, + "loss": 2.85, + "step": 4947 + }, + { + "epoch": 0.14672478723719717, + "grad_norm": 0.15141209959983826, + "learning_rate": 0.0009537052438780744, + "loss": 2.8307, + "step": 4948 + }, + { + "epoch": 0.14675444058950865, + "grad_norm": 0.1776638627052307, + "learning_rate": 0.0009536854689188496, + "loss": 2.8286, + "step": 4949 + }, + { + "epoch": 0.14678409394182013, + "grad_norm": 0.1590137481689453, + "learning_rate": 0.0009536656899421623, + "loss": 2.8549, + "step": 4950 + }, + { + "epoch": 0.1468137472941316, + "grad_norm": 0.16948290169239044, + "learning_rate": 0.000953645906948187, + "loss": 2.8369, + "step": 4951 + }, + { + "epoch": 0.14684340064644308, + "grad_norm": 0.16895033419132233, + "learning_rate": 0.0009536261199370997, + "loss": 2.8055, + "step": 4952 + }, + { + "epoch": 0.14687305399875455, + "grad_norm": 0.14414608478546143, + "learning_rate": 0.0009536063289090751, + "loss": 2.8582, + "step": 4953 + }, + { + "epoch": 0.14690270735106603, + "grad_norm": 0.1578403264284134, + "learning_rate": 0.0009535865338642886, + "loss": 2.8783, + "step": 4954 + }, + { + "epoch": 0.1469323607033775, + "grad_norm": 0.14851146936416626, + "learning_rate": 0.0009535667348029155, + "loss": 2.8465, + "step": 4955 + }, + { + "epoch": 0.14696201405568898, + "grad_norm": 0.1547597497701645, + "learning_rate": 0.0009535469317251311, + "loss": 2.8694, + "step": 4956 + }, + { + "epoch": 0.1469916674080005, + "grad_norm": 0.15795837342739105, + "learning_rate": 0.0009535271246311108, + "loss": 2.8398, + "step": 4957 + }, + { + "epoch": 0.14702132076031196, + "grad_norm": 0.17152442038059235, + "learning_rate": 0.0009535073135210299, + "loss": 2.8456, + "step": 4958 + }, + { + "epoch": 0.14705097411262344, + "grad_norm": 0.21700350940227509, + "learning_rate": 0.0009534874983950639, + "loss": 2.8543, + "step": 4959 + }, + { + "epoch": 0.14708062746493492, + "grad_norm": 0.21733492612838745, + "learning_rate": 0.0009534676792533883, + "loss": 2.8495, + "step": 4960 + }, + { + "epoch": 0.1471102808172464, + "grad_norm": 0.1858302354812622, + "learning_rate": 0.0009534478560961786, + "loss": 2.8482, + "step": 4961 + }, + { + "epoch": 0.14713993416955787, + "grad_norm": 0.16897989809513092, + "learning_rate": 0.0009534280289236101, + "loss": 2.8556, + "step": 4962 + }, + { + "epoch": 0.14716958752186934, + "grad_norm": 0.19358892738819122, + "learning_rate": 0.0009534081977358588, + "loss": 2.8491, + "step": 4963 + }, + { + "epoch": 0.14719924087418082, + "grad_norm": 0.1867181956768036, + "learning_rate": 0.0009533883625331, + "loss": 2.8487, + "step": 4964 + }, + { + "epoch": 0.1472288942264923, + "grad_norm": 0.2030467391014099, + "learning_rate": 0.0009533685233155094, + "loss": 2.8266, + "step": 4965 + }, + { + "epoch": 0.14725854757880377, + "grad_norm": 0.19188818335533142, + "learning_rate": 0.0009533486800832628, + "loss": 2.8265, + "step": 4966 + }, + { + "epoch": 0.14728820093111528, + "grad_norm": 0.1754862368106842, + "learning_rate": 0.0009533288328365357, + "loss": 2.8305, + "step": 4967 + }, + { + "epoch": 0.14731785428342675, + "grad_norm": 0.1768118143081665, + "learning_rate": 0.0009533089815755041, + "loss": 2.8487, + "step": 4968 + }, + { + "epoch": 0.14734750763573823, + "grad_norm": 0.18036721646785736, + "learning_rate": 0.0009532891263003436, + "loss": 2.8028, + "step": 4969 + }, + { + "epoch": 0.1473771609880497, + "grad_norm": 0.1477033495903015, + "learning_rate": 0.00095326926701123, + "loss": 2.8507, + "step": 4970 + }, + { + "epoch": 0.14740681434036118, + "grad_norm": 0.16023939847946167, + "learning_rate": 0.0009532494037083394, + "loss": 2.874, + "step": 4971 + }, + { + "epoch": 0.14743646769267266, + "grad_norm": 0.14996449649333954, + "learning_rate": 0.0009532295363918474, + "loss": 2.8362, + "step": 4972 + }, + { + "epoch": 0.14746612104498413, + "grad_norm": 0.15248416364192963, + "learning_rate": 0.0009532096650619302, + "loss": 2.8624, + "step": 4973 + }, + { + "epoch": 0.1474957743972956, + "grad_norm": 0.14747914671897888, + "learning_rate": 0.0009531897897187635, + "loss": 2.8671, + "step": 4974 + }, + { + "epoch": 0.14752542774960709, + "grad_norm": 0.16353847086429596, + "learning_rate": 0.0009531699103625235, + "loss": 2.8489, + "step": 4975 + }, + { + "epoch": 0.14755508110191856, + "grad_norm": 0.1663210541009903, + "learning_rate": 0.000953150026993386, + "loss": 2.8529, + "step": 4976 + }, + { + "epoch": 0.14758473445423004, + "grad_norm": 0.17431044578552246, + "learning_rate": 0.0009531301396115273, + "loss": 2.8684, + "step": 4977 + }, + { + "epoch": 0.14761438780654154, + "grad_norm": 0.1814369410276413, + "learning_rate": 0.0009531102482171235, + "loss": 2.8666, + "step": 4978 + }, + { + "epoch": 0.14764404115885302, + "grad_norm": 0.17998036742210388, + "learning_rate": 0.0009530903528103507, + "loss": 2.8641, + "step": 4979 + }, + { + "epoch": 0.1476736945111645, + "grad_norm": 0.18259677290916443, + "learning_rate": 0.000953070453391385, + "loss": 2.8371, + "step": 4980 + }, + { + "epoch": 0.14770334786347597, + "grad_norm": 0.15575909614562988, + "learning_rate": 0.0009530505499604026, + "loss": 2.8384, + "step": 4981 + }, + { + "epoch": 0.14773300121578745, + "grad_norm": 0.15074920654296875, + "learning_rate": 0.0009530306425175798, + "loss": 2.8204, + "step": 4982 + }, + { + "epoch": 0.14776265456809892, + "grad_norm": 0.17584218084812164, + "learning_rate": 0.0009530107310630931, + "loss": 2.86, + "step": 4983 + }, + { + "epoch": 0.1477923079204104, + "grad_norm": 0.19048933684825897, + "learning_rate": 0.0009529908155971185, + "loss": 2.8637, + "step": 4984 + }, + { + "epoch": 0.14782196127272187, + "grad_norm": 0.1908639818429947, + "learning_rate": 0.0009529708961198325, + "loss": 2.7988, + "step": 4985 + }, + { + "epoch": 0.14785161462503335, + "grad_norm": 0.1610022634267807, + "learning_rate": 0.0009529509726314114, + "loss": 2.8676, + "step": 4986 + }, + { + "epoch": 0.14788126797734483, + "grad_norm": 0.1586686223745346, + "learning_rate": 0.0009529310451320316, + "loss": 2.8511, + "step": 4987 + }, + { + "epoch": 0.14791092132965633, + "grad_norm": 0.14994016289710999, + "learning_rate": 0.0009529111136218699, + "loss": 2.8283, + "step": 4988 + }, + { + "epoch": 0.1479405746819678, + "grad_norm": 0.14840391278266907, + "learning_rate": 0.0009528911781011025, + "loss": 2.821, + "step": 4989 + }, + { + "epoch": 0.14797022803427928, + "grad_norm": 0.172011137008667, + "learning_rate": 0.0009528712385699059, + "loss": 2.8518, + "step": 4990 + }, + { + "epoch": 0.14799988138659076, + "grad_norm": 0.18863223493099213, + "learning_rate": 0.0009528512950284566, + "loss": 2.8199, + "step": 4991 + }, + { + "epoch": 0.14802953473890224, + "grad_norm": 0.17007145285606384, + "learning_rate": 0.0009528313474769316, + "loss": 2.8544, + "step": 4992 + }, + { + "epoch": 0.1480591880912137, + "grad_norm": 0.1595618575811386, + "learning_rate": 0.0009528113959155071, + "loss": 2.8543, + "step": 4993 + }, + { + "epoch": 0.1480888414435252, + "grad_norm": 0.17012354731559753, + "learning_rate": 0.0009527914403443602, + "loss": 2.8436, + "step": 4994 + }, + { + "epoch": 0.14811849479583666, + "grad_norm": 0.17296186089515686, + "learning_rate": 0.0009527714807636672, + "loss": 2.8598, + "step": 4995 + }, + { + "epoch": 0.14814814814814814, + "grad_norm": 0.16047072410583496, + "learning_rate": 0.000952751517173605, + "loss": 2.8478, + "step": 4996 + }, + { + "epoch": 0.14817780150045962, + "grad_norm": 0.1577221155166626, + "learning_rate": 0.0009527315495743505, + "loss": 2.8463, + "step": 4997 + }, + { + "epoch": 0.1482074548527711, + "grad_norm": 0.14574679732322693, + "learning_rate": 0.0009527115779660805, + "loss": 2.8218, + "step": 4998 + }, + { + "epoch": 0.1482371082050826, + "grad_norm": 0.14790496230125427, + "learning_rate": 0.0009526916023489716, + "loss": 2.8347, + "step": 4999 + }, + { + "epoch": 0.14826676155739407, + "grad_norm": 0.13629552721977234, + "learning_rate": 0.0009526716227232009, + "loss": 2.8369, + "step": 5000 + }, + { + "epoch": 0.14829641490970555, + "grad_norm": 0.1361587792634964, + "learning_rate": 0.0009526516390889453, + "loss": 2.8401, + "step": 5001 + }, + { + "epoch": 0.14832606826201702, + "grad_norm": 0.1341271549463272, + "learning_rate": 0.0009526316514463815, + "loss": 2.8476, + "step": 5002 + }, + { + "epoch": 0.1483557216143285, + "grad_norm": 0.127971351146698, + "learning_rate": 0.0009526116597956871, + "loss": 2.8742, + "step": 5003 + }, + { + "epoch": 0.14838537496663998, + "grad_norm": 0.15358294546604156, + "learning_rate": 0.0009525916641370386, + "loss": 2.8597, + "step": 5004 + }, + { + "epoch": 0.14841502831895145, + "grad_norm": 0.14823207259178162, + "learning_rate": 0.0009525716644706132, + "loss": 2.838, + "step": 5005 + }, + { + "epoch": 0.14844468167126293, + "grad_norm": 0.15932753682136536, + "learning_rate": 0.0009525516607965881, + "loss": 2.8525, + "step": 5006 + }, + { + "epoch": 0.1484743350235744, + "grad_norm": 0.160318061709404, + "learning_rate": 0.0009525316531151404, + "loss": 2.8489, + "step": 5007 + }, + { + "epoch": 0.14850398837588588, + "grad_norm": 0.16685554385185242, + "learning_rate": 0.0009525116414264472, + "loss": 2.8048, + "step": 5008 + }, + { + "epoch": 0.14853364172819739, + "grad_norm": 0.15265800058841705, + "learning_rate": 0.0009524916257306857, + "loss": 2.8477, + "step": 5009 + }, + { + "epoch": 0.14856329508050886, + "grad_norm": 0.1383524239063263, + "learning_rate": 0.0009524716060280332, + "loss": 2.8331, + "step": 5010 + }, + { + "epoch": 0.14859294843282034, + "grad_norm": 0.14395004510879517, + "learning_rate": 0.0009524515823186669, + "loss": 2.8733, + "step": 5011 + }, + { + "epoch": 0.1486226017851318, + "grad_norm": 0.1600712090730667, + "learning_rate": 0.0009524315546027642, + "loss": 2.826, + "step": 5012 + }, + { + "epoch": 0.1486522551374433, + "grad_norm": 0.17183376848697662, + "learning_rate": 0.0009524115228805026, + "loss": 2.8259, + "step": 5013 + }, + { + "epoch": 0.14868190848975477, + "grad_norm": 0.1973290890455246, + "learning_rate": 0.0009523914871520592, + "loss": 2.8315, + "step": 5014 + }, + { + "epoch": 0.14871156184206624, + "grad_norm": 0.2175690233707428, + "learning_rate": 0.0009523714474176113, + "loss": 2.874, + "step": 5015 + }, + { + "epoch": 0.14874121519437772, + "grad_norm": 0.22675541043281555, + "learning_rate": 0.0009523514036773368, + "loss": 2.8423, + "step": 5016 + }, + { + "epoch": 0.1487708685466892, + "grad_norm": 0.24132126569747925, + "learning_rate": 0.0009523313559314131, + "loss": 2.8912, + "step": 5017 + }, + { + "epoch": 0.14880052189900067, + "grad_norm": 0.20298779010772705, + "learning_rate": 0.0009523113041800174, + "loss": 2.8743, + "step": 5018 + }, + { + "epoch": 0.14883017525131217, + "grad_norm": 0.20163995027542114, + "learning_rate": 0.0009522912484233274, + "loss": 2.8158, + "step": 5019 + }, + { + "epoch": 0.14885982860362365, + "grad_norm": 0.2027563899755478, + "learning_rate": 0.000952271188661521, + "loss": 2.8337, + "step": 5020 + }, + { + "epoch": 0.14888948195593513, + "grad_norm": 0.14729146659374237, + "learning_rate": 0.0009522511248947755, + "loss": 2.8647, + "step": 5021 + }, + { + "epoch": 0.1489191353082466, + "grad_norm": 0.146169513463974, + "learning_rate": 0.0009522310571232687, + "loss": 2.8428, + "step": 5022 + }, + { + "epoch": 0.14894878866055808, + "grad_norm": 0.17419397830963135, + "learning_rate": 0.0009522109853471781, + "loss": 2.8142, + "step": 5023 + }, + { + "epoch": 0.14897844201286956, + "grad_norm": 0.15094487369060516, + "learning_rate": 0.0009521909095666818, + "loss": 2.8325, + "step": 5024 + }, + { + "epoch": 0.14900809536518103, + "grad_norm": 0.1507689207792282, + "learning_rate": 0.0009521708297819572, + "loss": 2.8096, + "step": 5025 + }, + { + "epoch": 0.1490377487174925, + "grad_norm": 0.14844414591789246, + "learning_rate": 0.0009521507459931824, + "loss": 2.8399, + "step": 5026 + }, + { + "epoch": 0.14906740206980398, + "grad_norm": 0.17186908423900604, + "learning_rate": 0.0009521306582005351, + "loss": 2.8489, + "step": 5027 + }, + { + "epoch": 0.14909705542211546, + "grad_norm": 0.20997461676597595, + "learning_rate": 0.0009521105664041933, + "loss": 2.829, + "step": 5028 + }, + { + "epoch": 0.14912670877442694, + "grad_norm": 0.19212037324905396, + "learning_rate": 0.0009520904706043347, + "loss": 2.8563, + "step": 5029 + }, + { + "epoch": 0.14915636212673844, + "grad_norm": 0.17959903180599213, + "learning_rate": 0.0009520703708011376, + "loss": 2.821, + "step": 5030 + }, + { + "epoch": 0.14918601547904992, + "grad_norm": 0.13494661450386047, + "learning_rate": 0.0009520502669947794, + "loss": 2.8259, + "step": 5031 + }, + { + "epoch": 0.1492156688313614, + "grad_norm": 0.15116888284683228, + "learning_rate": 0.0009520301591854388, + "loss": 2.8667, + "step": 5032 + }, + { + "epoch": 0.14924532218367287, + "grad_norm": 0.14687295258045197, + "learning_rate": 0.0009520100473732934, + "loss": 2.8363, + "step": 5033 + }, + { + "epoch": 0.14927497553598434, + "grad_norm": 0.16189652681350708, + "learning_rate": 0.0009519899315585215, + "loss": 2.87, + "step": 5034 + }, + { + "epoch": 0.14930462888829582, + "grad_norm": 0.17749899625778198, + "learning_rate": 0.0009519698117413011, + "loss": 2.8479, + "step": 5035 + }, + { + "epoch": 0.1493342822406073, + "grad_norm": 0.16037921607494354, + "learning_rate": 0.0009519496879218106, + "loss": 2.8495, + "step": 5036 + }, + { + "epoch": 0.14936393559291877, + "grad_norm": 0.18384385108947754, + "learning_rate": 0.000951929560100228, + "loss": 2.8692, + "step": 5037 + }, + { + "epoch": 0.14939358894523025, + "grad_norm": 0.18213848769664764, + "learning_rate": 0.0009519094282767316, + "loss": 2.829, + "step": 5038 + }, + { + "epoch": 0.14942324229754173, + "grad_norm": 0.1545446664094925, + "learning_rate": 0.0009518892924514995, + "loss": 2.8227, + "step": 5039 + }, + { + "epoch": 0.14945289564985323, + "grad_norm": 0.14152760803699493, + "learning_rate": 0.0009518691526247102, + "loss": 2.8572, + "step": 5040 + }, + { + "epoch": 0.1494825490021647, + "grad_norm": 0.14152391254901886, + "learning_rate": 0.0009518490087965419, + "loss": 2.8376, + "step": 5041 + }, + { + "epoch": 0.14951220235447618, + "grad_norm": 0.14126573503017426, + "learning_rate": 0.0009518288609671733, + "loss": 2.8284, + "step": 5042 + }, + { + "epoch": 0.14954185570678766, + "grad_norm": 0.14893876016139984, + "learning_rate": 0.0009518087091367824, + "loss": 2.8115, + "step": 5043 + }, + { + "epoch": 0.14957150905909913, + "grad_norm": 0.1679196059703827, + "learning_rate": 0.0009517885533055479, + "loss": 2.8267, + "step": 5044 + }, + { + "epoch": 0.1496011624114106, + "grad_norm": 0.19611571729183197, + "learning_rate": 0.0009517683934736481, + "loss": 2.8436, + "step": 5045 + }, + { + "epoch": 0.14963081576372209, + "grad_norm": 0.20570752024650574, + "learning_rate": 0.0009517482296412618, + "loss": 2.8593, + "step": 5046 + }, + { + "epoch": 0.14966046911603356, + "grad_norm": 0.19504016637802124, + "learning_rate": 0.0009517280618085673, + "loss": 2.8126, + "step": 5047 + }, + { + "epoch": 0.14969012246834504, + "grad_norm": 0.16641081869602203, + "learning_rate": 0.0009517078899757432, + "loss": 2.8166, + "step": 5048 + }, + { + "epoch": 0.14971977582065651, + "grad_norm": 0.1542063057422638, + "learning_rate": 0.0009516877141429683, + "loss": 2.86, + "step": 5049 + }, + { + "epoch": 0.149749429172968, + "grad_norm": 0.1295912265777588, + "learning_rate": 0.0009516675343104212, + "loss": 2.847, + "step": 5050 + }, + { + "epoch": 0.1497790825252795, + "grad_norm": 0.1316225826740265, + "learning_rate": 0.0009516473504782805, + "loss": 2.8439, + "step": 5051 + }, + { + "epoch": 0.14980873587759097, + "grad_norm": 0.13341926038265228, + "learning_rate": 0.000951627162646725, + "loss": 2.8387, + "step": 5052 + }, + { + "epoch": 0.14983838922990245, + "grad_norm": 0.1251831203699112, + "learning_rate": 0.0009516069708159334, + "loss": 2.8253, + "step": 5053 + }, + { + "epoch": 0.14986804258221392, + "grad_norm": 0.13884760439395905, + "learning_rate": 0.0009515867749860846, + "loss": 2.8301, + "step": 5054 + }, + { + "epoch": 0.1498976959345254, + "grad_norm": 0.13515150547027588, + "learning_rate": 0.0009515665751573574, + "loss": 2.8606, + "step": 5055 + }, + { + "epoch": 0.14992734928683688, + "grad_norm": 0.15225718915462494, + "learning_rate": 0.0009515463713299306, + "loss": 2.8197, + "step": 5056 + }, + { + "epoch": 0.14995700263914835, + "grad_norm": 0.160109743475914, + "learning_rate": 0.0009515261635039832, + "loss": 2.8496, + "step": 5057 + }, + { + "epoch": 0.14998665599145983, + "grad_norm": 0.18134112656116486, + "learning_rate": 0.0009515059516796942, + "loss": 2.8332, + "step": 5058 + }, + { + "epoch": 0.1500163093437713, + "grad_norm": 0.21752949059009552, + "learning_rate": 0.0009514857358572424, + "loss": 2.8241, + "step": 5059 + }, + { + "epoch": 0.15004596269608278, + "grad_norm": 0.239064559340477, + "learning_rate": 0.0009514655160368071, + "loss": 2.8399, + "step": 5060 + }, + { + "epoch": 0.15007561604839428, + "grad_norm": 0.19913098216056824, + "learning_rate": 0.000951445292218567, + "loss": 2.7945, + "step": 5061 + }, + { + "epoch": 0.15010526940070576, + "grad_norm": 0.1413005143404007, + "learning_rate": 0.0009514250644027014, + "loss": 2.8114, + "step": 5062 + }, + { + "epoch": 0.15013492275301724, + "grad_norm": 0.17278791964054108, + "learning_rate": 0.0009514048325893892, + "loss": 2.8705, + "step": 5063 + }, + { + "epoch": 0.1501645761053287, + "grad_norm": 0.14350615441799164, + "learning_rate": 0.00095138459677881, + "loss": 2.8106, + "step": 5064 + }, + { + "epoch": 0.1501942294576402, + "grad_norm": 0.14231452345848083, + "learning_rate": 0.0009513643569711425, + "loss": 2.8303, + "step": 5065 + }, + { + "epoch": 0.15022388280995166, + "grad_norm": 0.15258651971817017, + "learning_rate": 0.0009513441131665662, + "loss": 2.8327, + "step": 5066 + }, + { + "epoch": 0.15025353616226314, + "grad_norm": 0.14287860691547394, + "learning_rate": 0.0009513238653652603, + "loss": 2.8454, + "step": 5067 + }, + { + "epoch": 0.15028318951457462, + "grad_norm": 0.135431170463562, + "learning_rate": 0.0009513036135674043, + "loss": 2.8392, + "step": 5068 + }, + { + "epoch": 0.1503128428668861, + "grad_norm": 0.12425236403942108, + "learning_rate": 0.000951283357773177, + "loss": 2.8517, + "step": 5069 + }, + { + "epoch": 0.15034249621919757, + "grad_norm": 0.14499162137508392, + "learning_rate": 0.0009512630979827583, + "loss": 2.8416, + "step": 5070 + }, + { + "epoch": 0.15037214957150907, + "grad_norm": 0.15424597263336182, + "learning_rate": 0.0009512428341963274, + "loss": 2.8344, + "step": 5071 + }, + { + "epoch": 0.15040180292382055, + "grad_norm": 0.15242458879947662, + "learning_rate": 0.0009512225664140637, + "loss": 2.854, + "step": 5072 + }, + { + "epoch": 0.15043145627613203, + "grad_norm": 0.16624492406845093, + "learning_rate": 0.0009512022946361467, + "loss": 2.796, + "step": 5073 + }, + { + "epoch": 0.1504611096284435, + "grad_norm": 0.1839648187160492, + "learning_rate": 0.000951182018862756, + "loss": 2.8843, + "step": 5074 + }, + { + "epoch": 0.15049076298075498, + "grad_norm": 0.16682018339633942, + "learning_rate": 0.0009511617390940711, + "loss": 2.8557, + "step": 5075 + }, + { + "epoch": 0.15052041633306645, + "grad_norm": 0.1280127614736557, + "learning_rate": 0.0009511414553302715, + "loss": 2.8759, + "step": 5076 + }, + { + "epoch": 0.15055006968537793, + "grad_norm": 0.14269804954528809, + "learning_rate": 0.000951121167571537, + "loss": 2.8537, + "step": 5077 + }, + { + "epoch": 0.1505797230376894, + "grad_norm": 0.15454809367656708, + "learning_rate": 0.0009511008758180471, + "loss": 2.8577, + "step": 5078 + }, + { + "epoch": 0.15060937639000088, + "grad_norm": 0.1581047624349594, + "learning_rate": 0.0009510805800699813, + "loss": 2.8196, + "step": 5079 + }, + { + "epoch": 0.15063902974231236, + "grad_norm": 0.16666366159915924, + "learning_rate": 0.0009510602803275197, + "loss": 2.8706, + "step": 5080 + }, + { + "epoch": 0.15066868309462383, + "grad_norm": 0.18230849504470825, + "learning_rate": 0.0009510399765908418, + "loss": 2.8631, + "step": 5081 + }, + { + "epoch": 0.15069833644693534, + "grad_norm": 0.17769339680671692, + "learning_rate": 0.0009510196688601275, + "loss": 2.851, + "step": 5082 + }, + { + "epoch": 0.15072798979924681, + "grad_norm": 0.16059277951717377, + "learning_rate": 0.0009509993571355568, + "loss": 2.8205, + "step": 5083 + }, + { + "epoch": 0.1507576431515583, + "grad_norm": 0.1834987848997116, + "learning_rate": 0.0009509790414173092, + "loss": 2.7971, + "step": 5084 + }, + { + "epoch": 0.15078729650386977, + "grad_norm": 0.20182788372039795, + "learning_rate": 0.0009509587217055649, + "loss": 2.8282, + "step": 5085 + }, + { + "epoch": 0.15081694985618124, + "grad_norm": 0.17826926708221436, + "learning_rate": 0.0009509383980005037, + "loss": 2.864, + "step": 5086 + }, + { + "epoch": 0.15084660320849272, + "grad_norm": 0.1926063448190689, + "learning_rate": 0.0009509180703023055, + "loss": 2.8225, + "step": 5087 + }, + { + "epoch": 0.1508762565608042, + "grad_norm": 0.20091135799884796, + "learning_rate": 0.0009508977386111504, + "loss": 2.8417, + "step": 5088 + }, + { + "epoch": 0.15090590991311567, + "grad_norm": 0.20404204726219177, + "learning_rate": 0.0009508774029272184, + "loss": 2.8527, + "step": 5089 + }, + { + "epoch": 0.15093556326542715, + "grad_norm": 0.19024288654327393, + "learning_rate": 0.0009508570632506897, + "loss": 2.8772, + "step": 5090 + }, + { + "epoch": 0.15096521661773862, + "grad_norm": 0.16379432380199432, + "learning_rate": 0.0009508367195817443, + "loss": 2.8337, + "step": 5091 + }, + { + "epoch": 0.15099486997005013, + "grad_norm": 0.17193278670310974, + "learning_rate": 0.0009508163719205622, + "loss": 2.8087, + "step": 5092 + }, + { + "epoch": 0.1510245233223616, + "grad_norm": 0.14322443306446075, + "learning_rate": 0.0009507960202673239, + "loss": 2.8064, + "step": 5093 + }, + { + "epoch": 0.15105417667467308, + "grad_norm": 0.13520827889442444, + "learning_rate": 0.0009507756646222094, + "loss": 2.8313, + "step": 5094 + }, + { + "epoch": 0.15108383002698456, + "grad_norm": 0.13312257826328278, + "learning_rate": 0.0009507553049853992, + "loss": 2.8572, + "step": 5095 + }, + { + "epoch": 0.15111348337929603, + "grad_norm": 0.14052118360996246, + "learning_rate": 0.0009507349413570732, + "loss": 2.8577, + "step": 5096 + }, + { + "epoch": 0.1511431367316075, + "grad_norm": 0.14195634424686432, + "learning_rate": 0.000950714573737412, + "loss": 2.7988, + "step": 5097 + }, + { + "epoch": 0.15117279008391898, + "grad_norm": 0.14985501766204834, + "learning_rate": 0.0009506942021265958, + "loss": 2.8267, + "step": 5098 + }, + { + "epoch": 0.15120244343623046, + "grad_norm": 0.15100203454494476, + "learning_rate": 0.0009506738265248052, + "loss": 2.8219, + "step": 5099 + }, + { + "epoch": 0.15123209678854194, + "grad_norm": 0.14937607944011688, + "learning_rate": 0.0009506534469322205, + "loss": 2.8457, + "step": 5100 + }, + { + "epoch": 0.1512617501408534, + "grad_norm": 0.14513948559761047, + "learning_rate": 0.0009506330633490221, + "loss": 2.8447, + "step": 5101 + }, + { + "epoch": 0.1512914034931649, + "grad_norm": 0.15426068007946014, + "learning_rate": 0.0009506126757753906, + "loss": 2.8649, + "step": 5102 + }, + { + "epoch": 0.1513210568454764, + "grad_norm": 0.16450776159763336, + "learning_rate": 0.0009505922842115064, + "loss": 2.8368, + "step": 5103 + }, + { + "epoch": 0.15135071019778787, + "grad_norm": 0.15693451464176178, + "learning_rate": 0.0009505718886575504, + "loss": 2.8268, + "step": 5104 + }, + { + "epoch": 0.15138036355009934, + "grad_norm": 0.16556446254253387, + "learning_rate": 0.0009505514891137028, + "loss": 2.8426, + "step": 5105 + }, + { + "epoch": 0.15141001690241082, + "grad_norm": 0.15208107233047485, + "learning_rate": 0.0009505310855801445, + "loss": 2.8573, + "step": 5106 + }, + { + "epoch": 0.1514396702547223, + "grad_norm": 0.15588729083538055, + "learning_rate": 0.0009505106780570563, + "loss": 2.8497, + "step": 5107 + }, + { + "epoch": 0.15146932360703377, + "grad_norm": 0.16582264006137848, + "learning_rate": 0.0009504902665446184, + "loss": 2.8345, + "step": 5108 + }, + { + "epoch": 0.15149897695934525, + "grad_norm": 0.1528942734003067, + "learning_rate": 0.0009504698510430121, + "loss": 2.8095, + "step": 5109 + }, + { + "epoch": 0.15152863031165673, + "grad_norm": 0.15717607736587524, + "learning_rate": 0.0009504494315524177, + "loss": 2.8572, + "step": 5110 + }, + { + "epoch": 0.1515582836639682, + "grad_norm": 0.17621400952339172, + "learning_rate": 0.0009504290080730165, + "loss": 2.8268, + "step": 5111 + }, + { + "epoch": 0.15158793701627968, + "grad_norm": 0.1585792452096939, + "learning_rate": 0.0009504085806049889, + "loss": 2.87, + "step": 5112 + }, + { + "epoch": 0.15161759036859118, + "grad_norm": 0.1764635443687439, + "learning_rate": 0.0009503881491485163, + "loss": 2.8221, + "step": 5113 + }, + { + "epoch": 0.15164724372090266, + "grad_norm": 0.1866534948348999, + "learning_rate": 0.0009503677137037792, + "loss": 2.8632, + "step": 5114 + }, + { + "epoch": 0.15167689707321413, + "grad_norm": 0.17722970247268677, + "learning_rate": 0.0009503472742709585, + "loss": 2.8643, + "step": 5115 + }, + { + "epoch": 0.1517065504255256, + "grad_norm": 0.15977205336093903, + "learning_rate": 0.0009503268308502355, + "loss": 2.8747, + "step": 5116 + }, + { + "epoch": 0.1517362037778371, + "grad_norm": 0.1650567650794983, + "learning_rate": 0.0009503063834417913, + "loss": 2.8424, + "step": 5117 + }, + { + "epoch": 0.15176585713014856, + "grad_norm": 0.16790281236171722, + "learning_rate": 0.0009502859320458066, + "loss": 2.8451, + "step": 5118 + }, + { + "epoch": 0.15179551048246004, + "grad_norm": 0.1736203134059906, + "learning_rate": 0.0009502654766624627, + "loss": 2.8486, + "step": 5119 + }, + { + "epoch": 0.15182516383477151, + "grad_norm": 0.19665972888469696, + "learning_rate": 0.0009502450172919408, + "loss": 2.8554, + "step": 5120 + }, + { + "epoch": 0.151854817187083, + "grad_norm": 0.20113496482372284, + "learning_rate": 0.0009502245539344218, + "loss": 2.8403, + "step": 5121 + }, + { + "epoch": 0.15188447053939447, + "grad_norm": 0.18713313341140747, + "learning_rate": 0.0009502040865900873, + "loss": 2.8846, + "step": 5122 + }, + { + "epoch": 0.15191412389170597, + "grad_norm": 0.17601226270198822, + "learning_rate": 0.0009501836152591182, + "loss": 2.8182, + "step": 5123 + }, + { + "epoch": 0.15194377724401745, + "grad_norm": 0.15203531086444855, + "learning_rate": 0.000950163139941696, + "loss": 2.8152, + "step": 5124 + }, + { + "epoch": 0.15197343059632892, + "grad_norm": 0.1692347526550293, + "learning_rate": 0.0009501426606380019, + "loss": 2.8628, + "step": 5125 + }, + { + "epoch": 0.1520030839486404, + "grad_norm": 0.18695376813411713, + "learning_rate": 0.0009501221773482171, + "loss": 2.8566, + "step": 5126 + }, + { + "epoch": 0.15203273730095188, + "grad_norm": 0.17417284846305847, + "learning_rate": 0.0009501016900725235, + "loss": 2.8498, + "step": 5127 + }, + { + "epoch": 0.15206239065326335, + "grad_norm": 0.1689380556344986, + "learning_rate": 0.000950081198811102, + "loss": 2.855, + "step": 5128 + }, + { + "epoch": 0.15209204400557483, + "grad_norm": 0.1868065446615219, + "learning_rate": 0.0009500607035641342, + "loss": 2.8596, + "step": 5129 + }, + { + "epoch": 0.1521216973578863, + "grad_norm": 0.16927498579025269, + "learning_rate": 0.0009500402043318017, + "loss": 2.8824, + "step": 5130 + }, + { + "epoch": 0.15215135071019778, + "grad_norm": 0.17803871631622314, + "learning_rate": 0.0009500197011142858, + "loss": 2.8504, + "step": 5131 + }, + { + "epoch": 0.15218100406250926, + "grad_norm": 0.1511431634426117, + "learning_rate": 0.0009499991939117682, + "loss": 2.8708, + "step": 5132 + }, + { + "epoch": 0.15221065741482073, + "grad_norm": 0.15569300949573517, + "learning_rate": 0.0009499786827244308, + "loss": 2.8381, + "step": 5133 + }, + { + "epoch": 0.15224031076713224, + "grad_norm": 0.14281699061393738, + "learning_rate": 0.0009499581675524546, + "loss": 2.8203, + "step": 5134 + }, + { + "epoch": 0.1522699641194437, + "grad_norm": 0.14469534158706665, + "learning_rate": 0.0009499376483960218, + "loss": 2.8431, + "step": 5135 + }, + { + "epoch": 0.1522996174717552, + "grad_norm": 0.1777649223804474, + "learning_rate": 0.0009499171252553137, + "loss": 2.8331, + "step": 5136 + }, + { + "epoch": 0.15232927082406666, + "grad_norm": 0.14312143623828888, + "learning_rate": 0.0009498965981305123, + "loss": 2.7918, + "step": 5137 + }, + { + "epoch": 0.15235892417637814, + "grad_norm": 0.16364949941635132, + "learning_rate": 0.0009498760670217995, + "loss": 2.8085, + "step": 5138 + }, + { + "epoch": 0.15238857752868962, + "grad_norm": 0.16510559618473053, + "learning_rate": 0.0009498555319293566, + "loss": 2.7985, + "step": 5139 + }, + { + "epoch": 0.1524182308810011, + "grad_norm": 0.14530964195728302, + "learning_rate": 0.0009498349928533658, + "loss": 2.827, + "step": 5140 + }, + { + "epoch": 0.15244788423331257, + "grad_norm": 0.17494027316570282, + "learning_rate": 0.0009498144497940091, + "loss": 2.8193, + "step": 5141 + }, + { + "epoch": 0.15247753758562405, + "grad_norm": 0.19152098894119263, + "learning_rate": 0.0009497939027514682, + "loss": 2.8257, + "step": 5142 + }, + { + "epoch": 0.15250719093793552, + "grad_norm": 0.16271594166755676, + "learning_rate": 0.000949773351725925, + "loss": 2.8141, + "step": 5143 + }, + { + "epoch": 0.15253684429024703, + "grad_norm": 0.16584712266921997, + "learning_rate": 0.0009497527967175615, + "loss": 2.822, + "step": 5144 + }, + { + "epoch": 0.1525664976425585, + "grad_norm": 0.1576049029827118, + "learning_rate": 0.0009497322377265599, + "loss": 2.847, + "step": 5145 + }, + { + "epoch": 0.15259615099486998, + "grad_norm": 0.1535133272409439, + "learning_rate": 0.0009497116747531021, + "loss": 2.8442, + "step": 5146 + }, + { + "epoch": 0.15262580434718145, + "grad_norm": 0.1503755748271942, + "learning_rate": 0.0009496911077973703, + "loss": 2.8092, + "step": 5147 + }, + { + "epoch": 0.15265545769949293, + "grad_norm": 0.14493724703788757, + "learning_rate": 0.0009496705368595465, + "loss": 2.799, + "step": 5148 + }, + { + "epoch": 0.1526851110518044, + "grad_norm": 0.14134438335895538, + "learning_rate": 0.0009496499619398129, + "loss": 2.8242, + "step": 5149 + }, + { + "epoch": 0.15271476440411588, + "grad_norm": 0.1555985063314438, + "learning_rate": 0.0009496293830383518, + "loss": 2.8403, + "step": 5150 + }, + { + "epoch": 0.15274441775642736, + "grad_norm": 0.16012060642242432, + "learning_rate": 0.0009496088001553451, + "loss": 2.812, + "step": 5151 + }, + { + "epoch": 0.15277407110873883, + "grad_norm": 0.15983855724334717, + "learning_rate": 0.0009495882132909756, + "loss": 2.8609, + "step": 5152 + }, + { + "epoch": 0.1528037244610503, + "grad_norm": 0.16073207557201385, + "learning_rate": 0.0009495676224454251, + "loss": 2.8409, + "step": 5153 + }, + { + "epoch": 0.1528333778133618, + "grad_norm": 0.15955795347690582, + "learning_rate": 0.0009495470276188763, + "loss": 2.8306, + "step": 5154 + }, + { + "epoch": 0.1528630311656733, + "grad_norm": 0.15662722289562225, + "learning_rate": 0.0009495264288115112, + "loss": 2.8258, + "step": 5155 + }, + { + "epoch": 0.15289268451798477, + "grad_norm": 0.1531725972890854, + "learning_rate": 0.0009495058260235125, + "loss": 2.8399, + "step": 5156 + }, + { + "epoch": 0.15292233787029624, + "grad_norm": 0.1672927439212799, + "learning_rate": 0.0009494852192550626, + "loss": 2.7939, + "step": 5157 + }, + { + "epoch": 0.15295199122260772, + "grad_norm": 0.17185479402542114, + "learning_rate": 0.0009494646085063439, + "loss": 2.8475, + "step": 5158 + }, + { + "epoch": 0.1529816445749192, + "grad_norm": 0.18539147078990936, + "learning_rate": 0.000949443993777539, + "loss": 2.8045, + "step": 5159 + }, + { + "epoch": 0.15301129792723067, + "grad_norm": 0.2032376080751419, + "learning_rate": 0.0009494233750688303, + "loss": 2.8268, + "step": 5160 + }, + { + "epoch": 0.15304095127954215, + "grad_norm": 0.19384980201721191, + "learning_rate": 0.0009494027523804004, + "loss": 2.8318, + "step": 5161 + }, + { + "epoch": 0.15307060463185362, + "grad_norm": 0.16149643063545227, + "learning_rate": 0.0009493821257124321, + "loss": 2.8169, + "step": 5162 + }, + { + "epoch": 0.1531002579841651, + "grad_norm": 0.1664421707391739, + "learning_rate": 0.000949361495065108, + "loss": 2.8302, + "step": 5163 + }, + { + "epoch": 0.15312991133647658, + "grad_norm": 0.17043852806091309, + "learning_rate": 0.0009493408604386106, + "loss": 2.8798, + "step": 5164 + }, + { + "epoch": 0.15315956468878808, + "grad_norm": 0.1338750720024109, + "learning_rate": 0.0009493202218331228, + "loss": 2.8422, + "step": 5165 + }, + { + "epoch": 0.15318921804109956, + "grad_norm": 0.1706274300813675, + "learning_rate": 0.0009492995792488273, + "loss": 2.8218, + "step": 5166 + }, + { + "epoch": 0.15321887139341103, + "grad_norm": 0.17632487416267395, + "learning_rate": 0.0009492789326859069, + "loss": 2.7932, + "step": 5167 + }, + { + "epoch": 0.1532485247457225, + "grad_norm": 0.14046688377857208, + "learning_rate": 0.0009492582821445445, + "loss": 2.8206, + "step": 5168 + }, + { + "epoch": 0.15327817809803398, + "grad_norm": 0.15810930728912354, + "learning_rate": 0.0009492376276249229, + "loss": 2.8151, + "step": 5169 + }, + { + "epoch": 0.15330783145034546, + "grad_norm": 0.13873690366744995, + "learning_rate": 0.0009492169691272249, + "loss": 2.8597, + "step": 5170 + }, + { + "epoch": 0.15333748480265694, + "grad_norm": 0.14742164313793182, + "learning_rate": 0.0009491963066516336, + "loss": 2.8065, + "step": 5171 + }, + { + "epoch": 0.1533671381549684, + "grad_norm": 0.1496409922838211, + "learning_rate": 0.0009491756401983318, + "loss": 2.82, + "step": 5172 + }, + { + "epoch": 0.1533967915072799, + "grad_norm": 0.1508975476026535, + "learning_rate": 0.0009491549697675029, + "loss": 2.8296, + "step": 5173 + }, + { + "epoch": 0.15342644485959137, + "grad_norm": 0.13857285678386688, + "learning_rate": 0.0009491342953593292, + "loss": 2.8709, + "step": 5174 + }, + { + "epoch": 0.15345609821190287, + "grad_norm": 0.1522265523672104, + "learning_rate": 0.0009491136169739946, + "loss": 2.853, + "step": 5175 + }, + { + "epoch": 0.15348575156421435, + "grad_norm": 0.16853411495685577, + "learning_rate": 0.0009490929346116817, + "loss": 2.8353, + "step": 5176 + }, + { + "epoch": 0.15351540491652582, + "grad_norm": 0.16326338052749634, + "learning_rate": 0.0009490722482725737, + "loss": 2.8631, + "step": 5177 + }, + { + "epoch": 0.1535450582688373, + "grad_norm": 0.17867855727672577, + "learning_rate": 0.0009490515579568539, + "loss": 2.838, + "step": 5178 + }, + { + "epoch": 0.15357471162114877, + "grad_norm": 0.21567025780677795, + "learning_rate": 0.0009490308636647056, + "loss": 2.8504, + "step": 5179 + }, + { + "epoch": 0.15360436497346025, + "grad_norm": 0.23036806285381317, + "learning_rate": 0.0009490101653963117, + "loss": 2.8628, + "step": 5180 + }, + { + "epoch": 0.15363401832577173, + "grad_norm": 0.2241465449333191, + "learning_rate": 0.0009489894631518559, + "loss": 2.8437, + "step": 5181 + }, + { + "epoch": 0.1536636716780832, + "grad_norm": 0.21738439798355103, + "learning_rate": 0.0009489687569315213, + "loss": 2.8656, + "step": 5182 + }, + { + "epoch": 0.15369332503039468, + "grad_norm": 0.2235150784254074, + "learning_rate": 0.0009489480467354912, + "loss": 2.8282, + "step": 5183 + }, + { + "epoch": 0.15372297838270615, + "grad_norm": 0.18728865683078766, + "learning_rate": 0.0009489273325639492, + "loss": 2.827, + "step": 5184 + }, + { + "epoch": 0.15375263173501763, + "grad_norm": 0.15329541265964508, + "learning_rate": 0.0009489066144170786, + "loss": 2.8326, + "step": 5185 + }, + { + "epoch": 0.15378228508732913, + "grad_norm": 0.14878007769584656, + "learning_rate": 0.0009488858922950628, + "loss": 2.8265, + "step": 5186 + }, + { + "epoch": 0.1538119384396406, + "grad_norm": 0.1403166651725769, + "learning_rate": 0.0009488651661980854, + "loss": 2.8122, + "step": 5187 + }, + { + "epoch": 0.1538415917919521, + "grad_norm": 0.15281996130943298, + "learning_rate": 0.0009488444361263299, + "loss": 2.8489, + "step": 5188 + }, + { + "epoch": 0.15387124514426356, + "grad_norm": 0.1323651671409607, + "learning_rate": 0.0009488237020799799, + "loss": 2.803, + "step": 5189 + }, + { + "epoch": 0.15390089849657504, + "grad_norm": 0.13447079062461853, + "learning_rate": 0.0009488029640592191, + "loss": 2.8613, + "step": 5190 + }, + { + "epoch": 0.15393055184888652, + "grad_norm": 0.15542592108249664, + "learning_rate": 0.0009487822220642308, + "loss": 2.8391, + "step": 5191 + }, + { + "epoch": 0.153960205201198, + "grad_norm": 0.1862933486700058, + "learning_rate": 0.0009487614760951991, + "loss": 2.8711, + "step": 5192 + }, + { + "epoch": 0.15398985855350947, + "grad_norm": 0.18379132449626923, + "learning_rate": 0.0009487407261523073, + "loss": 2.8241, + "step": 5193 + }, + { + "epoch": 0.15401951190582094, + "grad_norm": 0.15348021686077118, + "learning_rate": 0.0009487199722357395, + "loss": 2.8586, + "step": 5194 + }, + { + "epoch": 0.15404916525813242, + "grad_norm": 0.15861041843891144, + "learning_rate": 0.0009486992143456792, + "loss": 2.8168, + "step": 5195 + }, + { + "epoch": 0.15407881861044392, + "grad_norm": 0.16531185805797577, + "learning_rate": 0.0009486784524823104, + "loss": 2.8219, + "step": 5196 + }, + { + "epoch": 0.1541084719627554, + "grad_norm": 0.16833972930908203, + "learning_rate": 0.000948657686645817, + "loss": 2.8589, + "step": 5197 + }, + { + "epoch": 0.15413812531506688, + "grad_norm": 0.15561191737651825, + "learning_rate": 0.0009486369168363825, + "loss": 2.8021, + "step": 5198 + }, + { + "epoch": 0.15416777866737835, + "grad_norm": 0.1582641750574112, + "learning_rate": 0.0009486161430541913, + "loss": 2.8546, + "step": 5199 + }, + { + "epoch": 0.15419743201968983, + "grad_norm": 0.17199382185935974, + "learning_rate": 0.0009485953652994271, + "loss": 2.8177, + "step": 5200 + }, + { + "epoch": 0.1542270853720013, + "grad_norm": 0.16905024647712708, + "learning_rate": 0.0009485745835722739, + "loss": 2.8543, + "step": 5201 + }, + { + "epoch": 0.15425673872431278, + "grad_norm": 0.16306458413600922, + "learning_rate": 0.0009485537978729158, + "loss": 2.8515, + "step": 5202 + }, + { + "epoch": 0.15428639207662426, + "grad_norm": 0.16841694712638855, + "learning_rate": 0.0009485330082015369, + "loss": 2.8201, + "step": 5203 + }, + { + "epoch": 0.15431604542893573, + "grad_norm": 0.1733875572681427, + "learning_rate": 0.0009485122145583212, + "loss": 2.807, + "step": 5204 + }, + { + "epoch": 0.1543456987812472, + "grad_norm": 0.16137754917144775, + "learning_rate": 0.0009484914169434527, + "loss": 2.8305, + "step": 5205 + }, + { + "epoch": 0.15437535213355869, + "grad_norm": 0.1451495885848999, + "learning_rate": 0.0009484706153571158, + "loss": 2.8254, + "step": 5206 + }, + { + "epoch": 0.1544050054858702, + "grad_norm": 0.1316973716020584, + "learning_rate": 0.0009484498097994947, + "loss": 2.8714, + "step": 5207 + }, + { + "epoch": 0.15443465883818167, + "grad_norm": 0.15427684783935547, + "learning_rate": 0.0009484290002707736, + "loss": 2.8329, + "step": 5208 + }, + { + "epoch": 0.15446431219049314, + "grad_norm": 0.14855854213237762, + "learning_rate": 0.0009484081867711366, + "loss": 2.8336, + "step": 5209 + }, + { + "epoch": 0.15449396554280462, + "grad_norm": 0.14075197279453278, + "learning_rate": 0.0009483873693007682, + "loss": 2.8239, + "step": 5210 + }, + { + "epoch": 0.1545236188951161, + "grad_norm": 0.15305277705192566, + "learning_rate": 0.0009483665478598526, + "loss": 2.8485, + "step": 5211 + }, + { + "epoch": 0.15455327224742757, + "grad_norm": 0.1679825782775879, + "learning_rate": 0.0009483457224485743, + "loss": 2.8653, + "step": 5212 + }, + { + "epoch": 0.15458292559973905, + "grad_norm": 0.21560466289520264, + "learning_rate": 0.0009483248930671178, + "loss": 2.8171, + "step": 5213 + }, + { + "epoch": 0.15461257895205052, + "grad_norm": 0.23124736547470093, + "learning_rate": 0.0009483040597156673, + "loss": 2.848, + "step": 5214 + }, + { + "epoch": 0.154642232304362, + "grad_norm": 0.1801699697971344, + "learning_rate": 0.0009482832223944073, + "loss": 2.7988, + "step": 5215 + }, + { + "epoch": 0.15467188565667347, + "grad_norm": 0.14428231120109558, + "learning_rate": 0.0009482623811035226, + "loss": 2.8377, + "step": 5216 + }, + { + "epoch": 0.15470153900898498, + "grad_norm": 0.16240282356739044, + "learning_rate": 0.0009482415358431975, + "loss": 2.8263, + "step": 5217 + }, + { + "epoch": 0.15473119236129645, + "grad_norm": 0.1343526393175125, + "learning_rate": 0.0009482206866136167, + "loss": 2.8635, + "step": 5218 + }, + { + "epoch": 0.15476084571360793, + "grad_norm": 0.14474409818649292, + "learning_rate": 0.0009481998334149646, + "loss": 2.8374, + "step": 5219 + }, + { + "epoch": 0.1547904990659194, + "grad_norm": 0.15378409624099731, + "learning_rate": 0.0009481789762474263, + "loss": 2.8051, + "step": 5220 + }, + { + "epoch": 0.15482015241823088, + "grad_norm": 0.1384103149175644, + "learning_rate": 0.0009481581151111862, + "loss": 2.8606, + "step": 5221 + }, + { + "epoch": 0.15484980577054236, + "grad_norm": 0.13992904126644135, + "learning_rate": 0.0009481372500064289, + "loss": 2.8105, + "step": 5222 + }, + { + "epoch": 0.15487945912285384, + "grad_norm": 0.12846407294273376, + "learning_rate": 0.0009481163809333394, + "loss": 2.826, + "step": 5223 + }, + { + "epoch": 0.1549091124751653, + "grad_norm": 0.13756567239761353, + "learning_rate": 0.0009480955078921024, + "loss": 2.8384, + "step": 5224 + }, + { + "epoch": 0.1549387658274768, + "grad_norm": 0.1707654744386673, + "learning_rate": 0.0009480746308829028, + "loss": 2.863, + "step": 5225 + }, + { + "epoch": 0.15496841917978826, + "grad_norm": 0.19712384045124054, + "learning_rate": 0.0009480537499059254, + "loss": 2.8176, + "step": 5226 + }, + { + "epoch": 0.15499807253209977, + "grad_norm": 0.19909042119979858, + "learning_rate": 0.0009480328649613552, + "loss": 2.8469, + "step": 5227 + }, + { + "epoch": 0.15502772588441124, + "grad_norm": 0.2101011872291565, + "learning_rate": 0.0009480119760493771, + "loss": 2.8424, + "step": 5228 + }, + { + "epoch": 0.15505737923672272, + "grad_norm": 0.19821512699127197, + "learning_rate": 0.0009479910831701761, + "loss": 2.8439, + "step": 5229 + }, + { + "epoch": 0.1550870325890342, + "grad_norm": 0.18748323619365692, + "learning_rate": 0.000947970186323937, + "loss": 2.8168, + "step": 5230 + }, + { + "epoch": 0.15511668594134567, + "grad_norm": 0.21328724920749664, + "learning_rate": 0.000947949285510845, + "loss": 2.8407, + "step": 5231 + }, + { + "epoch": 0.15514633929365715, + "grad_norm": 0.2555605173110962, + "learning_rate": 0.0009479283807310852, + "loss": 2.8404, + "step": 5232 + }, + { + "epoch": 0.15517599264596862, + "grad_norm": 0.23637469112873077, + "learning_rate": 0.0009479074719848428, + "loss": 2.8654, + "step": 5233 + }, + { + "epoch": 0.1552056459982801, + "grad_norm": 0.18269099295139313, + "learning_rate": 0.000947886559272303, + "loss": 2.847, + "step": 5234 + }, + { + "epoch": 0.15523529935059158, + "grad_norm": 0.18351992964744568, + "learning_rate": 0.0009478656425936505, + "loss": 2.8352, + "step": 5235 + }, + { + "epoch": 0.15526495270290305, + "grad_norm": 0.16126851737499237, + "learning_rate": 0.0009478447219490712, + "loss": 2.8457, + "step": 5236 + }, + { + "epoch": 0.15529460605521453, + "grad_norm": 0.1639038622379303, + "learning_rate": 0.0009478237973387497, + "loss": 2.8405, + "step": 5237 + }, + { + "epoch": 0.15532425940752603, + "grad_norm": 0.14907266199588776, + "learning_rate": 0.0009478028687628717, + "loss": 2.8383, + "step": 5238 + }, + { + "epoch": 0.1553539127598375, + "grad_norm": 0.14378862082958221, + "learning_rate": 0.0009477819362216224, + "loss": 2.8637, + "step": 5239 + }, + { + "epoch": 0.15538356611214899, + "grad_norm": 0.14739471673965454, + "learning_rate": 0.0009477609997151873, + "loss": 2.8418, + "step": 5240 + }, + { + "epoch": 0.15541321946446046, + "grad_norm": 0.12490671128034592, + "learning_rate": 0.0009477400592437517, + "loss": 2.8177, + "step": 5241 + }, + { + "epoch": 0.15544287281677194, + "grad_norm": 0.12670190632343292, + "learning_rate": 0.000947719114807501, + "loss": 2.8599, + "step": 5242 + }, + { + "epoch": 0.1554725261690834, + "grad_norm": 0.1269298493862152, + "learning_rate": 0.0009476981664066207, + "loss": 2.8371, + "step": 5243 + }, + { + "epoch": 0.1555021795213949, + "grad_norm": 0.12981313467025757, + "learning_rate": 0.0009476772140412963, + "loss": 2.8462, + "step": 5244 + }, + { + "epoch": 0.15553183287370637, + "grad_norm": 0.10908237844705582, + "learning_rate": 0.0009476562577117131, + "loss": 2.8148, + "step": 5245 + }, + { + "epoch": 0.15556148622601784, + "grad_norm": 0.12915369868278503, + "learning_rate": 0.0009476352974180573, + "loss": 2.7934, + "step": 5246 + }, + { + "epoch": 0.15559113957832932, + "grad_norm": 0.12617813050746918, + "learning_rate": 0.0009476143331605138, + "loss": 2.7756, + "step": 5247 + }, + { + "epoch": 0.15562079293064082, + "grad_norm": 0.15579113364219666, + "learning_rate": 0.0009475933649392686, + "loss": 2.8384, + "step": 5248 + }, + { + "epoch": 0.1556504462829523, + "grad_norm": 0.1654445379972458, + "learning_rate": 0.0009475723927545075, + "loss": 2.8345, + "step": 5249 + }, + { + "epoch": 0.15568009963526377, + "grad_norm": 0.16274228692054749, + "learning_rate": 0.0009475514166064157, + "loss": 2.8265, + "step": 5250 + }, + { + "epoch": 0.15570975298757525, + "grad_norm": 0.1568826287984848, + "learning_rate": 0.0009475304364951795, + "loss": 2.8287, + "step": 5251 + }, + { + "epoch": 0.15573940633988673, + "grad_norm": 0.15711934864521027, + "learning_rate": 0.0009475094524209845, + "loss": 2.8418, + "step": 5252 + }, + { + "epoch": 0.1557690596921982, + "grad_norm": 0.13903513550758362, + "learning_rate": 0.0009474884643840165, + "loss": 2.8171, + "step": 5253 + }, + { + "epoch": 0.15579871304450968, + "grad_norm": 0.16246584057807922, + "learning_rate": 0.0009474674723844612, + "loss": 2.8463, + "step": 5254 + }, + { + "epoch": 0.15582836639682116, + "grad_norm": 0.1759728640317917, + "learning_rate": 0.0009474464764225046, + "loss": 2.8049, + "step": 5255 + }, + { + "epoch": 0.15585801974913263, + "grad_norm": 0.2071412056684494, + "learning_rate": 0.0009474254764983329, + "loss": 2.8395, + "step": 5256 + }, + { + "epoch": 0.1558876731014441, + "grad_norm": 0.22586743533611298, + "learning_rate": 0.0009474044726121316, + "loss": 2.8524, + "step": 5257 + }, + { + "epoch": 0.15591732645375558, + "grad_norm": 0.17988815903663635, + "learning_rate": 0.0009473834647640869, + "loss": 2.8252, + "step": 5258 + }, + { + "epoch": 0.1559469798060671, + "grad_norm": 0.1665555089712143, + "learning_rate": 0.0009473624529543849, + "loss": 2.8592, + "step": 5259 + }, + { + "epoch": 0.15597663315837856, + "grad_norm": 0.16110797226428986, + "learning_rate": 0.0009473414371832116, + "loss": 2.8313, + "step": 5260 + }, + { + "epoch": 0.15600628651069004, + "grad_norm": 0.16014569997787476, + "learning_rate": 0.0009473204174507531, + "loss": 2.8281, + "step": 5261 + }, + { + "epoch": 0.15603593986300152, + "grad_norm": 0.17394492030143738, + "learning_rate": 0.0009472993937571954, + "loss": 2.8414, + "step": 5262 + }, + { + "epoch": 0.156065593215313, + "grad_norm": 0.16263949871063232, + "learning_rate": 0.0009472783661027249, + "loss": 2.8509, + "step": 5263 + }, + { + "epoch": 0.15609524656762447, + "grad_norm": 0.14320901036262512, + "learning_rate": 0.0009472573344875277, + "loss": 2.8334, + "step": 5264 + }, + { + "epoch": 0.15612489991993594, + "grad_norm": 0.13940760493278503, + "learning_rate": 0.00094723629891179, + "loss": 2.8507, + "step": 5265 + }, + { + "epoch": 0.15615455327224742, + "grad_norm": 0.1498376429080963, + "learning_rate": 0.0009472152593756981, + "loss": 2.8157, + "step": 5266 + }, + { + "epoch": 0.1561842066245589, + "grad_norm": 0.14971376955509186, + "learning_rate": 0.0009471942158794382, + "loss": 2.817, + "step": 5267 + }, + { + "epoch": 0.15621385997687037, + "grad_norm": 0.1559964269399643, + "learning_rate": 0.000947173168423197, + "loss": 2.8305, + "step": 5268 + }, + { + "epoch": 0.15624351332918188, + "grad_norm": 0.1473454236984253, + "learning_rate": 0.0009471521170071604, + "loss": 2.8447, + "step": 5269 + }, + { + "epoch": 0.15627316668149335, + "grad_norm": 0.18117643892765045, + "learning_rate": 0.0009471310616315151, + "loss": 2.8373, + "step": 5270 + }, + { + "epoch": 0.15630282003380483, + "grad_norm": 0.19151540100574493, + "learning_rate": 0.0009471100022964476, + "loss": 2.8566, + "step": 5271 + }, + { + "epoch": 0.1563324733861163, + "grad_norm": 0.16128680109977722, + "learning_rate": 0.0009470889390021443, + "loss": 2.8363, + "step": 5272 + }, + { + "epoch": 0.15636212673842778, + "grad_norm": 0.17667344212532043, + "learning_rate": 0.0009470678717487918, + "loss": 2.8088, + "step": 5273 + }, + { + "epoch": 0.15639178009073926, + "grad_norm": 0.19797077775001526, + "learning_rate": 0.0009470468005365763, + "loss": 2.804, + "step": 5274 + }, + { + "epoch": 0.15642143344305073, + "grad_norm": 0.19533658027648926, + "learning_rate": 0.0009470257253656847, + "loss": 2.8172, + "step": 5275 + }, + { + "epoch": 0.1564510867953622, + "grad_norm": 0.1753208190202713, + "learning_rate": 0.0009470046462363037, + "loss": 2.8523, + "step": 5276 + }, + { + "epoch": 0.15648074014767369, + "grad_norm": 0.15241843461990356, + "learning_rate": 0.0009469835631486196, + "loss": 2.7966, + "step": 5277 + }, + { + "epoch": 0.15651039349998516, + "grad_norm": 0.1626966893672943, + "learning_rate": 0.0009469624761028196, + "loss": 2.8506, + "step": 5278 + }, + { + "epoch": 0.15654004685229667, + "grad_norm": 0.15378662943840027, + "learning_rate": 0.00094694138509909, + "loss": 2.8386, + "step": 5279 + }, + { + "epoch": 0.15656970020460814, + "grad_norm": 0.15979863703250885, + "learning_rate": 0.0009469202901376177, + "loss": 2.8105, + "step": 5280 + }, + { + "epoch": 0.15659935355691962, + "grad_norm": 0.2007821500301361, + "learning_rate": 0.0009468991912185895, + "loss": 2.7895, + "step": 5281 + }, + { + "epoch": 0.1566290069092311, + "grad_norm": 0.20765942335128784, + "learning_rate": 0.0009468780883421926, + "loss": 2.8409, + "step": 5282 + }, + { + "epoch": 0.15665866026154257, + "grad_norm": 0.22457383573055267, + "learning_rate": 0.0009468569815086133, + "loss": 2.8499, + "step": 5283 + }, + { + "epoch": 0.15668831361385405, + "grad_norm": 0.1882844865322113, + "learning_rate": 0.0009468358707180384, + "loss": 2.8363, + "step": 5284 + }, + { + "epoch": 0.15671796696616552, + "grad_norm": 0.17694754898548126, + "learning_rate": 0.0009468147559706555, + "loss": 2.8056, + "step": 5285 + }, + { + "epoch": 0.156747620318477, + "grad_norm": 0.18593038618564606, + "learning_rate": 0.0009467936372666513, + "loss": 2.821, + "step": 5286 + }, + { + "epoch": 0.15677727367078848, + "grad_norm": 0.18083487451076508, + "learning_rate": 0.0009467725146062125, + "loss": 2.8674, + "step": 5287 + }, + { + "epoch": 0.15680692702309995, + "grad_norm": 0.1789059191942215, + "learning_rate": 0.0009467513879895267, + "loss": 2.8281, + "step": 5288 + }, + { + "epoch": 0.15683658037541143, + "grad_norm": 0.16229559481143951, + "learning_rate": 0.0009467302574167804, + "loss": 2.822, + "step": 5289 + }, + { + "epoch": 0.15686623372772293, + "grad_norm": 0.16249240934848785, + "learning_rate": 0.000946709122888161, + "loss": 2.8198, + "step": 5290 + }, + { + "epoch": 0.1568958870800344, + "grad_norm": 0.17170852422714233, + "learning_rate": 0.0009466879844038558, + "loss": 2.8555, + "step": 5291 + }, + { + "epoch": 0.15692554043234588, + "grad_norm": 0.15822145342826843, + "learning_rate": 0.0009466668419640518, + "loss": 2.8227, + "step": 5292 + }, + { + "epoch": 0.15695519378465736, + "grad_norm": 0.18394093215465546, + "learning_rate": 0.0009466456955689362, + "loss": 2.8362, + "step": 5293 + }, + { + "epoch": 0.15698484713696884, + "grad_norm": 0.17719072103500366, + "learning_rate": 0.0009466245452186963, + "loss": 2.8165, + "step": 5294 + }, + { + "epoch": 0.1570145004892803, + "grad_norm": 0.1438613086938858, + "learning_rate": 0.0009466033909135194, + "loss": 2.8153, + "step": 5295 + }, + { + "epoch": 0.1570441538415918, + "grad_norm": 0.14103616774082184, + "learning_rate": 0.0009465822326535927, + "loss": 2.8085, + "step": 5296 + }, + { + "epoch": 0.15707380719390326, + "grad_norm": 0.1630852371454239, + "learning_rate": 0.0009465610704391039, + "loss": 2.8384, + "step": 5297 + }, + { + "epoch": 0.15710346054621474, + "grad_norm": 0.15414147078990936, + "learning_rate": 0.0009465399042702401, + "loss": 2.839, + "step": 5298 + }, + { + "epoch": 0.15713311389852622, + "grad_norm": 0.13633258640766144, + "learning_rate": 0.0009465187341471886, + "loss": 2.8457, + "step": 5299 + }, + { + "epoch": 0.15716276725083772, + "grad_norm": 0.12828566133975983, + "learning_rate": 0.0009464975600701373, + "loss": 2.8376, + "step": 5300 + }, + { + "epoch": 0.1571924206031492, + "grad_norm": 0.13183633983135223, + "learning_rate": 0.0009464763820392734, + "loss": 2.8253, + "step": 5301 + }, + { + "epoch": 0.15722207395546067, + "grad_norm": 0.12084310501813889, + "learning_rate": 0.0009464552000547844, + "loss": 2.8617, + "step": 5302 + }, + { + "epoch": 0.15725172730777215, + "grad_norm": 0.13268856704235077, + "learning_rate": 0.0009464340141168582, + "loss": 2.8329, + "step": 5303 + }, + { + "epoch": 0.15728138066008363, + "grad_norm": 0.10903532803058624, + "learning_rate": 0.000946412824225682, + "loss": 2.8072, + "step": 5304 + }, + { + "epoch": 0.1573110340123951, + "grad_norm": 0.12143020331859589, + "learning_rate": 0.0009463916303814436, + "loss": 2.8084, + "step": 5305 + }, + { + "epoch": 0.15734068736470658, + "grad_norm": 0.13536696135997772, + "learning_rate": 0.0009463704325843307, + "loss": 2.8394, + "step": 5306 + }, + { + "epoch": 0.15737034071701805, + "grad_norm": 0.1408849060535431, + "learning_rate": 0.000946349230834531, + "loss": 2.8523, + "step": 5307 + }, + { + "epoch": 0.15739999406932953, + "grad_norm": 0.1517326682806015, + "learning_rate": 0.0009463280251322324, + "loss": 2.7895, + "step": 5308 + }, + { + "epoch": 0.157429647421641, + "grad_norm": 0.16636279225349426, + "learning_rate": 0.0009463068154776223, + "loss": 2.8154, + "step": 5309 + }, + { + "epoch": 0.15745930077395248, + "grad_norm": 0.17031200230121613, + "learning_rate": 0.000946285601870889, + "loss": 2.8214, + "step": 5310 + }, + { + "epoch": 0.15748895412626399, + "grad_norm": 0.17448435723781586, + "learning_rate": 0.0009462643843122198, + "loss": 2.8127, + "step": 5311 + }, + { + "epoch": 0.15751860747857546, + "grad_norm": 0.18849250674247742, + "learning_rate": 0.0009462431628018031, + "loss": 2.8292, + "step": 5312 + }, + { + "epoch": 0.15754826083088694, + "grad_norm": 0.1858321726322174, + "learning_rate": 0.0009462219373398264, + "loss": 2.8063, + "step": 5313 + }, + { + "epoch": 0.15757791418319841, + "grad_norm": 0.15076762437820435, + "learning_rate": 0.0009462007079264781, + "loss": 2.838, + "step": 5314 + }, + { + "epoch": 0.1576075675355099, + "grad_norm": 0.15120315551757812, + "learning_rate": 0.0009461794745619457, + "loss": 2.8141, + "step": 5315 + }, + { + "epoch": 0.15763722088782137, + "grad_norm": 0.15653839707374573, + "learning_rate": 0.0009461582372464176, + "loss": 2.8473, + "step": 5316 + }, + { + "epoch": 0.15766687424013284, + "grad_norm": 0.16233529150485992, + "learning_rate": 0.0009461369959800817, + "loss": 2.8334, + "step": 5317 + }, + { + "epoch": 0.15769652759244432, + "grad_norm": 0.20266969501972198, + "learning_rate": 0.0009461157507631261, + "loss": 2.8253, + "step": 5318 + }, + { + "epoch": 0.1577261809447558, + "grad_norm": 0.235992431640625, + "learning_rate": 0.000946094501595739, + "loss": 2.8324, + "step": 5319 + }, + { + "epoch": 0.15775583429706727, + "grad_norm": 0.20918917655944824, + "learning_rate": 0.0009460732484781085, + "loss": 2.8243, + "step": 5320 + }, + { + "epoch": 0.15778548764937878, + "grad_norm": 0.19658425450325012, + "learning_rate": 0.0009460519914104229, + "loss": 2.8224, + "step": 5321 + }, + { + "epoch": 0.15781514100169025, + "grad_norm": 0.23116739094257355, + "learning_rate": 0.0009460307303928701, + "loss": 2.8867, + "step": 5322 + }, + { + "epoch": 0.15784479435400173, + "grad_norm": 0.22446772456169128, + "learning_rate": 0.0009460094654256388, + "loss": 2.819, + "step": 5323 + }, + { + "epoch": 0.1578744477063132, + "grad_norm": 0.19871853291988373, + "learning_rate": 0.0009459881965089172, + "loss": 2.8311, + "step": 5324 + }, + { + "epoch": 0.15790410105862468, + "grad_norm": 0.17680443823337555, + "learning_rate": 0.0009459669236428935, + "loss": 2.8278, + "step": 5325 + }, + { + "epoch": 0.15793375441093616, + "grad_norm": 0.16440583765506744, + "learning_rate": 0.0009459456468277561, + "loss": 2.8448, + "step": 5326 + }, + { + "epoch": 0.15796340776324763, + "grad_norm": 0.179426372051239, + "learning_rate": 0.0009459243660636935, + "loss": 2.8131, + "step": 5327 + }, + { + "epoch": 0.1579930611155591, + "grad_norm": 0.13795197010040283, + "learning_rate": 0.000945903081350894, + "loss": 2.8243, + "step": 5328 + }, + { + "epoch": 0.15802271446787058, + "grad_norm": 0.1488598883152008, + "learning_rate": 0.0009458817926895463, + "loss": 2.822, + "step": 5329 + }, + { + "epoch": 0.15805236782018206, + "grad_norm": 0.12339950352907181, + "learning_rate": 0.0009458605000798387, + "loss": 2.8539, + "step": 5330 + }, + { + "epoch": 0.15808202117249356, + "grad_norm": 0.13879412412643433, + "learning_rate": 0.0009458392035219599, + "loss": 2.8239, + "step": 5331 + }, + { + "epoch": 0.15811167452480504, + "grad_norm": 0.11827627569437027, + "learning_rate": 0.0009458179030160985, + "loss": 2.8189, + "step": 5332 + }, + { + "epoch": 0.15814132787711652, + "grad_norm": 0.12350762635469437, + "learning_rate": 0.0009457965985624428, + "loss": 2.7984, + "step": 5333 + }, + { + "epoch": 0.158170981229428, + "grad_norm": 0.13933555781841278, + "learning_rate": 0.0009457752901611819, + "loss": 2.8353, + "step": 5334 + }, + { + "epoch": 0.15820063458173947, + "grad_norm": 0.15023230016231537, + "learning_rate": 0.0009457539778125042, + "loss": 2.8175, + "step": 5335 + }, + { + "epoch": 0.15823028793405094, + "grad_norm": 0.162234827876091, + "learning_rate": 0.0009457326615165987, + "loss": 2.8712, + "step": 5336 + }, + { + "epoch": 0.15825994128636242, + "grad_norm": 0.180975541472435, + "learning_rate": 0.0009457113412736538, + "loss": 2.7955, + "step": 5337 + }, + { + "epoch": 0.1582895946386739, + "grad_norm": 0.1886504590511322, + "learning_rate": 0.0009456900170838585, + "loss": 2.8565, + "step": 5338 + }, + { + "epoch": 0.15831924799098537, + "grad_norm": 0.17028838396072388, + "learning_rate": 0.0009456686889474015, + "loss": 2.8373, + "step": 5339 + }, + { + "epoch": 0.15834890134329685, + "grad_norm": 0.16597531735897064, + "learning_rate": 0.0009456473568644719, + "loss": 2.8316, + "step": 5340 + }, + { + "epoch": 0.15837855469560833, + "grad_norm": 0.17496486008167267, + "learning_rate": 0.0009456260208352584, + "loss": 2.8221, + "step": 5341 + }, + { + "epoch": 0.15840820804791983, + "grad_norm": 0.18257470428943634, + "learning_rate": 0.00094560468085995, + "loss": 2.8218, + "step": 5342 + }, + { + "epoch": 0.1584378614002313, + "grad_norm": 0.17821004986763, + "learning_rate": 0.0009455833369387356, + "loss": 2.8152, + "step": 5343 + }, + { + "epoch": 0.15846751475254278, + "grad_norm": 0.1935640424489975, + "learning_rate": 0.0009455619890718043, + "loss": 2.8458, + "step": 5344 + }, + { + "epoch": 0.15849716810485426, + "grad_norm": 0.2007095068693161, + "learning_rate": 0.0009455406372593453, + "loss": 2.8376, + "step": 5345 + }, + { + "epoch": 0.15852682145716573, + "grad_norm": 0.1834350824356079, + "learning_rate": 0.0009455192815015472, + "loss": 2.86, + "step": 5346 + }, + { + "epoch": 0.1585564748094772, + "grad_norm": 0.17139308154582977, + "learning_rate": 0.0009454979217985996, + "loss": 2.8412, + "step": 5347 + }, + { + "epoch": 0.1585861281617887, + "grad_norm": 0.17343458533287048, + "learning_rate": 0.0009454765581506914, + "loss": 2.863, + "step": 5348 + }, + { + "epoch": 0.15861578151410016, + "grad_norm": 0.18141169846057892, + "learning_rate": 0.0009454551905580117, + "loss": 2.8392, + "step": 5349 + }, + { + "epoch": 0.15864543486641164, + "grad_norm": 0.16944175958633423, + "learning_rate": 0.0009454338190207498, + "loss": 2.8578, + "step": 5350 + }, + { + "epoch": 0.15867508821872311, + "grad_norm": 0.15391744673252106, + "learning_rate": 0.0009454124435390952, + "loss": 2.85, + "step": 5351 + }, + { + "epoch": 0.15870474157103462, + "grad_norm": 0.1470925509929657, + "learning_rate": 0.0009453910641132368, + "loss": 2.8456, + "step": 5352 + }, + { + "epoch": 0.1587343949233461, + "grad_norm": 0.1634809821844101, + "learning_rate": 0.0009453696807433641, + "loss": 2.8064, + "step": 5353 + }, + { + "epoch": 0.15876404827565757, + "grad_norm": 0.18826255202293396, + "learning_rate": 0.0009453482934296665, + "loss": 2.8174, + "step": 5354 + }, + { + "epoch": 0.15879370162796905, + "grad_norm": 0.18433552980422974, + "learning_rate": 0.0009453269021723332, + "loss": 2.8656, + "step": 5355 + }, + { + "epoch": 0.15882335498028052, + "grad_norm": 0.16190297901630402, + "learning_rate": 0.0009453055069715537, + "loss": 2.8464, + "step": 5356 + }, + { + "epoch": 0.158853008332592, + "grad_norm": 0.15549637377262115, + "learning_rate": 0.0009452841078275177, + "loss": 2.8204, + "step": 5357 + }, + { + "epoch": 0.15888266168490348, + "grad_norm": 0.18243390321731567, + "learning_rate": 0.0009452627047404143, + "loss": 2.8681, + "step": 5358 + }, + { + "epoch": 0.15891231503721495, + "grad_norm": 0.14531324803829193, + "learning_rate": 0.0009452412977104333, + "loss": 2.8473, + "step": 5359 + }, + { + "epoch": 0.15894196838952643, + "grad_norm": 0.11358331888914108, + "learning_rate": 0.0009452198867377641, + "loss": 2.8092, + "step": 5360 + }, + { + "epoch": 0.1589716217418379, + "grad_norm": 0.12887026369571686, + "learning_rate": 0.0009451984718225966, + "loss": 2.8355, + "step": 5361 + }, + { + "epoch": 0.15900127509414938, + "grad_norm": 0.14049625396728516, + "learning_rate": 0.00094517705296512, + "loss": 2.8269, + "step": 5362 + }, + { + "epoch": 0.15903092844646088, + "grad_norm": 0.12852028012275696, + "learning_rate": 0.0009451556301655244, + "loss": 2.8089, + "step": 5363 + }, + { + "epoch": 0.15906058179877236, + "grad_norm": 0.11577257513999939, + "learning_rate": 0.0009451342034239991, + "loss": 2.8123, + "step": 5364 + }, + { + "epoch": 0.15909023515108384, + "grad_norm": 0.12655898928642273, + "learning_rate": 0.000945112772740734, + "loss": 2.8301, + "step": 5365 + }, + { + "epoch": 0.1591198885033953, + "grad_norm": 0.12414932996034622, + "learning_rate": 0.0009450913381159191, + "loss": 2.8226, + "step": 5366 + }, + { + "epoch": 0.1591495418557068, + "grad_norm": 0.13514967262744904, + "learning_rate": 0.0009450698995497437, + "loss": 2.8079, + "step": 5367 + }, + { + "epoch": 0.15917919520801826, + "grad_norm": 0.16596026718616486, + "learning_rate": 0.0009450484570423983, + "loss": 2.7868, + "step": 5368 + }, + { + "epoch": 0.15920884856032974, + "grad_norm": 0.20800048112869263, + "learning_rate": 0.0009450270105940721, + "loss": 2.8368, + "step": 5369 + }, + { + "epoch": 0.15923850191264122, + "grad_norm": 0.21772044897079468, + "learning_rate": 0.0009450055602049555, + "loss": 2.7998, + "step": 5370 + }, + { + "epoch": 0.1592681552649527, + "grad_norm": 0.18227310478687286, + "learning_rate": 0.0009449841058752383, + "loss": 2.856, + "step": 5371 + }, + { + "epoch": 0.15929780861726417, + "grad_norm": 0.20005974173545837, + "learning_rate": 0.0009449626476051104, + "loss": 2.8523, + "step": 5372 + }, + { + "epoch": 0.15932746196957567, + "grad_norm": 0.18572165071964264, + "learning_rate": 0.000944941185394762, + "loss": 2.8467, + "step": 5373 + }, + { + "epoch": 0.15935711532188715, + "grad_norm": 0.15612244606018066, + "learning_rate": 0.0009449197192443828, + "loss": 2.8337, + "step": 5374 + }, + { + "epoch": 0.15938676867419863, + "grad_norm": 0.17733131349086761, + "learning_rate": 0.0009448982491541633, + "loss": 2.8187, + "step": 5375 + }, + { + "epoch": 0.1594164220265101, + "grad_norm": 0.14873771369457245, + "learning_rate": 0.0009448767751242934, + "loss": 2.8217, + "step": 5376 + }, + { + "epoch": 0.15944607537882158, + "grad_norm": 0.1399042010307312, + "learning_rate": 0.0009448552971549632, + "loss": 2.789, + "step": 5377 + }, + { + "epoch": 0.15947572873113305, + "grad_norm": 0.1418837308883667, + "learning_rate": 0.0009448338152463631, + "loss": 2.8353, + "step": 5378 + }, + { + "epoch": 0.15950538208344453, + "grad_norm": 0.17398296296596527, + "learning_rate": 0.0009448123293986832, + "loss": 2.8537, + "step": 5379 + }, + { + "epoch": 0.159535035435756, + "grad_norm": 0.20034977793693542, + "learning_rate": 0.0009447908396121136, + "loss": 2.8171, + "step": 5380 + }, + { + "epoch": 0.15956468878806748, + "grad_norm": 0.24086478352546692, + "learning_rate": 0.0009447693458868449, + "loss": 2.8468, + "step": 5381 + }, + { + "epoch": 0.15959434214037896, + "grad_norm": 0.24550481140613556, + "learning_rate": 0.0009447478482230673, + "loss": 2.8337, + "step": 5382 + }, + { + "epoch": 0.15962399549269046, + "grad_norm": 0.16941270232200623, + "learning_rate": 0.0009447263466209712, + "loss": 2.838, + "step": 5383 + }, + { + "epoch": 0.15965364884500194, + "grad_norm": 0.15607787668704987, + "learning_rate": 0.000944704841080747, + "loss": 2.8263, + "step": 5384 + }, + { + "epoch": 0.15968330219731341, + "grad_norm": 0.1722852885723114, + "learning_rate": 0.000944683331602585, + "loss": 2.8416, + "step": 5385 + }, + { + "epoch": 0.1597129555496249, + "grad_norm": 0.14106225967407227, + "learning_rate": 0.0009446618181866758, + "loss": 2.7835, + "step": 5386 + }, + { + "epoch": 0.15974260890193637, + "grad_norm": 0.14648273587226868, + "learning_rate": 0.0009446403008332099, + "loss": 2.843, + "step": 5387 + }, + { + "epoch": 0.15977226225424784, + "grad_norm": 0.14359325170516968, + "learning_rate": 0.0009446187795423777, + "loss": 2.8139, + "step": 5388 + }, + { + "epoch": 0.15980191560655932, + "grad_norm": 0.1464177519083023, + "learning_rate": 0.0009445972543143702, + "loss": 2.8012, + "step": 5389 + }, + { + "epoch": 0.1598315689588708, + "grad_norm": 0.16757197678089142, + "learning_rate": 0.0009445757251493774, + "loss": 2.8391, + "step": 5390 + }, + { + "epoch": 0.15986122231118227, + "grad_norm": 0.1731555312871933, + "learning_rate": 0.0009445541920475903, + "loss": 2.8091, + "step": 5391 + }, + { + "epoch": 0.15989087566349375, + "grad_norm": 0.15453431010246277, + "learning_rate": 0.0009445326550091995, + "loss": 2.8137, + "step": 5392 + }, + { + "epoch": 0.15992052901580522, + "grad_norm": 0.14507301151752472, + "learning_rate": 0.0009445111140343958, + "loss": 2.82, + "step": 5393 + }, + { + "epoch": 0.15995018236811673, + "grad_norm": 0.13228118419647217, + "learning_rate": 0.0009444895691233699, + "loss": 2.8422, + "step": 5394 + }, + { + "epoch": 0.1599798357204282, + "grad_norm": 0.12844118475914001, + "learning_rate": 0.0009444680202763125, + "loss": 2.8316, + "step": 5395 + }, + { + "epoch": 0.16000948907273968, + "grad_norm": 0.1314898431301117, + "learning_rate": 0.0009444464674934146, + "loss": 2.8202, + "step": 5396 + }, + { + "epoch": 0.16003914242505116, + "grad_norm": 0.13215433061122894, + "learning_rate": 0.0009444249107748668, + "loss": 2.8032, + "step": 5397 + }, + { + "epoch": 0.16006879577736263, + "grad_norm": 0.1350368857383728, + "learning_rate": 0.0009444033501208602, + "loss": 2.8331, + "step": 5398 + }, + { + "epoch": 0.1600984491296741, + "grad_norm": 0.14393331110477448, + "learning_rate": 0.0009443817855315857, + "loss": 2.8256, + "step": 5399 + }, + { + "epoch": 0.16012810248198558, + "grad_norm": 0.16997990012168884, + "learning_rate": 0.0009443602170072342, + "loss": 2.8566, + "step": 5400 + }, + { + "epoch": 0.16015775583429706, + "grad_norm": 0.17654700577259064, + "learning_rate": 0.0009443386445479967, + "loss": 2.8489, + "step": 5401 + }, + { + "epoch": 0.16018740918660854, + "grad_norm": 0.18417374789714813, + "learning_rate": 0.0009443170681540642, + "loss": 2.7952, + "step": 5402 + }, + { + "epoch": 0.16021706253892, + "grad_norm": 0.1884671002626419, + "learning_rate": 0.0009442954878256278, + "loss": 2.8087, + "step": 5403 + }, + { + "epoch": 0.16024671589123152, + "grad_norm": 0.18453019857406616, + "learning_rate": 0.0009442739035628784, + "loss": 2.8242, + "step": 5404 + }, + { + "epoch": 0.160276369243543, + "grad_norm": 0.1771809160709381, + "learning_rate": 0.0009442523153660076, + "loss": 2.8669, + "step": 5405 + }, + { + "epoch": 0.16030602259585447, + "grad_norm": 0.18617990612983704, + "learning_rate": 0.0009442307232352063, + "loss": 2.7945, + "step": 5406 + }, + { + "epoch": 0.16033567594816595, + "grad_norm": 0.19177861511707306, + "learning_rate": 0.0009442091271706656, + "loss": 2.8135, + "step": 5407 + }, + { + "epoch": 0.16036532930047742, + "grad_norm": 0.2092478722333908, + "learning_rate": 0.0009441875271725768, + "loss": 2.8191, + "step": 5408 + }, + { + "epoch": 0.1603949826527889, + "grad_norm": 0.18463817238807678, + "learning_rate": 0.0009441659232411313, + "loss": 2.7996, + "step": 5409 + }, + { + "epoch": 0.16042463600510037, + "grad_norm": 0.16564102470874786, + "learning_rate": 0.0009441443153765201, + "loss": 2.8345, + "step": 5410 + }, + { + "epoch": 0.16045428935741185, + "grad_norm": 0.15090680122375488, + "learning_rate": 0.0009441227035789351, + "loss": 2.8447, + "step": 5411 + }, + { + "epoch": 0.16048394270972333, + "grad_norm": 0.15426991879940033, + "learning_rate": 0.000944101087848567, + "loss": 2.8129, + "step": 5412 + }, + { + "epoch": 0.1605135960620348, + "grad_norm": 0.1566489040851593, + "learning_rate": 0.0009440794681856077, + "loss": 2.8195, + "step": 5413 + }, + { + "epoch": 0.16054324941434628, + "grad_norm": 0.14802487194538116, + "learning_rate": 0.0009440578445902484, + "loss": 2.827, + "step": 5414 + }, + { + "epoch": 0.16057290276665778, + "grad_norm": 0.14903855323791504, + "learning_rate": 0.0009440362170626809, + "loss": 2.8673, + "step": 5415 + }, + { + "epoch": 0.16060255611896926, + "grad_norm": 0.15011999011039734, + "learning_rate": 0.0009440145856030961, + "loss": 2.8231, + "step": 5416 + }, + { + "epoch": 0.16063220947128073, + "grad_norm": 0.1705608367919922, + "learning_rate": 0.0009439929502116862, + "loss": 2.7775, + "step": 5417 + }, + { + "epoch": 0.1606618628235922, + "grad_norm": 0.18425734341144562, + "learning_rate": 0.0009439713108886425, + "loss": 2.8361, + "step": 5418 + }, + { + "epoch": 0.1606915161759037, + "grad_norm": 0.15982383489608765, + "learning_rate": 0.0009439496676341565, + "loss": 2.8136, + "step": 5419 + }, + { + "epoch": 0.16072116952821516, + "grad_norm": 0.13058197498321533, + "learning_rate": 0.0009439280204484201, + "loss": 2.8076, + "step": 5420 + }, + { + "epoch": 0.16075082288052664, + "grad_norm": 0.14118698239326477, + "learning_rate": 0.0009439063693316247, + "loss": 2.8347, + "step": 5421 + }, + { + "epoch": 0.16078047623283812, + "grad_norm": 0.15847384929656982, + "learning_rate": 0.0009438847142839624, + "loss": 2.8389, + "step": 5422 + }, + { + "epoch": 0.1608101295851496, + "grad_norm": 0.14312061667442322, + "learning_rate": 0.0009438630553056247, + "loss": 2.792, + "step": 5423 + }, + { + "epoch": 0.16083978293746107, + "grad_norm": 0.13371051847934723, + "learning_rate": 0.0009438413923968036, + "loss": 2.8304, + "step": 5424 + }, + { + "epoch": 0.16086943628977257, + "grad_norm": 0.15017126500606537, + "learning_rate": 0.0009438197255576906, + "loss": 2.8258, + "step": 5425 + }, + { + "epoch": 0.16089908964208405, + "grad_norm": 0.13990944623947144, + "learning_rate": 0.0009437980547884778, + "loss": 2.8367, + "step": 5426 + }, + { + "epoch": 0.16092874299439552, + "grad_norm": 0.12753289937973022, + "learning_rate": 0.000943776380089357, + "loss": 2.8081, + "step": 5427 + }, + { + "epoch": 0.160958396346707, + "grad_norm": 0.14776663482189178, + "learning_rate": 0.0009437547014605203, + "loss": 2.8573, + "step": 5428 + }, + { + "epoch": 0.16098804969901848, + "grad_norm": 0.1387883722782135, + "learning_rate": 0.0009437330189021594, + "loss": 2.7813, + "step": 5429 + }, + { + "epoch": 0.16101770305132995, + "grad_norm": 0.1483982503414154, + "learning_rate": 0.0009437113324144666, + "loss": 2.8049, + "step": 5430 + }, + { + "epoch": 0.16104735640364143, + "grad_norm": 0.1533138006925583, + "learning_rate": 0.0009436896419976337, + "loss": 2.7805, + "step": 5431 + }, + { + "epoch": 0.1610770097559529, + "grad_norm": 0.16802842915058136, + "learning_rate": 0.000943667947651853, + "loss": 2.7974, + "step": 5432 + }, + { + "epoch": 0.16110666310826438, + "grad_norm": 0.18247228860855103, + "learning_rate": 0.0009436462493773163, + "loss": 2.8127, + "step": 5433 + }, + { + "epoch": 0.16113631646057586, + "grad_norm": 0.1647634506225586, + "learning_rate": 0.000943624547174216, + "loss": 2.8304, + "step": 5434 + }, + { + "epoch": 0.16116596981288736, + "grad_norm": 0.14378240704536438, + "learning_rate": 0.0009436028410427441, + "loss": 2.8218, + "step": 5435 + }, + { + "epoch": 0.16119562316519884, + "grad_norm": 0.1493784785270691, + "learning_rate": 0.000943581130983093, + "loss": 2.8346, + "step": 5436 + }, + { + "epoch": 0.1612252765175103, + "grad_norm": 0.177723690867424, + "learning_rate": 0.0009435594169954548, + "loss": 2.8341, + "step": 5437 + }, + { + "epoch": 0.1612549298698218, + "grad_norm": 0.17854763567447662, + "learning_rate": 0.0009435376990800218, + "loss": 2.8162, + "step": 5438 + }, + { + "epoch": 0.16128458322213327, + "grad_norm": 0.16153208911418915, + "learning_rate": 0.0009435159772369863, + "loss": 2.8069, + "step": 5439 + }, + { + "epoch": 0.16131423657444474, + "grad_norm": 0.16654932498931885, + "learning_rate": 0.0009434942514665407, + "loss": 2.8441, + "step": 5440 + }, + { + "epoch": 0.16134388992675622, + "grad_norm": 0.17777661979198456, + "learning_rate": 0.0009434725217688776, + "loss": 2.811, + "step": 5441 + }, + { + "epoch": 0.1613735432790677, + "grad_norm": 0.14554418623447418, + "learning_rate": 0.000943450788144189, + "loss": 2.8365, + "step": 5442 + }, + { + "epoch": 0.16140319663137917, + "grad_norm": 0.1523052304983139, + "learning_rate": 0.0009434290505926676, + "loss": 2.8245, + "step": 5443 + }, + { + "epoch": 0.16143284998369065, + "grad_norm": 0.18106429278850555, + "learning_rate": 0.0009434073091145059, + "loss": 2.8558, + "step": 5444 + }, + { + "epoch": 0.16146250333600212, + "grad_norm": 0.19723358750343323, + "learning_rate": 0.0009433855637098963, + "loss": 2.829, + "step": 5445 + }, + { + "epoch": 0.16149215668831363, + "grad_norm": 0.18967126309871674, + "learning_rate": 0.0009433638143790313, + "loss": 2.8143, + "step": 5446 + }, + { + "epoch": 0.1615218100406251, + "grad_norm": 0.1744391769170761, + "learning_rate": 0.0009433420611221037, + "loss": 2.8278, + "step": 5447 + }, + { + "epoch": 0.16155146339293658, + "grad_norm": 0.15805356204509735, + "learning_rate": 0.0009433203039393061, + "loss": 2.8223, + "step": 5448 + }, + { + "epoch": 0.16158111674524805, + "grad_norm": 0.17519497871398926, + "learning_rate": 0.0009432985428308311, + "loss": 2.8427, + "step": 5449 + }, + { + "epoch": 0.16161077009755953, + "grad_norm": 0.14361219108104706, + "learning_rate": 0.0009432767777968716, + "loss": 2.8156, + "step": 5450 + }, + { + "epoch": 0.161640423449871, + "grad_norm": 0.14207448065280914, + "learning_rate": 0.0009432550088376199, + "loss": 2.8501, + "step": 5451 + }, + { + "epoch": 0.16167007680218248, + "grad_norm": 0.15358003973960876, + "learning_rate": 0.0009432332359532691, + "loss": 2.8035, + "step": 5452 + }, + { + "epoch": 0.16169973015449396, + "grad_norm": 0.15819446742534637, + "learning_rate": 0.000943211459144012, + "loss": 2.8244, + "step": 5453 + }, + { + "epoch": 0.16172938350680544, + "grad_norm": 0.14478518068790436, + "learning_rate": 0.0009431896784100411, + "loss": 2.8561, + "step": 5454 + }, + { + "epoch": 0.1617590368591169, + "grad_norm": 0.14546380937099457, + "learning_rate": 0.0009431678937515497, + "loss": 2.7994, + "step": 5455 + }, + { + "epoch": 0.16178869021142842, + "grad_norm": 0.1474495232105255, + "learning_rate": 0.0009431461051687306, + "loss": 2.8378, + "step": 5456 + }, + { + "epoch": 0.1618183435637399, + "grad_norm": 0.18218161165714264, + "learning_rate": 0.0009431243126617766, + "loss": 2.819, + "step": 5457 + }, + { + "epoch": 0.16184799691605137, + "grad_norm": 0.20588836073875427, + "learning_rate": 0.0009431025162308807, + "loss": 2.8391, + "step": 5458 + }, + { + "epoch": 0.16187765026836284, + "grad_norm": 0.18078698217868805, + "learning_rate": 0.000943080715876236, + "loss": 2.8006, + "step": 5459 + }, + { + "epoch": 0.16190730362067432, + "grad_norm": 0.16443952918052673, + "learning_rate": 0.0009430589115980354, + "loss": 2.8293, + "step": 5460 + }, + { + "epoch": 0.1619369569729858, + "grad_norm": 0.18648642301559448, + "learning_rate": 0.0009430371033964722, + "loss": 2.8278, + "step": 5461 + }, + { + "epoch": 0.16196661032529727, + "grad_norm": 0.17264175415039062, + "learning_rate": 0.0009430152912717393, + "loss": 2.844, + "step": 5462 + }, + { + "epoch": 0.16199626367760875, + "grad_norm": 0.18257980048656464, + "learning_rate": 0.0009429934752240301, + "loss": 2.8432, + "step": 5463 + }, + { + "epoch": 0.16202591702992022, + "grad_norm": 0.18532291054725647, + "learning_rate": 0.0009429716552535376, + "loss": 2.8305, + "step": 5464 + }, + { + "epoch": 0.1620555703822317, + "grad_norm": 0.18304459750652313, + "learning_rate": 0.0009429498313604551, + "loss": 2.8302, + "step": 5465 + }, + { + "epoch": 0.16208522373454318, + "grad_norm": 0.1770014762878418, + "learning_rate": 0.0009429280035449757, + "loss": 2.8483, + "step": 5466 + }, + { + "epoch": 0.16211487708685468, + "grad_norm": 0.16261129081249237, + "learning_rate": 0.0009429061718072929, + "loss": 2.8115, + "step": 5467 + }, + { + "epoch": 0.16214453043916616, + "grad_norm": 0.15705761313438416, + "learning_rate": 0.0009428843361475998, + "loss": 2.8111, + "step": 5468 + }, + { + "epoch": 0.16217418379147763, + "grad_norm": 0.17309623956680298, + "learning_rate": 0.0009428624965660902, + "loss": 2.82, + "step": 5469 + }, + { + "epoch": 0.1622038371437891, + "grad_norm": 0.1856236457824707, + "learning_rate": 0.0009428406530629567, + "loss": 2.8036, + "step": 5470 + }, + { + "epoch": 0.16223349049610059, + "grad_norm": 0.22876746952533722, + "learning_rate": 0.0009428188056383936, + "loss": 2.8454, + "step": 5471 + }, + { + "epoch": 0.16226314384841206, + "grad_norm": 0.21827735006809235, + "learning_rate": 0.0009427969542925938, + "loss": 2.8503, + "step": 5472 + }, + { + "epoch": 0.16229279720072354, + "grad_norm": 0.2084054946899414, + "learning_rate": 0.0009427750990257509, + "loss": 2.8651, + "step": 5473 + }, + { + "epoch": 0.162322450553035, + "grad_norm": 0.16671469807624817, + "learning_rate": 0.0009427532398380587, + "loss": 2.8175, + "step": 5474 + }, + { + "epoch": 0.1623521039053465, + "grad_norm": 0.17702804505825043, + "learning_rate": 0.0009427313767297103, + "loss": 2.8141, + "step": 5475 + }, + { + "epoch": 0.16238175725765797, + "grad_norm": 0.1813988983631134, + "learning_rate": 0.0009427095097008998, + "loss": 2.8425, + "step": 5476 + }, + { + "epoch": 0.16241141060996947, + "grad_norm": 0.14148005843162537, + "learning_rate": 0.0009426876387518204, + "loss": 2.8213, + "step": 5477 + }, + { + "epoch": 0.16244106396228095, + "grad_norm": 0.15041643381118774, + "learning_rate": 0.0009426657638826661, + "loss": 2.8246, + "step": 5478 + }, + { + "epoch": 0.16247071731459242, + "grad_norm": 0.13806258141994476, + "learning_rate": 0.0009426438850936305, + "loss": 2.7908, + "step": 5479 + }, + { + "epoch": 0.1625003706669039, + "grad_norm": 0.1488897055387497, + "learning_rate": 0.0009426220023849072, + "loss": 2.846, + "step": 5480 + }, + { + "epoch": 0.16253002401921537, + "grad_norm": 0.15815047919750214, + "learning_rate": 0.0009426001157566903, + "loss": 2.8337, + "step": 5481 + }, + { + "epoch": 0.16255967737152685, + "grad_norm": 0.17508716881275177, + "learning_rate": 0.0009425782252091733, + "loss": 2.8188, + "step": 5482 + }, + { + "epoch": 0.16258933072383833, + "grad_norm": 0.17711707949638367, + "learning_rate": 0.00094255633074255, + "loss": 2.8193, + "step": 5483 + }, + { + "epoch": 0.1626189840761498, + "grad_norm": 0.17281188070774078, + "learning_rate": 0.0009425344323570145, + "loss": 2.802, + "step": 5484 + }, + { + "epoch": 0.16264863742846128, + "grad_norm": 0.14147230982780457, + "learning_rate": 0.0009425125300527609, + "loss": 2.8303, + "step": 5485 + }, + { + "epoch": 0.16267829078077276, + "grad_norm": 0.12868140637874603, + "learning_rate": 0.0009424906238299825, + "loss": 2.8453, + "step": 5486 + }, + { + "epoch": 0.16270794413308423, + "grad_norm": 0.1425371766090393, + "learning_rate": 0.0009424687136888739, + "loss": 2.8472, + "step": 5487 + }, + { + "epoch": 0.16273759748539574, + "grad_norm": 0.12801159918308258, + "learning_rate": 0.0009424467996296289, + "loss": 2.8407, + "step": 5488 + }, + { + "epoch": 0.1627672508377072, + "grad_norm": 0.13716530799865723, + "learning_rate": 0.0009424248816524415, + "loss": 2.7978, + "step": 5489 + }, + { + "epoch": 0.1627969041900187, + "grad_norm": 0.13817305862903595, + "learning_rate": 0.0009424029597575056, + "loss": 2.8165, + "step": 5490 + }, + { + "epoch": 0.16282655754233016, + "grad_norm": 0.14590759575366974, + "learning_rate": 0.0009423810339450158, + "loss": 2.8486, + "step": 5491 + }, + { + "epoch": 0.16285621089464164, + "grad_norm": 0.12832151353359222, + "learning_rate": 0.000942359104215166, + "loss": 2.8309, + "step": 5492 + }, + { + "epoch": 0.16288586424695312, + "grad_norm": 0.14424915611743927, + "learning_rate": 0.0009423371705681505, + "loss": 2.8271, + "step": 5493 + }, + { + "epoch": 0.1629155175992646, + "grad_norm": 0.14043109118938446, + "learning_rate": 0.0009423152330041634, + "loss": 2.8511, + "step": 5494 + }, + { + "epoch": 0.16294517095157607, + "grad_norm": 0.14899902045726776, + "learning_rate": 0.0009422932915233988, + "loss": 2.8363, + "step": 5495 + }, + { + "epoch": 0.16297482430388754, + "grad_norm": 0.1570662409067154, + "learning_rate": 0.0009422713461260513, + "loss": 2.8054, + "step": 5496 + }, + { + "epoch": 0.16300447765619902, + "grad_norm": 0.17445439100265503, + "learning_rate": 0.0009422493968123151, + "loss": 2.8234, + "step": 5497 + }, + { + "epoch": 0.16303413100851052, + "grad_norm": 0.17150673270225525, + "learning_rate": 0.0009422274435823846, + "loss": 2.8038, + "step": 5498 + }, + { + "epoch": 0.163063784360822, + "grad_norm": 0.16986486315727234, + "learning_rate": 0.0009422054864364542, + "loss": 2.8457, + "step": 5499 + }, + { + "epoch": 0.16309343771313348, + "grad_norm": 0.18594226241111755, + "learning_rate": 0.0009421835253747182, + "loss": 2.8187, + "step": 5500 + }, + { + "epoch": 0.16312309106544495, + "grad_norm": 0.20332957804203033, + "learning_rate": 0.0009421615603973713, + "loss": 2.8532, + "step": 5501 + }, + { + "epoch": 0.16315274441775643, + "grad_norm": 0.18662844598293304, + "learning_rate": 0.0009421395915046078, + "loss": 2.8045, + "step": 5502 + }, + { + "epoch": 0.1631823977700679, + "grad_norm": 0.17453818023204803, + "learning_rate": 0.0009421176186966224, + "loss": 2.8379, + "step": 5503 + }, + { + "epoch": 0.16321205112237938, + "grad_norm": 0.18055459856987, + "learning_rate": 0.0009420956419736096, + "loss": 2.8232, + "step": 5504 + }, + { + "epoch": 0.16324170447469086, + "grad_norm": 0.16538161039352417, + "learning_rate": 0.0009420736613357639, + "loss": 2.8217, + "step": 5505 + }, + { + "epoch": 0.16327135782700233, + "grad_norm": 0.13385286927223206, + "learning_rate": 0.0009420516767832802, + "loss": 2.8225, + "step": 5506 + }, + { + "epoch": 0.1633010111793138, + "grad_norm": 0.15226230025291443, + "learning_rate": 0.0009420296883163529, + "loss": 2.8078, + "step": 5507 + }, + { + "epoch": 0.1633306645316253, + "grad_norm": 0.1667647808790207, + "learning_rate": 0.0009420076959351769, + "loss": 2.8003, + "step": 5508 + }, + { + "epoch": 0.1633603178839368, + "grad_norm": 0.18208448588848114, + "learning_rate": 0.0009419856996399469, + "loss": 2.793, + "step": 5509 + }, + { + "epoch": 0.16338997123624827, + "grad_norm": 0.213812917470932, + "learning_rate": 0.0009419636994308576, + "loss": 2.8295, + "step": 5510 + }, + { + "epoch": 0.16341962458855974, + "grad_norm": 0.21156038343906403, + "learning_rate": 0.0009419416953081039, + "loss": 2.8174, + "step": 5511 + }, + { + "epoch": 0.16344927794087122, + "grad_norm": 0.18433287739753723, + "learning_rate": 0.0009419196872718807, + "loss": 2.837, + "step": 5512 + }, + { + "epoch": 0.1634789312931827, + "grad_norm": 0.17207685112953186, + "learning_rate": 0.0009418976753223827, + "loss": 2.841, + "step": 5513 + }, + { + "epoch": 0.16350858464549417, + "grad_norm": 0.1565212458372116, + "learning_rate": 0.000941875659459805, + "loss": 2.819, + "step": 5514 + }, + { + "epoch": 0.16353823799780565, + "grad_norm": 0.15495021641254425, + "learning_rate": 0.0009418536396843425, + "loss": 2.8202, + "step": 5515 + }, + { + "epoch": 0.16356789135011712, + "grad_norm": 0.1527581810951233, + "learning_rate": 0.0009418316159961901, + "loss": 2.8397, + "step": 5516 + }, + { + "epoch": 0.1635975447024286, + "grad_norm": 0.14998500049114227, + "learning_rate": 0.000941809588395543, + "loss": 2.8629, + "step": 5517 + }, + { + "epoch": 0.16362719805474008, + "grad_norm": 0.15946228802204132, + "learning_rate": 0.000941787556882596, + "loss": 2.8599, + "step": 5518 + }, + { + "epoch": 0.16365685140705158, + "grad_norm": 0.16957420110702515, + "learning_rate": 0.0009417655214575446, + "loss": 2.8119, + "step": 5519 + }, + { + "epoch": 0.16368650475936306, + "grad_norm": 0.16348059475421906, + "learning_rate": 0.0009417434821205835, + "loss": 2.8343, + "step": 5520 + }, + { + "epoch": 0.16371615811167453, + "grad_norm": 0.17833949625492096, + "learning_rate": 0.0009417214388719081, + "loss": 2.795, + "step": 5521 + }, + { + "epoch": 0.163745811463986, + "grad_norm": 0.14603637158870697, + "learning_rate": 0.0009416993917117136, + "loss": 2.8097, + "step": 5522 + }, + { + "epoch": 0.16377546481629748, + "grad_norm": 0.13438087701797485, + "learning_rate": 0.000941677340640195, + "loss": 2.8549, + "step": 5523 + }, + { + "epoch": 0.16380511816860896, + "grad_norm": 0.16900339722633362, + "learning_rate": 0.0009416552856575478, + "loss": 2.8522, + "step": 5524 + }, + { + "epoch": 0.16383477152092044, + "grad_norm": 0.14939157664775848, + "learning_rate": 0.0009416332267639673, + "loss": 2.8397, + "step": 5525 + }, + { + "epoch": 0.1638644248732319, + "grad_norm": 0.1584809422492981, + "learning_rate": 0.0009416111639596488, + "loss": 2.7981, + "step": 5526 + }, + { + "epoch": 0.1638940782255434, + "grad_norm": 0.17796297371387482, + "learning_rate": 0.0009415890972447876, + "loss": 2.8477, + "step": 5527 + }, + { + "epoch": 0.16392373157785486, + "grad_norm": 0.17822478711605072, + "learning_rate": 0.0009415670266195791, + "loss": 2.8415, + "step": 5528 + }, + { + "epoch": 0.16395338493016637, + "grad_norm": 0.1635701209306717, + "learning_rate": 0.0009415449520842188, + "loss": 2.8252, + "step": 5529 + }, + { + "epoch": 0.16398303828247784, + "grad_norm": 0.13957063853740692, + "learning_rate": 0.0009415228736389021, + "loss": 2.8509, + "step": 5530 + }, + { + "epoch": 0.16401269163478932, + "grad_norm": 0.1515013724565506, + "learning_rate": 0.0009415007912838247, + "loss": 2.8222, + "step": 5531 + }, + { + "epoch": 0.1640423449871008, + "grad_norm": 0.14698730409145355, + "learning_rate": 0.000941478705019182, + "loss": 2.8083, + "step": 5532 + }, + { + "epoch": 0.16407199833941227, + "grad_norm": 0.14452150464057922, + "learning_rate": 0.0009414566148451695, + "loss": 2.8507, + "step": 5533 + }, + { + "epoch": 0.16410165169172375, + "grad_norm": 0.14096368849277496, + "learning_rate": 0.000941434520761983, + "loss": 2.8297, + "step": 5534 + }, + { + "epoch": 0.16413130504403523, + "grad_norm": 0.15172742307186127, + "learning_rate": 0.0009414124227698179, + "loss": 2.8082, + "step": 5535 + }, + { + "epoch": 0.1641609583963467, + "grad_norm": 0.15511977672576904, + "learning_rate": 0.0009413903208688701, + "loss": 2.7982, + "step": 5536 + }, + { + "epoch": 0.16419061174865818, + "grad_norm": 0.1640634536743164, + "learning_rate": 0.0009413682150593352, + "loss": 2.8322, + "step": 5537 + }, + { + "epoch": 0.16422026510096965, + "grad_norm": 0.15980322659015656, + "learning_rate": 0.0009413461053414092, + "loss": 2.8183, + "step": 5538 + }, + { + "epoch": 0.16424991845328113, + "grad_norm": 0.15703411400318146, + "learning_rate": 0.0009413239917152875, + "loss": 2.8186, + "step": 5539 + }, + { + "epoch": 0.16427957180559263, + "grad_norm": 0.16659682989120483, + "learning_rate": 0.0009413018741811661, + "loss": 2.8295, + "step": 5540 + }, + { + "epoch": 0.1643092251579041, + "grad_norm": 0.16851378977298737, + "learning_rate": 0.0009412797527392409, + "loss": 2.8429, + "step": 5541 + }, + { + "epoch": 0.16433887851021559, + "grad_norm": 0.1965087652206421, + "learning_rate": 0.0009412576273897078, + "loss": 2.7961, + "step": 5542 + }, + { + "epoch": 0.16436853186252706, + "grad_norm": 0.18442828953266144, + "learning_rate": 0.0009412354981327626, + "loss": 2.839, + "step": 5543 + }, + { + "epoch": 0.16439818521483854, + "grad_norm": 0.17001624405384064, + "learning_rate": 0.0009412133649686012, + "loss": 2.8359, + "step": 5544 + }, + { + "epoch": 0.16442783856715001, + "grad_norm": 0.15891975164413452, + "learning_rate": 0.00094119122789742, + "loss": 2.8343, + "step": 5545 + }, + { + "epoch": 0.1644574919194615, + "grad_norm": 0.1654941886663437, + "learning_rate": 0.0009411690869194145, + "loss": 2.8133, + "step": 5546 + }, + { + "epoch": 0.16448714527177297, + "grad_norm": 0.15750016272068024, + "learning_rate": 0.0009411469420347811, + "loss": 2.7519, + "step": 5547 + }, + { + "epoch": 0.16451679862408444, + "grad_norm": 0.1699523776769638, + "learning_rate": 0.0009411247932437159, + "loss": 2.8509, + "step": 5548 + }, + { + "epoch": 0.16454645197639592, + "grad_norm": 0.158513143658638, + "learning_rate": 0.0009411026405464148, + "loss": 2.8223, + "step": 5549 + }, + { + "epoch": 0.16457610532870742, + "grad_norm": 0.16921491920948029, + "learning_rate": 0.0009410804839430743, + "loss": 2.8324, + "step": 5550 + }, + { + "epoch": 0.1646057586810189, + "grad_norm": 0.1661670207977295, + "learning_rate": 0.0009410583234338901, + "loss": 2.8496, + "step": 5551 + }, + { + "epoch": 0.16463541203333037, + "grad_norm": 0.13771621882915497, + "learning_rate": 0.0009410361590190589, + "loss": 2.8513, + "step": 5552 + }, + { + "epoch": 0.16466506538564185, + "grad_norm": 0.14012819528579712, + "learning_rate": 0.0009410139906987769, + "loss": 2.8027, + "step": 5553 + }, + { + "epoch": 0.16469471873795333, + "grad_norm": 0.1420125514268875, + "learning_rate": 0.0009409918184732402, + "loss": 2.8639, + "step": 5554 + }, + { + "epoch": 0.1647243720902648, + "grad_norm": 0.1437399536371231, + "learning_rate": 0.0009409696423426453, + "loss": 2.8257, + "step": 5555 + }, + { + "epoch": 0.16475402544257628, + "grad_norm": 0.14940601587295532, + "learning_rate": 0.0009409474623071885, + "loss": 2.814, + "step": 5556 + }, + { + "epoch": 0.16478367879488776, + "grad_norm": 0.16686105728149414, + "learning_rate": 0.0009409252783670662, + "loss": 2.8537, + "step": 5557 + }, + { + "epoch": 0.16481333214719923, + "grad_norm": 0.16972537338733673, + "learning_rate": 0.0009409030905224749, + "loss": 2.8453, + "step": 5558 + }, + { + "epoch": 0.1648429854995107, + "grad_norm": 0.17827871441841125, + "learning_rate": 0.000940880898773611, + "loss": 2.8251, + "step": 5559 + }, + { + "epoch": 0.1648726388518222, + "grad_norm": 0.1790693998336792, + "learning_rate": 0.0009408587031206712, + "loss": 2.8354, + "step": 5560 + }, + { + "epoch": 0.1649022922041337, + "grad_norm": 0.2013409584760666, + "learning_rate": 0.0009408365035638519, + "loss": 2.8306, + "step": 5561 + }, + { + "epoch": 0.16493194555644516, + "grad_norm": 0.17670470476150513, + "learning_rate": 0.0009408143001033496, + "loss": 2.8003, + "step": 5562 + }, + { + "epoch": 0.16496159890875664, + "grad_norm": 0.15607190132141113, + "learning_rate": 0.0009407920927393611, + "loss": 2.8314, + "step": 5563 + }, + { + "epoch": 0.16499125226106812, + "grad_norm": 0.16893669962882996, + "learning_rate": 0.0009407698814720829, + "loss": 2.8406, + "step": 5564 + }, + { + "epoch": 0.1650209056133796, + "grad_norm": 0.16279147565364838, + "learning_rate": 0.0009407476663017116, + "loss": 2.7856, + "step": 5565 + }, + { + "epoch": 0.16505055896569107, + "grad_norm": 0.15948735177516937, + "learning_rate": 0.0009407254472284444, + "loss": 2.8191, + "step": 5566 + }, + { + "epoch": 0.16508021231800254, + "grad_norm": 0.15649592876434326, + "learning_rate": 0.0009407032242524774, + "loss": 2.7991, + "step": 5567 + }, + { + "epoch": 0.16510986567031402, + "grad_norm": 0.1570655256509781, + "learning_rate": 0.0009406809973740078, + "loss": 2.8508, + "step": 5568 + }, + { + "epoch": 0.1651395190226255, + "grad_norm": 0.16031621396541595, + "learning_rate": 0.0009406587665932324, + "loss": 2.7947, + "step": 5569 + }, + { + "epoch": 0.16516917237493697, + "grad_norm": 0.14814743399620056, + "learning_rate": 0.0009406365319103479, + "loss": 2.8346, + "step": 5570 + }, + { + "epoch": 0.16519882572724848, + "grad_norm": 0.15182937681674957, + "learning_rate": 0.0009406142933255512, + "loss": 2.8601, + "step": 5571 + }, + { + "epoch": 0.16522847907955995, + "grad_norm": 0.14330536127090454, + "learning_rate": 0.0009405920508390395, + "loss": 2.7954, + "step": 5572 + }, + { + "epoch": 0.16525813243187143, + "grad_norm": 0.13255277276039124, + "learning_rate": 0.0009405698044510094, + "loss": 2.8245, + "step": 5573 + }, + { + "epoch": 0.1652877857841829, + "grad_norm": 0.14633023738861084, + "learning_rate": 0.0009405475541616582, + "loss": 2.8186, + "step": 5574 + }, + { + "epoch": 0.16531743913649438, + "grad_norm": 0.15296624600887299, + "learning_rate": 0.0009405252999711828, + "loss": 2.8239, + "step": 5575 + }, + { + "epoch": 0.16534709248880586, + "grad_norm": 0.15408027172088623, + "learning_rate": 0.0009405030418797802, + "loss": 2.7986, + "step": 5576 + }, + { + "epoch": 0.16537674584111733, + "grad_norm": 0.14484573900699615, + "learning_rate": 0.0009404807798876475, + "loss": 2.821, + "step": 5577 + }, + { + "epoch": 0.1654063991934288, + "grad_norm": 0.1433567851781845, + "learning_rate": 0.0009404585139949819, + "loss": 2.8398, + "step": 5578 + }, + { + "epoch": 0.1654360525457403, + "grad_norm": 0.1463821530342102, + "learning_rate": 0.0009404362442019805, + "loss": 2.8173, + "step": 5579 + }, + { + "epoch": 0.16546570589805176, + "grad_norm": 0.16017934679985046, + "learning_rate": 0.0009404139705088407, + "loss": 2.8353, + "step": 5580 + }, + { + "epoch": 0.16549535925036327, + "grad_norm": 0.19526751339435577, + "learning_rate": 0.0009403916929157594, + "loss": 2.7894, + "step": 5581 + }, + { + "epoch": 0.16552501260267474, + "grad_norm": 0.2199636846780777, + "learning_rate": 0.0009403694114229343, + "loss": 2.8478, + "step": 5582 + }, + { + "epoch": 0.16555466595498622, + "grad_norm": 0.23111259937286377, + "learning_rate": 0.0009403471260305624, + "loss": 2.8169, + "step": 5583 + }, + { + "epoch": 0.1655843193072977, + "grad_norm": 0.21700282394886017, + "learning_rate": 0.0009403248367388411, + "loss": 2.8195, + "step": 5584 + }, + { + "epoch": 0.16561397265960917, + "grad_norm": 0.17215122282505035, + "learning_rate": 0.0009403025435479678, + "loss": 2.8131, + "step": 5585 + }, + { + "epoch": 0.16564362601192065, + "grad_norm": 0.15693607926368713, + "learning_rate": 0.0009402802464581397, + "loss": 2.8398, + "step": 5586 + }, + { + "epoch": 0.16567327936423212, + "grad_norm": 0.1599828600883484, + "learning_rate": 0.0009402579454695547, + "loss": 2.8346, + "step": 5587 + }, + { + "epoch": 0.1657029327165436, + "grad_norm": 0.15924693644046783, + "learning_rate": 0.0009402356405824099, + "loss": 2.8327, + "step": 5588 + }, + { + "epoch": 0.16573258606885508, + "grad_norm": 0.18057133257389069, + "learning_rate": 0.0009402133317969031, + "loss": 2.8585, + "step": 5589 + }, + { + "epoch": 0.16576223942116655, + "grad_norm": 0.17383596301078796, + "learning_rate": 0.0009401910191132314, + "loss": 2.8087, + "step": 5590 + }, + { + "epoch": 0.16579189277347803, + "grad_norm": 0.16797399520874023, + "learning_rate": 0.0009401687025315928, + "loss": 2.7859, + "step": 5591 + }, + { + "epoch": 0.16582154612578953, + "grad_norm": 0.15187004208564758, + "learning_rate": 0.0009401463820521849, + "loss": 2.843, + "step": 5592 + }, + { + "epoch": 0.165851199478101, + "grad_norm": 0.11895740032196045, + "learning_rate": 0.0009401240576752052, + "loss": 2.8216, + "step": 5593 + }, + { + "epoch": 0.16588085283041248, + "grad_norm": 0.13477419316768646, + "learning_rate": 0.0009401017294008514, + "loss": 2.8337, + "step": 5594 + }, + { + "epoch": 0.16591050618272396, + "grad_norm": 0.13261625170707703, + "learning_rate": 0.0009400793972293211, + "loss": 2.8496, + "step": 5595 + }, + { + "epoch": 0.16594015953503544, + "grad_norm": 0.1422228068113327, + "learning_rate": 0.0009400570611608123, + "loss": 2.7941, + "step": 5596 + }, + { + "epoch": 0.1659698128873469, + "grad_norm": 0.14000177383422852, + "learning_rate": 0.0009400347211955226, + "loss": 2.8086, + "step": 5597 + }, + { + "epoch": 0.1659994662396584, + "grad_norm": 0.13281632959842682, + "learning_rate": 0.0009400123773336502, + "loss": 2.7989, + "step": 5598 + }, + { + "epoch": 0.16602911959196986, + "grad_norm": 0.12887321412563324, + "learning_rate": 0.0009399900295753925, + "loss": 2.8214, + "step": 5599 + }, + { + "epoch": 0.16605877294428134, + "grad_norm": 0.13738445937633514, + "learning_rate": 0.0009399676779209473, + "loss": 2.8343, + "step": 5600 + }, + { + "epoch": 0.16608842629659282, + "grad_norm": 0.1488957703113556, + "learning_rate": 0.0009399453223705132, + "loss": 2.8132, + "step": 5601 + }, + { + "epoch": 0.16611807964890432, + "grad_norm": 0.1577356904745102, + "learning_rate": 0.0009399229629242876, + "loss": 2.8325, + "step": 5602 + }, + { + "epoch": 0.1661477330012158, + "grad_norm": 0.16745921969413757, + "learning_rate": 0.0009399005995824687, + "loss": 2.8364, + "step": 5603 + }, + { + "epoch": 0.16617738635352727, + "grad_norm": 0.18070003390312195, + "learning_rate": 0.0009398782323452544, + "loss": 2.7999, + "step": 5604 + }, + { + "epoch": 0.16620703970583875, + "grad_norm": 0.20369014143943787, + "learning_rate": 0.000939855861212843, + "loss": 2.8205, + "step": 5605 + }, + { + "epoch": 0.16623669305815023, + "grad_norm": 0.21617437899112701, + "learning_rate": 0.0009398334861854322, + "loss": 2.8212, + "step": 5606 + }, + { + "epoch": 0.1662663464104617, + "grad_norm": 0.18841207027435303, + "learning_rate": 0.0009398111072632205, + "loss": 2.8522, + "step": 5607 + }, + { + "epoch": 0.16629599976277318, + "grad_norm": 0.16759036481380463, + "learning_rate": 0.0009397887244464061, + "loss": 2.8387, + "step": 5608 + }, + { + "epoch": 0.16632565311508465, + "grad_norm": 0.1937573254108429, + "learning_rate": 0.0009397663377351868, + "loss": 2.8269, + "step": 5609 + }, + { + "epoch": 0.16635530646739613, + "grad_norm": 0.19871218502521515, + "learning_rate": 0.0009397439471297613, + "loss": 2.813, + "step": 5610 + }, + { + "epoch": 0.1663849598197076, + "grad_norm": 0.18179449439048767, + "learning_rate": 0.0009397215526303276, + "loss": 2.7858, + "step": 5611 + }, + { + "epoch": 0.1664146131720191, + "grad_norm": 0.18961527943611145, + "learning_rate": 0.0009396991542370839, + "loss": 2.8145, + "step": 5612 + }, + { + "epoch": 0.1664442665243306, + "grad_norm": 0.18985389173030853, + "learning_rate": 0.0009396767519502289, + "loss": 2.8569, + "step": 5613 + }, + { + "epoch": 0.16647391987664206, + "grad_norm": 0.16309143602848053, + "learning_rate": 0.0009396543457699609, + "loss": 2.8329, + "step": 5614 + }, + { + "epoch": 0.16650357322895354, + "grad_norm": 0.1592158079147339, + "learning_rate": 0.000939631935696478, + "loss": 2.8061, + "step": 5615 + }, + { + "epoch": 0.16653322658126501, + "grad_norm": 0.1500914990901947, + "learning_rate": 0.000939609521729979, + "loss": 2.8148, + "step": 5616 + }, + { + "epoch": 0.1665628799335765, + "grad_norm": 0.16647180914878845, + "learning_rate": 0.000939587103870662, + "loss": 2.813, + "step": 5617 + }, + { + "epoch": 0.16659253328588797, + "grad_norm": 0.16503065824508667, + "learning_rate": 0.0009395646821187259, + "loss": 2.8289, + "step": 5618 + }, + { + "epoch": 0.16662218663819944, + "grad_norm": 0.15698838233947754, + "learning_rate": 0.0009395422564743691, + "loss": 2.8378, + "step": 5619 + }, + { + "epoch": 0.16665183999051092, + "grad_norm": 0.14956003427505493, + "learning_rate": 0.0009395198269377901, + "loss": 2.8416, + "step": 5620 + }, + { + "epoch": 0.1666814933428224, + "grad_norm": 0.13599784672260284, + "learning_rate": 0.0009394973935091878, + "loss": 2.8504, + "step": 5621 + }, + { + "epoch": 0.16671114669513387, + "grad_norm": 0.1384897232055664, + "learning_rate": 0.0009394749561887604, + "loss": 2.8317, + "step": 5622 + }, + { + "epoch": 0.16674080004744538, + "grad_norm": 0.14258837699890137, + "learning_rate": 0.0009394525149767068, + "loss": 2.7975, + "step": 5623 + }, + { + "epoch": 0.16677045339975685, + "grad_norm": 0.1464400440454483, + "learning_rate": 0.0009394300698732259, + "loss": 2.803, + "step": 5624 + }, + { + "epoch": 0.16680010675206833, + "grad_norm": 0.14490114152431488, + "learning_rate": 0.0009394076208785163, + "loss": 2.7867, + "step": 5625 + }, + { + "epoch": 0.1668297601043798, + "grad_norm": 0.14480240643024445, + "learning_rate": 0.0009393851679927767, + "loss": 2.8125, + "step": 5626 + }, + { + "epoch": 0.16685941345669128, + "grad_norm": 0.14668038487434387, + "learning_rate": 0.0009393627112162061, + "loss": 2.8407, + "step": 5627 + }, + { + "epoch": 0.16688906680900276, + "grad_norm": 0.14106349647045135, + "learning_rate": 0.0009393402505490032, + "loss": 2.814, + "step": 5628 + }, + { + "epoch": 0.16691872016131423, + "grad_norm": 0.1648385226726532, + "learning_rate": 0.0009393177859913671, + "loss": 2.8106, + "step": 5629 + }, + { + "epoch": 0.1669483735136257, + "grad_norm": 0.16046887636184692, + "learning_rate": 0.0009392953175434964, + "loss": 2.8139, + "step": 5630 + }, + { + "epoch": 0.16697802686593718, + "grad_norm": 0.1492973119020462, + "learning_rate": 0.0009392728452055904, + "loss": 2.8128, + "step": 5631 + }, + { + "epoch": 0.16700768021824866, + "grad_norm": 0.2186843603849411, + "learning_rate": 0.000939250368977848, + "loss": 2.8105, + "step": 5632 + }, + { + "epoch": 0.16703733357056016, + "grad_norm": 0.26847997307777405, + "learning_rate": 0.000939227888860468, + "loss": 2.8454, + "step": 5633 + }, + { + "epoch": 0.16706698692287164, + "grad_norm": 0.20117700099945068, + "learning_rate": 0.0009392054048536498, + "loss": 2.775, + "step": 5634 + }, + { + "epoch": 0.16709664027518312, + "grad_norm": 0.1667526215314865, + "learning_rate": 0.0009391829169575924, + "loss": 2.8242, + "step": 5635 + }, + { + "epoch": 0.1671262936274946, + "grad_norm": 0.19203965365886688, + "learning_rate": 0.0009391604251724947, + "loss": 2.8205, + "step": 5636 + }, + { + "epoch": 0.16715594697980607, + "grad_norm": 0.21011556684970856, + "learning_rate": 0.0009391379294985563, + "loss": 2.7979, + "step": 5637 + }, + { + "epoch": 0.16718560033211755, + "grad_norm": 0.18866966664791107, + "learning_rate": 0.0009391154299359758, + "loss": 2.8222, + "step": 5638 + }, + { + "epoch": 0.16721525368442902, + "grad_norm": 0.1378319263458252, + "learning_rate": 0.0009390929264849532, + "loss": 2.8532, + "step": 5639 + }, + { + "epoch": 0.1672449070367405, + "grad_norm": 0.15321306884288788, + "learning_rate": 0.0009390704191456871, + "loss": 2.826, + "step": 5640 + }, + { + "epoch": 0.16727456038905197, + "grad_norm": 0.15122702717781067, + "learning_rate": 0.0009390479079183771, + "loss": 2.7729, + "step": 5641 + }, + { + "epoch": 0.16730421374136345, + "grad_norm": 0.13017234206199646, + "learning_rate": 0.0009390253928032226, + "loss": 2.8051, + "step": 5642 + }, + { + "epoch": 0.16733386709367493, + "grad_norm": 0.13211163878440857, + "learning_rate": 0.0009390028738004228, + "loss": 2.8522, + "step": 5643 + }, + { + "epoch": 0.16736352044598643, + "grad_norm": 0.14228098094463348, + "learning_rate": 0.0009389803509101773, + "loss": 2.8487, + "step": 5644 + }, + { + "epoch": 0.1673931737982979, + "grad_norm": 0.12162463366985321, + "learning_rate": 0.0009389578241326855, + "loss": 2.7903, + "step": 5645 + }, + { + "epoch": 0.16742282715060938, + "grad_norm": 0.13609765470027924, + "learning_rate": 0.0009389352934681467, + "loss": 2.8288, + "step": 5646 + }, + { + "epoch": 0.16745248050292086, + "grad_norm": 0.15532757341861725, + "learning_rate": 0.0009389127589167606, + "loss": 2.8628, + "step": 5647 + }, + { + "epoch": 0.16748213385523233, + "grad_norm": 0.1717224419116974, + "learning_rate": 0.0009388902204787265, + "loss": 2.8416, + "step": 5648 + }, + { + "epoch": 0.1675117872075438, + "grad_norm": 0.16261056065559387, + "learning_rate": 0.0009388676781542443, + "loss": 2.8027, + "step": 5649 + }, + { + "epoch": 0.1675414405598553, + "grad_norm": 0.1438755989074707, + "learning_rate": 0.0009388451319435135, + "loss": 2.8223, + "step": 5650 + }, + { + "epoch": 0.16757109391216676, + "grad_norm": 0.13570639491081238, + "learning_rate": 0.0009388225818467337, + "loss": 2.8109, + "step": 5651 + }, + { + "epoch": 0.16760074726447824, + "grad_norm": 0.1344454288482666, + "learning_rate": 0.0009388000278641046, + "loss": 2.8293, + "step": 5652 + }, + { + "epoch": 0.16763040061678972, + "grad_norm": 0.1665033996105194, + "learning_rate": 0.000938777469995826, + "loss": 2.852, + "step": 5653 + }, + { + "epoch": 0.16766005396910122, + "grad_norm": 0.17521177232265472, + "learning_rate": 0.0009387549082420975, + "loss": 2.7935, + "step": 5654 + }, + { + "epoch": 0.1676897073214127, + "grad_norm": 0.14615637063980103, + "learning_rate": 0.000938732342603119, + "loss": 2.8453, + "step": 5655 + }, + { + "epoch": 0.16771936067372417, + "grad_norm": 0.1477258801460266, + "learning_rate": 0.0009387097730790904, + "loss": 2.8156, + "step": 5656 + }, + { + "epoch": 0.16774901402603565, + "grad_norm": 0.15181533992290497, + "learning_rate": 0.0009386871996702114, + "loss": 2.8353, + "step": 5657 + }, + { + "epoch": 0.16777866737834712, + "grad_norm": 0.1977846622467041, + "learning_rate": 0.0009386646223766818, + "loss": 2.7929, + "step": 5658 + }, + { + "epoch": 0.1678083207306586, + "grad_norm": 0.6999161243438721, + "learning_rate": 0.0009386420411987017, + "loss": 2.8003, + "step": 5659 + }, + { + "epoch": 0.16783797408297008, + "grad_norm": 0.3111397325992584, + "learning_rate": 0.0009386194561364712, + "loss": 2.8689, + "step": 5660 + }, + { + "epoch": 0.16786762743528155, + "grad_norm": 0.2570805847644806, + "learning_rate": 0.0009385968671901901, + "loss": 2.8456, + "step": 5661 + }, + { + "epoch": 0.16789728078759303, + "grad_norm": 0.29452627897262573, + "learning_rate": 0.0009385742743600584, + "loss": 2.8224, + "step": 5662 + }, + { + "epoch": 0.1679269341399045, + "grad_norm": 0.27764490246772766, + "learning_rate": 0.0009385516776462761, + "loss": 2.8582, + "step": 5663 + }, + { + "epoch": 0.167956587492216, + "grad_norm": 0.27824655175209045, + "learning_rate": 0.0009385290770490437, + "loss": 2.8744, + "step": 5664 + }, + { + "epoch": 0.16798624084452748, + "grad_norm": 0.29515230655670166, + "learning_rate": 0.0009385064725685608, + "loss": 2.8247, + "step": 5665 + }, + { + "epoch": 0.16801589419683896, + "grad_norm": 0.2766202390193939, + "learning_rate": 0.000938483864205028, + "loss": 2.8505, + "step": 5666 + }, + { + "epoch": 0.16804554754915044, + "grad_norm": 0.19570975005626678, + "learning_rate": 0.0009384612519586453, + "loss": 2.8062, + "step": 5667 + }, + { + "epoch": 0.1680752009014619, + "grad_norm": 0.17164066433906555, + "learning_rate": 0.0009384386358296128, + "loss": 2.8597, + "step": 5668 + }, + { + "epoch": 0.1681048542537734, + "grad_norm": 0.18719516694545746, + "learning_rate": 0.000938416015818131, + "loss": 2.8601, + "step": 5669 + }, + { + "epoch": 0.16813450760608487, + "grad_norm": 0.1635117083787918, + "learning_rate": 0.0009383933919244001, + "loss": 2.8357, + "step": 5670 + }, + { + "epoch": 0.16816416095839634, + "grad_norm": 0.17360857129096985, + "learning_rate": 0.0009383707641486206, + "loss": 2.8284, + "step": 5671 + }, + { + "epoch": 0.16819381431070782, + "grad_norm": 0.15752458572387695, + "learning_rate": 0.0009383481324909926, + "loss": 2.8264, + "step": 5672 + }, + { + "epoch": 0.1682234676630193, + "grad_norm": 0.15068082511425018, + "learning_rate": 0.0009383254969517167, + "loss": 2.8414, + "step": 5673 + }, + { + "epoch": 0.16825312101533077, + "grad_norm": 0.14932821691036224, + "learning_rate": 0.0009383028575309932, + "loss": 2.7797, + "step": 5674 + }, + { + "epoch": 0.16828277436764227, + "grad_norm": 0.14127889275550842, + "learning_rate": 0.0009382802142290228, + "loss": 2.8171, + "step": 5675 + }, + { + "epoch": 0.16831242771995375, + "grad_norm": 0.11982176452875137, + "learning_rate": 0.0009382575670460057, + "loss": 2.8364, + "step": 5676 + }, + { + "epoch": 0.16834208107226523, + "grad_norm": 0.1239599958062172, + "learning_rate": 0.0009382349159821428, + "loss": 2.799, + "step": 5677 + }, + { + "epoch": 0.1683717344245767, + "grad_norm": 0.12463834881782532, + "learning_rate": 0.0009382122610376344, + "loss": 2.8612, + "step": 5678 + }, + { + "epoch": 0.16840138777688818, + "grad_norm": 0.12452258914709091, + "learning_rate": 0.0009381896022126813, + "loss": 2.8421, + "step": 5679 + }, + { + "epoch": 0.16843104112919965, + "grad_norm": 0.13516339659690857, + "learning_rate": 0.0009381669395074839, + "loss": 2.8339, + "step": 5680 + }, + { + "epoch": 0.16846069448151113, + "grad_norm": 0.14795683324337006, + "learning_rate": 0.0009381442729222431, + "loss": 2.825, + "step": 5681 + }, + { + "epoch": 0.1684903478338226, + "grad_norm": 0.1421336978673935, + "learning_rate": 0.0009381216024571596, + "loss": 2.8288, + "step": 5682 + }, + { + "epoch": 0.16852000118613408, + "grad_norm": 0.12862233817577362, + "learning_rate": 0.0009380989281124342, + "loss": 2.8265, + "step": 5683 + }, + { + "epoch": 0.16854965453844556, + "grad_norm": 0.13591210544109344, + "learning_rate": 0.0009380762498882673, + "loss": 2.8492, + "step": 5684 + }, + { + "epoch": 0.16857930789075706, + "grad_norm": 0.13141632080078125, + "learning_rate": 0.0009380535677848603, + "loss": 2.8364, + "step": 5685 + }, + { + "epoch": 0.16860896124306854, + "grad_norm": 0.12200366705656052, + "learning_rate": 0.0009380308818024137, + "loss": 2.8534, + "step": 5686 + }, + { + "epoch": 0.16863861459538002, + "grad_norm": 0.11832255870103836, + "learning_rate": 0.0009380081919411284, + "loss": 2.7965, + "step": 5687 + }, + { + "epoch": 0.1686682679476915, + "grad_norm": 0.12049256265163422, + "learning_rate": 0.0009379854982012053, + "loss": 2.789, + "step": 5688 + }, + { + "epoch": 0.16869792130000297, + "grad_norm": 0.13367925584316254, + "learning_rate": 0.0009379628005828455, + "loss": 2.7855, + "step": 5689 + }, + { + "epoch": 0.16872757465231444, + "grad_norm": 0.1583154797554016, + "learning_rate": 0.0009379400990862501, + "loss": 2.8125, + "step": 5690 + }, + { + "epoch": 0.16875722800462592, + "grad_norm": 0.17680159211158752, + "learning_rate": 0.0009379173937116198, + "loss": 2.7902, + "step": 5691 + }, + { + "epoch": 0.1687868813569374, + "grad_norm": 0.18953700363636017, + "learning_rate": 0.0009378946844591558, + "loss": 2.7928, + "step": 5692 + }, + { + "epoch": 0.16881653470924887, + "grad_norm": 0.17761555314064026, + "learning_rate": 0.0009378719713290592, + "loss": 2.8139, + "step": 5693 + }, + { + "epoch": 0.16884618806156035, + "grad_norm": 0.16743218898773193, + "learning_rate": 0.0009378492543215311, + "loss": 2.826, + "step": 5694 + }, + { + "epoch": 0.16887584141387182, + "grad_norm": 0.14818130433559418, + "learning_rate": 0.0009378265334367728, + "loss": 2.8045, + "step": 5695 + }, + { + "epoch": 0.16890549476618333, + "grad_norm": 0.1558290719985962, + "learning_rate": 0.0009378038086749853, + "loss": 2.8119, + "step": 5696 + }, + { + "epoch": 0.1689351481184948, + "grad_norm": 0.1643151491880417, + "learning_rate": 0.00093778108003637, + "loss": 2.8307, + "step": 5697 + }, + { + "epoch": 0.16896480147080628, + "grad_norm": 0.15307825803756714, + "learning_rate": 0.0009377583475211281, + "loss": 2.807, + "step": 5698 + }, + { + "epoch": 0.16899445482311776, + "grad_norm": 0.15773917734622955, + "learning_rate": 0.0009377356111294608, + "loss": 2.801, + "step": 5699 + }, + { + "epoch": 0.16902410817542923, + "grad_norm": 0.14216221868991852, + "learning_rate": 0.0009377128708615696, + "loss": 2.8319, + "step": 5700 + }, + { + "epoch": 0.1690537615277407, + "grad_norm": 0.1320670247077942, + "learning_rate": 0.0009376901267176558, + "loss": 2.7995, + "step": 5701 + }, + { + "epoch": 0.16908341488005219, + "grad_norm": 0.1350332498550415, + "learning_rate": 0.0009376673786979209, + "loss": 2.8148, + "step": 5702 + }, + { + "epoch": 0.16911306823236366, + "grad_norm": 0.1538075953722, + "learning_rate": 0.000937644626802566, + "loss": 2.8182, + "step": 5703 + }, + { + "epoch": 0.16914272158467514, + "grad_norm": 0.1677468866109848, + "learning_rate": 0.0009376218710317929, + "loss": 2.8444, + "step": 5704 + }, + { + "epoch": 0.1691723749369866, + "grad_norm": 0.1654919981956482, + "learning_rate": 0.0009375991113858031, + "loss": 2.8087, + "step": 5705 + }, + { + "epoch": 0.16920202828929812, + "grad_norm": 0.17622928321361542, + "learning_rate": 0.000937576347864798, + "loss": 2.8556, + "step": 5706 + }, + { + "epoch": 0.1692316816416096, + "grad_norm": 0.18518483638763428, + "learning_rate": 0.0009375535804689792, + "loss": 2.8355, + "step": 5707 + }, + { + "epoch": 0.16926133499392107, + "grad_norm": 0.20471936464309692, + "learning_rate": 0.0009375308091985483, + "loss": 2.8111, + "step": 5708 + }, + { + "epoch": 0.16929098834623255, + "grad_norm": 0.17928707599639893, + "learning_rate": 0.0009375080340537072, + "loss": 2.8122, + "step": 5709 + }, + { + "epoch": 0.16932064169854402, + "grad_norm": 0.15118929743766785, + "learning_rate": 0.0009374852550346572, + "loss": 2.8603, + "step": 5710 + }, + { + "epoch": 0.1693502950508555, + "grad_norm": 0.16285394132137299, + "learning_rate": 0.0009374624721416001, + "loss": 2.7758, + "step": 5711 + }, + { + "epoch": 0.16937994840316697, + "grad_norm": 0.15815916657447815, + "learning_rate": 0.0009374396853747378, + "loss": 2.8319, + "step": 5712 + }, + { + "epoch": 0.16940960175547845, + "grad_norm": 0.14319367706775665, + "learning_rate": 0.0009374168947342721, + "loss": 2.8023, + "step": 5713 + }, + { + "epoch": 0.16943925510778993, + "grad_norm": 0.14518587291240692, + "learning_rate": 0.0009373941002204046, + "loss": 2.8197, + "step": 5714 + }, + { + "epoch": 0.1694689084601014, + "grad_norm": 0.15266835689544678, + "learning_rate": 0.0009373713018333373, + "loss": 2.823, + "step": 5715 + }, + { + "epoch": 0.1694985618124129, + "grad_norm": 0.15601779520511627, + "learning_rate": 0.000937348499573272, + "loss": 2.8341, + "step": 5716 + }, + { + "epoch": 0.16952821516472438, + "grad_norm": 0.16412559151649475, + "learning_rate": 0.0009373256934404107, + "loss": 2.842, + "step": 5717 + }, + { + "epoch": 0.16955786851703586, + "grad_norm": 0.16297130286693573, + "learning_rate": 0.0009373028834349554, + "loss": 2.8233, + "step": 5718 + }, + { + "epoch": 0.16958752186934734, + "grad_norm": 0.1564776748418808, + "learning_rate": 0.000937280069557108, + "loss": 2.8109, + "step": 5719 + }, + { + "epoch": 0.1696171752216588, + "grad_norm": 0.16743190586566925, + "learning_rate": 0.0009372572518070704, + "loss": 2.8215, + "step": 5720 + }, + { + "epoch": 0.1696468285739703, + "grad_norm": 0.17091825604438782, + "learning_rate": 0.0009372344301850448, + "loss": 2.8213, + "step": 5721 + }, + { + "epoch": 0.16967648192628176, + "grad_norm": 0.17861692607402802, + "learning_rate": 0.0009372116046912334, + "loss": 2.825, + "step": 5722 + }, + { + "epoch": 0.16970613527859324, + "grad_norm": 0.19086787104606628, + "learning_rate": 0.0009371887753258379, + "loss": 2.7954, + "step": 5723 + }, + { + "epoch": 0.16973578863090472, + "grad_norm": 0.1883491575717926, + "learning_rate": 0.0009371659420890611, + "loss": 2.8231, + "step": 5724 + }, + { + "epoch": 0.1697654419832162, + "grad_norm": 0.19186215102672577, + "learning_rate": 0.0009371431049811046, + "loss": 2.8176, + "step": 5725 + }, + { + "epoch": 0.16979509533552767, + "grad_norm": 0.1911315768957138, + "learning_rate": 0.000937120264002171, + "loss": 2.8238, + "step": 5726 + }, + { + "epoch": 0.16982474868783917, + "grad_norm": 0.17076687514781952, + "learning_rate": 0.0009370974191524624, + "loss": 2.781, + "step": 5727 + }, + { + "epoch": 0.16985440204015065, + "grad_norm": 0.15746740996837616, + "learning_rate": 0.0009370745704321812, + "loss": 2.8615, + "step": 5728 + }, + { + "epoch": 0.16988405539246212, + "grad_norm": 0.175408735871315, + "learning_rate": 0.0009370517178415295, + "loss": 2.8432, + "step": 5729 + }, + { + "epoch": 0.1699137087447736, + "grad_norm": 0.15500202775001526, + "learning_rate": 0.0009370288613807098, + "loss": 2.8388, + "step": 5730 + }, + { + "epoch": 0.16994336209708508, + "grad_norm": 0.15511725842952728, + "learning_rate": 0.0009370060010499247, + "loss": 2.8043, + "step": 5731 + }, + { + "epoch": 0.16997301544939655, + "grad_norm": 0.16031746566295624, + "learning_rate": 0.0009369831368493764, + "loss": 2.8128, + "step": 5732 + }, + { + "epoch": 0.17000266880170803, + "grad_norm": 0.15012294054031372, + "learning_rate": 0.0009369602687792673, + "loss": 2.8442, + "step": 5733 + }, + { + "epoch": 0.1700323221540195, + "grad_norm": 0.13456258177757263, + "learning_rate": 0.0009369373968398002, + "loss": 2.8266, + "step": 5734 + }, + { + "epoch": 0.17006197550633098, + "grad_norm": 0.1357855647802353, + "learning_rate": 0.0009369145210311774, + "loss": 2.8194, + "step": 5735 + }, + { + "epoch": 0.17009162885864246, + "grad_norm": 0.13497686386108398, + "learning_rate": 0.0009368916413536014, + "loss": 2.8124, + "step": 5736 + }, + { + "epoch": 0.17012128221095396, + "grad_norm": 0.12129607051610947, + "learning_rate": 0.000936868757807275, + "loss": 2.8349, + "step": 5737 + }, + { + "epoch": 0.17015093556326544, + "grad_norm": 0.1288996785879135, + "learning_rate": 0.0009368458703924008, + "loss": 2.7772, + "step": 5738 + }, + { + "epoch": 0.1701805889155769, + "grad_norm": 0.12002832442522049, + "learning_rate": 0.0009368229791091813, + "loss": 2.8167, + "step": 5739 + }, + { + "epoch": 0.1702102422678884, + "grad_norm": 0.13707920908927917, + "learning_rate": 0.0009368000839578194, + "loss": 2.8289, + "step": 5740 + }, + { + "epoch": 0.17023989562019987, + "grad_norm": 0.14377295970916748, + "learning_rate": 0.0009367771849385178, + "loss": 2.8101, + "step": 5741 + }, + { + "epoch": 0.17026954897251134, + "grad_norm": 0.13792169094085693, + "learning_rate": 0.0009367542820514794, + "loss": 2.8414, + "step": 5742 + }, + { + "epoch": 0.17029920232482282, + "grad_norm": 0.13510774075984955, + "learning_rate": 0.0009367313752969066, + "loss": 2.8302, + "step": 5743 + }, + { + "epoch": 0.1703288556771343, + "grad_norm": 0.14271195232868195, + "learning_rate": 0.0009367084646750029, + "loss": 2.8312, + "step": 5744 + }, + { + "epoch": 0.17035850902944577, + "grad_norm": 0.17629042267799377, + "learning_rate": 0.0009366855501859704, + "loss": 2.8242, + "step": 5745 + }, + { + "epoch": 0.17038816238175725, + "grad_norm": 0.1791563332080841, + "learning_rate": 0.0009366626318300125, + "loss": 2.8231, + "step": 5746 + }, + { + "epoch": 0.17041781573406872, + "grad_norm": 0.1527532935142517, + "learning_rate": 0.0009366397096073321, + "loss": 2.8416, + "step": 5747 + }, + { + "epoch": 0.17044746908638023, + "grad_norm": 0.18798001110553741, + "learning_rate": 0.000936616783518132, + "loss": 2.8197, + "step": 5748 + }, + { + "epoch": 0.1704771224386917, + "grad_norm": 0.21023549139499664, + "learning_rate": 0.0009365938535626155, + "loss": 2.8581, + "step": 5749 + }, + { + "epoch": 0.17050677579100318, + "grad_norm": 0.2147420346736908, + "learning_rate": 0.0009365709197409854, + "loss": 2.8283, + "step": 5750 + }, + { + "epoch": 0.17053642914331466, + "grad_norm": 0.2591547667980194, + "learning_rate": 0.0009365479820534448, + "loss": 2.8061, + "step": 5751 + }, + { + "epoch": 0.17056608249562613, + "grad_norm": 0.22847320139408112, + "learning_rate": 0.0009365250405001971, + "loss": 2.8065, + "step": 5752 + }, + { + "epoch": 0.1705957358479376, + "grad_norm": 0.2006351351737976, + "learning_rate": 0.000936502095081445, + "loss": 2.7933, + "step": 5753 + }, + { + "epoch": 0.17062538920024908, + "grad_norm": 0.16714638471603394, + "learning_rate": 0.000936479145797392, + "loss": 2.8193, + "step": 5754 + }, + { + "epoch": 0.17065504255256056, + "grad_norm": 0.14750376343727112, + "learning_rate": 0.0009364561926482413, + "loss": 2.8409, + "step": 5755 + }, + { + "epoch": 0.17068469590487204, + "grad_norm": 0.1684347540140152, + "learning_rate": 0.0009364332356341962, + "loss": 2.8457, + "step": 5756 + }, + { + "epoch": 0.1707143492571835, + "grad_norm": 0.15864898264408112, + "learning_rate": 0.0009364102747554597, + "loss": 2.8118, + "step": 5757 + }, + { + "epoch": 0.17074400260949502, + "grad_norm": 0.14688071608543396, + "learning_rate": 0.0009363873100122353, + "loss": 2.842, + "step": 5758 + }, + { + "epoch": 0.1707736559618065, + "grad_norm": 0.15055280923843384, + "learning_rate": 0.0009363643414047265, + "loss": 2.8182, + "step": 5759 + }, + { + "epoch": 0.17080330931411797, + "grad_norm": 0.13949713110923767, + "learning_rate": 0.0009363413689331365, + "loss": 2.8001, + "step": 5760 + }, + { + "epoch": 0.17083296266642944, + "grad_norm": 0.14345334470272064, + "learning_rate": 0.0009363183925976687, + "loss": 2.7965, + "step": 5761 + }, + { + "epoch": 0.17086261601874092, + "grad_norm": 0.1619623452425003, + "learning_rate": 0.0009362954123985268, + "loss": 2.8131, + "step": 5762 + }, + { + "epoch": 0.1708922693710524, + "grad_norm": 0.13182039558887482, + "learning_rate": 0.000936272428335914, + "loss": 2.8273, + "step": 5763 + }, + { + "epoch": 0.17092192272336387, + "grad_norm": 0.13356615602970123, + "learning_rate": 0.0009362494404100339, + "loss": 2.8097, + "step": 5764 + }, + { + "epoch": 0.17095157607567535, + "grad_norm": 0.14688874781131744, + "learning_rate": 0.0009362264486210903, + "loss": 2.7766, + "step": 5765 + }, + { + "epoch": 0.17098122942798682, + "grad_norm": 0.13954977691173553, + "learning_rate": 0.0009362034529692866, + "loss": 2.8651, + "step": 5766 + }, + { + "epoch": 0.1710108827802983, + "grad_norm": 0.1625709980726242, + "learning_rate": 0.0009361804534548264, + "loss": 2.8172, + "step": 5767 + }, + { + "epoch": 0.1710405361326098, + "grad_norm": 0.17843152582645416, + "learning_rate": 0.0009361574500779133, + "loss": 2.8213, + "step": 5768 + }, + { + "epoch": 0.17107018948492128, + "grad_norm": 0.15109828114509583, + "learning_rate": 0.0009361344428387513, + "loss": 2.7984, + "step": 5769 + }, + { + "epoch": 0.17109984283723276, + "grad_norm": 0.12978129088878632, + "learning_rate": 0.0009361114317375438, + "loss": 2.775, + "step": 5770 + }, + { + "epoch": 0.17112949618954423, + "grad_norm": 0.14426597952842712, + "learning_rate": 0.0009360884167744949, + "loss": 2.8194, + "step": 5771 + }, + { + "epoch": 0.1711591495418557, + "grad_norm": 0.13277587294578552, + "learning_rate": 0.000936065397949808, + "loss": 2.8287, + "step": 5772 + }, + { + "epoch": 0.17118880289416719, + "grad_norm": 0.14558245241641998, + "learning_rate": 0.0009360423752636873, + "loss": 2.8376, + "step": 5773 + }, + { + "epoch": 0.17121845624647866, + "grad_norm": 0.17862890660762787, + "learning_rate": 0.0009360193487163365, + "loss": 2.7948, + "step": 5774 + }, + { + "epoch": 0.17124810959879014, + "grad_norm": 0.18623322248458862, + "learning_rate": 0.0009359963183079596, + "loss": 2.7993, + "step": 5775 + }, + { + "epoch": 0.17127776295110161, + "grad_norm": 0.19558225572109222, + "learning_rate": 0.0009359732840387603, + "loss": 2.817, + "step": 5776 + }, + { + "epoch": 0.1713074163034131, + "grad_norm": 0.17593751847743988, + "learning_rate": 0.0009359502459089428, + "loss": 2.8502, + "step": 5777 + }, + { + "epoch": 0.17133706965572457, + "grad_norm": 0.15629316866397858, + "learning_rate": 0.000935927203918711, + "loss": 2.8192, + "step": 5778 + }, + { + "epoch": 0.17136672300803607, + "grad_norm": 0.1728130280971527, + "learning_rate": 0.000935904158068269, + "loss": 2.8619, + "step": 5779 + }, + { + "epoch": 0.17139637636034755, + "grad_norm": 0.18045568466186523, + "learning_rate": 0.0009358811083578209, + "loss": 2.8226, + "step": 5780 + }, + { + "epoch": 0.17142602971265902, + "grad_norm": 0.1651383489370346, + "learning_rate": 0.0009358580547875708, + "loss": 2.8098, + "step": 5781 + }, + { + "epoch": 0.1714556830649705, + "grad_norm": 0.1624825894832611, + "learning_rate": 0.0009358349973577227, + "loss": 2.8066, + "step": 5782 + }, + { + "epoch": 0.17148533641728197, + "grad_norm": 0.17582745850086212, + "learning_rate": 0.000935811936068481, + "loss": 2.8416, + "step": 5783 + }, + { + "epoch": 0.17151498976959345, + "grad_norm": 0.1945316195487976, + "learning_rate": 0.0009357888709200497, + "loss": 2.8254, + "step": 5784 + }, + { + "epoch": 0.17154464312190493, + "grad_norm": 0.16712065041065216, + "learning_rate": 0.0009357658019126333, + "loss": 2.8477, + "step": 5785 + }, + { + "epoch": 0.1715742964742164, + "grad_norm": 0.1418396383523941, + "learning_rate": 0.0009357427290464358, + "loss": 2.8183, + "step": 5786 + }, + { + "epoch": 0.17160394982652788, + "grad_norm": 0.15589234232902527, + "learning_rate": 0.0009357196523216616, + "loss": 2.791, + "step": 5787 + }, + { + "epoch": 0.17163360317883936, + "grad_norm": 0.14917610585689545, + "learning_rate": 0.0009356965717385152, + "loss": 2.808, + "step": 5788 + }, + { + "epoch": 0.17166325653115086, + "grad_norm": 0.1469610184431076, + "learning_rate": 0.0009356734872972008, + "loss": 2.8234, + "step": 5789 + }, + { + "epoch": 0.17169290988346234, + "grad_norm": 0.15650057792663574, + "learning_rate": 0.0009356503989979229, + "loss": 2.8291, + "step": 5790 + }, + { + "epoch": 0.1717225632357738, + "grad_norm": 0.14456459879875183, + "learning_rate": 0.000935627306840886, + "loss": 2.8601, + "step": 5791 + }, + { + "epoch": 0.1717522165880853, + "grad_norm": 0.158082515001297, + "learning_rate": 0.0009356042108262945, + "loss": 2.8428, + "step": 5792 + }, + { + "epoch": 0.17178186994039676, + "grad_norm": 0.17827662825584412, + "learning_rate": 0.0009355811109543528, + "loss": 2.8155, + "step": 5793 + }, + { + "epoch": 0.17181152329270824, + "grad_norm": 0.19548752903938293, + "learning_rate": 0.0009355580072252658, + "loss": 2.8216, + "step": 5794 + }, + { + "epoch": 0.17184117664501972, + "grad_norm": 0.2251298725605011, + "learning_rate": 0.0009355348996392378, + "loss": 2.8156, + "step": 5795 + }, + { + "epoch": 0.1718708299973312, + "grad_norm": 0.22020253539085388, + "learning_rate": 0.0009355117881964735, + "loss": 2.8045, + "step": 5796 + }, + { + "epoch": 0.17190048334964267, + "grad_norm": 0.16847547888755798, + "learning_rate": 0.0009354886728971776, + "loss": 2.8429, + "step": 5797 + }, + { + "epoch": 0.17193013670195414, + "grad_norm": 0.1722267121076584, + "learning_rate": 0.0009354655537415546, + "loss": 2.8325, + "step": 5798 + }, + { + "epoch": 0.17195979005426562, + "grad_norm": 0.1675681471824646, + "learning_rate": 0.0009354424307298095, + "loss": 2.8114, + "step": 5799 + }, + { + "epoch": 0.17198944340657712, + "grad_norm": 0.15134449303150177, + "learning_rate": 0.000935419303862147, + "loss": 2.8061, + "step": 5800 + }, + { + "epoch": 0.1720190967588886, + "grad_norm": 0.15537774562835693, + "learning_rate": 0.0009353961731387717, + "loss": 2.8211, + "step": 5801 + }, + { + "epoch": 0.17204875011120008, + "grad_norm": 0.18891046941280365, + "learning_rate": 0.0009353730385598887, + "loss": 2.7668, + "step": 5802 + }, + { + "epoch": 0.17207840346351155, + "grad_norm": 0.19272162020206451, + "learning_rate": 0.0009353499001257025, + "loss": 2.8039, + "step": 5803 + }, + { + "epoch": 0.17210805681582303, + "grad_norm": 0.14740437269210815, + "learning_rate": 0.0009353267578364184, + "loss": 2.8213, + "step": 5804 + }, + { + "epoch": 0.1721377101681345, + "grad_norm": 0.13203388452529907, + "learning_rate": 0.000935303611692241, + "loss": 2.8339, + "step": 5805 + }, + { + "epoch": 0.17216736352044598, + "grad_norm": 0.17294633388519287, + "learning_rate": 0.0009352804616933754, + "loss": 2.858, + "step": 5806 + }, + { + "epoch": 0.17219701687275746, + "grad_norm": 0.2173391878604889, + "learning_rate": 0.0009352573078400267, + "loss": 2.8126, + "step": 5807 + }, + { + "epoch": 0.17222667022506893, + "grad_norm": 0.16607603430747986, + "learning_rate": 0.0009352341501323997, + "loss": 2.8004, + "step": 5808 + }, + { + "epoch": 0.1722563235773804, + "grad_norm": 0.154518261551857, + "learning_rate": 0.0009352109885706997, + "loss": 2.8245, + "step": 5809 + }, + { + "epoch": 0.17228597692969191, + "grad_norm": 0.16347171366214752, + "learning_rate": 0.0009351878231551317, + "loss": 2.8049, + "step": 5810 + }, + { + "epoch": 0.1723156302820034, + "grad_norm": 0.1395288109779358, + "learning_rate": 0.0009351646538859009, + "loss": 2.8228, + "step": 5811 + }, + { + "epoch": 0.17234528363431487, + "grad_norm": 0.135933980345726, + "learning_rate": 0.0009351414807632121, + "loss": 2.8228, + "step": 5812 + }, + { + "epoch": 0.17237493698662634, + "grad_norm": 0.13206486403942108, + "learning_rate": 0.000935118303787271, + "loss": 2.8252, + "step": 5813 + }, + { + "epoch": 0.17240459033893782, + "grad_norm": 0.14816971123218536, + "learning_rate": 0.0009350951229582827, + "loss": 2.7895, + "step": 5814 + }, + { + "epoch": 0.1724342436912493, + "grad_norm": 0.14089573919773102, + "learning_rate": 0.0009350719382764523, + "loss": 2.8249, + "step": 5815 + }, + { + "epoch": 0.17246389704356077, + "grad_norm": 0.1412590891122818, + "learning_rate": 0.0009350487497419852, + "loss": 2.8587, + "step": 5816 + }, + { + "epoch": 0.17249355039587225, + "grad_norm": 0.1610218584537506, + "learning_rate": 0.0009350255573550868, + "loss": 2.8246, + "step": 5817 + }, + { + "epoch": 0.17252320374818372, + "grad_norm": 0.16899071633815765, + "learning_rate": 0.0009350023611159624, + "loss": 2.795, + "step": 5818 + }, + { + "epoch": 0.1725528571004952, + "grad_norm": 0.14771030843257904, + "learning_rate": 0.0009349791610248175, + "loss": 2.7856, + "step": 5819 + }, + { + "epoch": 0.1725825104528067, + "grad_norm": 0.1386219561100006, + "learning_rate": 0.0009349559570818574, + "loss": 2.809, + "step": 5820 + }, + { + "epoch": 0.17261216380511818, + "grad_norm": 0.12690602242946625, + "learning_rate": 0.0009349327492872876, + "loss": 2.8321, + "step": 5821 + }, + { + "epoch": 0.17264181715742966, + "grad_norm": 0.11439143121242523, + "learning_rate": 0.0009349095376413137, + "loss": 2.8245, + "step": 5822 + }, + { + "epoch": 0.17267147050974113, + "grad_norm": 0.12992936372756958, + "learning_rate": 0.000934886322144141, + "loss": 2.7774, + "step": 5823 + }, + { + "epoch": 0.1727011238620526, + "grad_norm": 0.14639884233474731, + "learning_rate": 0.0009348631027959755, + "loss": 2.8052, + "step": 5824 + }, + { + "epoch": 0.17273077721436408, + "grad_norm": 0.15597601234912872, + "learning_rate": 0.0009348398795970225, + "loss": 2.8299, + "step": 5825 + }, + { + "epoch": 0.17276043056667556, + "grad_norm": 0.16539551317691803, + "learning_rate": 0.0009348166525474878, + "loss": 2.7841, + "step": 5826 + }, + { + "epoch": 0.17279008391898704, + "grad_norm": 0.13873256742954254, + "learning_rate": 0.0009347934216475769, + "loss": 2.8144, + "step": 5827 + }, + { + "epoch": 0.1728197372712985, + "grad_norm": 0.15198615193367004, + "learning_rate": 0.0009347701868974959, + "loss": 2.8341, + "step": 5828 + }, + { + "epoch": 0.17284939062361, + "grad_norm": 0.16502425074577332, + "learning_rate": 0.0009347469482974499, + "loss": 2.8205, + "step": 5829 + }, + { + "epoch": 0.17287904397592146, + "grad_norm": 0.17187513411045074, + "learning_rate": 0.0009347237058476452, + "loss": 2.7726, + "step": 5830 + }, + { + "epoch": 0.17290869732823297, + "grad_norm": 0.1845099925994873, + "learning_rate": 0.0009347004595482875, + "loss": 2.8017, + "step": 5831 + }, + { + "epoch": 0.17293835068054444, + "grad_norm": 0.17293308675289154, + "learning_rate": 0.0009346772093995826, + "loss": 2.8148, + "step": 5832 + }, + { + "epoch": 0.17296800403285592, + "grad_norm": 0.18794599175453186, + "learning_rate": 0.0009346539554017363, + "loss": 2.8189, + "step": 5833 + }, + { + "epoch": 0.1729976573851674, + "grad_norm": 0.22835947573184967, + "learning_rate": 0.0009346306975549546, + "loss": 2.8252, + "step": 5834 + }, + { + "epoch": 0.17302731073747887, + "grad_norm": 0.22526632249355316, + "learning_rate": 0.0009346074358594436, + "loss": 2.7737, + "step": 5835 + }, + { + "epoch": 0.17305696408979035, + "grad_norm": 0.18897177278995514, + "learning_rate": 0.0009345841703154092, + "loss": 2.7763, + "step": 5836 + }, + { + "epoch": 0.17308661744210183, + "grad_norm": 0.18740932643413544, + "learning_rate": 0.0009345609009230572, + "loss": 2.8437, + "step": 5837 + }, + { + "epoch": 0.1731162707944133, + "grad_norm": 0.1648246943950653, + "learning_rate": 0.0009345376276825939, + "loss": 2.8224, + "step": 5838 + }, + { + "epoch": 0.17314592414672478, + "grad_norm": 0.16709007322788239, + "learning_rate": 0.0009345143505942254, + "loss": 2.8305, + "step": 5839 + }, + { + "epoch": 0.17317557749903625, + "grad_norm": 0.170688197016716, + "learning_rate": 0.0009344910696581577, + "loss": 2.8162, + "step": 5840 + }, + { + "epoch": 0.17320523085134776, + "grad_norm": 0.14879584312438965, + "learning_rate": 0.000934467784874597, + "loss": 2.8165, + "step": 5841 + }, + { + "epoch": 0.17323488420365923, + "grad_norm": 0.1448744833469391, + "learning_rate": 0.0009344444962437494, + "loss": 2.823, + "step": 5842 + }, + { + "epoch": 0.1732645375559707, + "grad_norm": 0.15464720129966736, + "learning_rate": 0.0009344212037658213, + "loss": 2.8363, + "step": 5843 + }, + { + "epoch": 0.1732941909082822, + "grad_norm": 0.1508876532316208, + "learning_rate": 0.0009343979074410189, + "loss": 2.8149, + "step": 5844 + }, + { + "epoch": 0.17332384426059366, + "grad_norm": 0.1485559344291687, + "learning_rate": 0.0009343746072695484, + "loss": 2.8261, + "step": 5845 + }, + { + "epoch": 0.17335349761290514, + "grad_norm": 0.16231687366962433, + "learning_rate": 0.0009343513032516162, + "loss": 2.81, + "step": 5846 + }, + { + "epoch": 0.17338315096521661, + "grad_norm": 0.159539595246315, + "learning_rate": 0.0009343279953874286, + "loss": 2.8142, + "step": 5847 + }, + { + "epoch": 0.1734128043175281, + "grad_norm": 0.16692030429840088, + "learning_rate": 0.0009343046836771923, + "loss": 2.8302, + "step": 5848 + }, + { + "epoch": 0.17344245766983957, + "grad_norm": 0.1743062436580658, + "learning_rate": 0.0009342813681211131, + "loss": 2.8309, + "step": 5849 + }, + { + "epoch": 0.17347211102215104, + "grad_norm": 0.17983663082122803, + "learning_rate": 0.0009342580487193981, + "loss": 2.8451, + "step": 5850 + }, + { + "epoch": 0.17350176437446252, + "grad_norm": 0.14811331033706665, + "learning_rate": 0.0009342347254722535, + "loss": 2.8114, + "step": 5851 + }, + { + "epoch": 0.17353141772677402, + "grad_norm": 0.165654718875885, + "learning_rate": 0.0009342113983798859, + "loss": 2.8312, + "step": 5852 + }, + { + "epoch": 0.1735610710790855, + "grad_norm": 0.17632247507572174, + "learning_rate": 0.0009341880674425017, + "loss": 2.8176, + "step": 5853 + }, + { + "epoch": 0.17359072443139698, + "grad_norm": 0.13914433121681213, + "learning_rate": 0.0009341647326603078, + "loss": 2.8196, + "step": 5854 + }, + { + "epoch": 0.17362037778370845, + "grad_norm": 0.12616799771785736, + "learning_rate": 0.0009341413940335107, + "loss": 2.8205, + "step": 5855 + }, + { + "epoch": 0.17365003113601993, + "grad_norm": 0.1543000191450119, + "learning_rate": 0.0009341180515623168, + "loss": 2.8252, + "step": 5856 + }, + { + "epoch": 0.1736796844883314, + "grad_norm": 0.1421639323234558, + "learning_rate": 0.0009340947052469331, + "loss": 2.8369, + "step": 5857 + }, + { + "epoch": 0.17370933784064288, + "grad_norm": 0.14245840907096863, + "learning_rate": 0.0009340713550875663, + "loss": 2.8062, + "step": 5858 + }, + { + "epoch": 0.17373899119295436, + "grad_norm": 0.12328243255615234, + "learning_rate": 0.0009340480010844232, + "loss": 2.8158, + "step": 5859 + }, + { + "epoch": 0.17376864454526583, + "grad_norm": 0.17010259628295898, + "learning_rate": 0.0009340246432377106, + "loss": 2.811, + "step": 5860 + }, + { + "epoch": 0.1737982978975773, + "grad_norm": 0.21353858709335327, + "learning_rate": 0.0009340012815476352, + "loss": 2.8123, + "step": 5861 + }, + { + "epoch": 0.1738279512498888, + "grad_norm": 0.21662907302379608, + "learning_rate": 0.000933977916014404, + "loss": 2.8372, + "step": 5862 + }, + { + "epoch": 0.1738576046022003, + "grad_norm": 0.1988820731639862, + "learning_rate": 0.0009339545466382238, + "loss": 2.8322, + "step": 5863 + }, + { + "epoch": 0.17388725795451176, + "grad_norm": 0.1734703779220581, + "learning_rate": 0.0009339311734193016, + "loss": 2.8332, + "step": 5864 + }, + { + "epoch": 0.17391691130682324, + "grad_norm": 0.1654847413301468, + "learning_rate": 0.0009339077963578443, + "loss": 2.8379, + "step": 5865 + }, + { + "epoch": 0.17394656465913472, + "grad_norm": 0.15779103338718414, + "learning_rate": 0.0009338844154540593, + "loss": 2.8038, + "step": 5866 + }, + { + "epoch": 0.1739762180114462, + "grad_norm": 0.16541540622711182, + "learning_rate": 0.0009338610307081531, + "loss": 2.8045, + "step": 5867 + }, + { + "epoch": 0.17400587136375767, + "grad_norm": 0.16718246042728424, + "learning_rate": 0.000933837642120333, + "loss": 2.8385, + "step": 5868 + }, + { + "epoch": 0.17403552471606915, + "grad_norm": 0.12391752004623413, + "learning_rate": 0.0009338142496908062, + "loss": 2.8364, + "step": 5869 + }, + { + "epoch": 0.17406517806838062, + "grad_norm": 0.12340950220823288, + "learning_rate": 0.0009337908534197796, + "loss": 2.828, + "step": 5870 + }, + { + "epoch": 0.1740948314206921, + "grad_norm": 0.12789441645145416, + "learning_rate": 0.0009337674533074607, + "loss": 2.7837, + "step": 5871 + }, + { + "epoch": 0.1741244847730036, + "grad_norm": 0.123862124979496, + "learning_rate": 0.0009337440493540565, + "loss": 2.8111, + "step": 5872 + }, + { + "epoch": 0.17415413812531508, + "grad_norm": 0.11917206645011902, + "learning_rate": 0.0009337206415597741, + "loss": 2.8357, + "step": 5873 + }, + { + "epoch": 0.17418379147762655, + "grad_norm": 0.13044431805610657, + "learning_rate": 0.0009336972299248212, + "loss": 2.8252, + "step": 5874 + }, + { + "epoch": 0.17421344482993803, + "grad_norm": 0.1428149938583374, + "learning_rate": 0.0009336738144494048, + "loss": 2.7707, + "step": 5875 + }, + { + "epoch": 0.1742430981822495, + "grad_norm": 0.14976854622364044, + "learning_rate": 0.0009336503951337324, + "loss": 2.8476, + "step": 5876 + }, + { + "epoch": 0.17427275153456098, + "grad_norm": 0.15681838989257812, + "learning_rate": 0.0009336269719780113, + "loss": 2.7866, + "step": 5877 + }, + { + "epoch": 0.17430240488687246, + "grad_norm": 0.15838474035263062, + "learning_rate": 0.0009336035449824489, + "loss": 2.8502, + "step": 5878 + }, + { + "epoch": 0.17433205823918393, + "grad_norm": 0.15086351335048676, + "learning_rate": 0.0009335801141472527, + "loss": 2.8485, + "step": 5879 + }, + { + "epoch": 0.1743617115914954, + "grad_norm": 0.16688990592956543, + "learning_rate": 0.0009335566794726302, + "loss": 2.7892, + "step": 5880 + }, + { + "epoch": 0.1743913649438069, + "grad_norm": 0.1835193634033203, + "learning_rate": 0.0009335332409587887, + "loss": 2.7924, + "step": 5881 + }, + { + "epoch": 0.17442101829611836, + "grad_norm": 0.20814552903175354, + "learning_rate": 0.0009335097986059361, + "loss": 2.8025, + "step": 5882 + }, + { + "epoch": 0.17445067164842987, + "grad_norm": 0.20994840562343597, + "learning_rate": 0.0009334863524142798, + "loss": 2.8369, + "step": 5883 + }, + { + "epoch": 0.17448032500074134, + "grad_norm": 0.18820597231388092, + "learning_rate": 0.0009334629023840274, + "loss": 2.7962, + "step": 5884 + }, + { + "epoch": 0.17450997835305282, + "grad_norm": 0.20467519760131836, + "learning_rate": 0.0009334394485153866, + "loss": 2.8314, + "step": 5885 + }, + { + "epoch": 0.1745396317053643, + "grad_norm": 0.16132239997386932, + "learning_rate": 0.0009334159908085651, + "loss": 2.7872, + "step": 5886 + }, + { + "epoch": 0.17456928505767577, + "grad_norm": 0.13813821971416473, + "learning_rate": 0.0009333925292637707, + "loss": 2.8233, + "step": 5887 + }, + { + "epoch": 0.17459893840998725, + "grad_norm": 0.15548734366893768, + "learning_rate": 0.000933369063881211, + "loss": 2.813, + "step": 5888 + }, + { + "epoch": 0.17462859176229872, + "grad_norm": 0.15085740387439728, + "learning_rate": 0.0009333455946610938, + "loss": 2.7607, + "step": 5889 + }, + { + "epoch": 0.1746582451146102, + "grad_norm": 0.13018228113651276, + "learning_rate": 0.0009333221216036269, + "loss": 2.8006, + "step": 5890 + }, + { + "epoch": 0.17468789846692168, + "grad_norm": 0.13464206457138062, + "learning_rate": 0.0009332986447090185, + "loss": 2.8083, + "step": 5891 + }, + { + "epoch": 0.17471755181923315, + "grad_norm": 0.1280928999185562, + "learning_rate": 0.000933275163977476, + "loss": 2.815, + "step": 5892 + }, + { + "epoch": 0.17474720517154466, + "grad_norm": 0.15195585787296295, + "learning_rate": 0.0009332516794092077, + "loss": 2.825, + "step": 5893 + }, + { + "epoch": 0.17477685852385613, + "grad_norm": 0.16180957853794098, + "learning_rate": 0.0009332281910044214, + "loss": 2.8055, + "step": 5894 + }, + { + "epoch": 0.1748065118761676, + "grad_norm": 0.16659283638000488, + "learning_rate": 0.000933204698763325, + "loss": 2.7916, + "step": 5895 + }, + { + "epoch": 0.17483616522847908, + "grad_norm": 0.1482149362564087, + "learning_rate": 0.0009331812026861266, + "loss": 2.8109, + "step": 5896 + }, + { + "epoch": 0.17486581858079056, + "grad_norm": 0.15074758231639862, + "learning_rate": 0.0009331577027730344, + "loss": 2.8284, + "step": 5897 + }, + { + "epoch": 0.17489547193310204, + "grad_norm": 0.16943055391311646, + "learning_rate": 0.0009331341990242563, + "loss": 2.8071, + "step": 5898 + }, + { + "epoch": 0.1749251252854135, + "grad_norm": 0.16543370485305786, + "learning_rate": 0.0009331106914400008, + "loss": 2.8192, + "step": 5899 + }, + { + "epoch": 0.174954778637725, + "grad_norm": 0.1301525980234146, + "learning_rate": 0.0009330871800204754, + "loss": 2.803, + "step": 5900 + }, + { + "epoch": 0.17498443199003647, + "grad_norm": 0.13323792815208435, + "learning_rate": 0.0009330636647658889, + "loss": 2.8005, + "step": 5901 + }, + { + "epoch": 0.17501408534234794, + "grad_norm": 0.14680834114551544, + "learning_rate": 0.0009330401456764492, + "loss": 2.8407, + "step": 5902 + }, + { + "epoch": 0.17504373869465942, + "grad_norm": 0.15093445777893066, + "learning_rate": 0.0009330166227523649, + "loss": 2.7988, + "step": 5903 + }, + { + "epoch": 0.17507339204697092, + "grad_norm": 0.14392045140266418, + "learning_rate": 0.0009329930959938439, + "loss": 2.8155, + "step": 5904 + }, + { + "epoch": 0.1751030453992824, + "grad_norm": 0.14462405443191528, + "learning_rate": 0.0009329695654010947, + "loss": 2.7944, + "step": 5905 + }, + { + "epoch": 0.17513269875159387, + "grad_norm": 0.1446317881345749, + "learning_rate": 0.0009329460309743257, + "loss": 2.7884, + "step": 5906 + }, + { + "epoch": 0.17516235210390535, + "grad_norm": 0.16097241640090942, + "learning_rate": 0.0009329224927137453, + "loss": 2.8154, + "step": 5907 + }, + { + "epoch": 0.17519200545621683, + "grad_norm": 0.16256429255008698, + "learning_rate": 0.000932898950619562, + "loss": 2.8462, + "step": 5908 + }, + { + "epoch": 0.1752216588085283, + "grad_norm": 0.14978471398353577, + "learning_rate": 0.000932875404691984, + "loss": 2.8161, + "step": 5909 + }, + { + "epoch": 0.17525131216083978, + "grad_norm": 0.16070371866226196, + "learning_rate": 0.0009328518549312202, + "loss": 2.8064, + "step": 5910 + }, + { + "epoch": 0.17528096551315125, + "grad_norm": 0.1704498827457428, + "learning_rate": 0.0009328283013374788, + "loss": 2.7783, + "step": 5911 + }, + { + "epoch": 0.17531061886546273, + "grad_norm": 0.17518815398216248, + "learning_rate": 0.0009328047439109685, + "loss": 2.8265, + "step": 5912 + }, + { + "epoch": 0.1753402722177742, + "grad_norm": 0.1780436933040619, + "learning_rate": 0.0009327811826518979, + "loss": 2.8157, + "step": 5913 + }, + { + "epoch": 0.1753699255700857, + "grad_norm": 0.2033616006374359, + "learning_rate": 0.0009327576175604756, + "loss": 2.8454, + "step": 5914 + }, + { + "epoch": 0.1753995789223972, + "grad_norm": 0.2174277901649475, + "learning_rate": 0.0009327340486369104, + "loss": 2.8342, + "step": 5915 + }, + { + "epoch": 0.17542923227470866, + "grad_norm": 0.20681919157505035, + "learning_rate": 0.0009327104758814109, + "loss": 2.8206, + "step": 5916 + }, + { + "epoch": 0.17545888562702014, + "grad_norm": 0.16730108857154846, + "learning_rate": 0.000932686899294186, + "loss": 2.8031, + "step": 5917 + }, + { + "epoch": 0.17548853897933162, + "grad_norm": 0.16949467360973358, + "learning_rate": 0.0009326633188754441, + "loss": 2.8145, + "step": 5918 + }, + { + "epoch": 0.1755181923316431, + "grad_norm": 0.18056198954582214, + "learning_rate": 0.0009326397346253943, + "loss": 2.7838, + "step": 5919 + }, + { + "epoch": 0.17554784568395457, + "grad_norm": 0.17232763767242432, + "learning_rate": 0.0009326161465442455, + "loss": 2.8301, + "step": 5920 + }, + { + "epoch": 0.17557749903626604, + "grad_norm": 0.14361867308616638, + "learning_rate": 0.0009325925546322064, + "loss": 2.8233, + "step": 5921 + }, + { + "epoch": 0.17560715238857752, + "grad_norm": 0.15280520915985107, + "learning_rate": 0.0009325689588894859, + "loss": 2.8136, + "step": 5922 + }, + { + "epoch": 0.175636805740889, + "grad_norm": 0.14191053807735443, + "learning_rate": 0.0009325453593162931, + "loss": 2.7933, + "step": 5923 + }, + { + "epoch": 0.1756664590932005, + "grad_norm": 0.14346130192279816, + "learning_rate": 0.000932521755912837, + "loss": 2.8033, + "step": 5924 + }, + { + "epoch": 0.17569611244551198, + "grad_norm": 0.14370784163475037, + "learning_rate": 0.0009324981486793263, + "loss": 2.8, + "step": 5925 + }, + { + "epoch": 0.17572576579782345, + "grad_norm": 0.1500864177942276, + "learning_rate": 0.0009324745376159705, + "loss": 2.8472, + "step": 5926 + }, + { + "epoch": 0.17575541915013493, + "grad_norm": 0.14825047552585602, + "learning_rate": 0.0009324509227229781, + "loss": 2.8166, + "step": 5927 + }, + { + "epoch": 0.1757850725024464, + "grad_norm": 0.12873214483261108, + "learning_rate": 0.0009324273040005589, + "loss": 2.7921, + "step": 5928 + }, + { + "epoch": 0.17581472585475788, + "grad_norm": 0.13033819198608398, + "learning_rate": 0.0009324036814489214, + "loss": 2.7834, + "step": 5929 + }, + { + "epoch": 0.17584437920706936, + "grad_norm": 0.12478241324424744, + "learning_rate": 0.0009323800550682753, + "loss": 2.7952, + "step": 5930 + }, + { + "epoch": 0.17587403255938083, + "grad_norm": 0.12917979061603546, + "learning_rate": 0.0009323564248588294, + "loss": 2.8055, + "step": 5931 + }, + { + "epoch": 0.1759036859116923, + "grad_norm": 0.149866983294487, + "learning_rate": 0.0009323327908207934, + "loss": 2.8171, + "step": 5932 + }, + { + "epoch": 0.17593333926400379, + "grad_norm": 0.15361061692237854, + "learning_rate": 0.0009323091529543761, + "loss": 2.8274, + "step": 5933 + }, + { + "epoch": 0.17596299261631526, + "grad_norm": 0.16344793140888214, + "learning_rate": 0.0009322855112597873, + "loss": 2.7548, + "step": 5934 + }, + { + "epoch": 0.17599264596862677, + "grad_norm": 0.184126615524292, + "learning_rate": 0.0009322618657372358, + "loss": 2.8001, + "step": 5935 + }, + { + "epoch": 0.17602229932093824, + "grad_norm": 0.20501045882701874, + "learning_rate": 0.0009322382163869314, + "loss": 2.8222, + "step": 5936 + }, + { + "epoch": 0.17605195267324972, + "grad_norm": 0.20609301328659058, + "learning_rate": 0.0009322145632090835, + "loss": 2.7894, + "step": 5937 + }, + { + "epoch": 0.1760816060255612, + "grad_norm": 0.17809177935123444, + "learning_rate": 0.0009321909062039014, + "loss": 2.8323, + "step": 5938 + }, + { + "epoch": 0.17611125937787267, + "grad_norm": 0.17980574071407318, + "learning_rate": 0.0009321672453715945, + "loss": 2.8201, + "step": 5939 + }, + { + "epoch": 0.17614091273018415, + "grad_norm": 0.17748932540416718, + "learning_rate": 0.0009321435807123726, + "loss": 2.8142, + "step": 5940 + }, + { + "epoch": 0.17617056608249562, + "grad_norm": 0.1423904150724411, + "learning_rate": 0.000932119912226445, + "loss": 2.8227, + "step": 5941 + }, + { + "epoch": 0.1762002194348071, + "grad_norm": 0.1603732705116272, + "learning_rate": 0.0009320962399140216, + "loss": 2.7878, + "step": 5942 + }, + { + "epoch": 0.17622987278711857, + "grad_norm": 0.15126164257526398, + "learning_rate": 0.0009320725637753115, + "loss": 2.8359, + "step": 5943 + }, + { + "epoch": 0.17625952613943005, + "grad_norm": 0.14446303248405457, + "learning_rate": 0.000932048883810525, + "loss": 2.802, + "step": 5944 + }, + { + "epoch": 0.17628917949174155, + "grad_norm": 0.13838636875152588, + "learning_rate": 0.0009320252000198715, + "loss": 2.835, + "step": 5945 + }, + { + "epoch": 0.17631883284405303, + "grad_norm": 0.14119133353233337, + "learning_rate": 0.0009320015124035606, + "loss": 2.793, + "step": 5946 + }, + { + "epoch": 0.1763484861963645, + "grad_norm": 0.16337443888187408, + "learning_rate": 0.000931977820961802, + "loss": 2.8234, + "step": 5947 + }, + { + "epoch": 0.17637813954867598, + "grad_norm": 0.1567762941122055, + "learning_rate": 0.0009319541256948058, + "loss": 2.802, + "step": 5948 + }, + { + "epoch": 0.17640779290098746, + "grad_norm": 0.163502037525177, + "learning_rate": 0.0009319304266027817, + "loss": 2.7949, + "step": 5949 + }, + { + "epoch": 0.17643744625329894, + "grad_norm": 0.1643218696117401, + "learning_rate": 0.0009319067236859394, + "loss": 2.7958, + "step": 5950 + }, + { + "epoch": 0.1764670996056104, + "grad_norm": 0.1483003944158554, + "learning_rate": 0.0009318830169444891, + "loss": 2.8234, + "step": 5951 + }, + { + "epoch": 0.1764967529579219, + "grad_norm": 0.14329075813293457, + "learning_rate": 0.0009318593063786405, + "loss": 2.8009, + "step": 5952 + }, + { + "epoch": 0.17652640631023336, + "grad_norm": 0.16287748515605927, + "learning_rate": 0.0009318355919886034, + "loss": 2.8116, + "step": 5953 + }, + { + "epoch": 0.17655605966254484, + "grad_norm": 0.192740336060524, + "learning_rate": 0.0009318118737745882, + "loss": 2.8201, + "step": 5954 + }, + { + "epoch": 0.17658571301485632, + "grad_norm": 0.21739135682582855, + "learning_rate": 0.0009317881517368048, + "loss": 2.7834, + "step": 5955 + }, + { + "epoch": 0.17661536636716782, + "grad_norm": 0.2260083556175232, + "learning_rate": 0.0009317644258754632, + "loss": 2.8096, + "step": 5956 + }, + { + "epoch": 0.1766450197194793, + "grad_norm": 0.1771126389503479, + "learning_rate": 0.0009317406961907732, + "loss": 2.772, + "step": 5957 + }, + { + "epoch": 0.17667467307179077, + "grad_norm": 0.142915740609169, + "learning_rate": 0.0009317169626829456, + "loss": 2.7842, + "step": 5958 + }, + { + "epoch": 0.17670432642410225, + "grad_norm": 0.16737009584903717, + "learning_rate": 0.0009316932253521901, + "loss": 2.8228, + "step": 5959 + }, + { + "epoch": 0.17673397977641372, + "grad_norm": 0.14943978190422058, + "learning_rate": 0.0009316694841987168, + "loss": 2.8074, + "step": 5960 + }, + { + "epoch": 0.1767636331287252, + "grad_norm": 0.1501806676387787, + "learning_rate": 0.0009316457392227363, + "loss": 2.78, + "step": 5961 + }, + { + "epoch": 0.17679328648103668, + "grad_norm": 0.14711835980415344, + "learning_rate": 0.0009316219904244587, + "loss": 2.7996, + "step": 5962 + }, + { + "epoch": 0.17682293983334815, + "grad_norm": 0.1271098107099533, + "learning_rate": 0.0009315982378040942, + "loss": 2.8304, + "step": 5963 + }, + { + "epoch": 0.17685259318565963, + "grad_norm": 0.14361697435379028, + "learning_rate": 0.0009315744813618532, + "loss": 2.8333, + "step": 5964 + }, + { + "epoch": 0.1768822465379711, + "grad_norm": 0.132212832570076, + "learning_rate": 0.0009315507210979462, + "loss": 2.8212, + "step": 5965 + }, + { + "epoch": 0.1769118998902826, + "grad_norm": 0.13037323951721191, + "learning_rate": 0.0009315269570125833, + "loss": 2.8156, + "step": 5966 + }, + { + "epoch": 0.17694155324259409, + "grad_norm": 0.15494702756404877, + "learning_rate": 0.0009315031891059753, + "loss": 2.8323, + "step": 5967 + }, + { + "epoch": 0.17697120659490556, + "grad_norm": 0.13570666313171387, + "learning_rate": 0.0009314794173783326, + "loss": 2.8112, + "step": 5968 + }, + { + "epoch": 0.17700085994721704, + "grad_norm": 0.13681429624557495, + "learning_rate": 0.0009314556418298654, + "loss": 2.847, + "step": 5969 + }, + { + "epoch": 0.1770305132995285, + "grad_norm": 0.14981898665428162, + "learning_rate": 0.0009314318624607845, + "loss": 2.809, + "step": 5970 + }, + { + "epoch": 0.17706016665184, + "grad_norm": 0.14731694757938385, + "learning_rate": 0.0009314080792713004, + "loss": 2.815, + "step": 5971 + }, + { + "epoch": 0.17708982000415147, + "grad_norm": 0.14531783759593964, + "learning_rate": 0.0009313842922616236, + "loss": 2.8279, + "step": 5972 + }, + { + "epoch": 0.17711947335646294, + "grad_norm": 0.14063195884227753, + "learning_rate": 0.000931360501431965, + "loss": 2.8251, + "step": 5973 + }, + { + "epoch": 0.17714912670877442, + "grad_norm": 0.155778169631958, + "learning_rate": 0.000931336706782535, + "loss": 2.8246, + "step": 5974 + }, + { + "epoch": 0.1771787800610859, + "grad_norm": 0.17460298538208008, + "learning_rate": 0.0009313129083135445, + "loss": 2.8278, + "step": 5975 + }, + { + "epoch": 0.1772084334133974, + "grad_norm": 0.18227209150791168, + "learning_rate": 0.0009312891060252041, + "loss": 2.8341, + "step": 5976 + }, + { + "epoch": 0.17723808676570887, + "grad_norm": 0.15572209656238556, + "learning_rate": 0.0009312652999177247, + "loss": 2.8013, + "step": 5977 + }, + { + "epoch": 0.17726774011802035, + "grad_norm": 0.1756381094455719, + "learning_rate": 0.0009312414899913171, + "loss": 2.8003, + "step": 5978 + }, + { + "epoch": 0.17729739347033183, + "grad_norm": 0.19813521206378937, + "learning_rate": 0.0009312176762461919, + "loss": 2.7883, + "step": 5979 + }, + { + "epoch": 0.1773270468226433, + "grad_norm": 0.19888877868652344, + "learning_rate": 0.0009311938586825604, + "loss": 2.7829, + "step": 5980 + }, + { + "epoch": 0.17735670017495478, + "grad_norm": 0.18158750236034393, + "learning_rate": 0.0009311700373006331, + "loss": 2.8072, + "step": 5981 + }, + { + "epoch": 0.17738635352726626, + "grad_norm": 0.15885788202285767, + "learning_rate": 0.0009311462121006211, + "loss": 2.8028, + "step": 5982 + }, + { + "epoch": 0.17741600687957773, + "grad_norm": 0.1429811418056488, + "learning_rate": 0.0009311223830827353, + "loss": 2.8106, + "step": 5983 + }, + { + "epoch": 0.1774456602318892, + "grad_norm": 0.16240689158439636, + "learning_rate": 0.000931098550247187, + "loss": 2.7996, + "step": 5984 + }, + { + "epoch": 0.17747531358420068, + "grad_norm": 0.16086861491203308, + "learning_rate": 0.0009310747135941869, + "loss": 2.8287, + "step": 5985 + }, + { + "epoch": 0.17750496693651216, + "grad_norm": 0.14003051817417145, + "learning_rate": 0.0009310508731239464, + "loss": 2.8186, + "step": 5986 + }, + { + "epoch": 0.17753462028882366, + "grad_norm": 0.1543777883052826, + "learning_rate": 0.0009310270288366762, + "loss": 2.8278, + "step": 5987 + }, + { + "epoch": 0.17756427364113514, + "grad_norm": 0.14915777742862701, + "learning_rate": 0.0009310031807325879, + "loss": 2.8111, + "step": 5988 + }, + { + "epoch": 0.17759392699344662, + "grad_norm": 0.17911431193351746, + "learning_rate": 0.0009309793288118923, + "loss": 2.8128, + "step": 5989 + }, + { + "epoch": 0.1776235803457581, + "grad_norm": 0.17733484506607056, + "learning_rate": 0.0009309554730748009, + "loss": 2.8041, + "step": 5990 + }, + { + "epoch": 0.17765323369806957, + "grad_norm": 0.1843222677707672, + "learning_rate": 0.0009309316135215247, + "loss": 2.8309, + "step": 5991 + }, + { + "epoch": 0.17768288705038104, + "grad_norm": 0.2006092220544815, + "learning_rate": 0.0009309077501522751, + "loss": 2.8145, + "step": 5992 + }, + { + "epoch": 0.17771254040269252, + "grad_norm": 0.21375735104084015, + "learning_rate": 0.0009308838829672633, + "loss": 2.8214, + "step": 5993 + }, + { + "epoch": 0.177742193755004, + "grad_norm": 0.18922211229801178, + "learning_rate": 0.000930860011966701, + "loss": 2.8268, + "step": 5994 + }, + { + "epoch": 0.17777184710731547, + "grad_norm": 0.16662663221359253, + "learning_rate": 0.0009308361371507992, + "loss": 2.8264, + "step": 5995 + }, + { + "epoch": 0.17780150045962695, + "grad_norm": 0.155067577958107, + "learning_rate": 0.0009308122585197693, + "loss": 2.8268, + "step": 5996 + }, + { + "epoch": 0.17783115381193845, + "grad_norm": 0.15769197046756744, + "learning_rate": 0.0009307883760738231, + "loss": 2.8251, + "step": 5997 + }, + { + "epoch": 0.17786080716424993, + "grad_norm": 0.15767385065555573, + "learning_rate": 0.000930764489813172, + "loss": 2.8177, + "step": 5998 + }, + { + "epoch": 0.1778904605165614, + "grad_norm": 0.15622960031032562, + "learning_rate": 0.0009307405997380271, + "loss": 2.804, + "step": 5999 + }, + { + "epoch": 0.17792011386887288, + "grad_norm": 0.1496104598045349, + "learning_rate": 0.0009307167058486005, + "loss": 2.8086, + "step": 6000 + }, + { + "epoch": 0.17794976722118436, + "grad_norm": 0.12050718069076538, + "learning_rate": 0.0009306928081451035, + "loss": 2.7951, + "step": 6001 + }, + { + "epoch": 0.17797942057349583, + "grad_norm": 0.13878211379051208, + "learning_rate": 0.0009306689066277478, + "loss": 2.8554, + "step": 6002 + }, + { + "epoch": 0.1780090739258073, + "grad_norm": 0.1820317655801773, + "learning_rate": 0.0009306450012967448, + "loss": 2.8134, + "step": 6003 + }, + { + "epoch": 0.17803872727811879, + "grad_norm": 0.18868932127952576, + "learning_rate": 0.0009306210921523066, + "loss": 2.8309, + "step": 6004 + }, + { + "epoch": 0.17806838063043026, + "grad_norm": 0.19894173741340637, + "learning_rate": 0.0009305971791946446, + "loss": 2.8247, + "step": 6005 + }, + { + "epoch": 0.17809803398274174, + "grad_norm": 0.19302111864089966, + "learning_rate": 0.0009305732624239707, + "loss": 2.7974, + "step": 6006 + }, + { + "epoch": 0.17812768733505321, + "grad_norm": 0.1914704144001007, + "learning_rate": 0.0009305493418404967, + "loss": 2.8554, + "step": 6007 + }, + { + "epoch": 0.17815734068736472, + "grad_norm": 0.21250773966312408, + "learning_rate": 0.0009305254174444342, + "loss": 2.8044, + "step": 6008 + }, + { + "epoch": 0.1781869940396762, + "grad_norm": 0.19350583851337433, + "learning_rate": 0.0009305014892359954, + "loss": 2.8094, + "step": 6009 + }, + { + "epoch": 0.17821664739198767, + "grad_norm": 0.17686983942985535, + "learning_rate": 0.000930477557215392, + "loss": 2.8186, + "step": 6010 + }, + { + "epoch": 0.17824630074429915, + "grad_norm": 0.16552165150642395, + "learning_rate": 0.0009304536213828358, + "loss": 2.8205, + "step": 6011 + }, + { + "epoch": 0.17827595409661062, + "grad_norm": 0.1359218806028366, + "learning_rate": 0.0009304296817385392, + "loss": 2.8297, + "step": 6012 + }, + { + "epoch": 0.1783056074489221, + "grad_norm": 0.1578502506017685, + "learning_rate": 0.0009304057382827135, + "loss": 2.7896, + "step": 6013 + }, + { + "epoch": 0.17833526080123357, + "grad_norm": 0.16742491722106934, + "learning_rate": 0.0009303817910155714, + "loss": 2.7928, + "step": 6014 + }, + { + "epoch": 0.17836491415354505, + "grad_norm": 0.1664547324180603, + "learning_rate": 0.0009303578399373245, + "loss": 2.7916, + "step": 6015 + }, + { + "epoch": 0.17839456750585653, + "grad_norm": 0.14934921264648438, + "learning_rate": 0.0009303338850481853, + "loss": 2.8099, + "step": 6016 + }, + { + "epoch": 0.178424220858168, + "grad_norm": 0.1367069035768509, + "learning_rate": 0.0009303099263483655, + "loss": 2.8216, + "step": 6017 + }, + { + "epoch": 0.1784538742104795, + "grad_norm": 0.14390257000923157, + "learning_rate": 0.0009302859638380774, + "loss": 2.8306, + "step": 6018 + }, + { + "epoch": 0.17848352756279098, + "grad_norm": 0.12219356000423431, + "learning_rate": 0.0009302619975175332, + "loss": 2.792, + "step": 6019 + }, + { + "epoch": 0.17851318091510246, + "grad_norm": 0.14629703760147095, + "learning_rate": 0.0009302380273869453, + "loss": 2.815, + "step": 6020 + }, + { + "epoch": 0.17854283426741394, + "grad_norm": 0.1693582981824875, + "learning_rate": 0.0009302140534465258, + "loss": 2.824, + "step": 6021 + }, + { + "epoch": 0.1785724876197254, + "grad_norm": 0.1429632157087326, + "learning_rate": 0.000930190075696487, + "loss": 2.8108, + "step": 6022 + }, + { + "epoch": 0.1786021409720369, + "grad_norm": 0.1306992620229721, + "learning_rate": 0.0009301660941370411, + "loss": 2.7872, + "step": 6023 + }, + { + "epoch": 0.17863179432434836, + "grad_norm": 0.1451852172613144, + "learning_rate": 0.0009301421087684008, + "loss": 2.8295, + "step": 6024 + }, + { + "epoch": 0.17866144767665984, + "grad_norm": 0.15886634588241577, + "learning_rate": 0.0009301181195907782, + "loss": 2.8329, + "step": 6025 + }, + { + "epoch": 0.17869110102897132, + "grad_norm": 0.17327529191970825, + "learning_rate": 0.0009300941266043857, + "loss": 2.8204, + "step": 6026 + }, + { + "epoch": 0.1787207543812828, + "grad_norm": 0.17004211246967316, + "learning_rate": 0.0009300701298094361, + "loss": 2.8344, + "step": 6027 + }, + { + "epoch": 0.1787504077335943, + "grad_norm": 0.16763156652450562, + "learning_rate": 0.0009300461292061415, + "loss": 2.8299, + "step": 6028 + }, + { + "epoch": 0.17878006108590577, + "grad_norm": 0.18870288133621216, + "learning_rate": 0.0009300221247947147, + "loss": 2.8142, + "step": 6029 + }, + { + "epoch": 0.17880971443821725, + "grad_norm": 0.18980303406715393, + "learning_rate": 0.0009299981165753683, + "loss": 2.8219, + "step": 6030 + }, + { + "epoch": 0.17883936779052872, + "grad_norm": 0.19861111044883728, + "learning_rate": 0.0009299741045483145, + "loss": 2.7995, + "step": 6031 + }, + { + "epoch": 0.1788690211428402, + "grad_norm": 0.18575206398963928, + "learning_rate": 0.0009299500887137664, + "loss": 2.8196, + "step": 6032 + }, + { + "epoch": 0.17889867449515168, + "grad_norm": 0.19347688555717468, + "learning_rate": 0.0009299260690719364, + "loss": 2.7827, + "step": 6033 + }, + { + "epoch": 0.17892832784746315, + "grad_norm": 0.18249094486236572, + "learning_rate": 0.0009299020456230373, + "loss": 2.8038, + "step": 6034 + }, + { + "epoch": 0.17895798119977463, + "grad_norm": 0.15959906578063965, + "learning_rate": 0.0009298780183672817, + "loss": 2.8169, + "step": 6035 + }, + { + "epoch": 0.1789876345520861, + "grad_norm": 0.1339084357023239, + "learning_rate": 0.0009298539873048826, + "loss": 2.8254, + "step": 6036 + }, + { + "epoch": 0.17901728790439758, + "grad_norm": 0.13837836682796478, + "learning_rate": 0.0009298299524360525, + "loss": 2.8031, + "step": 6037 + }, + { + "epoch": 0.17904694125670906, + "grad_norm": 0.13838721811771393, + "learning_rate": 0.0009298059137610045, + "loss": 2.8167, + "step": 6038 + }, + { + "epoch": 0.17907659460902056, + "grad_norm": 0.13149099051952362, + "learning_rate": 0.0009297818712799514, + "loss": 2.8516, + "step": 6039 + }, + { + "epoch": 0.17910624796133204, + "grad_norm": 0.15936845541000366, + "learning_rate": 0.000929757824993106, + "loss": 2.8068, + "step": 6040 + }, + { + "epoch": 0.17913590131364351, + "grad_norm": 0.1671646237373352, + "learning_rate": 0.0009297337749006812, + "loss": 2.8298, + "step": 6041 + }, + { + "epoch": 0.179165554665955, + "grad_norm": 0.1670813262462616, + "learning_rate": 0.0009297097210028902, + "loss": 2.8499, + "step": 6042 + }, + { + "epoch": 0.17919520801826647, + "grad_norm": 0.17812801897525787, + "learning_rate": 0.0009296856632999458, + "loss": 2.8214, + "step": 6043 + }, + { + "epoch": 0.17922486137057794, + "grad_norm": 0.1573038399219513, + "learning_rate": 0.0009296616017920612, + "loss": 2.8502, + "step": 6044 + }, + { + "epoch": 0.17925451472288942, + "grad_norm": 0.15605948865413666, + "learning_rate": 0.0009296375364794492, + "loss": 2.7942, + "step": 6045 + }, + { + "epoch": 0.1792841680752009, + "grad_norm": 0.14333868026733398, + "learning_rate": 0.0009296134673623231, + "loss": 2.825, + "step": 6046 + }, + { + "epoch": 0.17931382142751237, + "grad_norm": 0.1397712528705597, + "learning_rate": 0.0009295893944408959, + "loss": 2.8191, + "step": 6047 + }, + { + "epoch": 0.17934347477982385, + "grad_norm": 0.1529347002506256, + "learning_rate": 0.0009295653177153811, + "loss": 2.7845, + "step": 6048 + }, + { + "epoch": 0.17937312813213535, + "grad_norm": 0.13999336957931519, + "learning_rate": 0.0009295412371859918, + "loss": 2.7824, + "step": 6049 + }, + { + "epoch": 0.17940278148444683, + "grad_norm": 0.13467958569526672, + "learning_rate": 0.0009295171528529407, + "loss": 2.8495, + "step": 6050 + }, + { + "epoch": 0.1794324348367583, + "grad_norm": 0.15321798622608185, + "learning_rate": 0.0009294930647164417, + "loss": 2.7977, + "step": 6051 + }, + { + "epoch": 0.17946208818906978, + "grad_norm": 0.14159846305847168, + "learning_rate": 0.000929468972776708, + "loss": 2.7947, + "step": 6052 + }, + { + "epoch": 0.17949174154138126, + "grad_norm": 0.14077578485012054, + "learning_rate": 0.0009294448770339526, + "loss": 2.7752, + "step": 6053 + }, + { + "epoch": 0.17952139489369273, + "grad_norm": 0.148127943277359, + "learning_rate": 0.0009294207774883892, + "loss": 2.8228, + "step": 6054 + }, + { + "epoch": 0.1795510482460042, + "grad_norm": 0.15710124373435974, + "learning_rate": 0.0009293966741402311, + "loss": 2.8301, + "step": 6055 + }, + { + "epoch": 0.17958070159831568, + "grad_norm": 0.18985766172409058, + "learning_rate": 0.0009293725669896918, + "loss": 2.792, + "step": 6056 + }, + { + "epoch": 0.17961035495062716, + "grad_norm": 0.20380206406116486, + "learning_rate": 0.0009293484560369847, + "loss": 2.8026, + "step": 6057 + }, + { + "epoch": 0.17964000830293864, + "grad_norm": 0.17349013686180115, + "learning_rate": 0.0009293243412823234, + "loss": 2.8002, + "step": 6058 + }, + { + "epoch": 0.1796696616552501, + "grad_norm": 0.1576664000749588, + "learning_rate": 0.0009293002227259211, + "loss": 2.8234, + "step": 6059 + }, + { + "epoch": 0.17969931500756162, + "grad_norm": 0.14094333350658417, + "learning_rate": 0.000929276100367992, + "loss": 2.8454, + "step": 6060 + }, + { + "epoch": 0.1797289683598731, + "grad_norm": 0.12599541246891022, + "learning_rate": 0.0009292519742087491, + "loss": 2.8139, + "step": 6061 + }, + { + "epoch": 0.17975862171218457, + "grad_norm": 0.1588713377714157, + "learning_rate": 0.0009292278442484063, + "loss": 2.8545, + "step": 6062 + }, + { + "epoch": 0.17978827506449604, + "grad_norm": 0.1691366732120514, + "learning_rate": 0.0009292037104871773, + "loss": 2.7587, + "step": 6063 + }, + { + "epoch": 0.17981792841680752, + "grad_norm": 0.15919756889343262, + "learning_rate": 0.0009291795729252759, + "loss": 2.7672, + "step": 6064 + }, + { + "epoch": 0.179847581769119, + "grad_norm": 0.1730828881263733, + "learning_rate": 0.0009291554315629156, + "loss": 2.7964, + "step": 6065 + }, + { + "epoch": 0.17987723512143047, + "grad_norm": 0.18328678607940674, + "learning_rate": 0.0009291312864003102, + "loss": 2.8285, + "step": 6066 + }, + { + "epoch": 0.17990688847374195, + "grad_norm": 0.18986457586288452, + "learning_rate": 0.0009291071374376736, + "loss": 2.842, + "step": 6067 + }, + { + "epoch": 0.17993654182605343, + "grad_norm": 0.16823270916938782, + "learning_rate": 0.0009290829846752197, + "loss": 2.8164, + "step": 6068 + }, + { + "epoch": 0.1799661951783649, + "grad_norm": 0.1555660218000412, + "learning_rate": 0.0009290588281131624, + "loss": 2.8165, + "step": 6069 + }, + { + "epoch": 0.1799958485306764, + "grad_norm": 0.16438202559947968, + "learning_rate": 0.0009290346677517155, + "loss": 2.7652, + "step": 6070 + }, + { + "epoch": 0.18002550188298788, + "grad_norm": 0.15171664953231812, + "learning_rate": 0.000929010503591093, + "loss": 2.7862, + "step": 6071 + }, + { + "epoch": 0.18005515523529936, + "grad_norm": 0.1273617446422577, + "learning_rate": 0.0009289863356315087, + "loss": 2.7972, + "step": 6072 + }, + { + "epoch": 0.18008480858761083, + "grad_norm": 0.14030516147613525, + "learning_rate": 0.0009289621638731769, + "loss": 2.8212, + "step": 6073 + }, + { + "epoch": 0.1801144619399223, + "grad_norm": 0.15076079964637756, + "learning_rate": 0.0009289379883163116, + "loss": 2.8092, + "step": 6074 + }, + { + "epoch": 0.1801441152922338, + "grad_norm": 0.14731547236442566, + "learning_rate": 0.0009289138089611267, + "loss": 2.8382, + "step": 6075 + }, + { + "epoch": 0.18017376864454526, + "grad_norm": 0.1484602391719818, + "learning_rate": 0.0009288896258078363, + "loss": 2.7979, + "step": 6076 + }, + { + "epoch": 0.18020342199685674, + "grad_norm": 0.15980184078216553, + "learning_rate": 0.0009288654388566546, + "loss": 2.8547, + "step": 6077 + }, + { + "epoch": 0.18023307534916821, + "grad_norm": 0.17443503439426422, + "learning_rate": 0.000928841248107796, + "loss": 2.8139, + "step": 6078 + }, + { + "epoch": 0.1802627287014797, + "grad_norm": 0.18405333161354065, + "learning_rate": 0.0009288170535614745, + "loss": 2.7917, + "step": 6079 + }, + { + "epoch": 0.1802923820537912, + "grad_norm": 0.18562203645706177, + "learning_rate": 0.0009287928552179043, + "loss": 2.8131, + "step": 6080 + }, + { + "epoch": 0.18032203540610267, + "grad_norm": 0.1485193818807602, + "learning_rate": 0.0009287686530772999, + "loss": 2.7984, + "step": 6081 + }, + { + "epoch": 0.18035168875841415, + "grad_norm": 0.13165311515331268, + "learning_rate": 0.0009287444471398754, + "loss": 2.8006, + "step": 6082 + }, + { + "epoch": 0.18038134211072562, + "grad_norm": 0.1428060084581375, + "learning_rate": 0.0009287202374058453, + "loss": 2.8048, + "step": 6083 + }, + { + "epoch": 0.1804109954630371, + "grad_norm": 0.15856482088565826, + "learning_rate": 0.0009286960238754238, + "loss": 2.8313, + "step": 6084 + }, + { + "epoch": 0.18044064881534858, + "grad_norm": 0.176906019449234, + "learning_rate": 0.0009286718065488253, + "loss": 2.8117, + "step": 6085 + }, + { + "epoch": 0.18047030216766005, + "grad_norm": 0.17448574304580688, + "learning_rate": 0.0009286475854262646, + "loss": 2.7866, + "step": 6086 + }, + { + "epoch": 0.18049995551997153, + "grad_norm": 0.17896954715251923, + "learning_rate": 0.0009286233605079559, + "loss": 2.8089, + "step": 6087 + }, + { + "epoch": 0.180529608872283, + "grad_norm": 0.16213257610797882, + "learning_rate": 0.0009285991317941138, + "loss": 2.7834, + "step": 6088 + }, + { + "epoch": 0.18055926222459448, + "grad_norm": 0.13380883634090424, + "learning_rate": 0.0009285748992849528, + "loss": 2.8057, + "step": 6089 + }, + { + "epoch": 0.18058891557690596, + "grad_norm": 0.15750418603420258, + "learning_rate": 0.0009285506629806875, + "loss": 2.8436, + "step": 6090 + }, + { + "epoch": 0.18061856892921746, + "grad_norm": 0.14035271108150482, + "learning_rate": 0.0009285264228815325, + "loss": 2.8324, + "step": 6091 + }, + { + "epoch": 0.18064822228152894, + "grad_norm": 0.14190392196178436, + "learning_rate": 0.0009285021789877024, + "loss": 2.8027, + "step": 6092 + }, + { + "epoch": 0.1806778756338404, + "grad_norm": 0.15126828849315643, + "learning_rate": 0.0009284779312994121, + "loss": 2.8184, + "step": 6093 + }, + { + "epoch": 0.1807075289861519, + "grad_norm": 0.1459982693195343, + "learning_rate": 0.0009284536798168762, + "loss": 2.8058, + "step": 6094 + }, + { + "epoch": 0.18073718233846336, + "grad_norm": 0.15874320268630981, + "learning_rate": 0.0009284294245403091, + "loss": 2.8298, + "step": 6095 + }, + { + "epoch": 0.18076683569077484, + "grad_norm": 0.16652511060237885, + "learning_rate": 0.0009284051654699262, + "loss": 2.822, + "step": 6096 + }, + { + "epoch": 0.18079648904308632, + "grad_norm": 0.18606379628181458, + "learning_rate": 0.0009283809026059419, + "loss": 2.794, + "step": 6097 + }, + { + "epoch": 0.1808261423953978, + "grad_norm": 0.19501258432865143, + "learning_rate": 0.0009283566359485713, + "loss": 2.8358, + "step": 6098 + }, + { + "epoch": 0.18085579574770927, + "grad_norm": 0.16847264766693115, + "learning_rate": 0.0009283323654980291, + "loss": 2.8063, + "step": 6099 + }, + { + "epoch": 0.18088544910002075, + "grad_norm": 0.1481509655714035, + "learning_rate": 0.0009283080912545303, + "loss": 2.8157, + "step": 6100 + }, + { + "epoch": 0.18091510245233225, + "grad_norm": 0.17671974003314972, + "learning_rate": 0.0009282838132182898, + "loss": 2.8073, + "step": 6101 + }, + { + "epoch": 0.18094475580464373, + "grad_norm": 0.16930992901325226, + "learning_rate": 0.0009282595313895225, + "loss": 2.8009, + "step": 6102 + }, + { + "epoch": 0.1809744091569552, + "grad_norm": 0.1663116067647934, + "learning_rate": 0.0009282352457684438, + "loss": 2.8022, + "step": 6103 + }, + { + "epoch": 0.18100406250926668, + "grad_norm": 0.14637745916843414, + "learning_rate": 0.0009282109563552683, + "loss": 2.8323, + "step": 6104 + }, + { + "epoch": 0.18103371586157815, + "grad_norm": 0.15195965766906738, + "learning_rate": 0.0009281866631502114, + "loss": 2.8135, + "step": 6105 + }, + { + "epoch": 0.18106336921388963, + "grad_norm": 0.16517126560211182, + "learning_rate": 0.0009281623661534879, + "loss": 2.8219, + "step": 6106 + }, + { + "epoch": 0.1810930225662011, + "grad_norm": 0.15485529601573944, + "learning_rate": 0.0009281380653653133, + "loss": 2.8485, + "step": 6107 + }, + { + "epoch": 0.18112267591851258, + "grad_norm": 0.14238813519477844, + "learning_rate": 0.0009281137607859028, + "loss": 2.7765, + "step": 6108 + }, + { + "epoch": 0.18115232927082406, + "grad_norm": 0.15171730518341064, + "learning_rate": 0.0009280894524154713, + "loss": 2.7856, + "step": 6109 + }, + { + "epoch": 0.18118198262313553, + "grad_norm": 0.13541314005851746, + "learning_rate": 0.0009280651402542342, + "loss": 2.8313, + "step": 6110 + }, + { + "epoch": 0.181211635975447, + "grad_norm": 0.13226193189620972, + "learning_rate": 0.0009280408243024068, + "loss": 2.7817, + "step": 6111 + }, + { + "epoch": 0.18124128932775851, + "grad_norm": 0.14365120232105255, + "learning_rate": 0.0009280165045602045, + "loss": 2.7818, + "step": 6112 + }, + { + "epoch": 0.18127094268007, + "grad_norm": 0.12874175608158112, + "learning_rate": 0.0009279921810278424, + "loss": 2.7897, + "step": 6113 + }, + { + "epoch": 0.18130059603238147, + "grad_norm": 0.13446082174777985, + "learning_rate": 0.0009279678537055363, + "loss": 2.8209, + "step": 6114 + }, + { + "epoch": 0.18133024938469294, + "grad_norm": 0.15968620777130127, + "learning_rate": 0.0009279435225935012, + "loss": 2.7759, + "step": 6115 + }, + { + "epoch": 0.18135990273700442, + "grad_norm": 0.16272199153900146, + "learning_rate": 0.0009279191876919528, + "loss": 2.7942, + "step": 6116 + }, + { + "epoch": 0.1813895560893159, + "grad_norm": 0.15077349543571472, + "learning_rate": 0.0009278948490011068, + "loss": 2.7848, + "step": 6117 + }, + { + "epoch": 0.18141920944162737, + "grad_norm": 0.14427252113819122, + "learning_rate": 0.0009278705065211781, + "loss": 2.7705, + "step": 6118 + }, + { + "epoch": 0.18144886279393885, + "grad_norm": 0.15308167040348053, + "learning_rate": 0.0009278461602523828, + "loss": 2.7855, + "step": 6119 + }, + { + "epoch": 0.18147851614625032, + "grad_norm": 0.17262181639671326, + "learning_rate": 0.0009278218101949364, + "loss": 2.8008, + "step": 6120 + }, + { + "epoch": 0.1815081694985618, + "grad_norm": 0.16255773603916168, + "learning_rate": 0.0009277974563490543, + "loss": 2.8021, + "step": 6121 + }, + { + "epoch": 0.1815378228508733, + "grad_norm": 0.14801445603370667, + "learning_rate": 0.0009277730987149525, + "loss": 2.8064, + "step": 6122 + }, + { + "epoch": 0.18156747620318478, + "grad_norm": 0.15280698239803314, + "learning_rate": 0.0009277487372928462, + "loss": 2.8176, + "step": 6123 + }, + { + "epoch": 0.18159712955549626, + "grad_norm": 0.1549866497516632, + "learning_rate": 0.0009277243720829515, + "loss": 2.7749, + "step": 6124 + }, + { + "epoch": 0.18162678290780773, + "grad_norm": 0.15803420543670654, + "learning_rate": 0.0009277000030854841, + "loss": 2.7812, + "step": 6125 + }, + { + "epoch": 0.1816564362601192, + "grad_norm": 0.14338034391403198, + "learning_rate": 0.0009276756303006597, + "loss": 2.8059, + "step": 6126 + }, + { + "epoch": 0.18168608961243068, + "grad_norm": 0.12482702732086182, + "learning_rate": 0.0009276512537286943, + "loss": 2.8126, + "step": 6127 + }, + { + "epoch": 0.18171574296474216, + "grad_norm": 0.1392524391412735, + "learning_rate": 0.0009276268733698034, + "loss": 2.843, + "step": 6128 + }, + { + "epoch": 0.18174539631705364, + "grad_norm": 0.14341309666633606, + "learning_rate": 0.0009276024892242034, + "loss": 2.8076, + "step": 6129 + }, + { + "epoch": 0.1817750496693651, + "grad_norm": 0.13397158682346344, + "learning_rate": 0.0009275781012921099, + "loss": 2.8061, + "step": 6130 + }, + { + "epoch": 0.1818047030216766, + "grad_norm": 0.12418258190155029, + "learning_rate": 0.0009275537095737389, + "loss": 2.8311, + "step": 6131 + }, + { + "epoch": 0.1818343563739881, + "grad_norm": 0.14799514412879944, + "learning_rate": 0.0009275293140693064, + "loss": 2.8119, + "step": 6132 + }, + { + "epoch": 0.18186400972629957, + "grad_norm": 0.17451255023479462, + "learning_rate": 0.0009275049147790285, + "loss": 2.8002, + "step": 6133 + }, + { + "epoch": 0.18189366307861105, + "grad_norm": 0.19897966086864471, + "learning_rate": 0.0009274805117031211, + "loss": 2.7923, + "step": 6134 + }, + { + "epoch": 0.18192331643092252, + "grad_norm": 0.19442011415958405, + "learning_rate": 0.0009274561048418004, + "loss": 2.8117, + "step": 6135 + }, + { + "epoch": 0.181952969783234, + "grad_norm": 0.1947750449180603, + "learning_rate": 0.0009274316941952825, + "loss": 2.8173, + "step": 6136 + }, + { + "epoch": 0.18198262313554547, + "grad_norm": 0.21902503073215485, + "learning_rate": 0.0009274072797637837, + "loss": 2.8137, + "step": 6137 + }, + { + "epoch": 0.18201227648785695, + "grad_norm": 0.2190064638853073, + "learning_rate": 0.00092738286154752, + "loss": 2.8581, + "step": 6138 + }, + { + "epoch": 0.18204192984016843, + "grad_norm": 0.17944495379924774, + "learning_rate": 0.0009273584395467077, + "loss": 2.8172, + "step": 6139 + }, + { + "epoch": 0.1820715831924799, + "grad_norm": 0.15542496740818024, + "learning_rate": 0.0009273340137615631, + "loss": 2.8189, + "step": 6140 + }, + { + "epoch": 0.18210123654479138, + "grad_norm": 0.1477007269859314, + "learning_rate": 0.0009273095841923025, + "loss": 2.8196, + "step": 6141 + }, + { + "epoch": 0.18213088989710285, + "grad_norm": 0.13685934245586395, + "learning_rate": 0.000927285150839142, + "loss": 2.7698, + "step": 6142 + }, + { + "epoch": 0.18216054324941436, + "grad_norm": 0.16759075224399567, + "learning_rate": 0.0009272607137022983, + "loss": 2.7863, + "step": 6143 + }, + { + "epoch": 0.18219019660172583, + "grad_norm": 0.16615968942642212, + "learning_rate": 0.0009272362727819877, + "loss": 2.7888, + "step": 6144 + }, + { + "epoch": 0.1822198499540373, + "grad_norm": 0.16843821108341217, + "learning_rate": 0.0009272118280784263, + "loss": 2.8041, + "step": 6145 + }, + { + "epoch": 0.1822495033063488, + "grad_norm": 0.1660458743572235, + "learning_rate": 0.0009271873795918311, + "loss": 2.7927, + "step": 6146 + }, + { + "epoch": 0.18227915665866026, + "grad_norm": 0.14428728818893433, + "learning_rate": 0.0009271629273224182, + "loss": 2.8117, + "step": 6147 + }, + { + "epoch": 0.18230881001097174, + "grad_norm": 0.1628129780292511, + "learning_rate": 0.0009271384712704043, + "loss": 2.8383, + "step": 6148 + }, + { + "epoch": 0.18233846336328322, + "grad_norm": 0.15507225692272186, + "learning_rate": 0.0009271140114360059, + "loss": 2.7987, + "step": 6149 + }, + { + "epoch": 0.1823681167155947, + "grad_norm": 0.1681796908378601, + "learning_rate": 0.0009270895478194396, + "loss": 2.792, + "step": 6150 + }, + { + "epoch": 0.18239777006790617, + "grad_norm": 0.18378311395645142, + "learning_rate": 0.0009270650804209222, + "loss": 2.7945, + "step": 6151 + }, + { + "epoch": 0.18242742342021764, + "grad_norm": 0.20763300359249115, + "learning_rate": 0.00092704060924067, + "loss": 2.8174, + "step": 6152 + }, + { + "epoch": 0.18245707677252915, + "grad_norm": 0.16373227536678314, + "learning_rate": 0.0009270161342789, + "loss": 2.8113, + "step": 6153 + }, + { + "epoch": 0.18248673012484062, + "grad_norm": 0.16445782780647278, + "learning_rate": 0.0009269916555358289, + "loss": 2.8087, + "step": 6154 + }, + { + "epoch": 0.1825163834771521, + "grad_norm": 0.17371542751789093, + "learning_rate": 0.0009269671730116732, + "loss": 2.7895, + "step": 6155 + }, + { + "epoch": 0.18254603682946358, + "grad_norm": 0.15406586229801178, + "learning_rate": 0.00092694268670665, + "loss": 2.8136, + "step": 6156 + }, + { + "epoch": 0.18257569018177505, + "grad_norm": 0.15672627091407776, + "learning_rate": 0.0009269181966209761, + "loss": 2.7699, + "step": 6157 + }, + { + "epoch": 0.18260534353408653, + "grad_norm": 0.15063421428203583, + "learning_rate": 0.0009268937027548683, + "loss": 2.8167, + "step": 6158 + }, + { + "epoch": 0.182634996886398, + "grad_norm": 0.14440353214740753, + "learning_rate": 0.0009268692051085433, + "loss": 2.798, + "step": 6159 + }, + { + "epoch": 0.18266465023870948, + "grad_norm": 0.13951805233955383, + "learning_rate": 0.0009268447036822183, + "loss": 2.8024, + "step": 6160 + }, + { + "epoch": 0.18269430359102096, + "grad_norm": 0.14620792865753174, + "learning_rate": 0.0009268201984761102, + "loss": 2.8253, + "step": 6161 + }, + { + "epoch": 0.18272395694333243, + "grad_norm": 0.13426876068115234, + "learning_rate": 0.0009267956894904361, + "loss": 2.7735, + "step": 6162 + }, + { + "epoch": 0.1827536102956439, + "grad_norm": 0.14605803787708282, + "learning_rate": 0.0009267711767254128, + "loss": 2.8222, + "step": 6163 + }, + { + "epoch": 0.1827832636479554, + "grad_norm": 0.17760123312473297, + "learning_rate": 0.0009267466601812575, + "loss": 2.7855, + "step": 6164 + }, + { + "epoch": 0.1828129170002669, + "grad_norm": 0.17240120470523834, + "learning_rate": 0.0009267221398581873, + "loss": 2.7898, + "step": 6165 + }, + { + "epoch": 0.18284257035257837, + "grad_norm": 0.16974976658821106, + "learning_rate": 0.0009266976157564191, + "loss": 2.8345, + "step": 6166 + }, + { + "epoch": 0.18287222370488984, + "grad_norm": 0.1646159291267395, + "learning_rate": 0.0009266730878761705, + "loss": 2.8085, + "step": 6167 + }, + { + "epoch": 0.18290187705720132, + "grad_norm": 0.16438409686088562, + "learning_rate": 0.0009266485562176583, + "loss": 2.7985, + "step": 6168 + }, + { + "epoch": 0.1829315304095128, + "grad_norm": 0.18583185970783234, + "learning_rate": 0.0009266240207811001, + "loss": 2.7964, + "step": 6169 + }, + { + "epoch": 0.18296118376182427, + "grad_norm": 0.19748948514461517, + "learning_rate": 0.0009265994815667129, + "loss": 2.8312, + "step": 6170 + }, + { + "epoch": 0.18299083711413575, + "grad_norm": 0.19293230772018433, + "learning_rate": 0.0009265749385747139, + "loss": 2.8191, + "step": 6171 + }, + { + "epoch": 0.18302049046644722, + "grad_norm": 0.19490045309066772, + "learning_rate": 0.0009265503918053209, + "loss": 2.7996, + "step": 6172 + }, + { + "epoch": 0.1830501438187587, + "grad_norm": 0.19535569846630096, + "learning_rate": 0.0009265258412587507, + "loss": 2.8069, + "step": 6173 + }, + { + "epoch": 0.1830797971710702, + "grad_norm": 0.1851467788219452, + "learning_rate": 0.0009265012869352212, + "loss": 2.7976, + "step": 6174 + }, + { + "epoch": 0.18310945052338168, + "grad_norm": 0.15591588616371155, + "learning_rate": 0.0009264767288349494, + "loss": 2.8474, + "step": 6175 + }, + { + "epoch": 0.18313910387569315, + "grad_norm": 0.1463085412979126, + "learning_rate": 0.000926452166958153, + "loss": 2.8076, + "step": 6176 + }, + { + "epoch": 0.18316875722800463, + "grad_norm": 0.1507694572210312, + "learning_rate": 0.0009264276013050494, + "loss": 2.8172, + "step": 6177 + }, + { + "epoch": 0.1831984105803161, + "grad_norm": 0.15527062118053436, + "learning_rate": 0.0009264030318758562, + "loss": 2.8093, + "step": 6178 + }, + { + "epoch": 0.18322806393262758, + "grad_norm": 0.12126481533050537, + "learning_rate": 0.0009263784586707912, + "loss": 2.8235, + "step": 6179 + }, + { + "epoch": 0.18325771728493906, + "grad_norm": 0.14216969907283783, + "learning_rate": 0.0009263538816900716, + "loss": 2.8226, + "step": 6180 + }, + { + "epoch": 0.18328737063725054, + "grad_norm": 0.1502763330936432, + "learning_rate": 0.0009263293009339151, + "loss": 2.7619, + "step": 6181 + }, + { + "epoch": 0.183317023989562, + "grad_norm": 0.14281809329986572, + "learning_rate": 0.0009263047164025396, + "loss": 2.8398, + "step": 6182 + }, + { + "epoch": 0.1833466773418735, + "grad_norm": 0.1388070434331894, + "learning_rate": 0.0009262801280961626, + "loss": 2.8296, + "step": 6183 + }, + { + "epoch": 0.183376330694185, + "grad_norm": 0.14650437235832214, + "learning_rate": 0.000926255536015002, + "loss": 2.8335, + "step": 6184 + }, + { + "epoch": 0.18340598404649647, + "grad_norm": 0.14434783160686493, + "learning_rate": 0.0009262309401592753, + "loss": 2.7786, + "step": 6185 + }, + { + "epoch": 0.18343563739880794, + "grad_norm": 0.13142715394496918, + "learning_rate": 0.0009262063405292005, + "loss": 2.7957, + "step": 6186 + }, + { + "epoch": 0.18346529075111942, + "grad_norm": 0.15287119150161743, + "learning_rate": 0.0009261817371249955, + "loss": 2.8183, + "step": 6187 + }, + { + "epoch": 0.1834949441034309, + "grad_norm": 0.17743588984012604, + "learning_rate": 0.0009261571299468781, + "loss": 2.7747, + "step": 6188 + }, + { + "epoch": 0.18352459745574237, + "grad_norm": 0.19457659125328064, + "learning_rate": 0.000926132518995066, + "loss": 2.8162, + "step": 6189 + }, + { + "epoch": 0.18355425080805385, + "grad_norm": 0.23314721882343292, + "learning_rate": 0.0009261079042697773, + "loss": 2.8095, + "step": 6190 + }, + { + "epoch": 0.18358390416036532, + "grad_norm": 0.2287968099117279, + "learning_rate": 0.00092608328577123, + "loss": 2.827, + "step": 6191 + }, + { + "epoch": 0.1836135575126768, + "grad_norm": 0.19777965545654297, + "learning_rate": 0.0009260586634996422, + "loss": 2.7957, + "step": 6192 + }, + { + "epoch": 0.18364321086498828, + "grad_norm": 0.16516351699829102, + "learning_rate": 0.0009260340374552316, + "loss": 2.8363, + "step": 6193 + }, + { + "epoch": 0.18367286421729975, + "grad_norm": 0.15941013395786285, + "learning_rate": 0.0009260094076382166, + "loss": 2.8139, + "step": 6194 + }, + { + "epoch": 0.18370251756961126, + "grad_norm": 0.18699313700199127, + "learning_rate": 0.0009259847740488152, + "loss": 2.798, + "step": 6195 + }, + { + "epoch": 0.18373217092192273, + "grad_norm": 0.1684592068195343, + "learning_rate": 0.0009259601366872455, + "loss": 2.7946, + "step": 6196 + }, + { + "epoch": 0.1837618242742342, + "grad_norm": 0.177565798163414, + "learning_rate": 0.0009259354955537256, + "loss": 2.8399, + "step": 6197 + }, + { + "epoch": 0.18379147762654569, + "grad_norm": 0.15391963720321655, + "learning_rate": 0.0009259108506484738, + "loss": 2.7956, + "step": 6198 + }, + { + "epoch": 0.18382113097885716, + "grad_norm": 0.15332816541194916, + "learning_rate": 0.0009258862019717082, + "loss": 2.8117, + "step": 6199 + }, + { + "epoch": 0.18385078433116864, + "grad_norm": 0.14069406688213348, + "learning_rate": 0.0009258615495236474, + "loss": 2.7641, + "step": 6200 + }, + { + "epoch": 0.1838804376834801, + "grad_norm": 0.11909694969654083, + "learning_rate": 0.0009258368933045093, + "loss": 2.8001, + "step": 6201 + }, + { + "epoch": 0.1839100910357916, + "grad_norm": 0.13905659317970276, + "learning_rate": 0.0009258122333145126, + "loss": 2.822, + "step": 6202 + }, + { + "epoch": 0.18393974438810307, + "grad_norm": 0.1376218944787979, + "learning_rate": 0.0009257875695538754, + "loss": 2.8258, + "step": 6203 + }, + { + "epoch": 0.18396939774041454, + "grad_norm": 0.12396934628486633, + "learning_rate": 0.0009257629020228163, + "loss": 2.8161, + "step": 6204 + }, + { + "epoch": 0.18399905109272605, + "grad_norm": 0.12610450387001038, + "learning_rate": 0.0009257382307215533, + "loss": 2.8206, + "step": 6205 + }, + { + "epoch": 0.18402870444503752, + "grad_norm": 0.14111681282520294, + "learning_rate": 0.0009257135556503054, + "loss": 2.7847, + "step": 6206 + }, + { + "epoch": 0.184058357797349, + "grad_norm": 0.14869415760040283, + "learning_rate": 0.0009256888768092908, + "loss": 2.8312, + "step": 6207 + }, + { + "epoch": 0.18408801114966047, + "grad_norm": 0.15041621029376984, + "learning_rate": 0.0009256641941987283, + "loss": 2.8236, + "step": 6208 + }, + { + "epoch": 0.18411766450197195, + "grad_norm": 0.15473684668540955, + "learning_rate": 0.0009256395078188362, + "loss": 2.7791, + "step": 6209 + }, + { + "epoch": 0.18414731785428343, + "grad_norm": 0.15114064514636993, + "learning_rate": 0.0009256148176698332, + "loss": 2.7802, + "step": 6210 + }, + { + "epoch": 0.1841769712065949, + "grad_norm": 0.12290024012327194, + "learning_rate": 0.0009255901237519377, + "loss": 2.8195, + "step": 6211 + }, + { + "epoch": 0.18420662455890638, + "grad_norm": 0.1252467930316925, + "learning_rate": 0.000925565426065369, + "loss": 2.8125, + "step": 6212 + }, + { + "epoch": 0.18423627791121786, + "grad_norm": 0.16790665686130524, + "learning_rate": 0.0009255407246103451, + "loss": 2.8245, + "step": 6213 + }, + { + "epoch": 0.18426593126352933, + "grad_norm": 0.17910592257976532, + "learning_rate": 0.0009255160193870853, + "loss": 2.8275, + "step": 6214 + }, + { + "epoch": 0.1842955846158408, + "grad_norm": 0.17759811878204346, + "learning_rate": 0.0009254913103958079, + "loss": 2.815, + "step": 6215 + }, + { + "epoch": 0.1843252379681523, + "grad_norm": 0.18921563029289246, + "learning_rate": 0.000925466597636732, + "loss": 2.8078, + "step": 6216 + }, + { + "epoch": 0.1843548913204638, + "grad_norm": 0.2200654000043869, + "learning_rate": 0.0009254418811100763, + "loss": 2.8401, + "step": 6217 + }, + { + "epoch": 0.18438454467277526, + "grad_norm": 0.20734377205371857, + "learning_rate": 0.0009254171608160598, + "loss": 2.7997, + "step": 6218 + }, + { + "epoch": 0.18441419802508674, + "grad_norm": 0.17060166597366333, + "learning_rate": 0.0009253924367549013, + "loss": 2.7818, + "step": 6219 + }, + { + "epoch": 0.18444385137739822, + "grad_norm": 0.1840427666902542, + "learning_rate": 0.0009253677089268198, + "loss": 2.7952, + "step": 6220 + }, + { + "epoch": 0.1844735047297097, + "grad_norm": 0.19064615666866302, + "learning_rate": 0.0009253429773320341, + "loss": 2.8144, + "step": 6221 + }, + { + "epoch": 0.18450315808202117, + "grad_norm": 0.19106362760066986, + "learning_rate": 0.0009253182419707633, + "loss": 2.8142, + "step": 6222 + }, + { + "epoch": 0.18453281143433264, + "grad_norm": 0.18321433663368225, + "learning_rate": 0.0009252935028432266, + "loss": 2.8001, + "step": 6223 + }, + { + "epoch": 0.18456246478664412, + "grad_norm": 0.1809099167585373, + "learning_rate": 0.0009252687599496427, + "loss": 2.8066, + "step": 6224 + }, + { + "epoch": 0.1845921181389556, + "grad_norm": 0.14670507609844208, + "learning_rate": 0.0009252440132902312, + "loss": 2.8005, + "step": 6225 + }, + { + "epoch": 0.1846217714912671, + "grad_norm": 0.1644890457391739, + "learning_rate": 0.000925219262865211, + "loss": 2.8121, + "step": 6226 + }, + { + "epoch": 0.18465142484357858, + "grad_norm": 0.11721111088991165, + "learning_rate": 0.000925194508674801, + "loss": 2.7873, + "step": 6227 + }, + { + "epoch": 0.18468107819589005, + "grad_norm": 0.13328681886196136, + "learning_rate": 0.0009251697507192208, + "loss": 2.8265, + "step": 6228 + }, + { + "epoch": 0.18471073154820153, + "grad_norm": 0.12864713370800018, + "learning_rate": 0.0009251449889986894, + "loss": 2.7976, + "step": 6229 + }, + { + "epoch": 0.184740384900513, + "grad_norm": 0.12172053009271622, + "learning_rate": 0.0009251202235134262, + "loss": 2.7796, + "step": 6230 + }, + { + "epoch": 0.18477003825282448, + "grad_norm": 0.13603532314300537, + "learning_rate": 0.0009250954542636505, + "loss": 2.7853, + "step": 6231 + }, + { + "epoch": 0.18479969160513596, + "grad_norm": 0.14297065138816833, + "learning_rate": 0.0009250706812495815, + "loss": 2.8408, + "step": 6232 + }, + { + "epoch": 0.18482934495744743, + "grad_norm": 0.17857441306114197, + "learning_rate": 0.0009250459044714387, + "loss": 2.7966, + "step": 6233 + }, + { + "epoch": 0.1848589983097589, + "grad_norm": 0.20165584981441498, + "learning_rate": 0.0009250211239294414, + "loss": 2.8047, + "step": 6234 + }, + { + "epoch": 0.18488865166207039, + "grad_norm": 0.20191748440265656, + "learning_rate": 0.000924996339623809, + "loss": 2.8476, + "step": 6235 + }, + { + "epoch": 0.1849183050143819, + "grad_norm": 0.22198522090911865, + "learning_rate": 0.0009249715515547612, + "loss": 2.7959, + "step": 6236 + }, + { + "epoch": 0.18494795836669337, + "grad_norm": 0.21355046331882477, + "learning_rate": 0.0009249467597225174, + "loss": 2.8141, + "step": 6237 + }, + { + "epoch": 0.18497761171900484, + "grad_norm": 0.2043946236371994, + "learning_rate": 0.000924921964127297, + "loss": 2.8113, + "step": 6238 + }, + { + "epoch": 0.18500726507131632, + "grad_norm": 0.19149918854236603, + "learning_rate": 0.0009248971647693199, + "loss": 2.8104, + "step": 6239 + }, + { + "epoch": 0.1850369184236278, + "grad_norm": 0.2034929096698761, + "learning_rate": 0.0009248723616488053, + "loss": 2.7819, + "step": 6240 + }, + { + "epoch": 0.18506657177593927, + "grad_norm": 0.20034486055374146, + "learning_rate": 0.000924847554765973, + "loss": 2.8012, + "step": 6241 + }, + { + "epoch": 0.18509622512825075, + "grad_norm": 0.1560385525226593, + "learning_rate": 0.0009248227441210426, + "loss": 2.8119, + "step": 6242 + }, + { + "epoch": 0.18512587848056222, + "grad_norm": 0.14644281566143036, + "learning_rate": 0.000924797929714234, + "loss": 2.799, + "step": 6243 + }, + { + "epoch": 0.1851555318328737, + "grad_norm": 0.14958390593528748, + "learning_rate": 0.0009247731115457667, + "loss": 2.8057, + "step": 6244 + }, + { + "epoch": 0.18518518518518517, + "grad_norm": 0.13846224546432495, + "learning_rate": 0.0009247482896158608, + "loss": 2.7836, + "step": 6245 + }, + { + "epoch": 0.18521483853749665, + "grad_norm": 0.15999072790145874, + "learning_rate": 0.0009247234639247357, + "loss": 2.8493, + "step": 6246 + }, + { + "epoch": 0.18524449188980815, + "grad_norm": 0.15141446888446808, + "learning_rate": 0.0009246986344726114, + "loss": 2.765, + "step": 6247 + }, + { + "epoch": 0.18527414524211963, + "grad_norm": 0.12497622519731522, + "learning_rate": 0.000924673801259708, + "loss": 2.7741, + "step": 6248 + }, + { + "epoch": 0.1853037985944311, + "grad_norm": 0.14079061150550842, + "learning_rate": 0.000924648964286245, + "loss": 2.8386, + "step": 6249 + }, + { + "epoch": 0.18533345194674258, + "grad_norm": 0.1259281039237976, + "learning_rate": 0.0009246241235524427, + "loss": 2.8125, + "step": 6250 + }, + { + "epoch": 0.18536310529905406, + "grad_norm": 0.1346503645181656, + "learning_rate": 0.0009245992790585207, + "loss": 2.797, + "step": 6251 + }, + { + "epoch": 0.18539275865136554, + "grad_norm": 0.1421063393354416, + "learning_rate": 0.0009245744308046993, + "loss": 2.8039, + "step": 6252 + }, + { + "epoch": 0.185422412003677, + "grad_norm": 0.14714643359184265, + "learning_rate": 0.0009245495787911985, + "loss": 2.8084, + "step": 6253 + }, + { + "epoch": 0.1854520653559885, + "grad_norm": 0.1638144999742508, + "learning_rate": 0.0009245247230182382, + "loss": 2.7835, + "step": 6254 + }, + { + "epoch": 0.18548171870829996, + "grad_norm": 0.1842234581708908, + "learning_rate": 0.0009244998634860386, + "loss": 2.8158, + "step": 6255 + }, + { + "epoch": 0.18551137206061144, + "grad_norm": 0.21126680076122284, + "learning_rate": 0.00092447500019482, + "loss": 2.8275, + "step": 6256 + }, + { + "epoch": 0.18554102541292294, + "grad_norm": 0.21404996514320374, + "learning_rate": 0.0009244501331448023, + "loss": 2.8157, + "step": 6257 + }, + { + "epoch": 0.18557067876523442, + "grad_norm": 0.17353907227516174, + "learning_rate": 0.0009244252623362058, + "loss": 2.7595, + "step": 6258 + }, + { + "epoch": 0.1856003321175459, + "grad_norm": 0.14658032357692719, + "learning_rate": 0.0009244003877692509, + "loss": 2.8041, + "step": 6259 + }, + { + "epoch": 0.18562998546985737, + "grad_norm": 0.18031209707260132, + "learning_rate": 0.0009243755094441575, + "loss": 2.7976, + "step": 6260 + }, + { + "epoch": 0.18565963882216885, + "grad_norm": 0.15429289638996124, + "learning_rate": 0.0009243506273611463, + "loss": 2.7859, + "step": 6261 + }, + { + "epoch": 0.18568929217448032, + "grad_norm": 0.13673710823059082, + "learning_rate": 0.0009243257415204373, + "loss": 2.7905, + "step": 6262 + }, + { + "epoch": 0.1857189455267918, + "grad_norm": 0.14796030521392822, + "learning_rate": 0.0009243008519222511, + "loss": 2.846, + "step": 6263 + }, + { + "epoch": 0.18574859887910328, + "grad_norm": 0.15920425951480865, + "learning_rate": 0.0009242759585668081, + "loss": 2.7938, + "step": 6264 + }, + { + "epoch": 0.18577825223141475, + "grad_norm": 0.17053866386413574, + "learning_rate": 0.0009242510614543285, + "loss": 2.8011, + "step": 6265 + }, + { + "epoch": 0.18580790558372623, + "grad_norm": 0.14654774963855743, + "learning_rate": 0.000924226160585033, + "loss": 2.7948, + "step": 6266 + }, + { + "epoch": 0.1858375589360377, + "grad_norm": 0.12925848364830017, + "learning_rate": 0.0009242012559591422, + "loss": 2.7724, + "step": 6267 + }, + { + "epoch": 0.1858672122883492, + "grad_norm": 0.12732622027397156, + "learning_rate": 0.0009241763475768763, + "loss": 2.8426, + "step": 6268 + }, + { + "epoch": 0.18589686564066069, + "grad_norm": 0.12133900821208954, + "learning_rate": 0.0009241514354384559, + "loss": 2.7614, + "step": 6269 + }, + { + "epoch": 0.18592651899297216, + "grad_norm": 0.12478872388601303, + "learning_rate": 0.000924126519544102, + "loss": 2.7892, + "step": 6270 + }, + { + "epoch": 0.18595617234528364, + "grad_norm": 0.14018097519874573, + "learning_rate": 0.0009241015998940347, + "loss": 2.786, + "step": 6271 + }, + { + "epoch": 0.18598582569759511, + "grad_norm": 0.11530163884162903, + "learning_rate": 0.0009240766764884752, + "loss": 2.8202, + "step": 6272 + }, + { + "epoch": 0.1860154790499066, + "grad_norm": 0.13529759645462036, + "learning_rate": 0.0009240517493276438, + "loss": 2.7826, + "step": 6273 + }, + { + "epoch": 0.18604513240221807, + "grad_norm": 0.14925885200500488, + "learning_rate": 0.0009240268184117614, + "loss": 2.8094, + "step": 6274 + }, + { + "epoch": 0.18607478575452954, + "grad_norm": 0.15596114099025726, + "learning_rate": 0.0009240018837410488, + "loss": 2.7899, + "step": 6275 + }, + { + "epoch": 0.18610443910684102, + "grad_norm": 0.17215554416179657, + "learning_rate": 0.0009239769453157266, + "loss": 2.812, + "step": 6276 + }, + { + "epoch": 0.1861340924591525, + "grad_norm": 0.16240721940994263, + "learning_rate": 0.0009239520031360158, + "loss": 2.7997, + "step": 6277 + }, + { + "epoch": 0.186163745811464, + "grad_norm": 0.14768484234809875, + "learning_rate": 0.0009239270572021374, + "loss": 2.7645, + "step": 6278 + }, + { + "epoch": 0.18619339916377547, + "grad_norm": 0.1487039029598236, + "learning_rate": 0.0009239021075143119, + "loss": 2.7827, + "step": 6279 + }, + { + "epoch": 0.18622305251608695, + "grad_norm": 0.16191142797470093, + "learning_rate": 0.0009238771540727608, + "loss": 2.8128, + "step": 6280 + }, + { + "epoch": 0.18625270586839843, + "grad_norm": 0.16240869462490082, + "learning_rate": 0.0009238521968777045, + "loss": 2.8203, + "step": 6281 + }, + { + "epoch": 0.1862823592207099, + "grad_norm": 0.15796810388565063, + "learning_rate": 0.0009238272359293643, + "loss": 2.7883, + "step": 6282 + }, + { + "epoch": 0.18631201257302138, + "grad_norm": 0.13917551934719086, + "learning_rate": 0.0009238022712279611, + "loss": 2.7853, + "step": 6283 + }, + { + "epoch": 0.18634166592533286, + "grad_norm": 0.1411215215921402, + "learning_rate": 0.0009237773027737162, + "loss": 2.7887, + "step": 6284 + }, + { + "epoch": 0.18637131927764433, + "grad_norm": 0.13012810051441193, + "learning_rate": 0.0009237523305668505, + "loss": 2.8048, + "step": 6285 + }, + { + "epoch": 0.1864009726299558, + "grad_norm": 0.1603723019361496, + "learning_rate": 0.0009237273546075851, + "loss": 2.8186, + "step": 6286 + }, + { + "epoch": 0.18643062598226728, + "grad_norm": 0.17632536590099335, + "learning_rate": 0.0009237023748961412, + "loss": 2.8261, + "step": 6287 + }, + { + "epoch": 0.1864602793345788, + "grad_norm": 0.16870611906051636, + "learning_rate": 0.0009236773914327401, + "loss": 2.805, + "step": 6288 + }, + { + "epoch": 0.18648993268689026, + "grad_norm": 0.1632547378540039, + "learning_rate": 0.0009236524042176031, + "loss": 2.8185, + "step": 6289 + }, + { + "epoch": 0.18651958603920174, + "grad_norm": 0.17106662690639496, + "learning_rate": 0.0009236274132509513, + "loss": 2.788, + "step": 6290 + }, + { + "epoch": 0.18654923939151322, + "grad_norm": 0.18790671229362488, + "learning_rate": 0.0009236024185330058, + "loss": 2.8206, + "step": 6291 + }, + { + "epoch": 0.1865788927438247, + "grad_norm": 0.1911371350288391, + "learning_rate": 0.0009235774200639883, + "loss": 2.7901, + "step": 6292 + }, + { + "epoch": 0.18660854609613617, + "grad_norm": 0.15131939947605133, + "learning_rate": 0.0009235524178441202, + "loss": 2.8036, + "step": 6293 + }, + { + "epoch": 0.18663819944844764, + "grad_norm": 0.1558203101158142, + "learning_rate": 0.0009235274118736228, + "loss": 2.7893, + "step": 6294 + }, + { + "epoch": 0.18666785280075912, + "grad_norm": 0.16228535771369934, + "learning_rate": 0.0009235024021527171, + "loss": 2.8309, + "step": 6295 + }, + { + "epoch": 0.1866975061530706, + "grad_norm": 0.17881318926811218, + "learning_rate": 0.0009234773886816252, + "loss": 2.7892, + "step": 6296 + }, + { + "epoch": 0.18672715950538207, + "grad_norm": 0.15792474150657654, + "learning_rate": 0.0009234523714605683, + "loss": 2.8409, + "step": 6297 + }, + { + "epoch": 0.18675681285769355, + "grad_norm": 0.15587711334228516, + "learning_rate": 0.0009234273504897678, + "loss": 2.8076, + "step": 6298 + }, + { + "epoch": 0.18678646621000505, + "grad_norm": 0.16455797851085663, + "learning_rate": 0.0009234023257694457, + "loss": 2.7927, + "step": 6299 + }, + { + "epoch": 0.18681611956231653, + "grad_norm": 0.21620579063892365, + "learning_rate": 0.0009233772972998232, + "loss": 2.7954, + "step": 6300 + }, + { + "epoch": 0.186845772914628, + "grad_norm": 0.23545484244823456, + "learning_rate": 0.0009233522650811221, + "loss": 2.8027, + "step": 6301 + }, + { + "epoch": 0.18687542626693948, + "grad_norm": 0.21904081106185913, + "learning_rate": 0.0009233272291135639, + "loss": 2.8232, + "step": 6302 + }, + { + "epoch": 0.18690507961925096, + "grad_norm": 0.20429451763629913, + "learning_rate": 0.0009233021893973706, + "loss": 2.8207, + "step": 6303 + }, + { + "epoch": 0.18693473297156243, + "grad_norm": 0.19810296595096588, + "learning_rate": 0.0009232771459327636, + "loss": 2.8216, + "step": 6304 + }, + { + "epoch": 0.1869643863238739, + "grad_norm": 0.16509805619716644, + "learning_rate": 0.0009232520987199649, + "loss": 2.8105, + "step": 6305 + }, + { + "epoch": 0.1869940396761854, + "grad_norm": 0.18169334530830383, + "learning_rate": 0.0009232270477591962, + "loss": 2.8061, + "step": 6306 + }, + { + "epoch": 0.18702369302849686, + "grad_norm": 0.18017323315143585, + "learning_rate": 0.0009232019930506795, + "loss": 2.8422, + "step": 6307 + }, + { + "epoch": 0.18705334638080834, + "grad_norm": 0.16269339621067047, + "learning_rate": 0.0009231769345946361, + "loss": 2.8049, + "step": 6308 + }, + { + "epoch": 0.18708299973311984, + "grad_norm": 0.1789955049753189, + "learning_rate": 0.0009231518723912886, + "loss": 2.7715, + "step": 6309 + }, + { + "epoch": 0.18711265308543132, + "grad_norm": 0.15238800644874573, + "learning_rate": 0.0009231268064408587, + "loss": 2.8082, + "step": 6310 + }, + { + "epoch": 0.1871423064377428, + "grad_norm": 0.13761495053768158, + "learning_rate": 0.0009231017367435681, + "loss": 2.7945, + "step": 6311 + }, + { + "epoch": 0.18717195979005427, + "grad_norm": 0.14503873884677887, + "learning_rate": 0.0009230766632996392, + "loss": 2.818, + "step": 6312 + }, + { + "epoch": 0.18720161314236575, + "grad_norm": 0.14104695618152618, + "learning_rate": 0.0009230515861092936, + "loss": 2.7815, + "step": 6313 + }, + { + "epoch": 0.18723126649467722, + "grad_norm": 0.12897337973117828, + "learning_rate": 0.0009230265051727537, + "loss": 2.7745, + "step": 6314 + }, + { + "epoch": 0.1872609198469887, + "grad_norm": 0.1214556097984314, + "learning_rate": 0.0009230014204902415, + "loss": 2.7816, + "step": 6315 + }, + { + "epoch": 0.18729057319930018, + "grad_norm": 0.12329010665416718, + "learning_rate": 0.0009229763320619793, + "loss": 2.8499, + "step": 6316 + }, + { + "epoch": 0.18732022655161165, + "grad_norm": 0.1314002424478531, + "learning_rate": 0.0009229512398881887, + "loss": 2.7894, + "step": 6317 + }, + { + "epoch": 0.18734987990392313, + "grad_norm": 0.13643892109394073, + "learning_rate": 0.0009229261439690925, + "loss": 2.821, + "step": 6318 + }, + { + "epoch": 0.1873795332562346, + "grad_norm": 0.12743933498859406, + "learning_rate": 0.0009229010443049128, + "loss": 2.7691, + "step": 6319 + }, + { + "epoch": 0.1874091866085461, + "grad_norm": 0.12473535537719727, + "learning_rate": 0.0009228759408958716, + "loss": 2.8025, + "step": 6320 + }, + { + "epoch": 0.18743883996085758, + "grad_norm": 0.11511445045471191, + "learning_rate": 0.0009228508337421913, + "loss": 2.825, + "step": 6321 + }, + { + "epoch": 0.18746849331316906, + "grad_norm": 0.12636281549930573, + "learning_rate": 0.0009228257228440944, + "loss": 2.7779, + "step": 6322 + }, + { + "epoch": 0.18749814666548054, + "grad_norm": 0.13153360784053802, + "learning_rate": 0.000922800608201803, + "loss": 2.7785, + "step": 6323 + }, + { + "epoch": 0.187527800017792, + "grad_norm": 0.142369344830513, + "learning_rate": 0.0009227754898155398, + "loss": 2.7807, + "step": 6324 + }, + { + "epoch": 0.1875574533701035, + "grad_norm": 0.1475500464439392, + "learning_rate": 0.0009227503676855272, + "loss": 2.7819, + "step": 6325 + }, + { + "epoch": 0.18758710672241496, + "grad_norm": 0.1529967039823532, + "learning_rate": 0.0009227252418119871, + "loss": 2.8258, + "step": 6326 + }, + { + "epoch": 0.18761676007472644, + "grad_norm": 0.15937043726444244, + "learning_rate": 0.0009227001121951429, + "loss": 2.7982, + "step": 6327 + }, + { + "epoch": 0.18764641342703792, + "grad_norm": 0.14392180740833282, + "learning_rate": 0.0009226749788352162, + "loss": 2.7965, + "step": 6328 + }, + { + "epoch": 0.1876760667793494, + "grad_norm": 0.15446403622627258, + "learning_rate": 0.0009226498417324304, + "loss": 2.8076, + "step": 6329 + }, + { + "epoch": 0.1877057201316609, + "grad_norm": 0.17744839191436768, + "learning_rate": 0.0009226247008870074, + "loss": 2.8222, + "step": 6330 + }, + { + "epoch": 0.18773537348397237, + "grad_norm": 0.17624840140342712, + "learning_rate": 0.0009225995562991703, + "loss": 2.8108, + "step": 6331 + }, + { + "epoch": 0.18776502683628385, + "grad_norm": 0.18311846256256104, + "learning_rate": 0.0009225744079691417, + "loss": 2.8288, + "step": 6332 + }, + { + "epoch": 0.18779468018859533, + "grad_norm": 0.20513369143009186, + "learning_rate": 0.000922549255897144, + "loss": 2.7748, + "step": 6333 + }, + { + "epoch": 0.1878243335409068, + "grad_norm": 0.17478996515274048, + "learning_rate": 0.0009225241000834002, + "loss": 2.81, + "step": 6334 + }, + { + "epoch": 0.18785398689321828, + "grad_norm": 0.15814760327339172, + "learning_rate": 0.0009224989405281329, + "loss": 2.7877, + "step": 6335 + }, + { + "epoch": 0.18788364024552975, + "grad_norm": 0.1831313967704773, + "learning_rate": 0.000922473777231565, + "loss": 2.8163, + "step": 6336 + }, + { + "epoch": 0.18791329359784123, + "grad_norm": 0.1622621715068817, + "learning_rate": 0.0009224486101939192, + "loss": 2.8193, + "step": 6337 + }, + { + "epoch": 0.1879429469501527, + "grad_norm": 0.15971294045448303, + "learning_rate": 0.0009224234394154185, + "loss": 2.809, + "step": 6338 + }, + { + "epoch": 0.18797260030246418, + "grad_norm": 0.14727896451950073, + "learning_rate": 0.0009223982648962858, + "loss": 2.8051, + "step": 6339 + }, + { + "epoch": 0.1880022536547757, + "grad_norm": 0.1422933042049408, + "learning_rate": 0.0009223730866367439, + "loss": 2.7839, + "step": 6340 + }, + { + "epoch": 0.18803190700708716, + "grad_norm": 0.15545843541622162, + "learning_rate": 0.0009223479046370158, + "loss": 2.826, + "step": 6341 + }, + { + "epoch": 0.18806156035939864, + "grad_norm": 0.17358730733394623, + "learning_rate": 0.0009223227188973246, + "loss": 2.8267, + "step": 6342 + }, + { + "epoch": 0.18809121371171011, + "grad_norm": 0.17202341556549072, + "learning_rate": 0.0009222975294178933, + "loss": 2.8241, + "step": 6343 + }, + { + "epoch": 0.1881208670640216, + "grad_norm": 0.1998845487833023, + "learning_rate": 0.0009222723361989447, + "loss": 2.8223, + "step": 6344 + }, + { + "epoch": 0.18815052041633307, + "grad_norm": 0.19419525563716888, + "learning_rate": 0.0009222471392407021, + "loss": 2.8173, + "step": 6345 + }, + { + "epoch": 0.18818017376864454, + "grad_norm": 0.191616490483284, + "learning_rate": 0.0009222219385433886, + "loss": 2.8137, + "step": 6346 + }, + { + "epoch": 0.18820982712095602, + "grad_norm": 0.2253912091255188, + "learning_rate": 0.0009221967341072275, + "loss": 2.784, + "step": 6347 + }, + { + "epoch": 0.1882394804732675, + "grad_norm": 0.17878901958465576, + "learning_rate": 0.0009221715259324416, + "loss": 2.8229, + "step": 6348 + }, + { + "epoch": 0.18826913382557897, + "grad_norm": 0.16005748510360718, + "learning_rate": 0.0009221463140192546, + "loss": 2.8216, + "step": 6349 + }, + { + "epoch": 0.18829878717789045, + "grad_norm": 0.1587793380022049, + "learning_rate": 0.0009221210983678895, + "loss": 2.8028, + "step": 6350 + }, + { + "epoch": 0.18832844053020195, + "grad_norm": 0.14180156588554382, + "learning_rate": 0.0009220958789785696, + "loss": 2.8023, + "step": 6351 + }, + { + "epoch": 0.18835809388251343, + "grad_norm": 0.15146523714065552, + "learning_rate": 0.0009220706558515182, + "loss": 2.8097, + "step": 6352 + }, + { + "epoch": 0.1883877472348249, + "grad_norm": 0.12991167604923248, + "learning_rate": 0.0009220454289869586, + "loss": 2.8041, + "step": 6353 + }, + { + "epoch": 0.18841740058713638, + "grad_norm": 0.14216220378875732, + "learning_rate": 0.0009220201983851145, + "loss": 2.7954, + "step": 6354 + }, + { + "epoch": 0.18844705393944786, + "grad_norm": 0.14580020308494568, + "learning_rate": 0.0009219949640462091, + "loss": 2.8168, + "step": 6355 + }, + { + "epoch": 0.18847670729175933, + "grad_norm": 0.1398160308599472, + "learning_rate": 0.0009219697259704657, + "loss": 2.7946, + "step": 6356 + }, + { + "epoch": 0.1885063606440708, + "grad_norm": 0.143050417304039, + "learning_rate": 0.000921944484158108, + "loss": 2.7695, + "step": 6357 + }, + { + "epoch": 0.18853601399638228, + "grad_norm": 0.14489232003688812, + "learning_rate": 0.0009219192386093595, + "loss": 2.7913, + "step": 6358 + }, + { + "epoch": 0.18856566734869376, + "grad_norm": 0.14313659071922302, + "learning_rate": 0.0009218939893244437, + "loss": 2.7976, + "step": 6359 + }, + { + "epoch": 0.18859532070100524, + "grad_norm": 0.15663276612758636, + "learning_rate": 0.0009218687363035841, + "loss": 2.8102, + "step": 6360 + }, + { + "epoch": 0.18862497405331674, + "grad_norm": 0.1664617508649826, + "learning_rate": 0.0009218434795470045, + "loss": 2.8021, + "step": 6361 + }, + { + "epoch": 0.18865462740562822, + "grad_norm": 0.16163545846939087, + "learning_rate": 0.0009218182190549287, + "loss": 2.8273, + "step": 6362 + }, + { + "epoch": 0.1886842807579397, + "grad_norm": 0.1600019782781601, + "learning_rate": 0.0009217929548275799, + "loss": 2.7786, + "step": 6363 + }, + { + "epoch": 0.18871393411025117, + "grad_norm": 0.15636040270328522, + "learning_rate": 0.0009217676868651821, + "loss": 2.7772, + "step": 6364 + }, + { + "epoch": 0.18874358746256265, + "grad_norm": 0.1437072455883026, + "learning_rate": 0.0009217424151679592, + "loss": 2.8061, + "step": 6365 + }, + { + "epoch": 0.18877324081487412, + "grad_norm": 0.13599957525730133, + "learning_rate": 0.0009217171397361346, + "loss": 2.8052, + "step": 6366 + }, + { + "epoch": 0.1888028941671856, + "grad_norm": 0.15484091639518738, + "learning_rate": 0.0009216918605699325, + "loss": 2.8283, + "step": 6367 + }, + { + "epoch": 0.18883254751949707, + "grad_norm": 0.1570477932691574, + "learning_rate": 0.0009216665776695766, + "loss": 2.8104, + "step": 6368 + }, + { + "epoch": 0.18886220087180855, + "grad_norm": 0.13760808110237122, + "learning_rate": 0.0009216412910352907, + "loss": 2.8188, + "step": 6369 + }, + { + "epoch": 0.18889185422412003, + "grad_norm": 0.14898984134197235, + "learning_rate": 0.0009216160006672989, + "loss": 2.819, + "step": 6370 + }, + { + "epoch": 0.1889215075764315, + "grad_norm": 0.1652400642633438, + "learning_rate": 0.0009215907065658249, + "loss": 2.8036, + "step": 6371 + }, + { + "epoch": 0.188951160928743, + "grad_norm": 0.15790493786334991, + "learning_rate": 0.000921565408731093, + "loss": 2.831, + "step": 6372 + }, + { + "epoch": 0.18898081428105448, + "grad_norm": 0.14232859015464783, + "learning_rate": 0.0009215401071633269, + "loss": 2.8283, + "step": 6373 + }, + { + "epoch": 0.18901046763336596, + "grad_norm": 0.18499723076820374, + "learning_rate": 0.0009215148018627508, + "loss": 2.7747, + "step": 6374 + }, + { + "epoch": 0.18904012098567743, + "grad_norm": 0.18385721743106842, + "learning_rate": 0.0009214894928295888, + "loss": 2.8206, + "step": 6375 + }, + { + "epoch": 0.1890697743379889, + "grad_norm": 0.18327800929546356, + "learning_rate": 0.0009214641800640651, + "loss": 2.7903, + "step": 6376 + }, + { + "epoch": 0.1890994276903004, + "grad_norm": 0.17258328199386597, + "learning_rate": 0.0009214388635664036, + "loss": 2.7985, + "step": 6377 + }, + { + "epoch": 0.18912908104261186, + "grad_norm": 0.1634451150894165, + "learning_rate": 0.0009214135433368287, + "loss": 2.7969, + "step": 6378 + }, + { + "epoch": 0.18915873439492334, + "grad_norm": 0.20567528903484344, + "learning_rate": 0.0009213882193755645, + "loss": 2.8013, + "step": 6379 + }, + { + "epoch": 0.18918838774723482, + "grad_norm": 0.18659675121307373, + "learning_rate": 0.0009213628916828353, + "loss": 2.782, + "step": 6380 + }, + { + "epoch": 0.1892180410995463, + "grad_norm": 0.14911767840385437, + "learning_rate": 0.0009213375602588654, + "loss": 2.8195, + "step": 6381 + }, + { + "epoch": 0.1892476944518578, + "grad_norm": 0.16228130459785461, + "learning_rate": 0.000921312225103879, + "loss": 2.7877, + "step": 6382 + }, + { + "epoch": 0.18927734780416927, + "grad_norm": 0.147771418094635, + "learning_rate": 0.0009212868862181005, + "loss": 2.8007, + "step": 6383 + }, + { + "epoch": 0.18930700115648075, + "grad_norm": 0.12897804379463196, + "learning_rate": 0.0009212615436017545, + "loss": 2.783, + "step": 6384 + }, + { + "epoch": 0.18933665450879222, + "grad_norm": 0.13470134139060974, + "learning_rate": 0.0009212361972550651, + "loss": 2.7853, + "step": 6385 + }, + { + "epoch": 0.1893663078611037, + "grad_norm": 0.13802435994148254, + "learning_rate": 0.0009212108471782569, + "loss": 2.7819, + "step": 6386 + }, + { + "epoch": 0.18939596121341518, + "grad_norm": 0.14716295897960663, + "learning_rate": 0.0009211854933715544, + "loss": 2.8073, + "step": 6387 + }, + { + "epoch": 0.18942561456572665, + "grad_norm": 0.1433330476284027, + "learning_rate": 0.0009211601358351818, + "loss": 2.8228, + "step": 6388 + }, + { + "epoch": 0.18945526791803813, + "grad_norm": 0.16401584446430206, + "learning_rate": 0.0009211347745693642, + "loss": 2.8033, + "step": 6389 + }, + { + "epoch": 0.1894849212703496, + "grad_norm": 0.16681095957756042, + "learning_rate": 0.0009211094095743258, + "loss": 2.8047, + "step": 6390 + }, + { + "epoch": 0.18951457462266108, + "grad_norm": 0.18927009403705597, + "learning_rate": 0.0009210840408502912, + "loss": 2.7764, + "step": 6391 + }, + { + "epoch": 0.18954422797497258, + "grad_norm": 0.21448107063770294, + "learning_rate": 0.0009210586683974854, + "loss": 2.8219, + "step": 6392 + }, + { + "epoch": 0.18957388132728406, + "grad_norm": 0.16607801616191864, + "learning_rate": 0.0009210332922161325, + "loss": 2.7643, + "step": 6393 + }, + { + "epoch": 0.18960353467959554, + "grad_norm": 0.13294246792793274, + "learning_rate": 0.0009210079123064576, + "loss": 2.7851, + "step": 6394 + }, + { + "epoch": 0.189633188031907, + "grad_norm": 0.17505744099617004, + "learning_rate": 0.0009209825286686855, + "loss": 2.8126, + "step": 6395 + }, + { + "epoch": 0.1896628413842185, + "grad_norm": 0.1649891436100006, + "learning_rate": 0.0009209571413030409, + "loss": 2.7826, + "step": 6396 + }, + { + "epoch": 0.18969249473652997, + "grad_norm": 0.1485239714384079, + "learning_rate": 0.0009209317502097483, + "loss": 2.797, + "step": 6397 + }, + { + "epoch": 0.18972214808884144, + "grad_norm": 0.15926142036914825, + "learning_rate": 0.000920906355389033, + "loss": 2.8232, + "step": 6398 + }, + { + "epoch": 0.18975180144115292, + "grad_norm": 0.14810357987880707, + "learning_rate": 0.0009208809568411196, + "loss": 2.8129, + "step": 6399 + }, + { + "epoch": 0.1897814547934644, + "grad_norm": 0.152918741106987, + "learning_rate": 0.0009208555545662332, + "loss": 2.7981, + "step": 6400 + }, + { + "epoch": 0.18981110814577587, + "grad_norm": 0.14766322076320648, + "learning_rate": 0.0009208301485645984, + "loss": 2.7722, + "step": 6401 + }, + { + "epoch": 0.18984076149808735, + "grad_norm": 0.14748941361904144, + "learning_rate": 0.0009208047388364405, + "loss": 2.8044, + "step": 6402 + }, + { + "epoch": 0.18987041485039885, + "grad_norm": 0.13788959383964539, + "learning_rate": 0.0009207793253819845, + "loss": 2.7657, + "step": 6403 + }, + { + "epoch": 0.18990006820271033, + "grad_norm": 0.13965953886508942, + "learning_rate": 0.0009207539082014553, + "loss": 2.8015, + "step": 6404 + }, + { + "epoch": 0.1899297215550218, + "grad_norm": 0.15633785724639893, + "learning_rate": 0.000920728487295078, + "loss": 2.8121, + "step": 6405 + }, + { + "epoch": 0.18995937490733328, + "grad_norm": 0.16002212464809418, + "learning_rate": 0.0009207030626630777, + "loss": 2.8249, + "step": 6406 + }, + { + "epoch": 0.18998902825964475, + "grad_norm": 0.1695968508720398, + "learning_rate": 0.0009206776343056795, + "loss": 2.7808, + "step": 6407 + }, + { + "epoch": 0.19001868161195623, + "grad_norm": 0.17158664762973785, + "learning_rate": 0.0009206522022231087, + "loss": 2.7667, + "step": 6408 + }, + { + "epoch": 0.1900483349642677, + "grad_norm": 0.16374461352825165, + "learning_rate": 0.0009206267664155906, + "loss": 2.7939, + "step": 6409 + }, + { + "epoch": 0.19007798831657918, + "grad_norm": 0.14709840714931488, + "learning_rate": 0.0009206013268833502, + "loss": 2.8191, + "step": 6410 + }, + { + "epoch": 0.19010764166889066, + "grad_norm": 0.13582660257816315, + "learning_rate": 0.0009205758836266128, + "loss": 2.7631, + "step": 6411 + }, + { + "epoch": 0.19013729502120214, + "grad_norm": 0.12715312838554382, + "learning_rate": 0.0009205504366456038, + "loss": 2.8396, + "step": 6412 + }, + { + "epoch": 0.19016694837351364, + "grad_norm": 0.12832750380039215, + "learning_rate": 0.0009205249859405484, + "loss": 2.7922, + "step": 6413 + }, + { + "epoch": 0.19019660172582512, + "grad_norm": 0.11477058380842209, + "learning_rate": 0.0009204995315116722, + "loss": 2.8194, + "step": 6414 + }, + { + "epoch": 0.1902262550781366, + "grad_norm": 0.1436341255903244, + "learning_rate": 0.0009204740733592005, + "loss": 2.7718, + "step": 6415 + }, + { + "epoch": 0.19025590843044807, + "grad_norm": 0.1587870866060257, + "learning_rate": 0.0009204486114833586, + "loss": 2.8045, + "step": 6416 + }, + { + "epoch": 0.19028556178275954, + "grad_norm": 0.1827036440372467, + "learning_rate": 0.000920423145884372, + "loss": 2.8003, + "step": 6417 + }, + { + "epoch": 0.19031521513507102, + "grad_norm": 0.18326370418071747, + "learning_rate": 0.0009203976765624664, + "loss": 2.7866, + "step": 6418 + }, + { + "epoch": 0.1903448684873825, + "grad_norm": 0.17327095568180084, + "learning_rate": 0.0009203722035178672, + "loss": 2.8137, + "step": 6419 + }, + { + "epoch": 0.19037452183969397, + "grad_norm": 0.16542993485927582, + "learning_rate": 0.0009203467267508, + "loss": 2.8338, + "step": 6420 + }, + { + "epoch": 0.19040417519200545, + "grad_norm": 0.1887187510728836, + "learning_rate": 0.0009203212462614902, + "loss": 2.8603, + "step": 6421 + }, + { + "epoch": 0.19043382854431692, + "grad_norm": 0.2224743813276291, + "learning_rate": 0.000920295762050164, + "loss": 2.7793, + "step": 6422 + }, + { + "epoch": 0.1904634818966284, + "grad_norm": 0.21381893754005432, + "learning_rate": 0.0009202702741170464, + "loss": 2.8175, + "step": 6423 + }, + { + "epoch": 0.1904931352489399, + "grad_norm": 0.19416527450084686, + "learning_rate": 0.0009202447824623634, + "loss": 2.8016, + "step": 6424 + }, + { + "epoch": 0.19052278860125138, + "grad_norm": 0.18273484706878662, + "learning_rate": 0.0009202192870863408, + "loss": 2.7771, + "step": 6425 + }, + { + "epoch": 0.19055244195356286, + "grad_norm": 0.18087904155254364, + "learning_rate": 0.0009201937879892042, + "loss": 2.8112, + "step": 6426 + }, + { + "epoch": 0.19058209530587433, + "grad_norm": 0.14650824666023254, + "learning_rate": 0.0009201682851711795, + "loss": 2.7987, + "step": 6427 + }, + { + "epoch": 0.1906117486581858, + "grad_norm": 0.14389315247535706, + "learning_rate": 0.0009201427786324926, + "loss": 2.7981, + "step": 6428 + }, + { + "epoch": 0.19064140201049729, + "grad_norm": 0.1448347568511963, + "learning_rate": 0.0009201172683733691, + "loss": 2.8016, + "step": 6429 + }, + { + "epoch": 0.19067105536280876, + "grad_norm": 0.13725906610488892, + "learning_rate": 0.0009200917543940352, + "loss": 2.7738, + "step": 6430 + }, + { + "epoch": 0.19070070871512024, + "grad_norm": 0.12620067596435547, + "learning_rate": 0.0009200662366947168, + "loss": 2.7996, + "step": 6431 + }, + { + "epoch": 0.1907303620674317, + "grad_norm": 0.1340709775686264, + "learning_rate": 0.0009200407152756396, + "loss": 2.7547, + "step": 6432 + }, + { + "epoch": 0.1907600154197432, + "grad_norm": 0.1263291984796524, + "learning_rate": 0.00092001519013703, + "loss": 2.8185, + "step": 6433 + }, + { + "epoch": 0.1907896687720547, + "grad_norm": 0.11738771200180054, + "learning_rate": 0.0009199896612791135, + "loss": 2.7993, + "step": 6434 + }, + { + "epoch": 0.19081932212436617, + "grad_norm": 0.13227757811546326, + "learning_rate": 0.0009199641287021166, + "loss": 2.83, + "step": 6435 + }, + { + "epoch": 0.19084897547667765, + "grad_norm": 0.14566169679164886, + "learning_rate": 0.0009199385924062653, + "loss": 2.813, + "step": 6436 + }, + { + "epoch": 0.19087862882898912, + "grad_norm": 0.14830780029296875, + "learning_rate": 0.0009199130523917855, + "loss": 2.7894, + "step": 6437 + }, + { + "epoch": 0.1909082821813006, + "grad_norm": 0.15561705827713013, + "learning_rate": 0.0009198875086589038, + "loss": 2.8128, + "step": 6438 + }, + { + "epoch": 0.19093793553361207, + "grad_norm": 0.15925538539886475, + "learning_rate": 0.0009198619612078461, + "loss": 2.8054, + "step": 6439 + }, + { + "epoch": 0.19096758888592355, + "grad_norm": 0.1640673726797104, + "learning_rate": 0.0009198364100388384, + "loss": 2.7611, + "step": 6440 + }, + { + "epoch": 0.19099724223823503, + "grad_norm": 0.18819917738437653, + "learning_rate": 0.0009198108551521075, + "loss": 2.7888, + "step": 6441 + }, + { + "epoch": 0.1910268955905465, + "grad_norm": 0.19992990791797638, + "learning_rate": 0.0009197852965478792, + "loss": 2.8196, + "step": 6442 + }, + { + "epoch": 0.19105654894285798, + "grad_norm": 0.18426665663719177, + "learning_rate": 0.0009197597342263802, + "loss": 2.8079, + "step": 6443 + }, + { + "epoch": 0.19108620229516948, + "grad_norm": 0.1820862889289856, + "learning_rate": 0.0009197341681878368, + "loss": 2.7911, + "step": 6444 + }, + { + "epoch": 0.19111585564748096, + "grad_norm": 0.17318372428417206, + "learning_rate": 0.0009197085984324751, + "loss": 2.7567, + "step": 6445 + }, + { + "epoch": 0.19114550899979244, + "grad_norm": 0.16266131401062012, + "learning_rate": 0.0009196830249605217, + "loss": 2.7995, + "step": 6446 + }, + { + "epoch": 0.1911751623521039, + "grad_norm": 0.16356158256530762, + "learning_rate": 0.0009196574477722033, + "loss": 2.8019, + "step": 6447 + }, + { + "epoch": 0.1912048157044154, + "grad_norm": 0.16901759803295135, + "learning_rate": 0.000919631866867746, + "loss": 2.7864, + "step": 6448 + }, + { + "epoch": 0.19123446905672686, + "grad_norm": 0.1831536740064621, + "learning_rate": 0.0009196062822473765, + "loss": 2.7988, + "step": 6449 + }, + { + "epoch": 0.19126412240903834, + "grad_norm": 0.18819572031497955, + "learning_rate": 0.0009195806939113213, + "loss": 2.8187, + "step": 6450 + }, + { + "epoch": 0.19129377576134982, + "grad_norm": 0.18167199194431305, + "learning_rate": 0.0009195551018598072, + "loss": 2.8191, + "step": 6451 + }, + { + "epoch": 0.1913234291136613, + "grad_norm": 0.16323183476924896, + "learning_rate": 0.0009195295060930605, + "loss": 2.8135, + "step": 6452 + }, + { + "epoch": 0.19135308246597277, + "grad_norm": 0.15692271292209625, + "learning_rate": 0.000919503906611308, + "loss": 2.7911, + "step": 6453 + }, + { + "epoch": 0.19138273581828424, + "grad_norm": 0.15781351923942566, + "learning_rate": 0.0009194783034147764, + "loss": 2.7895, + "step": 6454 + }, + { + "epoch": 0.19141238917059575, + "grad_norm": 0.16151674091815948, + "learning_rate": 0.0009194526965036927, + "loss": 2.8037, + "step": 6455 + }, + { + "epoch": 0.19144204252290722, + "grad_norm": 0.1476372629404068, + "learning_rate": 0.000919427085878283, + "loss": 2.7799, + "step": 6456 + }, + { + "epoch": 0.1914716958752187, + "grad_norm": 0.14026054739952087, + "learning_rate": 0.0009194014715387746, + "loss": 2.8228, + "step": 6457 + }, + { + "epoch": 0.19150134922753018, + "grad_norm": 0.13326327502727509, + "learning_rate": 0.0009193758534853942, + "loss": 2.7635, + "step": 6458 + }, + { + "epoch": 0.19153100257984165, + "grad_norm": 0.14217299222946167, + "learning_rate": 0.0009193502317183687, + "loss": 2.7935, + "step": 6459 + }, + { + "epoch": 0.19156065593215313, + "grad_norm": 0.14762336015701294, + "learning_rate": 0.0009193246062379248, + "loss": 2.8053, + "step": 6460 + }, + { + "epoch": 0.1915903092844646, + "grad_norm": 0.15179772675037384, + "learning_rate": 0.0009192989770442897, + "loss": 2.7968, + "step": 6461 + }, + { + "epoch": 0.19161996263677608, + "grad_norm": 0.15506015717983246, + "learning_rate": 0.0009192733441376899, + "loss": 2.818, + "step": 6462 + }, + { + "epoch": 0.19164961598908756, + "grad_norm": 0.15324318408966064, + "learning_rate": 0.0009192477075183529, + "loss": 2.8178, + "step": 6463 + }, + { + "epoch": 0.19167926934139903, + "grad_norm": 0.13316866755485535, + "learning_rate": 0.0009192220671865055, + "loss": 2.792, + "step": 6464 + }, + { + "epoch": 0.19170892269371054, + "grad_norm": 0.13103258609771729, + "learning_rate": 0.0009191964231423746, + "loss": 2.8178, + "step": 6465 + }, + { + "epoch": 0.191738576046022, + "grad_norm": 0.1394791305065155, + "learning_rate": 0.0009191707753861875, + "loss": 2.7736, + "step": 6466 + }, + { + "epoch": 0.1917682293983335, + "grad_norm": 0.15681494772434235, + "learning_rate": 0.0009191451239181712, + "loss": 2.8266, + "step": 6467 + }, + { + "epoch": 0.19179788275064497, + "grad_norm": 0.15527427196502686, + "learning_rate": 0.0009191194687385529, + "loss": 2.7615, + "step": 6468 + }, + { + "epoch": 0.19182753610295644, + "grad_norm": 0.16374213993549347, + "learning_rate": 0.0009190938098475598, + "loss": 2.7993, + "step": 6469 + }, + { + "epoch": 0.19185718945526792, + "grad_norm": 0.16174283623695374, + "learning_rate": 0.000919068147245419, + "loss": 2.784, + "step": 6470 + }, + { + "epoch": 0.1918868428075794, + "grad_norm": 0.1742928922176361, + "learning_rate": 0.0009190424809323579, + "loss": 2.8163, + "step": 6471 + }, + { + "epoch": 0.19191649615989087, + "grad_norm": 0.16827236115932465, + "learning_rate": 0.0009190168109086037, + "loss": 2.7661, + "step": 6472 + }, + { + "epoch": 0.19194614951220235, + "grad_norm": 0.15497316420078278, + "learning_rate": 0.0009189911371743837, + "loss": 2.7776, + "step": 6473 + }, + { + "epoch": 0.19197580286451382, + "grad_norm": 0.15349256992340088, + "learning_rate": 0.0009189654597299252, + "loss": 2.7824, + "step": 6474 + }, + { + "epoch": 0.1920054562168253, + "grad_norm": 0.14490948617458344, + "learning_rate": 0.0009189397785754558, + "loss": 2.7855, + "step": 6475 + }, + { + "epoch": 0.1920351095691368, + "grad_norm": 0.14076842367649078, + "learning_rate": 0.0009189140937112026, + "loss": 2.7967, + "step": 6476 + }, + { + "epoch": 0.19206476292144828, + "grad_norm": 0.13764099776744843, + "learning_rate": 0.0009188884051373931, + "loss": 2.821, + "step": 6477 + }, + { + "epoch": 0.19209441627375975, + "grad_norm": 0.18179872632026672, + "learning_rate": 0.000918862712854255, + "loss": 2.7692, + "step": 6478 + }, + { + "epoch": 0.19212406962607123, + "grad_norm": 0.16791081428527832, + "learning_rate": 0.0009188370168620158, + "loss": 2.771, + "step": 6479 + }, + { + "epoch": 0.1921537229783827, + "grad_norm": 0.1496018022298813, + "learning_rate": 0.0009188113171609029, + "loss": 2.804, + "step": 6480 + }, + { + "epoch": 0.19218337633069418, + "grad_norm": 0.14779409766197205, + "learning_rate": 0.0009187856137511436, + "loss": 2.8102, + "step": 6481 + }, + { + "epoch": 0.19221302968300566, + "grad_norm": 0.16420486569404602, + "learning_rate": 0.0009187599066329662, + "loss": 2.816, + "step": 6482 + }, + { + "epoch": 0.19224268303531714, + "grad_norm": 0.19651183485984802, + "learning_rate": 0.0009187341958065977, + "loss": 2.7891, + "step": 6483 + }, + { + "epoch": 0.1922723363876286, + "grad_norm": 0.2250952422618866, + "learning_rate": 0.0009187084812722661, + "loss": 2.7915, + "step": 6484 + }, + { + "epoch": 0.1923019897399401, + "grad_norm": 0.21654652059078217, + "learning_rate": 0.000918682763030199, + "loss": 2.7983, + "step": 6485 + }, + { + "epoch": 0.1923316430922516, + "grad_norm": 0.17400521039962769, + "learning_rate": 0.0009186570410806241, + "loss": 2.7786, + "step": 6486 + }, + { + "epoch": 0.19236129644456307, + "grad_norm": 0.1647787243127823, + "learning_rate": 0.0009186313154237693, + "loss": 2.8035, + "step": 6487 + }, + { + "epoch": 0.19239094979687454, + "grad_norm": 0.16793827712535858, + "learning_rate": 0.0009186055860598624, + "loss": 2.7864, + "step": 6488 + }, + { + "epoch": 0.19242060314918602, + "grad_norm": 0.16576232016086578, + "learning_rate": 0.0009185798529891311, + "loss": 2.7875, + "step": 6489 + }, + { + "epoch": 0.1924502565014975, + "grad_norm": 0.15614137053489685, + "learning_rate": 0.0009185541162118036, + "loss": 2.8108, + "step": 6490 + }, + { + "epoch": 0.19247990985380897, + "grad_norm": 0.1607801616191864, + "learning_rate": 0.0009185283757281073, + "loss": 2.7996, + "step": 6491 + }, + { + "epoch": 0.19250956320612045, + "grad_norm": 0.15151143074035645, + "learning_rate": 0.0009185026315382704, + "loss": 2.7747, + "step": 6492 + }, + { + "epoch": 0.19253921655843192, + "grad_norm": 0.14333707094192505, + "learning_rate": 0.0009184768836425209, + "loss": 2.8294, + "step": 6493 + }, + { + "epoch": 0.1925688699107434, + "grad_norm": 0.15646697580814362, + "learning_rate": 0.0009184511320410868, + "loss": 2.7902, + "step": 6494 + }, + { + "epoch": 0.19259852326305488, + "grad_norm": 0.15121214091777802, + "learning_rate": 0.0009184253767341961, + "loss": 2.7959, + "step": 6495 + }, + { + "epoch": 0.19262817661536638, + "grad_norm": 0.16015011072158813, + "learning_rate": 0.0009183996177220768, + "loss": 2.82, + "step": 6496 + }, + { + "epoch": 0.19265782996767786, + "grad_norm": 0.19392448663711548, + "learning_rate": 0.0009183738550049571, + "loss": 2.8468, + "step": 6497 + }, + { + "epoch": 0.19268748331998933, + "grad_norm": 0.21687746047973633, + "learning_rate": 0.0009183480885830651, + "loss": 2.7925, + "step": 6498 + }, + { + "epoch": 0.1927171366723008, + "grad_norm": 0.21672308444976807, + "learning_rate": 0.000918322318456629, + "loss": 2.783, + "step": 6499 + }, + { + "epoch": 0.19274679002461229, + "grad_norm": 0.19633203744888306, + "learning_rate": 0.0009182965446258768, + "loss": 2.844, + "step": 6500 + }, + { + "epoch": 0.19277644337692376, + "grad_norm": 0.18212738633155823, + "learning_rate": 0.0009182707670910372, + "loss": 2.8081, + "step": 6501 + }, + { + "epoch": 0.19280609672923524, + "grad_norm": 0.1820749044418335, + "learning_rate": 0.0009182449858523379, + "loss": 2.7867, + "step": 6502 + }, + { + "epoch": 0.19283575008154671, + "grad_norm": 0.16708162426948547, + "learning_rate": 0.0009182192009100077, + "loss": 2.8295, + "step": 6503 + }, + { + "epoch": 0.1928654034338582, + "grad_norm": 0.1415918618440628, + "learning_rate": 0.0009181934122642746, + "loss": 2.8112, + "step": 6504 + }, + { + "epoch": 0.19289505678616967, + "grad_norm": 0.1470811516046524, + "learning_rate": 0.0009181676199153669, + "loss": 2.795, + "step": 6505 + }, + { + "epoch": 0.19292471013848114, + "grad_norm": 0.14821918308734894, + "learning_rate": 0.0009181418238635134, + "loss": 2.7613, + "step": 6506 + }, + { + "epoch": 0.19295436349079265, + "grad_norm": 0.14925561845302582, + "learning_rate": 0.000918116024108942, + "loss": 2.801, + "step": 6507 + }, + { + "epoch": 0.19298401684310412, + "grad_norm": 0.1761871725320816, + "learning_rate": 0.0009180902206518815, + "loss": 2.824, + "step": 6508 + }, + { + "epoch": 0.1930136701954156, + "grad_norm": 0.14271144568920135, + "learning_rate": 0.0009180644134925604, + "loss": 2.8082, + "step": 6509 + }, + { + "epoch": 0.19304332354772707, + "grad_norm": 0.14884956181049347, + "learning_rate": 0.0009180386026312073, + "loss": 2.7888, + "step": 6510 + }, + { + "epoch": 0.19307297690003855, + "grad_norm": 0.15564365684986115, + "learning_rate": 0.0009180127880680504, + "loss": 2.7988, + "step": 6511 + }, + { + "epoch": 0.19310263025235003, + "grad_norm": 0.1498330980539322, + "learning_rate": 0.0009179869698033186, + "loss": 2.7646, + "step": 6512 + }, + { + "epoch": 0.1931322836046615, + "grad_norm": 0.14719897508621216, + "learning_rate": 0.0009179611478372405, + "loss": 2.788, + "step": 6513 + }, + { + "epoch": 0.19316193695697298, + "grad_norm": 0.14431044459342957, + "learning_rate": 0.0009179353221700446, + "loss": 2.8194, + "step": 6514 + }, + { + "epoch": 0.19319159030928446, + "grad_norm": 0.1324024349451065, + "learning_rate": 0.0009179094928019596, + "loss": 2.8106, + "step": 6515 + }, + { + "epoch": 0.19322124366159593, + "grad_norm": 0.1284191906452179, + "learning_rate": 0.0009178836597332145, + "loss": 2.7892, + "step": 6516 + }, + { + "epoch": 0.19325089701390744, + "grad_norm": 0.14732185006141663, + "learning_rate": 0.0009178578229640377, + "loss": 2.8227, + "step": 6517 + }, + { + "epoch": 0.1932805503662189, + "grad_norm": 0.1463342308998108, + "learning_rate": 0.0009178319824946583, + "loss": 2.8055, + "step": 6518 + }, + { + "epoch": 0.1933102037185304, + "grad_norm": 0.1396140456199646, + "learning_rate": 0.000917806138325305, + "loss": 2.77, + "step": 6519 + }, + { + "epoch": 0.19333985707084186, + "grad_norm": 0.1315201222896576, + "learning_rate": 0.0009177802904562065, + "loss": 2.772, + "step": 6520 + }, + { + "epoch": 0.19336951042315334, + "grad_norm": 0.127304807305336, + "learning_rate": 0.0009177544388875918, + "loss": 2.7991, + "step": 6521 + }, + { + "epoch": 0.19339916377546482, + "grad_norm": 0.14139048755168915, + "learning_rate": 0.0009177285836196898, + "loss": 2.7849, + "step": 6522 + }, + { + "epoch": 0.1934288171277763, + "grad_norm": 0.15385600924491882, + "learning_rate": 0.0009177027246527296, + "loss": 2.8151, + "step": 6523 + }, + { + "epoch": 0.19345847048008777, + "grad_norm": 0.14790011942386627, + "learning_rate": 0.00091767686198694, + "loss": 2.7752, + "step": 6524 + }, + { + "epoch": 0.19348812383239924, + "grad_norm": 0.12878413498401642, + "learning_rate": 0.0009176509956225503, + "loss": 2.788, + "step": 6525 + }, + { + "epoch": 0.19351777718471072, + "grad_norm": 0.13027839362621307, + "learning_rate": 0.0009176251255597892, + "loss": 2.7786, + "step": 6526 + }, + { + "epoch": 0.1935474305370222, + "grad_norm": 0.14472104609012604, + "learning_rate": 0.0009175992517988858, + "loss": 2.7871, + "step": 6527 + }, + { + "epoch": 0.1935770838893337, + "grad_norm": 0.14521530270576477, + "learning_rate": 0.0009175733743400694, + "loss": 2.8079, + "step": 6528 + }, + { + "epoch": 0.19360673724164518, + "grad_norm": 0.15710628032684326, + "learning_rate": 0.0009175474931835692, + "loss": 2.8052, + "step": 6529 + }, + { + "epoch": 0.19363639059395665, + "grad_norm": 0.15895754098892212, + "learning_rate": 0.0009175216083296142, + "loss": 2.7498, + "step": 6530 + }, + { + "epoch": 0.19366604394626813, + "grad_norm": 0.15615858137607574, + "learning_rate": 0.0009174957197784338, + "loss": 2.8045, + "step": 6531 + }, + { + "epoch": 0.1936956972985796, + "grad_norm": 0.16119930148124695, + "learning_rate": 0.0009174698275302571, + "loss": 2.8208, + "step": 6532 + }, + { + "epoch": 0.19372535065089108, + "grad_norm": 0.17272073030471802, + "learning_rate": 0.0009174439315853133, + "loss": 2.7668, + "step": 6533 + }, + { + "epoch": 0.19375500400320256, + "grad_norm": 0.14139795303344727, + "learning_rate": 0.000917418031943832, + "loss": 2.8141, + "step": 6534 + }, + { + "epoch": 0.19378465735551403, + "grad_norm": 0.16848810017108917, + "learning_rate": 0.0009173921286060422, + "loss": 2.8138, + "step": 6535 + }, + { + "epoch": 0.1938143107078255, + "grad_norm": 0.1804548054933548, + "learning_rate": 0.0009173662215721737, + "loss": 2.8169, + "step": 6536 + }, + { + "epoch": 0.193843964060137, + "grad_norm": 0.17879807949066162, + "learning_rate": 0.0009173403108424554, + "loss": 2.7716, + "step": 6537 + }, + { + "epoch": 0.1938736174124485, + "grad_norm": 0.18974153697490692, + "learning_rate": 0.0009173143964171171, + "loss": 2.7757, + "step": 6538 + }, + { + "epoch": 0.19390327076475997, + "grad_norm": 0.16885091364383698, + "learning_rate": 0.0009172884782963884, + "loss": 2.8293, + "step": 6539 + }, + { + "epoch": 0.19393292411707144, + "grad_norm": 0.16170330345630646, + "learning_rate": 0.0009172625564804984, + "loss": 2.8052, + "step": 6540 + }, + { + "epoch": 0.19396257746938292, + "grad_norm": 0.13790594041347504, + "learning_rate": 0.0009172366309696768, + "loss": 2.8068, + "step": 6541 + }, + { + "epoch": 0.1939922308216944, + "grad_norm": 0.19258975982666016, + "learning_rate": 0.0009172107017641533, + "loss": 2.8009, + "step": 6542 + }, + { + "epoch": 0.19402188417400587, + "grad_norm": 0.1445634365081787, + "learning_rate": 0.0009171847688641574, + "loss": 2.7528, + "step": 6543 + }, + { + "epoch": 0.19405153752631735, + "grad_norm": 0.15893089771270752, + "learning_rate": 0.0009171588322699187, + "loss": 2.7954, + "step": 6544 + }, + { + "epoch": 0.19408119087862882, + "grad_norm": 0.15672068297863007, + "learning_rate": 0.0009171328919816671, + "loss": 2.8065, + "step": 6545 + }, + { + "epoch": 0.1941108442309403, + "grad_norm": 0.15284477174282074, + "learning_rate": 0.0009171069479996319, + "loss": 2.8012, + "step": 6546 + }, + { + "epoch": 0.19414049758325178, + "grad_norm": 0.14416038990020752, + "learning_rate": 0.0009170810003240432, + "loss": 2.8196, + "step": 6547 + }, + { + "epoch": 0.19417015093556325, + "grad_norm": 0.1454204022884369, + "learning_rate": 0.0009170550489551308, + "loss": 2.8375, + "step": 6548 + }, + { + "epoch": 0.19419980428787476, + "grad_norm": 0.15613579750061035, + "learning_rate": 0.0009170290938931242, + "loss": 2.7843, + "step": 6549 + }, + { + "epoch": 0.19422945764018623, + "grad_norm": 0.15272682905197144, + "learning_rate": 0.0009170031351382535, + "loss": 2.7917, + "step": 6550 + }, + { + "epoch": 0.1942591109924977, + "grad_norm": 0.1774848997592926, + "learning_rate": 0.0009169771726907483, + "loss": 2.7947, + "step": 6551 + }, + { + "epoch": 0.19428876434480918, + "grad_norm": 0.1680494099855423, + "learning_rate": 0.0009169512065508388, + "loss": 2.8098, + "step": 6552 + }, + { + "epoch": 0.19431841769712066, + "grad_norm": 0.1761755347251892, + "learning_rate": 0.0009169252367187546, + "loss": 2.8084, + "step": 6553 + }, + { + "epoch": 0.19434807104943214, + "grad_norm": 0.1667827069759369, + "learning_rate": 0.0009168992631947261, + "loss": 2.8159, + "step": 6554 + }, + { + "epoch": 0.1943777244017436, + "grad_norm": 0.1415640115737915, + "learning_rate": 0.0009168732859789829, + "loss": 2.8129, + "step": 6555 + }, + { + "epoch": 0.1944073777540551, + "grad_norm": 0.130636528134346, + "learning_rate": 0.0009168473050717553, + "loss": 2.8058, + "step": 6556 + }, + { + "epoch": 0.19443703110636656, + "grad_norm": 0.12604311108589172, + "learning_rate": 0.0009168213204732732, + "loss": 2.7853, + "step": 6557 + }, + { + "epoch": 0.19446668445867804, + "grad_norm": 0.12284918129444122, + "learning_rate": 0.0009167953321837668, + "loss": 2.7819, + "step": 6558 + }, + { + "epoch": 0.19449633781098954, + "grad_norm": 0.12013433128595352, + "learning_rate": 0.0009167693402034662, + "loss": 2.747, + "step": 6559 + }, + { + "epoch": 0.19452599116330102, + "grad_norm": 0.11621319502592087, + "learning_rate": 0.0009167433445326015, + "loss": 2.7669, + "step": 6560 + }, + { + "epoch": 0.1945556445156125, + "grad_norm": 0.12168595939874649, + "learning_rate": 0.000916717345171403, + "loss": 2.819, + "step": 6561 + }, + { + "epoch": 0.19458529786792397, + "grad_norm": 0.1321256458759308, + "learning_rate": 0.0009166913421201009, + "loss": 2.7699, + "step": 6562 + }, + { + "epoch": 0.19461495122023545, + "grad_norm": 0.15533211827278137, + "learning_rate": 0.0009166653353789254, + "loss": 2.7898, + "step": 6563 + }, + { + "epoch": 0.19464460457254693, + "grad_norm": 0.1872967928647995, + "learning_rate": 0.0009166393249481069, + "loss": 2.7874, + "step": 6564 + }, + { + "epoch": 0.1946742579248584, + "grad_norm": 0.15204471349716187, + "learning_rate": 0.0009166133108278756, + "loss": 2.7855, + "step": 6565 + }, + { + "epoch": 0.19470391127716988, + "grad_norm": 0.14751635491847992, + "learning_rate": 0.0009165872930184618, + "loss": 2.783, + "step": 6566 + }, + { + "epoch": 0.19473356462948135, + "grad_norm": 0.15126340091228485, + "learning_rate": 0.0009165612715200962, + "loss": 2.7603, + "step": 6567 + }, + { + "epoch": 0.19476321798179283, + "grad_norm": 0.13862326741218567, + "learning_rate": 0.000916535246333009, + "loss": 2.8013, + "step": 6568 + }, + { + "epoch": 0.19479287133410433, + "grad_norm": 0.15187738835811615, + "learning_rate": 0.0009165092174574307, + "loss": 2.7825, + "step": 6569 + }, + { + "epoch": 0.1948225246864158, + "grad_norm": 0.13958853483200073, + "learning_rate": 0.0009164831848935917, + "loss": 2.8058, + "step": 6570 + }, + { + "epoch": 0.1948521780387273, + "grad_norm": 0.15117962658405304, + "learning_rate": 0.0009164571486417226, + "loss": 2.7794, + "step": 6571 + }, + { + "epoch": 0.19488183139103876, + "grad_norm": 0.15523618459701538, + "learning_rate": 0.0009164311087020541, + "loss": 2.8235, + "step": 6572 + }, + { + "epoch": 0.19491148474335024, + "grad_norm": 0.14778195321559906, + "learning_rate": 0.0009164050650748165, + "loss": 2.8145, + "step": 6573 + }, + { + "epoch": 0.19494113809566171, + "grad_norm": 0.13771952688694, + "learning_rate": 0.0009163790177602407, + "loss": 2.8214, + "step": 6574 + }, + { + "epoch": 0.1949707914479732, + "grad_norm": 0.13986870646476746, + "learning_rate": 0.0009163529667585573, + "loss": 2.8062, + "step": 6575 + }, + { + "epoch": 0.19500044480028467, + "grad_norm": 0.1355365812778473, + "learning_rate": 0.0009163269120699968, + "loss": 2.782, + "step": 6576 + }, + { + "epoch": 0.19503009815259614, + "grad_norm": 0.1523858606815338, + "learning_rate": 0.00091630085369479, + "loss": 2.8036, + "step": 6577 + }, + { + "epoch": 0.19505975150490762, + "grad_norm": 0.17841093242168427, + "learning_rate": 0.0009162747916331678, + "loss": 2.807, + "step": 6578 + }, + { + "epoch": 0.1950894048572191, + "grad_norm": 0.22827976942062378, + "learning_rate": 0.0009162487258853606, + "loss": 2.8164, + "step": 6579 + }, + { + "epoch": 0.1951190582095306, + "grad_norm": 0.3111794888973236, + "learning_rate": 0.0009162226564515997, + "loss": 2.7756, + "step": 6580 + }, + { + "epoch": 0.19514871156184208, + "grad_norm": 0.28613194823265076, + "learning_rate": 0.0009161965833321158, + "loss": 2.8031, + "step": 6581 + }, + { + "epoch": 0.19517836491415355, + "grad_norm": 0.2408144474029541, + "learning_rate": 0.0009161705065271395, + "loss": 2.7875, + "step": 6582 + }, + { + "epoch": 0.19520801826646503, + "grad_norm": 0.2706257700920105, + "learning_rate": 0.0009161444260369021, + "loss": 2.788, + "step": 6583 + }, + { + "epoch": 0.1952376716187765, + "grad_norm": 0.15178877115249634, + "learning_rate": 0.0009161183418616343, + "loss": 2.7959, + "step": 6584 + }, + { + "epoch": 0.19526732497108798, + "grad_norm": 0.1945624202489853, + "learning_rate": 0.0009160922540015673, + "loss": 2.8062, + "step": 6585 + }, + { + "epoch": 0.19529697832339946, + "grad_norm": 0.22066941857337952, + "learning_rate": 0.0009160661624569318, + "loss": 2.7761, + "step": 6586 + }, + { + "epoch": 0.19532663167571093, + "grad_norm": 0.17642255127429962, + "learning_rate": 0.0009160400672279591, + "loss": 2.7946, + "step": 6587 + }, + { + "epoch": 0.1953562850280224, + "grad_norm": 0.17379285395145416, + "learning_rate": 0.0009160139683148801, + "loss": 2.7755, + "step": 6588 + }, + { + "epoch": 0.19538593838033388, + "grad_norm": 0.15351659059524536, + "learning_rate": 0.0009159878657179261, + "loss": 2.8219, + "step": 6589 + }, + { + "epoch": 0.1954155917326454, + "grad_norm": 0.13991251587867737, + "learning_rate": 0.0009159617594373281, + "loss": 2.8198, + "step": 6590 + }, + { + "epoch": 0.19544524508495686, + "grad_norm": 0.14488008618354797, + "learning_rate": 0.0009159356494733173, + "loss": 2.7845, + "step": 6591 + }, + { + "epoch": 0.19547489843726834, + "grad_norm": 0.12686863541603088, + "learning_rate": 0.0009159095358261249, + "loss": 2.7933, + "step": 6592 + }, + { + "epoch": 0.19550455178957982, + "grad_norm": 0.1411711424589157, + "learning_rate": 0.0009158834184959824, + "loss": 2.7885, + "step": 6593 + }, + { + "epoch": 0.1955342051418913, + "grad_norm": 0.12190788239240646, + "learning_rate": 0.0009158572974831206, + "loss": 2.7905, + "step": 6594 + }, + { + "epoch": 0.19556385849420277, + "grad_norm": 0.145472452044487, + "learning_rate": 0.000915831172787771, + "loss": 2.7889, + "step": 6595 + }, + { + "epoch": 0.19559351184651425, + "grad_norm": 0.121730275452137, + "learning_rate": 0.0009158050444101652, + "loss": 2.8059, + "step": 6596 + }, + { + "epoch": 0.19562316519882572, + "grad_norm": 0.13137778639793396, + "learning_rate": 0.0009157789123505342, + "loss": 2.8114, + "step": 6597 + }, + { + "epoch": 0.1956528185511372, + "grad_norm": 0.1333608627319336, + "learning_rate": 0.0009157527766091097, + "loss": 2.7968, + "step": 6598 + }, + { + "epoch": 0.19568247190344867, + "grad_norm": 0.1377691626548767, + "learning_rate": 0.0009157266371861229, + "loss": 2.8091, + "step": 6599 + }, + { + "epoch": 0.19571212525576015, + "grad_norm": 0.13755746185779572, + "learning_rate": 0.0009157004940818054, + "loss": 2.7978, + "step": 6600 + }, + { + "epoch": 0.19574177860807165, + "grad_norm": 0.12241709977388382, + "learning_rate": 0.0009156743472963887, + "loss": 2.8084, + "step": 6601 + }, + { + "epoch": 0.19577143196038313, + "grad_norm": 0.12751567363739014, + "learning_rate": 0.0009156481968301042, + "loss": 2.8153, + "step": 6602 + }, + { + "epoch": 0.1958010853126946, + "grad_norm": 0.13026933372020721, + "learning_rate": 0.0009156220426831839, + "loss": 2.8048, + "step": 6603 + }, + { + "epoch": 0.19583073866500608, + "grad_norm": 0.14028991758823395, + "learning_rate": 0.0009155958848558587, + "loss": 2.7795, + "step": 6604 + }, + { + "epoch": 0.19586039201731756, + "grad_norm": 0.1372918039560318, + "learning_rate": 0.0009155697233483608, + "loss": 2.8354, + "step": 6605 + }, + { + "epoch": 0.19589004536962903, + "grad_norm": 0.14151202142238617, + "learning_rate": 0.0009155435581609215, + "loss": 2.8251, + "step": 6606 + }, + { + "epoch": 0.1959196987219405, + "grad_norm": 0.1489420384168625, + "learning_rate": 0.0009155173892937727, + "loss": 2.7761, + "step": 6607 + }, + { + "epoch": 0.195949352074252, + "grad_norm": 0.13546791672706604, + "learning_rate": 0.0009154912167471463, + "loss": 2.7808, + "step": 6608 + }, + { + "epoch": 0.19597900542656346, + "grad_norm": 0.13034318387508392, + "learning_rate": 0.0009154650405212737, + "loss": 2.776, + "step": 6609 + }, + { + "epoch": 0.19600865877887494, + "grad_norm": 0.1338922381401062, + "learning_rate": 0.0009154388606163868, + "loss": 2.817, + "step": 6610 + }, + { + "epoch": 0.19603831213118644, + "grad_norm": 0.13531139492988586, + "learning_rate": 0.0009154126770327175, + "loss": 2.7766, + "step": 6611 + }, + { + "epoch": 0.19606796548349792, + "grad_norm": 0.13429154455661774, + "learning_rate": 0.0009153864897704977, + "loss": 2.8058, + "step": 6612 + }, + { + "epoch": 0.1960976188358094, + "grad_norm": 0.15606549382209778, + "learning_rate": 0.0009153602988299592, + "loss": 2.8179, + "step": 6613 + }, + { + "epoch": 0.19612727218812087, + "grad_norm": 0.18272000551223755, + "learning_rate": 0.000915334104211334, + "loss": 2.7847, + "step": 6614 + }, + { + "epoch": 0.19615692554043235, + "grad_norm": 0.17956972122192383, + "learning_rate": 0.0009153079059148541, + "loss": 2.7924, + "step": 6615 + }, + { + "epoch": 0.19618657889274382, + "grad_norm": 0.17088927328586578, + "learning_rate": 0.0009152817039407513, + "loss": 2.7498, + "step": 6616 + }, + { + "epoch": 0.1962162322450553, + "grad_norm": 0.15531465411186218, + "learning_rate": 0.0009152554982892575, + "loss": 2.7999, + "step": 6617 + }, + { + "epoch": 0.19624588559736678, + "grad_norm": 0.1165902391076088, + "learning_rate": 0.0009152292889606053, + "loss": 2.7683, + "step": 6618 + }, + { + "epoch": 0.19627553894967825, + "grad_norm": 0.1479395180940628, + "learning_rate": 0.0009152030759550265, + "loss": 2.8313, + "step": 6619 + }, + { + "epoch": 0.19630519230198973, + "grad_norm": 0.16072165966033936, + "learning_rate": 0.000915176859272753, + "loss": 2.8141, + "step": 6620 + }, + { + "epoch": 0.19633484565430123, + "grad_norm": 0.14853888750076294, + "learning_rate": 0.0009151506389140173, + "loss": 2.8419, + "step": 6621 + }, + { + "epoch": 0.1963644990066127, + "grad_norm": 0.14349843561649323, + "learning_rate": 0.0009151244148790513, + "loss": 2.7717, + "step": 6622 + }, + { + "epoch": 0.19639415235892418, + "grad_norm": 0.15596209466457367, + "learning_rate": 0.0009150981871680875, + "loss": 2.8098, + "step": 6623 + }, + { + "epoch": 0.19642380571123566, + "grad_norm": 0.1562730222940445, + "learning_rate": 0.0009150719557813579, + "loss": 2.8132, + "step": 6624 + }, + { + "epoch": 0.19645345906354714, + "grad_norm": 0.16757206618785858, + "learning_rate": 0.0009150457207190947, + "loss": 2.8373, + "step": 6625 + }, + { + "epoch": 0.1964831124158586, + "grad_norm": 0.17721284925937653, + "learning_rate": 0.0009150194819815307, + "loss": 2.8158, + "step": 6626 + }, + { + "epoch": 0.1965127657681701, + "grad_norm": 0.17790041863918304, + "learning_rate": 0.0009149932395688979, + "loss": 2.7871, + "step": 6627 + }, + { + "epoch": 0.19654241912048157, + "grad_norm": 0.18110090494155884, + "learning_rate": 0.0009149669934814287, + "loss": 2.7817, + "step": 6628 + }, + { + "epoch": 0.19657207247279304, + "grad_norm": 0.16934221982955933, + "learning_rate": 0.0009149407437193556, + "loss": 2.8009, + "step": 6629 + }, + { + "epoch": 0.19660172582510452, + "grad_norm": 0.140511155128479, + "learning_rate": 0.0009149144902829107, + "loss": 2.7784, + "step": 6630 + }, + { + "epoch": 0.196631379177416, + "grad_norm": 0.1414857655763626, + "learning_rate": 0.0009148882331723271, + "loss": 2.7796, + "step": 6631 + }, + { + "epoch": 0.1966610325297275, + "grad_norm": 0.13402003049850464, + "learning_rate": 0.0009148619723878369, + "loss": 2.7619, + "step": 6632 + }, + { + "epoch": 0.19669068588203897, + "grad_norm": 0.1411210298538208, + "learning_rate": 0.0009148357079296726, + "loss": 2.7894, + "step": 6633 + }, + { + "epoch": 0.19672033923435045, + "grad_norm": 0.14824479818344116, + "learning_rate": 0.000914809439798067, + "loss": 2.8184, + "step": 6634 + }, + { + "epoch": 0.19674999258666193, + "grad_norm": 0.1394042670726776, + "learning_rate": 0.0009147831679932525, + "loss": 2.8069, + "step": 6635 + }, + { + "epoch": 0.1967796459389734, + "grad_norm": 0.13254688680171967, + "learning_rate": 0.0009147568925154621, + "loss": 2.791, + "step": 6636 + }, + { + "epoch": 0.19680929929128488, + "grad_norm": 0.15415075421333313, + "learning_rate": 0.0009147306133649281, + "loss": 2.7816, + "step": 6637 + }, + { + "epoch": 0.19683895264359635, + "grad_norm": 0.1613154411315918, + "learning_rate": 0.0009147043305418833, + "loss": 2.8082, + "step": 6638 + }, + { + "epoch": 0.19686860599590783, + "grad_norm": 0.16441293060779572, + "learning_rate": 0.0009146780440465605, + "loss": 2.775, + "step": 6639 + }, + { + "epoch": 0.1968982593482193, + "grad_norm": 0.17566922307014465, + "learning_rate": 0.0009146517538791924, + "loss": 2.7727, + "step": 6640 + }, + { + "epoch": 0.19692791270053078, + "grad_norm": 0.16329775750637054, + "learning_rate": 0.0009146254600400119, + "loss": 2.8109, + "step": 6641 + }, + { + "epoch": 0.1969575660528423, + "grad_norm": 0.1621376872062683, + "learning_rate": 0.0009145991625292517, + "loss": 2.7879, + "step": 6642 + }, + { + "epoch": 0.19698721940515376, + "grad_norm": 0.17255502939224243, + "learning_rate": 0.0009145728613471448, + "loss": 2.8079, + "step": 6643 + }, + { + "epoch": 0.19701687275746524, + "grad_norm": 0.17357970774173737, + "learning_rate": 0.0009145465564939239, + "loss": 2.7908, + "step": 6644 + }, + { + "epoch": 0.19704652610977672, + "grad_norm": 0.17130975425243378, + "learning_rate": 0.0009145202479698223, + "loss": 2.8075, + "step": 6645 + }, + { + "epoch": 0.1970761794620882, + "grad_norm": 0.18897469341754913, + "learning_rate": 0.0009144939357750728, + "loss": 2.805, + "step": 6646 + }, + { + "epoch": 0.19710583281439967, + "grad_norm": 0.21292564272880554, + "learning_rate": 0.0009144676199099082, + "loss": 2.7811, + "step": 6647 + }, + { + "epoch": 0.19713548616671114, + "grad_norm": 0.2056148201227188, + "learning_rate": 0.0009144413003745617, + "loss": 2.7363, + "step": 6648 + }, + { + "epoch": 0.19716513951902262, + "grad_norm": 0.17436206340789795, + "learning_rate": 0.0009144149771692664, + "loss": 2.7957, + "step": 6649 + }, + { + "epoch": 0.1971947928713341, + "grad_norm": 0.16162966191768646, + "learning_rate": 0.0009143886502942553, + "loss": 2.7915, + "step": 6650 + }, + { + "epoch": 0.19722444622364557, + "grad_norm": 0.13827449083328247, + "learning_rate": 0.0009143623197497616, + "loss": 2.8236, + "step": 6651 + }, + { + "epoch": 0.19725409957595705, + "grad_norm": 0.1436968296766281, + "learning_rate": 0.0009143359855360184, + "loss": 2.7751, + "step": 6652 + }, + { + "epoch": 0.19728375292826855, + "grad_norm": 0.15310965478420258, + "learning_rate": 0.000914309647653259, + "loss": 2.7906, + "step": 6653 + }, + { + "epoch": 0.19731340628058003, + "grad_norm": 0.14611349999904633, + "learning_rate": 0.0009142833061017163, + "loss": 2.7927, + "step": 6654 + }, + { + "epoch": 0.1973430596328915, + "grad_norm": 0.14223700761795044, + "learning_rate": 0.0009142569608816243, + "loss": 2.812, + "step": 6655 + }, + { + "epoch": 0.19737271298520298, + "grad_norm": 0.1583101749420166, + "learning_rate": 0.0009142306119932154, + "loss": 2.8008, + "step": 6656 + }, + { + "epoch": 0.19740236633751446, + "grad_norm": 0.11849336326122284, + "learning_rate": 0.0009142042594367235, + "loss": 2.7675, + "step": 6657 + }, + { + "epoch": 0.19743201968982593, + "grad_norm": 0.13072237372398376, + "learning_rate": 0.0009141779032123816, + "loss": 2.8093, + "step": 6658 + }, + { + "epoch": 0.1974616730421374, + "grad_norm": 0.14059844613075256, + "learning_rate": 0.0009141515433204235, + "loss": 2.7945, + "step": 6659 + }, + { + "epoch": 0.19749132639444889, + "grad_norm": 0.1634356528520584, + "learning_rate": 0.0009141251797610823, + "loss": 2.778, + "step": 6660 + }, + { + "epoch": 0.19752097974676036, + "grad_norm": 0.15561839938163757, + "learning_rate": 0.0009140988125345915, + "loss": 2.7902, + "step": 6661 + }, + { + "epoch": 0.19755063309907184, + "grad_norm": 0.14265236258506775, + "learning_rate": 0.0009140724416411847, + "loss": 2.8115, + "step": 6662 + }, + { + "epoch": 0.19758028645138334, + "grad_norm": 0.15002872049808502, + "learning_rate": 0.0009140460670810954, + "loss": 2.8128, + "step": 6663 + }, + { + "epoch": 0.19760993980369482, + "grad_norm": 0.15857625007629395, + "learning_rate": 0.0009140196888545571, + "loss": 2.8142, + "step": 6664 + }, + { + "epoch": 0.1976395931560063, + "grad_norm": 0.15163002908229828, + "learning_rate": 0.0009139933069618033, + "loss": 2.8104, + "step": 6665 + }, + { + "epoch": 0.19766924650831777, + "grad_norm": 0.14217126369476318, + "learning_rate": 0.0009139669214030677, + "loss": 2.7796, + "step": 6666 + }, + { + "epoch": 0.19769889986062925, + "grad_norm": 0.14517813920974731, + "learning_rate": 0.0009139405321785841, + "loss": 2.7992, + "step": 6667 + }, + { + "epoch": 0.19772855321294072, + "grad_norm": 0.14870192110538483, + "learning_rate": 0.0009139141392885859, + "loss": 2.8219, + "step": 6668 + }, + { + "epoch": 0.1977582065652522, + "grad_norm": 0.17043666541576385, + "learning_rate": 0.000913887742733307, + "loss": 2.7816, + "step": 6669 + }, + { + "epoch": 0.19778785991756367, + "grad_norm": 0.1796177327632904, + "learning_rate": 0.0009138613425129811, + "loss": 2.7933, + "step": 6670 + }, + { + "epoch": 0.19781751326987515, + "grad_norm": 0.18894100189208984, + "learning_rate": 0.000913834938627842, + "loss": 2.7965, + "step": 6671 + }, + { + "epoch": 0.19784716662218663, + "grad_norm": 0.18497665226459503, + "learning_rate": 0.0009138085310781233, + "loss": 2.7987, + "step": 6672 + }, + { + "epoch": 0.19787681997449813, + "grad_norm": 0.14620855450630188, + "learning_rate": 0.0009137821198640592, + "loss": 2.806, + "step": 6673 + }, + { + "epoch": 0.1979064733268096, + "grad_norm": 0.1353984922170639, + "learning_rate": 0.0009137557049858833, + "loss": 2.8111, + "step": 6674 + }, + { + "epoch": 0.19793612667912108, + "grad_norm": 0.15037207305431366, + "learning_rate": 0.0009137292864438298, + "loss": 2.7865, + "step": 6675 + }, + { + "epoch": 0.19796578003143256, + "grad_norm": 0.14079385995864868, + "learning_rate": 0.0009137028642381323, + "loss": 2.8104, + "step": 6676 + }, + { + "epoch": 0.19799543338374404, + "grad_norm": 0.14766737818717957, + "learning_rate": 0.000913676438369025, + "loss": 2.8348, + "step": 6677 + }, + { + "epoch": 0.1980250867360555, + "grad_norm": 0.1500820368528366, + "learning_rate": 0.0009136500088367418, + "loss": 2.8136, + "step": 6678 + }, + { + "epoch": 0.198054740088367, + "grad_norm": 0.17889554798603058, + "learning_rate": 0.0009136235756415168, + "loss": 2.7921, + "step": 6679 + }, + { + "epoch": 0.19808439344067846, + "grad_norm": 0.19232411682605743, + "learning_rate": 0.000913597138783584, + "loss": 2.7456, + "step": 6680 + }, + { + "epoch": 0.19811404679298994, + "grad_norm": 0.16632984578609467, + "learning_rate": 0.0009135706982631775, + "loss": 2.809, + "step": 6681 + }, + { + "epoch": 0.19814370014530142, + "grad_norm": 0.1827220767736435, + "learning_rate": 0.0009135442540805315, + "loss": 2.7879, + "step": 6682 + }, + { + "epoch": 0.1981733534976129, + "grad_norm": 0.1984982192516327, + "learning_rate": 0.0009135178062358802, + "loss": 2.8134, + "step": 6683 + }, + { + "epoch": 0.1982030068499244, + "grad_norm": 0.16941186785697937, + "learning_rate": 0.0009134913547294576, + "loss": 2.7704, + "step": 6684 + }, + { + "epoch": 0.19823266020223587, + "grad_norm": 0.13571567833423615, + "learning_rate": 0.0009134648995614982, + "loss": 2.7817, + "step": 6685 + }, + { + "epoch": 0.19826231355454735, + "grad_norm": 0.1686936616897583, + "learning_rate": 0.0009134384407322362, + "loss": 2.7796, + "step": 6686 + }, + { + "epoch": 0.19829196690685882, + "grad_norm": 0.1581472009420395, + "learning_rate": 0.0009134119782419058, + "loss": 2.7832, + "step": 6687 + }, + { + "epoch": 0.1983216202591703, + "grad_norm": 0.13781699538230896, + "learning_rate": 0.0009133855120907414, + "loss": 2.8147, + "step": 6688 + }, + { + "epoch": 0.19835127361148178, + "grad_norm": 0.15761080384254456, + "learning_rate": 0.0009133590422789773, + "loss": 2.8087, + "step": 6689 + }, + { + "epoch": 0.19838092696379325, + "grad_norm": 0.14407281577587128, + "learning_rate": 0.0009133325688068479, + "loss": 2.7921, + "step": 6690 + }, + { + "epoch": 0.19841058031610473, + "grad_norm": 0.14983536303043365, + "learning_rate": 0.0009133060916745876, + "loss": 2.7626, + "step": 6691 + }, + { + "epoch": 0.1984402336684162, + "grad_norm": 0.15811526775360107, + "learning_rate": 0.000913279610882431, + "loss": 2.7825, + "step": 6692 + }, + { + "epoch": 0.19846988702072768, + "grad_norm": 0.15218181908130646, + "learning_rate": 0.0009132531264306126, + "loss": 2.8178, + "step": 6693 + }, + { + "epoch": 0.19849954037303918, + "grad_norm": 0.14589543640613556, + "learning_rate": 0.0009132266383193666, + "loss": 2.7668, + "step": 6694 + }, + { + "epoch": 0.19852919372535066, + "grad_norm": 0.16436555981636047, + "learning_rate": 0.000913200146548928, + "loss": 2.8166, + "step": 6695 + }, + { + "epoch": 0.19855884707766214, + "grad_norm": 0.16005823016166687, + "learning_rate": 0.000913173651119531, + "loss": 2.8332, + "step": 6696 + }, + { + "epoch": 0.1985885004299736, + "grad_norm": 0.15947416424751282, + "learning_rate": 0.0009131471520314105, + "loss": 2.8157, + "step": 6697 + }, + { + "epoch": 0.1986181537822851, + "grad_norm": 0.1872698962688446, + "learning_rate": 0.0009131206492848012, + "loss": 2.8012, + "step": 6698 + }, + { + "epoch": 0.19864780713459657, + "grad_norm": 0.20343391597270966, + "learning_rate": 0.0009130941428799373, + "loss": 2.7839, + "step": 6699 + }, + { + "epoch": 0.19867746048690804, + "grad_norm": 0.18336835503578186, + "learning_rate": 0.0009130676328170542, + "loss": 2.7877, + "step": 6700 + }, + { + "epoch": 0.19870711383921952, + "grad_norm": 0.18483871221542358, + "learning_rate": 0.0009130411190963861, + "loss": 2.8153, + "step": 6701 + }, + { + "epoch": 0.198736767191531, + "grad_norm": 0.1545083075761795, + "learning_rate": 0.000913014601718168, + "loss": 2.743, + "step": 6702 + }, + { + "epoch": 0.19876642054384247, + "grad_norm": 0.16338148713111877, + "learning_rate": 0.0009129880806826347, + "loss": 2.8144, + "step": 6703 + }, + { + "epoch": 0.19879607389615395, + "grad_norm": 0.1904965192079544, + "learning_rate": 0.0009129615559900213, + "loss": 2.822, + "step": 6704 + }, + { + "epoch": 0.19882572724846545, + "grad_norm": 0.16431191563606262, + "learning_rate": 0.0009129350276405622, + "loss": 2.8309, + "step": 6705 + }, + { + "epoch": 0.19885538060077693, + "grad_norm": 0.16481538116931915, + "learning_rate": 0.0009129084956344927, + "loss": 2.7718, + "step": 6706 + }, + { + "epoch": 0.1988850339530884, + "grad_norm": 0.17817991971969604, + "learning_rate": 0.0009128819599720472, + "loss": 2.7815, + "step": 6707 + }, + { + "epoch": 0.19891468730539988, + "grad_norm": 0.16375118494033813, + "learning_rate": 0.0009128554206534616, + "loss": 2.7739, + "step": 6708 + }, + { + "epoch": 0.19894434065771135, + "grad_norm": 0.1589677631855011, + "learning_rate": 0.00091282887767897, + "loss": 2.7937, + "step": 6709 + }, + { + "epoch": 0.19897399401002283, + "grad_norm": 0.12804493308067322, + "learning_rate": 0.000912802331048808, + "loss": 2.8299, + "step": 6710 + }, + { + "epoch": 0.1990036473623343, + "grad_norm": 0.14611656963825226, + "learning_rate": 0.0009127757807632106, + "loss": 2.8197, + "step": 6711 + }, + { + "epoch": 0.19903330071464578, + "grad_norm": 0.16249164938926697, + "learning_rate": 0.0009127492268224127, + "loss": 2.7884, + "step": 6712 + }, + { + "epoch": 0.19906295406695726, + "grad_norm": 0.1381813883781433, + "learning_rate": 0.0009127226692266495, + "loss": 2.8117, + "step": 6713 + }, + { + "epoch": 0.19909260741926874, + "grad_norm": 0.12993982434272766, + "learning_rate": 0.0009126961079761562, + "loss": 2.8133, + "step": 6714 + }, + { + "epoch": 0.19912226077158024, + "grad_norm": 0.1587754637002945, + "learning_rate": 0.0009126695430711681, + "loss": 2.8163, + "step": 6715 + }, + { + "epoch": 0.19915191412389172, + "grad_norm": 0.1547616422176361, + "learning_rate": 0.0009126429745119203, + "loss": 2.8403, + "step": 6716 + }, + { + "epoch": 0.1991815674762032, + "grad_norm": 0.16004320979118347, + "learning_rate": 0.0009126164022986483, + "loss": 2.7487, + "step": 6717 + }, + { + "epoch": 0.19921122082851467, + "grad_norm": 0.1505814641714096, + "learning_rate": 0.000912589826431587, + "loss": 2.7808, + "step": 6718 + }, + { + "epoch": 0.19924087418082614, + "grad_norm": 0.12199939787387848, + "learning_rate": 0.0009125632469109722, + "loss": 2.8064, + "step": 6719 + }, + { + "epoch": 0.19927052753313762, + "grad_norm": 0.14508213102817535, + "learning_rate": 0.0009125366637370389, + "loss": 2.7702, + "step": 6720 + }, + { + "epoch": 0.1993001808854491, + "grad_norm": 0.15013107657432556, + "learning_rate": 0.0009125100769100226, + "loss": 2.7947, + "step": 6721 + }, + { + "epoch": 0.19932983423776057, + "grad_norm": 0.23685790598392487, + "learning_rate": 0.0009124834864301588, + "loss": 2.7697, + "step": 6722 + }, + { + "epoch": 0.19935948759007205, + "grad_norm": 0.11769643425941467, + "learning_rate": 0.0009124568922976829, + "loss": 2.7854, + "step": 6723 + }, + { + "epoch": 0.19938914094238352, + "grad_norm": 0.1453617811203003, + "learning_rate": 0.0009124302945128305, + "loss": 2.7698, + "step": 6724 + }, + { + "epoch": 0.19941879429469503, + "grad_norm": 0.1485142707824707, + "learning_rate": 0.0009124036930758371, + "loss": 2.8074, + "step": 6725 + }, + { + "epoch": 0.1994484476470065, + "grad_norm": 0.1422424465417862, + "learning_rate": 0.000912377087986938, + "loss": 2.7901, + "step": 6726 + }, + { + "epoch": 0.19947810099931798, + "grad_norm": 0.12826095521450043, + "learning_rate": 0.0009123504792463692, + "loss": 2.8322, + "step": 6727 + }, + { + "epoch": 0.19950775435162946, + "grad_norm": 0.12967351078987122, + "learning_rate": 0.000912323866854366, + "loss": 2.7971, + "step": 6728 + }, + { + "epoch": 0.19953740770394093, + "grad_norm": 0.13897913694381714, + "learning_rate": 0.0009122972508111642, + "loss": 2.7792, + "step": 6729 + }, + { + "epoch": 0.1995670610562524, + "grad_norm": 0.15856961905956268, + "learning_rate": 0.0009122706311169994, + "loss": 2.7753, + "step": 6730 + }, + { + "epoch": 0.19959671440856389, + "grad_norm": 0.13304680585861206, + "learning_rate": 0.0009122440077721077, + "loss": 2.7784, + "step": 6731 + }, + { + "epoch": 0.19962636776087536, + "grad_norm": 0.1366063356399536, + "learning_rate": 0.0009122173807767243, + "loss": 2.7581, + "step": 6732 + }, + { + "epoch": 0.19965602111318684, + "grad_norm": 0.15570011734962463, + "learning_rate": 0.0009121907501310853, + "loss": 2.8058, + "step": 6733 + }, + { + "epoch": 0.19968567446549831, + "grad_norm": 0.1556847095489502, + "learning_rate": 0.0009121641158354264, + "loss": 2.7814, + "step": 6734 + }, + { + "epoch": 0.1997153278178098, + "grad_norm": 0.16068769991397858, + "learning_rate": 0.0009121374778899836, + "loss": 2.7731, + "step": 6735 + }, + { + "epoch": 0.1997449811701213, + "grad_norm": 0.17545783519744873, + "learning_rate": 0.0009121108362949926, + "loss": 2.7777, + "step": 6736 + }, + { + "epoch": 0.19977463452243277, + "grad_norm": 0.1752755045890808, + "learning_rate": 0.0009120841910506894, + "loss": 2.7859, + "step": 6737 + }, + { + "epoch": 0.19980428787474425, + "grad_norm": 0.1974053531885147, + "learning_rate": 0.0009120575421573101, + "loss": 2.7979, + "step": 6738 + }, + { + "epoch": 0.19983394122705572, + "grad_norm": 0.20668953657150269, + "learning_rate": 0.0009120308896150904, + "loss": 2.8025, + "step": 6739 + }, + { + "epoch": 0.1998635945793672, + "grad_norm": 0.1630457490682602, + "learning_rate": 0.0009120042334242665, + "loss": 2.8145, + "step": 6740 + }, + { + "epoch": 0.19989324793167867, + "grad_norm": 0.15351706743240356, + "learning_rate": 0.0009119775735850744, + "loss": 2.7896, + "step": 6741 + }, + { + "epoch": 0.19992290128399015, + "grad_norm": 0.15997695922851562, + "learning_rate": 0.0009119509100977501, + "loss": 2.7997, + "step": 6742 + }, + { + "epoch": 0.19995255463630163, + "grad_norm": 0.14817248284816742, + "learning_rate": 0.0009119242429625298, + "loss": 2.7973, + "step": 6743 + }, + { + "epoch": 0.1999822079886131, + "grad_norm": 0.17166350781917572, + "learning_rate": 0.0009118975721796496, + "loss": 2.7845, + "step": 6744 + }, + { + "epoch": 0.20001186134092458, + "grad_norm": 0.14628367125988007, + "learning_rate": 0.0009118708977493457, + "loss": 2.8105, + "step": 6745 + }, + { + "epoch": 0.20004151469323608, + "grad_norm": 0.15714189410209656, + "learning_rate": 0.0009118442196718545, + "loss": 2.7867, + "step": 6746 + }, + { + "epoch": 0.20007116804554756, + "grad_norm": 0.17080748081207275, + "learning_rate": 0.0009118175379474119, + "loss": 2.7566, + "step": 6747 + }, + { + "epoch": 0.20010082139785904, + "grad_norm": 0.1423821598291397, + "learning_rate": 0.0009117908525762542, + "loss": 2.7675, + "step": 6748 + }, + { + "epoch": 0.2001304747501705, + "grad_norm": 0.15276744961738586, + "learning_rate": 0.0009117641635586181, + "loss": 2.7653, + "step": 6749 + }, + { + "epoch": 0.200160128102482, + "grad_norm": 0.16546879708766937, + "learning_rate": 0.0009117374708947394, + "loss": 2.8142, + "step": 6750 + }, + { + "epoch": 0.20018978145479346, + "grad_norm": 0.19058389961719513, + "learning_rate": 0.0009117107745848547, + "loss": 2.7951, + "step": 6751 + }, + { + "epoch": 0.20021943480710494, + "grad_norm": 0.2009740173816681, + "learning_rate": 0.0009116840746292006, + "loss": 2.7863, + "step": 6752 + }, + { + "epoch": 0.20024908815941642, + "grad_norm": 0.20079566538333893, + "learning_rate": 0.0009116573710280133, + "loss": 2.795, + "step": 6753 + }, + { + "epoch": 0.2002787415117279, + "grad_norm": 0.18387913703918457, + "learning_rate": 0.0009116306637815293, + "loss": 2.7947, + "step": 6754 + }, + { + "epoch": 0.20030839486403937, + "grad_norm": 0.21525384485721588, + "learning_rate": 0.0009116039528899851, + "loss": 2.795, + "step": 6755 + }, + { + "epoch": 0.20033804821635084, + "grad_norm": 0.17394126951694489, + "learning_rate": 0.0009115772383536171, + "loss": 2.8293, + "step": 6756 + }, + { + "epoch": 0.20036770156866235, + "grad_norm": 0.12600256502628326, + "learning_rate": 0.0009115505201726623, + "loss": 2.7564, + "step": 6757 + }, + { + "epoch": 0.20039735492097382, + "grad_norm": 0.16637079417705536, + "learning_rate": 0.0009115237983473569, + "loss": 2.8145, + "step": 6758 + }, + { + "epoch": 0.2004270082732853, + "grad_norm": 0.1453644037246704, + "learning_rate": 0.0009114970728779376, + "loss": 2.8216, + "step": 6759 + }, + { + "epoch": 0.20045666162559678, + "grad_norm": 0.16209067404270172, + "learning_rate": 0.0009114703437646412, + "loss": 2.8095, + "step": 6760 + }, + { + "epoch": 0.20048631497790825, + "grad_norm": 0.16624166071414948, + "learning_rate": 0.000911443611007704, + "loss": 2.8181, + "step": 6761 + }, + { + "epoch": 0.20051596833021973, + "grad_norm": 0.1598643958568573, + "learning_rate": 0.0009114168746073633, + "loss": 2.8235, + "step": 6762 + }, + { + "epoch": 0.2005456216825312, + "grad_norm": 0.14565224945545197, + "learning_rate": 0.0009113901345638554, + "loss": 2.7986, + "step": 6763 + }, + { + "epoch": 0.20057527503484268, + "grad_norm": 0.14199547469615936, + "learning_rate": 0.0009113633908774171, + "loss": 2.814, + "step": 6764 + }, + { + "epoch": 0.20060492838715416, + "grad_norm": 0.1505195051431656, + "learning_rate": 0.0009113366435482857, + "loss": 2.8017, + "step": 6765 + }, + { + "epoch": 0.20063458173946563, + "grad_norm": 0.14525343477725983, + "learning_rate": 0.0009113098925766975, + "loss": 2.7971, + "step": 6766 + }, + { + "epoch": 0.20066423509177714, + "grad_norm": 0.14062215387821198, + "learning_rate": 0.0009112831379628896, + "loss": 2.7633, + "step": 6767 + }, + { + "epoch": 0.20069388844408861, + "grad_norm": 0.1521756947040558, + "learning_rate": 0.0009112563797070989, + "loss": 2.8111, + "step": 6768 + }, + { + "epoch": 0.2007235417964001, + "grad_norm": 0.1435297578573227, + "learning_rate": 0.0009112296178095625, + "loss": 2.7601, + "step": 6769 + }, + { + "epoch": 0.20075319514871157, + "grad_norm": 0.12736549973487854, + "learning_rate": 0.0009112028522705171, + "loss": 2.7726, + "step": 6770 + }, + { + "epoch": 0.20078284850102304, + "grad_norm": 0.13647980988025665, + "learning_rate": 0.0009111760830902001, + "loss": 2.8358, + "step": 6771 + }, + { + "epoch": 0.20081250185333452, + "grad_norm": 0.17152559757232666, + "learning_rate": 0.000911149310268848, + "loss": 2.8145, + "step": 6772 + }, + { + "epoch": 0.200842155205646, + "grad_norm": 0.19318275153636932, + "learning_rate": 0.0009111225338066984, + "loss": 2.8043, + "step": 6773 + }, + { + "epoch": 0.20087180855795747, + "grad_norm": 0.1684049814939499, + "learning_rate": 0.0009110957537039881, + "loss": 2.7924, + "step": 6774 + }, + { + "epoch": 0.20090146191026895, + "grad_norm": 0.153742253780365, + "learning_rate": 0.0009110689699609544, + "loss": 2.7627, + "step": 6775 + }, + { + "epoch": 0.20093111526258042, + "grad_norm": 0.16801518201828003, + "learning_rate": 0.0009110421825778343, + "loss": 2.8085, + "step": 6776 + }, + { + "epoch": 0.20096076861489193, + "grad_norm": 0.14678789675235748, + "learning_rate": 0.0009110153915548654, + "loss": 2.8288, + "step": 6777 + }, + { + "epoch": 0.2009904219672034, + "grad_norm": 0.13960124552249908, + "learning_rate": 0.0009109885968922844, + "loss": 2.777, + "step": 6778 + }, + { + "epoch": 0.20102007531951488, + "grad_norm": 0.15339301526546478, + "learning_rate": 0.0009109617985903289, + "loss": 2.8135, + "step": 6779 + }, + { + "epoch": 0.20104972867182636, + "grad_norm": 0.15386982262134552, + "learning_rate": 0.000910934996649236, + "loss": 2.8062, + "step": 6780 + }, + { + "epoch": 0.20107938202413783, + "grad_norm": 0.16421566903591156, + "learning_rate": 0.0009109081910692434, + "loss": 2.7807, + "step": 6781 + }, + { + "epoch": 0.2011090353764493, + "grad_norm": 0.16129885613918304, + "learning_rate": 0.0009108813818505881, + "loss": 2.7962, + "step": 6782 + }, + { + "epoch": 0.20113868872876078, + "grad_norm": 0.1476738303899765, + "learning_rate": 0.0009108545689935076, + "loss": 2.8018, + "step": 6783 + }, + { + "epoch": 0.20116834208107226, + "grad_norm": 0.15046444535255432, + "learning_rate": 0.0009108277524982394, + "loss": 2.8026, + "step": 6784 + }, + { + "epoch": 0.20119799543338374, + "grad_norm": 0.13832709193229675, + "learning_rate": 0.000910800932365021, + "loss": 2.7751, + "step": 6785 + }, + { + "epoch": 0.2012276487856952, + "grad_norm": 0.14064636826515198, + "learning_rate": 0.0009107741085940897, + "loss": 2.7934, + "step": 6786 + }, + { + "epoch": 0.2012573021380067, + "grad_norm": 0.14954175055027008, + "learning_rate": 0.0009107472811856834, + "loss": 2.7943, + "step": 6787 + }, + { + "epoch": 0.2012869554903182, + "grad_norm": 0.14624746143817902, + "learning_rate": 0.0009107204501400392, + "loss": 2.7725, + "step": 6788 + }, + { + "epoch": 0.20131660884262967, + "grad_norm": 0.1511099934577942, + "learning_rate": 0.0009106936154573951, + "loss": 2.8146, + "step": 6789 + }, + { + "epoch": 0.20134626219494114, + "grad_norm": 0.14337478578090668, + "learning_rate": 0.0009106667771379883, + "loss": 2.7963, + "step": 6790 + }, + { + "epoch": 0.20137591554725262, + "grad_norm": 0.17881284654140472, + "learning_rate": 0.0009106399351820569, + "loss": 2.7722, + "step": 6791 + }, + { + "epoch": 0.2014055688995641, + "grad_norm": 0.1989162117242813, + "learning_rate": 0.0009106130895898383, + "loss": 2.7912, + "step": 6792 + }, + { + "epoch": 0.20143522225187557, + "grad_norm": 0.22895070910453796, + "learning_rate": 0.0009105862403615703, + "loss": 2.7939, + "step": 6793 + }, + { + "epoch": 0.20146487560418705, + "grad_norm": 0.23768660426139832, + "learning_rate": 0.0009105593874974907, + "loss": 2.7737, + "step": 6794 + }, + { + "epoch": 0.20149452895649853, + "grad_norm": 0.18003474175930023, + "learning_rate": 0.0009105325309978372, + "loss": 2.8054, + "step": 6795 + }, + { + "epoch": 0.20152418230881, + "grad_norm": 0.19095779955387115, + "learning_rate": 0.0009105056708628477, + "loss": 2.8311, + "step": 6796 + }, + { + "epoch": 0.20155383566112148, + "grad_norm": 0.16031514108181, + "learning_rate": 0.0009104788070927601, + "loss": 2.7694, + "step": 6797 + }, + { + "epoch": 0.20158348901343298, + "grad_norm": 0.1501225382089615, + "learning_rate": 0.0009104519396878121, + "loss": 2.7998, + "step": 6798 + }, + { + "epoch": 0.20161314236574446, + "grad_norm": 0.17628705501556396, + "learning_rate": 0.0009104250686482418, + "loss": 2.7858, + "step": 6799 + }, + { + "epoch": 0.20164279571805593, + "grad_norm": 0.16925202310085297, + "learning_rate": 0.0009103981939742869, + "loss": 2.7732, + "step": 6800 + }, + { + "epoch": 0.2016724490703674, + "grad_norm": 0.15695247054100037, + "learning_rate": 0.0009103713156661858, + "loss": 2.821, + "step": 6801 + }, + { + "epoch": 0.20170210242267889, + "grad_norm": 0.14728772640228271, + "learning_rate": 0.0009103444337241761, + "loss": 2.7908, + "step": 6802 + }, + { + "epoch": 0.20173175577499036, + "grad_norm": 0.13961099088191986, + "learning_rate": 0.0009103175481484961, + "loss": 2.779, + "step": 6803 + }, + { + "epoch": 0.20176140912730184, + "grad_norm": 0.14039547741413116, + "learning_rate": 0.0009102906589393836, + "loss": 2.7712, + "step": 6804 + }, + { + "epoch": 0.20179106247961331, + "grad_norm": 0.12531901895999908, + "learning_rate": 0.0009102637660970772, + "loss": 2.7938, + "step": 6805 + }, + { + "epoch": 0.2018207158319248, + "grad_norm": 0.12872843444347382, + "learning_rate": 0.0009102368696218144, + "loss": 2.7905, + "step": 6806 + }, + { + "epoch": 0.20185036918423627, + "grad_norm": 0.13083498179912567, + "learning_rate": 0.000910209969513834, + "loss": 2.7572, + "step": 6807 + }, + { + "epoch": 0.20188002253654774, + "grad_norm": 0.1560780555009842, + "learning_rate": 0.0009101830657733736, + "loss": 2.7783, + "step": 6808 + }, + { + "epoch": 0.20190967588885925, + "grad_norm": 0.16897061467170715, + "learning_rate": 0.0009101561584006719, + "loss": 2.7856, + "step": 6809 + }, + { + "epoch": 0.20193932924117072, + "grad_norm": 0.1629425436258316, + "learning_rate": 0.0009101292473959671, + "loss": 2.7728, + "step": 6810 + }, + { + "epoch": 0.2019689825934822, + "grad_norm": 0.1593228578567505, + "learning_rate": 0.0009101023327594972, + "loss": 2.8132, + "step": 6811 + }, + { + "epoch": 0.20199863594579368, + "grad_norm": 0.16076216101646423, + "learning_rate": 0.000910075414491501, + "loss": 2.7669, + "step": 6812 + }, + { + "epoch": 0.20202828929810515, + "grad_norm": 0.15429557859897614, + "learning_rate": 0.0009100484925922166, + "loss": 2.7962, + "step": 6813 + }, + { + "epoch": 0.20205794265041663, + "grad_norm": 0.15539875626564026, + "learning_rate": 0.0009100215670618823, + "loss": 2.7601, + "step": 6814 + }, + { + "epoch": 0.2020875960027281, + "grad_norm": 0.1441596895456314, + "learning_rate": 0.0009099946379007367, + "loss": 2.8002, + "step": 6815 + }, + { + "epoch": 0.20211724935503958, + "grad_norm": 0.139064222574234, + "learning_rate": 0.0009099677051090181, + "loss": 2.7921, + "step": 6816 + }, + { + "epoch": 0.20214690270735106, + "grad_norm": 0.14041122794151306, + "learning_rate": 0.0009099407686869651, + "loss": 2.817, + "step": 6817 + }, + { + "epoch": 0.20217655605966253, + "grad_norm": 0.15354377031326294, + "learning_rate": 0.0009099138286348163, + "loss": 2.7607, + "step": 6818 + }, + { + "epoch": 0.20220620941197404, + "grad_norm": 0.13759788870811462, + "learning_rate": 0.0009098868849528101, + "loss": 2.7859, + "step": 6819 + }, + { + "epoch": 0.2022358627642855, + "grad_norm": 0.1435764580965042, + "learning_rate": 0.0009098599376411853, + "loss": 2.7785, + "step": 6820 + }, + { + "epoch": 0.202265516116597, + "grad_norm": 0.15353283286094666, + "learning_rate": 0.0009098329867001804, + "loss": 2.7495, + "step": 6821 + }, + { + "epoch": 0.20229516946890846, + "grad_norm": 0.16359516978263855, + "learning_rate": 0.0009098060321300341, + "loss": 2.8093, + "step": 6822 + }, + { + "epoch": 0.20232482282121994, + "grad_norm": 0.16797859966754913, + "learning_rate": 0.0009097790739309848, + "loss": 2.7745, + "step": 6823 + }, + { + "epoch": 0.20235447617353142, + "grad_norm": 0.15184137225151062, + "learning_rate": 0.0009097521121032717, + "loss": 2.7729, + "step": 6824 + }, + { + "epoch": 0.2023841295258429, + "grad_norm": 0.1350530982017517, + "learning_rate": 0.0009097251466471332, + "loss": 2.7979, + "step": 6825 + }, + { + "epoch": 0.20241378287815437, + "grad_norm": 0.14670905470848083, + "learning_rate": 0.0009096981775628082, + "loss": 2.803, + "step": 6826 + }, + { + "epoch": 0.20244343623046585, + "grad_norm": 0.15041938424110413, + "learning_rate": 0.0009096712048505355, + "loss": 2.7806, + "step": 6827 + }, + { + "epoch": 0.20247308958277732, + "grad_norm": 0.16901685297489166, + "learning_rate": 0.0009096442285105542, + "loss": 2.7832, + "step": 6828 + }, + { + "epoch": 0.20250274293508883, + "grad_norm": 0.17950180172920227, + "learning_rate": 0.0009096172485431027, + "loss": 2.7592, + "step": 6829 + }, + { + "epoch": 0.2025323962874003, + "grad_norm": 0.15803831815719604, + "learning_rate": 0.0009095902649484202, + "loss": 2.7887, + "step": 6830 + }, + { + "epoch": 0.20256204963971178, + "grad_norm": 0.1549506038427353, + "learning_rate": 0.0009095632777267456, + "loss": 2.7806, + "step": 6831 + }, + { + "epoch": 0.20259170299202325, + "grad_norm": 0.14260883629322052, + "learning_rate": 0.0009095362868783179, + "loss": 2.8007, + "step": 6832 + }, + { + "epoch": 0.20262135634433473, + "grad_norm": 0.1406385898590088, + "learning_rate": 0.0009095092924033761, + "loss": 2.7763, + "step": 6833 + }, + { + "epoch": 0.2026510096966462, + "grad_norm": 0.14592613279819489, + "learning_rate": 0.0009094822943021591, + "loss": 2.7887, + "step": 6834 + }, + { + "epoch": 0.20268066304895768, + "grad_norm": 0.13275378942489624, + "learning_rate": 0.0009094552925749062, + "loss": 2.7839, + "step": 6835 + }, + { + "epoch": 0.20271031640126916, + "grad_norm": 0.13771472871303558, + "learning_rate": 0.0009094282872218564, + "loss": 2.8111, + "step": 6836 + }, + { + "epoch": 0.20273996975358063, + "grad_norm": 0.15009430050849915, + "learning_rate": 0.0009094012782432488, + "loss": 2.8147, + "step": 6837 + }, + { + "epoch": 0.2027696231058921, + "grad_norm": 0.17072248458862305, + "learning_rate": 0.0009093742656393227, + "loss": 2.8117, + "step": 6838 + }, + { + "epoch": 0.2027992764582036, + "grad_norm": 0.20042093098163605, + "learning_rate": 0.0009093472494103171, + "loss": 2.7752, + "step": 6839 + }, + { + "epoch": 0.2028289298105151, + "grad_norm": 0.19746077060699463, + "learning_rate": 0.0009093202295564714, + "loss": 2.7483, + "step": 6840 + }, + { + "epoch": 0.20285858316282657, + "grad_norm": 0.16413000226020813, + "learning_rate": 0.0009092932060780248, + "loss": 2.7671, + "step": 6841 + }, + { + "epoch": 0.20288823651513804, + "grad_norm": 0.15408112108707428, + "learning_rate": 0.0009092661789752165, + "loss": 2.7867, + "step": 6842 + }, + { + "epoch": 0.20291788986744952, + "grad_norm": 0.15917696058750153, + "learning_rate": 0.0009092391482482861, + "loss": 2.7783, + "step": 6843 + }, + { + "epoch": 0.202947543219761, + "grad_norm": 0.17840595543384552, + "learning_rate": 0.0009092121138974727, + "loss": 2.8054, + "step": 6844 + }, + { + "epoch": 0.20297719657207247, + "grad_norm": 0.16193895041942596, + "learning_rate": 0.0009091850759230158, + "loss": 2.8004, + "step": 6845 + }, + { + "epoch": 0.20300684992438395, + "grad_norm": 0.1512862741947174, + "learning_rate": 0.0009091580343251549, + "loss": 2.8064, + "step": 6846 + }, + { + "epoch": 0.20303650327669542, + "grad_norm": 0.1492607444524765, + "learning_rate": 0.0009091309891041294, + "loss": 2.7809, + "step": 6847 + }, + { + "epoch": 0.2030661566290069, + "grad_norm": 0.158922016620636, + "learning_rate": 0.0009091039402601786, + "loss": 2.7889, + "step": 6848 + }, + { + "epoch": 0.20309580998131838, + "grad_norm": 0.1569708287715912, + "learning_rate": 0.0009090768877935422, + "loss": 2.827, + "step": 6849 + }, + { + "epoch": 0.20312546333362988, + "grad_norm": 0.14177227020263672, + "learning_rate": 0.00090904983170446, + "loss": 2.8047, + "step": 6850 + }, + { + "epoch": 0.20315511668594136, + "grad_norm": 0.13826300203800201, + "learning_rate": 0.000909022771993171, + "loss": 2.7574, + "step": 6851 + }, + { + "epoch": 0.20318477003825283, + "grad_norm": 0.15110532939434052, + "learning_rate": 0.0009089957086599154, + "loss": 2.7984, + "step": 6852 + }, + { + "epoch": 0.2032144233905643, + "grad_norm": 0.15185290575027466, + "learning_rate": 0.0009089686417049325, + "loss": 2.7709, + "step": 6853 + }, + { + "epoch": 0.20324407674287578, + "grad_norm": 0.15407241880893707, + "learning_rate": 0.000908941571128462, + "loss": 2.7696, + "step": 6854 + }, + { + "epoch": 0.20327373009518726, + "grad_norm": 0.14807280898094177, + "learning_rate": 0.0009089144969307437, + "loss": 2.8084, + "step": 6855 + }, + { + "epoch": 0.20330338344749874, + "grad_norm": 0.12098043411970139, + "learning_rate": 0.0009088874191120175, + "loss": 2.7573, + "step": 6856 + }, + { + "epoch": 0.2033330367998102, + "grad_norm": 0.11622253060340881, + "learning_rate": 0.0009088603376725228, + "loss": 2.7824, + "step": 6857 + }, + { + "epoch": 0.2033626901521217, + "grad_norm": 0.12894946336746216, + "learning_rate": 0.0009088332526124999, + "loss": 2.7637, + "step": 6858 + }, + { + "epoch": 0.20339234350443317, + "grad_norm": 0.13623909652233124, + "learning_rate": 0.0009088061639321881, + "loss": 2.8032, + "step": 6859 + }, + { + "epoch": 0.20342199685674464, + "grad_norm": 0.14022137224674225, + "learning_rate": 0.0009087790716318276, + "loss": 2.7924, + "step": 6860 + }, + { + "epoch": 0.20345165020905615, + "grad_norm": 0.1635662466287613, + "learning_rate": 0.0009087519757116585, + "loss": 2.7911, + "step": 6861 + }, + { + "epoch": 0.20348130356136762, + "grad_norm": 0.1588258594274521, + "learning_rate": 0.0009087248761719202, + "loss": 2.7823, + "step": 6862 + }, + { + "epoch": 0.2035109569136791, + "grad_norm": 0.17086894810199738, + "learning_rate": 0.000908697773012853, + "loss": 2.7932, + "step": 6863 + }, + { + "epoch": 0.20354061026599057, + "grad_norm": 0.2253570258617401, + "learning_rate": 0.0009086706662346971, + "loss": 2.7987, + "step": 6864 + }, + { + "epoch": 0.20357026361830205, + "grad_norm": 0.2191690355539322, + "learning_rate": 0.0009086435558376921, + "loss": 2.7915, + "step": 6865 + }, + { + "epoch": 0.20359991697061353, + "grad_norm": 0.1931920051574707, + "learning_rate": 0.0009086164418220784, + "loss": 2.7836, + "step": 6866 + }, + { + "epoch": 0.203629570322925, + "grad_norm": 0.1995297521352768, + "learning_rate": 0.0009085893241880958, + "loss": 2.7775, + "step": 6867 + }, + { + "epoch": 0.20365922367523648, + "grad_norm": 0.17439991235733032, + "learning_rate": 0.0009085622029359847, + "loss": 2.7999, + "step": 6868 + }, + { + "epoch": 0.20368887702754795, + "grad_norm": 0.17615707218647003, + "learning_rate": 0.0009085350780659851, + "loss": 2.7728, + "step": 6869 + }, + { + "epoch": 0.20371853037985943, + "grad_norm": 0.1497037410736084, + "learning_rate": 0.0009085079495783374, + "loss": 2.7596, + "step": 6870 + }, + { + "epoch": 0.20374818373217093, + "grad_norm": 0.16049312055110931, + "learning_rate": 0.0009084808174732815, + "loss": 2.8087, + "step": 6871 + }, + { + "epoch": 0.2037778370844824, + "grad_norm": 0.17188826203346252, + "learning_rate": 0.000908453681751058, + "loss": 2.7907, + "step": 6872 + }, + { + "epoch": 0.2038074904367939, + "grad_norm": 0.16068191826343536, + "learning_rate": 0.0009084265424119069, + "loss": 2.8261, + "step": 6873 + }, + { + "epoch": 0.20383714378910536, + "grad_norm": 0.16830262541770935, + "learning_rate": 0.0009083993994560689, + "loss": 2.7774, + "step": 6874 + }, + { + "epoch": 0.20386679714141684, + "grad_norm": 0.15593311190605164, + "learning_rate": 0.0009083722528837839, + "loss": 2.773, + "step": 6875 + }, + { + "epoch": 0.20389645049372832, + "grad_norm": 0.14576758444309235, + "learning_rate": 0.0009083451026952926, + "loss": 2.8363, + "step": 6876 + }, + { + "epoch": 0.2039261038460398, + "grad_norm": 0.13129067420959473, + "learning_rate": 0.0009083179488908353, + "loss": 2.7726, + "step": 6877 + }, + { + "epoch": 0.20395575719835127, + "grad_norm": 0.12613679468631744, + "learning_rate": 0.0009082907914706524, + "loss": 2.7966, + "step": 6878 + }, + { + "epoch": 0.20398541055066274, + "grad_norm": 0.1427369862794876, + "learning_rate": 0.0009082636304349845, + "loss": 2.778, + "step": 6879 + }, + { + "epoch": 0.20401506390297422, + "grad_norm": 0.135462686419487, + "learning_rate": 0.0009082364657840721, + "loss": 2.7904, + "step": 6880 + }, + { + "epoch": 0.20404471725528572, + "grad_norm": 0.1395183503627777, + "learning_rate": 0.0009082092975181557, + "loss": 2.7671, + "step": 6881 + }, + { + "epoch": 0.2040743706075972, + "grad_norm": 0.12538178265094757, + "learning_rate": 0.000908182125637476, + "loss": 2.7668, + "step": 6882 + }, + { + "epoch": 0.20410402395990868, + "grad_norm": 0.133860245347023, + "learning_rate": 0.0009081549501422734, + "loss": 2.7616, + "step": 6883 + }, + { + "epoch": 0.20413367731222015, + "grad_norm": 0.170717254281044, + "learning_rate": 0.0009081277710327886, + "loss": 2.781, + "step": 6884 + }, + { + "epoch": 0.20416333066453163, + "grad_norm": 0.12655699253082275, + "learning_rate": 0.0009081005883092625, + "loss": 2.7626, + "step": 6885 + }, + { + "epoch": 0.2041929840168431, + "grad_norm": 0.14193573594093323, + "learning_rate": 0.0009080734019719357, + "loss": 2.7763, + "step": 6886 + }, + { + "epoch": 0.20422263736915458, + "grad_norm": 0.15315797924995422, + "learning_rate": 0.0009080462120210486, + "loss": 2.7684, + "step": 6887 + }, + { + "epoch": 0.20425229072146606, + "grad_norm": 0.14064358174800873, + "learning_rate": 0.0009080190184568424, + "loss": 2.7771, + "step": 6888 + }, + { + "epoch": 0.20428194407377753, + "grad_norm": 0.13355207443237305, + "learning_rate": 0.000907991821279558, + "loss": 2.8172, + "step": 6889 + }, + { + "epoch": 0.204311597426089, + "grad_norm": 0.14628452062606812, + "learning_rate": 0.0009079646204894356, + "loss": 2.7779, + "step": 6890 + }, + { + "epoch": 0.20434125077840049, + "grad_norm": 0.1718171089887619, + "learning_rate": 0.0009079374160867167, + "loss": 2.7923, + "step": 6891 + }, + { + "epoch": 0.204370904130712, + "grad_norm": 0.18652483820915222, + "learning_rate": 0.0009079102080716418, + "loss": 2.7648, + "step": 6892 + }, + { + "epoch": 0.20440055748302347, + "grad_norm": 0.1968497782945633, + "learning_rate": 0.0009078829964444521, + "loss": 2.7798, + "step": 6893 + }, + { + "epoch": 0.20443021083533494, + "grad_norm": 0.1633245050907135, + "learning_rate": 0.0009078557812053884, + "loss": 2.7487, + "step": 6894 + }, + { + "epoch": 0.20445986418764642, + "grad_norm": 0.15613755583763123, + "learning_rate": 0.0009078285623546918, + "loss": 2.7724, + "step": 6895 + }, + { + "epoch": 0.2044895175399579, + "grad_norm": 0.1820656955242157, + "learning_rate": 0.0009078013398926032, + "loss": 2.803, + "step": 6896 + }, + { + "epoch": 0.20451917089226937, + "grad_norm": 0.16688179969787598, + "learning_rate": 0.0009077741138193638, + "loss": 2.8027, + "step": 6897 + }, + { + "epoch": 0.20454882424458085, + "grad_norm": 0.15048235654830933, + "learning_rate": 0.0009077468841352146, + "loss": 2.8042, + "step": 6898 + }, + { + "epoch": 0.20457847759689232, + "grad_norm": 0.16237521171569824, + "learning_rate": 0.0009077196508403967, + "loss": 2.8041, + "step": 6899 + }, + { + "epoch": 0.2046081309492038, + "grad_norm": 0.17461028695106506, + "learning_rate": 0.0009076924139351514, + "loss": 2.7831, + "step": 6900 + }, + { + "epoch": 0.20463778430151527, + "grad_norm": 0.16341473162174225, + "learning_rate": 0.0009076651734197198, + "loss": 2.7818, + "step": 6901 + }, + { + "epoch": 0.20466743765382678, + "grad_norm": 0.17498190701007843, + "learning_rate": 0.0009076379292943431, + "loss": 2.7811, + "step": 6902 + }, + { + "epoch": 0.20469709100613825, + "grad_norm": 0.15292391180992126, + "learning_rate": 0.0009076106815592624, + "loss": 2.7944, + "step": 6903 + }, + { + "epoch": 0.20472674435844973, + "grad_norm": 0.13904321193695068, + "learning_rate": 0.0009075834302147194, + "loss": 2.7618, + "step": 6904 + }, + { + "epoch": 0.2047563977107612, + "grad_norm": 0.14870357513427734, + "learning_rate": 0.0009075561752609552, + "loss": 2.7655, + "step": 6905 + }, + { + "epoch": 0.20478605106307268, + "grad_norm": 0.13426393270492554, + "learning_rate": 0.0009075289166982108, + "loss": 2.8148, + "step": 6906 + }, + { + "epoch": 0.20481570441538416, + "grad_norm": 0.13880695402622223, + "learning_rate": 0.0009075016545267281, + "loss": 2.7971, + "step": 6907 + }, + { + "epoch": 0.20484535776769563, + "grad_norm": 0.14353646337985992, + "learning_rate": 0.0009074743887467482, + "loss": 2.7922, + "step": 6908 + }, + { + "epoch": 0.2048750111200071, + "grad_norm": 0.1470014750957489, + "learning_rate": 0.0009074471193585128, + "loss": 2.8012, + "step": 6909 + }, + { + "epoch": 0.2049046644723186, + "grad_norm": 0.16579017043113708, + "learning_rate": 0.0009074198463622632, + "loss": 2.7949, + "step": 6910 + }, + { + "epoch": 0.20493431782463006, + "grad_norm": 0.16620056331157684, + "learning_rate": 0.0009073925697582408, + "loss": 2.8024, + "step": 6911 + }, + { + "epoch": 0.20496397117694154, + "grad_norm": 0.1666831225156784, + "learning_rate": 0.0009073652895466873, + "loss": 2.7861, + "step": 6912 + }, + { + "epoch": 0.20499362452925304, + "grad_norm": 0.1588447391986847, + "learning_rate": 0.0009073380057278442, + "loss": 2.7985, + "step": 6913 + }, + { + "epoch": 0.20502327788156452, + "grad_norm": 0.14904798567295074, + "learning_rate": 0.0009073107183019532, + "loss": 2.8034, + "step": 6914 + }, + { + "epoch": 0.205052931233876, + "grad_norm": 0.17395605146884918, + "learning_rate": 0.0009072834272692558, + "loss": 2.8, + "step": 6915 + }, + { + "epoch": 0.20508258458618747, + "grad_norm": 0.18610519170761108, + "learning_rate": 0.000907256132629994, + "loss": 2.7819, + "step": 6916 + }, + { + "epoch": 0.20511223793849895, + "grad_norm": 0.18991857767105103, + "learning_rate": 0.000907228834384409, + "loss": 2.7748, + "step": 6917 + }, + { + "epoch": 0.20514189129081042, + "grad_norm": 0.15572690963745117, + "learning_rate": 0.0009072015325327429, + "loss": 2.8148, + "step": 6918 + }, + { + "epoch": 0.2051715446431219, + "grad_norm": 0.1368672400712967, + "learning_rate": 0.0009071742270752373, + "loss": 2.7871, + "step": 6919 + }, + { + "epoch": 0.20520119799543338, + "grad_norm": 0.13959425687789917, + "learning_rate": 0.0009071469180121339, + "loss": 2.7954, + "step": 6920 + }, + { + "epoch": 0.20523085134774485, + "grad_norm": 0.14566683769226074, + "learning_rate": 0.0009071196053436748, + "loss": 2.8296, + "step": 6921 + }, + { + "epoch": 0.20526050470005633, + "grad_norm": 0.13225550949573517, + "learning_rate": 0.0009070922890701017, + "loss": 2.8122, + "step": 6922 + }, + { + "epoch": 0.20529015805236783, + "grad_norm": 0.13538944721221924, + "learning_rate": 0.0009070649691916564, + "loss": 2.7718, + "step": 6923 + }, + { + "epoch": 0.2053198114046793, + "grad_norm": 0.16131778061389923, + "learning_rate": 0.000907037645708581, + "loss": 2.7738, + "step": 6924 + }, + { + "epoch": 0.20534946475699078, + "grad_norm": 0.15642181038856506, + "learning_rate": 0.0009070103186211174, + "loss": 2.7851, + "step": 6925 + }, + { + "epoch": 0.20537911810930226, + "grad_norm": 0.15249289572238922, + "learning_rate": 0.0009069829879295075, + "loss": 2.8033, + "step": 6926 + }, + { + "epoch": 0.20540877146161374, + "grad_norm": 0.1661258339881897, + "learning_rate": 0.0009069556536339935, + "loss": 2.7861, + "step": 6927 + }, + { + "epoch": 0.2054384248139252, + "grad_norm": 0.17306533455848694, + "learning_rate": 0.0009069283157348172, + "loss": 2.7861, + "step": 6928 + }, + { + "epoch": 0.2054680781662367, + "grad_norm": 0.16893571615219116, + "learning_rate": 0.0009069009742322208, + "loss": 2.7853, + "step": 6929 + }, + { + "epoch": 0.20549773151854817, + "grad_norm": 0.168619766831398, + "learning_rate": 0.0009068736291264466, + "loss": 2.7846, + "step": 6930 + }, + { + "epoch": 0.20552738487085964, + "grad_norm": 0.1679241955280304, + "learning_rate": 0.0009068462804177364, + "loss": 2.8398, + "step": 6931 + }, + { + "epoch": 0.20555703822317112, + "grad_norm": 0.16438744962215424, + "learning_rate": 0.0009068189281063326, + "loss": 2.8229, + "step": 6932 + }, + { + "epoch": 0.20558669157548262, + "grad_norm": 0.17447973787784576, + "learning_rate": 0.0009067915721924775, + "loss": 2.8006, + "step": 6933 + }, + { + "epoch": 0.2056163449277941, + "grad_norm": 0.17512176930904388, + "learning_rate": 0.0009067642126764131, + "loss": 2.7631, + "step": 6934 + }, + { + "epoch": 0.20564599828010557, + "grad_norm": 0.1796824187040329, + "learning_rate": 0.0009067368495583818, + "loss": 2.8148, + "step": 6935 + }, + { + "epoch": 0.20567565163241705, + "grad_norm": 0.18780489265918732, + "learning_rate": 0.0009067094828386258, + "loss": 2.7689, + "step": 6936 + }, + { + "epoch": 0.20570530498472853, + "grad_norm": 0.18504200875759125, + "learning_rate": 0.0009066821125173877, + "loss": 2.7981, + "step": 6937 + }, + { + "epoch": 0.20573495833704, + "grad_norm": 0.1730486899614334, + "learning_rate": 0.0009066547385949095, + "loss": 2.7899, + "step": 6938 + }, + { + "epoch": 0.20576461168935148, + "grad_norm": 0.17082692682743073, + "learning_rate": 0.0009066273610714337, + "loss": 2.7666, + "step": 6939 + }, + { + "epoch": 0.20579426504166295, + "grad_norm": 0.16380852460861206, + "learning_rate": 0.0009065999799472031, + "loss": 2.7703, + "step": 6940 + }, + { + "epoch": 0.20582391839397443, + "grad_norm": 0.14805427193641663, + "learning_rate": 0.0009065725952224597, + "loss": 2.7846, + "step": 6941 + }, + { + "epoch": 0.2058535717462859, + "grad_norm": 0.1445765346288681, + "learning_rate": 0.0009065452068974463, + "loss": 2.7905, + "step": 6942 + }, + { + "epoch": 0.20588322509859738, + "grad_norm": 0.14487212896347046, + "learning_rate": 0.0009065178149724051, + "loss": 2.7964, + "step": 6943 + }, + { + "epoch": 0.2059128784509089, + "grad_norm": 0.12810258567333221, + "learning_rate": 0.0009064904194475791, + "loss": 2.8183, + "step": 6944 + }, + { + "epoch": 0.20594253180322036, + "grad_norm": 0.14600834250450134, + "learning_rate": 0.0009064630203232107, + "loss": 2.7784, + "step": 6945 + }, + { + "epoch": 0.20597218515553184, + "grad_norm": 0.16220775246620178, + "learning_rate": 0.0009064356175995423, + "loss": 2.8003, + "step": 6946 + }, + { + "epoch": 0.20600183850784332, + "grad_norm": 0.15122294425964355, + "learning_rate": 0.0009064082112768168, + "loss": 2.7849, + "step": 6947 + }, + { + "epoch": 0.2060314918601548, + "grad_norm": 0.15242119133472443, + "learning_rate": 0.0009063808013552768, + "loss": 2.7913, + "step": 6948 + }, + { + "epoch": 0.20606114521246627, + "grad_norm": 0.1642361432313919, + "learning_rate": 0.0009063533878351651, + "loss": 2.8068, + "step": 6949 + }, + { + "epoch": 0.20609079856477774, + "grad_norm": 0.17020446062088013, + "learning_rate": 0.0009063259707167244, + "loss": 2.7611, + "step": 6950 + }, + { + "epoch": 0.20612045191708922, + "grad_norm": 0.17355689406394958, + "learning_rate": 0.0009062985500001976, + "loss": 2.7672, + "step": 6951 + }, + { + "epoch": 0.2061501052694007, + "grad_norm": 0.15702661871910095, + "learning_rate": 0.0009062711256858271, + "loss": 2.7749, + "step": 6952 + }, + { + "epoch": 0.20617975862171217, + "grad_norm": 0.1339688003063202, + "learning_rate": 0.0009062436977738563, + "loss": 2.7838, + "step": 6953 + }, + { + "epoch": 0.20620941197402368, + "grad_norm": 0.13912269473075867, + "learning_rate": 0.0009062162662645278, + "loss": 2.7535, + "step": 6954 + }, + { + "epoch": 0.20623906532633515, + "grad_norm": 0.1552734673023224, + "learning_rate": 0.0009061888311580844, + "loss": 2.7942, + "step": 6955 + }, + { + "epoch": 0.20626871867864663, + "grad_norm": 0.13971258699893951, + "learning_rate": 0.0009061613924547694, + "loss": 2.7879, + "step": 6956 + }, + { + "epoch": 0.2062983720309581, + "grad_norm": 0.1534062922000885, + "learning_rate": 0.0009061339501548253, + "loss": 2.828, + "step": 6957 + }, + { + "epoch": 0.20632802538326958, + "grad_norm": 0.1718013882637024, + "learning_rate": 0.0009061065042584955, + "loss": 2.7987, + "step": 6958 + }, + { + "epoch": 0.20635767873558106, + "grad_norm": 0.1500110775232315, + "learning_rate": 0.0009060790547660229, + "loss": 2.772, + "step": 6959 + }, + { + "epoch": 0.20638733208789253, + "grad_norm": 0.14180070161819458, + "learning_rate": 0.0009060516016776506, + "loss": 2.8125, + "step": 6960 + }, + { + "epoch": 0.206416985440204, + "grad_norm": 0.15087655186653137, + "learning_rate": 0.0009060241449936216, + "loss": 2.7796, + "step": 6961 + }, + { + "epoch": 0.20644663879251549, + "grad_norm": 0.12799780070781708, + "learning_rate": 0.0009059966847141791, + "loss": 2.7506, + "step": 6962 + }, + { + "epoch": 0.20647629214482696, + "grad_norm": 0.13051638007164001, + "learning_rate": 0.0009059692208395662, + "loss": 2.7829, + "step": 6963 + }, + { + "epoch": 0.20650594549713844, + "grad_norm": 0.14397525787353516, + "learning_rate": 0.0009059417533700263, + "loss": 2.8161, + "step": 6964 + }, + { + "epoch": 0.20653559884944994, + "grad_norm": 0.14639171957969666, + "learning_rate": 0.0009059142823058024, + "loss": 2.8063, + "step": 6965 + }, + { + "epoch": 0.20656525220176142, + "grad_norm": 0.14481280744075775, + "learning_rate": 0.0009058868076471379, + "loss": 2.8038, + "step": 6966 + }, + { + "epoch": 0.2065949055540729, + "grad_norm": 0.1503281146287918, + "learning_rate": 0.000905859329394276, + "loss": 2.7896, + "step": 6967 + }, + { + "epoch": 0.20662455890638437, + "grad_norm": 0.16328376531600952, + "learning_rate": 0.0009058318475474602, + "loss": 2.7855, + "step": 6968 + }, + { + "epoch": 0.20665421225869585, + "grad_norm": 0.20674821734428406, + "learning_rate": 0.0009058043621069336, + "loss": 2.8351, + "step": 6969 + }, + { + "epoch": 0.20668386561100732, + "grad_norm": 0.23890742659568787, + "learning_rate": 0.0009057768730729399, + "loss": 2.8095, + "step": 6970 + }, + { + "epoch": 0.2067135189633188, + "grad_norm": 0.23734936118125916, + "learning_rate": 0.0009057493804457221, + "loss": 2.784, + "step": 6971 + }, + { + "epoch": 0.20674317231563027, + "grad_norm": 0.1646779626607895, + "learning_rate": 0.0009057218842255239, + "loss": 2.7729, + "step": 6972 + }, + { + "epoch": 0.20677282566794175, + "grad_norm": 0.1608525812625885, + "learning_rate": 0.000905694384412589, + "loss": 2.7724, + "step": 6973 + }, + { + "epoch": 0.20680247902025323, + "grad_norm": 0.17522911727428436, + "learning_rate": 0.0009056668810071605, + "loss": 2.778, + "step": 6974 + }, + { + "epoch": 0.20683213237256473, + "grad_norm": 0.14946086704730988, + "learning_rate": 0.0009056393740094823, + "loss": 2.774, + "step": 6975 + }, + { + "epoch": 0.2068617857248762, + "grad_norm": 0.16086305677890778, + "learning_rate": 0.0009056118634197976, + "loss": 2.8309, + "step": 6976 + }, + { + "epoch": 0.20689143907718768, + "grad_norm": 0.1477574110031128, + "learning_rate": 0.0009055843492383504, + "loss": 2.773, + "step": 6977 + }, + { + "epoch": 0.20692109242949916, + "grad_norm": 0.11786015331745148, + "learning_rate": 0.0009055568314653841, + "loss": 2.8396, + "step": 6978 + }, + { + "epoch": 0.20695074578181064, + "grad_norm": 0.13458341360092163, + "learning_rate": 0.0009055293101011424, + "loss": 2.7752, + "step": 6979 + }, + { + "epoch": 0.2069803991341221, + "grad_norm": 0.12914018332958221, + "learning_rate": 0.0009055017851458691, + "loss": 2.7768, + "step": 6980 + }, + { + "epoch": 0.2070100524864336, + "grad_norm": 0.1321297287940979, + "learning_rate": 0.000905474256599808, + "loss": 2.7953, + "step": 6981 + }, + { + "epoch": 0.20703970583874506, + "grad_norm": 0.12695921957492828, + "learning_rate": 0.0009054467244632025, + "loss": 2.8124, + "step": 6982 + }, + { + "epoch": 0.20706935919105654, + "grad_norm": 0.11241493374109268, + "learning_rate": 0.000905419188736297, + "loss": 2.7831, + "step": 6983 + }, + { + "epoch": 0.20709901254336802, + "grad_norm": 0.12214686721563339, + "learning_rate": 0.0009053916494193347, + "loss": 2.7938, + "step": 6984 + }, + { + "epoch": 0.20712866589567952, + "grad_norm": 0.13510537147521973, + "learning_rate": 0.0009053641065125599, + "loss": 2.8021, + "step": 6985 + }, + { + "epoch": 0.207158319247991, + "grad_norm": 0.13305692374706268, + "learning_rate": 0.0009053365600162163, + "loss": 2.7511, + "step": 6986 + }, + { + "epoch": 0.20718797260030247, + "grad_norm": 0.15102402865886688, + "learning_rate": 0.0009053090099305479, + "loss": 2.78, + "step": 6987 + }, + { + "epoch": 0.20721762595261395, + "grad_norm": 0.1644282341003418, + "learning_rate": 0.0009052814562557987, + "loss": 2.7803, + "step": 6988 + }, + { + "epoch": 0.20724727930492542, + "grad_norm": 0.17953844368457794, + "learning_rate": 0.0009052538989922126, + "loss": 2.7596, + "step": 6989 + }, + { + "epoch": 0.2072769326572369, + "grad_norm": 0.1995958834886551, + "learning_rate": 0.0009052263381400336, + "loss": 2.8103, + "step": 6990 + }, + { + "epoch": 0.20730658600954838, + "grad_norm": 0.18150591850280762, + "learning_rate": 0.000905198773699506, + "loss": 2.8127, + "step": 6991 + }, + { + "epoch": 0.20733623936185985, + "grad_norm": 0.16292250156402588, + "learning_rate": 0.0009051712056708735, + "loss": 2.7646, + "step": 6992 + }, + { + "epoch": 0.20736589271417133, + "grad_norm": 0.17407697439193726, + "learning_rate": 0.0009051436340543806, + "loss": 2.7938, + "step": 6993 + }, + { + "epoch": 0.2073955460664828, + "grad_norm": 0.15804490447044373, + "learning_rate": 0.0009051160588502712, + "loss": 2.778, + "step": 6994 + }, + { + "epoch": 0.20742519941879428, + "grad_norm": 0.15905551612377167, + "learning_rate": 0.0009050884800587896, + "loss": 2.7917, + "step": 6995 + }, + { + "epoch": 0.20745485277110579, + "grad_norm": 0.17169281840324402, + "learning_rate": 0.0009050608976801798, + "loss": 2.8246, + "step": 6996 + }, + { + "epoch": 0.20748450612341726, + "grad_norm": 0.16425800323486328, + "learning_rate": 0.0009050333117146864, + "loss": 2.7864, + "step": 6997 + }, + { + "epoch": 0.20751415947572874, + "grad_norm": 0.18875597417354584, + "learning_rate": 0.0009050057221625533, + "loss": 2.7956, + "step": 6998 + }, + { + "epoch": 0.20754381282804021, + "grad_norm": 0.19126856327056885, + "learning_rate": 0.0009049781290240254, + "loss": 2.8019, + "step": 6999 + }, + { + "epoch": 0.2075734661803517, + "grad_norm": 0.16964994370937347, + "learning_rate": 0.0009049505322993463, + "loss": 2.7762, + "step": 7000 + }, + { + "epoch": 0.20760311953266317, + "grad_norm": 0.1990709900856018, + "learning_rate": 0.0009049229319887609, + "loss": 2.7941, + "step": 7001 + }, + { + "epoch": 0.20763277288497464, + "grad_norm": 0.18085356056690216, + "learning_rate": 0.0009048953280925134, + "loss": 2.7811, + "step": 7002 + }, + { + "epoch": 0.20766242623728612, + "grad_norm": 0.1531788408756256, + "learning_rate": 0.0009048677206108482, + "loss": 2.7792, + "step": 7003 + }, + { + "epoch": 0.2076920795895976, + "grad_norm": 0.1522994339466095, + "learning_rate": 0.00090484010954401, + "loss": 2.7932, + "step": 7004 + }, + { + "epoch": 0.20772173294190907, + "grad_norm": 0.12115843594074249, + "learning_rate": 0.0009048124948922429, + "loss": 2.8062, + "step": 7005 + }, + { + "epoch": 0.20775138629422057, + "grad_norm": 0.1364111304283142, + "learning_rate": 0.0009047848766557917, + "loss": 2.7923, + "step": 7006 + }, + { + "epoch": 0.20778103964653205, + "grad_norm": 0.13594216108322144, + "learning_rate": 0.0009047572548349012, + "loss": 2.7611, + "step": 7007 + }, + { + "epoch": 0.20781069299884353, + "grad_norm": 0.13665032386779785, + "learning_rate": 0.0009047296294298155, + "loss": 2.7694, + "step": 7008 + }, + { + "epoch": 0.207840346351155, + "grad_norm": 0.11514446139335632, + "learning_rate": 0.0009047020004407795, + "loss": 2.7728, + "step": 7009 + }, + { + "epoch": 0.20786999970346648, + "grad_norm": 0.14380769431591034, + "learning_rate": 0.000904674367868038, + "loss": 2.7684, + "step": 7010 + }, + { + "epoch": 0.20789965305577796, + "grad_norm": 0.16740195453166962, + "learning_rate": 0.0009046467317118353, + "loss": 2.7883, + "step": 7011 + }, + { + "epoch": 0.20792930640808943, + "grad_norm": 0.1506134569644928, + "learning_rate": 0.0009046190919724164, + "loss": 2.7888, + "step": 7012 + }, + { + "epoch": 0.2079589597604009, + "grad_norm": 0.15641793608665466, + "learning_rate": 0.0009045914486500259, + "loss": 2.7777, + "step": 7013 + }, + { + "epoch": 0.20798861311271238, + "grad_norm": 0.1484629064798355, + "learning_rate": 0.0009045638017449089, + "loss": 2.8088, + "step": 7014 + }, + { + "epoch": 0.20801826646502386, + "grad_norm": 0.14318443834781647, + "learning_rate": 0.0009045361512573098, + "loss": 2.8142, + "step": 7015 + }, + { + "epoch": 0.20804791981733534, + "grad_norm": 0.1691257357597351, + "learning_rate": 0.0009045084971874737, + "loss": 2.7608, + "step": 7016 + }, + { + "epoch": 0.20807757316964684, + "grad_norm": 0.15573032200336456, + "learning_rate": 0.0009044808395356455, + "loss": 2.8025, + "step": 7017 + }, + { + "epoch": 0.20810722652195832, + "grad_norm": 0.1255567967891693, + "learning_rate": 0.00090445317830207, + "loss": 2.7687, + "step": 7018 + }, + { + "epoch": 0.2081368798742698, + "grad_norm": 0.12274730205535889, + "learning_rate": 0.0009044255134869921, + "loss": 2.7811, + "step": 7019 + }, + { + "epoch": 0.20816653322658127, + "grad_norm": 0.1387208253145218, + "learning_rate": 0.0009043978450906569, + "loss": 2.7706, + "step": 7020 + }, + { + "epoch": 0.20819618657889274, + "grad_norm": 0.1465868502855301, + "learning_rate": 0.0009043701731133094, + "loss": 2.7927, + "step": 7021 + }, + { + "epoch": 0.20822583993120422, + "grad_norm": 0.1546890139579773, + "learning_rate": 0.0009043424975551946, + "loss": 2.7769, + "step": 7022 + }, + { + "epoch": 0.2082554932835157, + "grad_norm": 0.17365871369838715, + "learning_rate": 0.0009043148184165575, + "loss": 2.8336, + "step": 7023 + }, + { + "epoch": 0.20828514663582717, + "grad_norm": 0.16554191708564758, + "learning_rate": 0.0009042871356976434, + "loss": 2.7875, + "step": 7024 + }, + { + "epoch": 0.20831479998813865, + "grad_norm": 0.14687374234199524, + "learning_rate": 0.0009042594493986972, + "loss": 2.7936, + "step": 7025 + }, + { + "epoch": 0.20834445334045013, + "grad_norm": 0.15654729306697845, + "learning_rate": 0.0009042317595199643, + "loss": 2.8048, + "step": 7026 + }, + { + "epoch": 0.20837410669276163, + "grad_norm": 0.16952309012413025, + "learning_rate": 0.0009042040660616897, + "loss": 2.7988, + "step": 7027 + }, + { + "epoch": 0.2084037600450731, + "grad_norm": 0.1956280618906021, + "learning_rate": 0.0009041763690241187, + "loss": 2.8161, + "step": 7028 + }, + { + "epoch": 0.20843341339738458, + "grad_norm": 0.22359497845172882, + "learning_rate": 0.0009041486684074967, + "loss": 2.7782, + "step": 7029 + }, + { + "epoch": 0.20846306674969606, + "grad_norm": 0.19275225698947906, + "learning_rate": 0.0009041209642120687, + "loss": 2.8347, + "step": 7030 + }, + { + "epoch": 0.20849272010200753, + "grad_norm": 0.1649741232395172, + "learning_rate": 0.0009040932564380804, + "loss": 2.7874, + "step": 7031 + }, + { + "epoch": 0.208522373454319, + "grad_norm": 0.2048693746328354, + "learning_rate": 0.0009040655450857768, + "loss": 2.7639, + "step": 7032 + }, + { + "epoch": 0.20855202680663049, + "grad_norm": 0.16377629339694977, + "learning_rate": 0.0009040378301554034, + "loss": 2.7784, + "step": 7033 + }, + { + "epoch": 0.20858168015894196, + "grad_norm": 0.14039361476898193, + "learning_rate": 0.0009040101116472057, + "loss": 2.7698, + "step": 7034 + }, + { + "epoch": 0.20861133351125344, + "grad_norm": 0.16522587835788727, + "learning_rate": 0.0009039823895614292, + "loss": 2.7901, + "step": 7035 + }, + { + "epoch": 0.20864098686356491, + "grad_norm": 0.14199087023735046, + "learning_rate": 0.0009039546638983192, + "loss": 2.7975, + "step": 7036 + }, + { + "epoch": 0.20867064021587642, + "grad_norm": 0.13658416271209717, + "learning_rate": 0.0009039269346581214, + "loss": 2.7868, + "step": 7037 + }, + { + "epoch": 0.2087002935681879, + "grad_norm": 0.1388077735900879, + "learning_rate": 0.0009038992018410813, + "loss": 2.7602, + "step": 7038 + }, + { + "epoch": 0.20872994692049937, + "grad_norm": 0.1288076639175415, + "learning_rate": 0.0009038714654474443, + "loss": 2.8184, + "step": 7039 + }, + { + "epoch": 0.20875960027281085, + "grad_norm": 0.11785583943128586, + "learning_rate": 0.0009038437254774563, + "loss": 2.7762, + "step": 7040 + }, + { + "epoch": 0.20878925362512232, + "grad_norm": 0.12602370977401733, + "learning_rate": 0.0009038159819313627, + "loss": 2.7953, + "step": 7041 + }, + { + "epoch": 0.2088189069774338, + "grad_norm": 0.12910118699073792, + "learning_rate": 0.0009037882348094093, + "loss": 2.8144, + "step": 7042 + }, + { + "epoch": 0.20884856032974528, + "grad_norm": 0.11692705005407333, + "learning_rate": 0.0009037604841118416, + "loss": 2.7602, + "step": 7043 + }, + { + "epoch": 0.20887821368205675, + "grad_norm": 0.11976955831050873, + "learning_rate": 0.0009037327298389058, + "loss": 2.7827, + "step": 7044 + }, + { + "epoch": 0.20890786703436823, + "grad_norm": 0.13901203870773315, + "learning_rate": 0.0009037049719908473, + "loss": 2.746, + "step": 7045 + }, + { + "epoch": 0.2089375203866797, + "grad_norm": 0.15528535842895508, + "learning_rate": 0.0009036772105679118, + "loss": 2.7822, + "step": 7046 + }, + { + "epoch": 0.20896717373899118, + "grad_norm": 0.147485613822937, + "learning_rate": 0.0009036494455703455, + "loss": 2.7966, + "step": 7047 + }, + { + "epoch": 0.20899682709130268, + "grad_norm": 0.16060520708560944, + "learning_rate": 0.0009036216769983939, + "loss": 2.789, + "step": 7048 + }, + { + "epoch": 0.20902648044361416, + "grad_norm": 0.16884300112724304, + "learning_rate": 0.0009035939048523032, + "loss": 2.7841, + "step": 7049 + }, + { + "epoch": 0.20905613379592564, + "grad_norm": 0.15641534328460693, + "learning_rate": 0.0009035661291323192, + "loss": 2.7619, + "step": 7050 + }, + { + "epoch": 0.2090857871482371, + "grad_norm": 0.15024767816066742, + "learning_rate": 0.0009035383498386878, + "loss": 2.7945, + "step": 7051 + }, + { + "epoch": 0.2091154405005486, + "grad_norm": 0.16829468309879303, + "learning_rate": 0.000903510566971655, + "loss": 2.7977, + "step": 7052 + }, + { + "epoch": 0.20914509385286006, + "grad_norm": 0.18368709087371826, + "learning_rate": 0.000903482780531467, + "loss": 2.7875, + "step": 7053 + }, + { + "epoch": 0.20917474720517154, + "grad_norm": 0.1869412660598755, + "learning_rate": 0.0009034549905183695, + "loss": 2.7744, + "step": 7054 + }, + { + "epoch": 0.20920440055748302, + "grad_norm": 0.20417799055576324, + "learning_rate": 0.0009034271969326092, + "loss": 2.8091, + "step": 7055 + }, + { + "epoch": 0.2092340539097945, + "grad_norm": 0.18174844980239868, + "learning_rate": 0.0009033993997744314, + "loss": 2.7858, + "step": 7056 + }, + { + "epoch": 0.20926370726210597, + "grad_norm": 0.14651024341583252, + "learning_rate": 0.0009033715990440829, + "loss": 2.7961, + "step": 7057 + }, + { + "epoch": 0.20929336061441747, + "grad_norm": 0.1587921530008316, + "learning_rate": 0.0009033437947418095, + "loss": 2.7997, + "step": 7058 + }, + { + "epoch": 0.20932301396672895, + "grad_norm": 0.17042799293994904, + "learning_rate": 0.0009033159868678577, + "loss": 2.755, + "step": 7059 + }, + { + "epoch": 0.20935266731904043, + "grad_norm": 0.17815297842025757, + "learning_rate": 0.0009032881754224737, + "loss": 2.7327, + "step": 7060 + }, + { + "epoch": 0.2093823206713519, + "grad_norm": 0.15096338093280792, + "learning_rate": 0.0009032603604059035, + "loss": 2.7762, + "step": 7061 + }, + { + "epoch": 0.20941197402366338, + "grad_norm": 0.15177400410175323, + "learning_rate": 0.0009032325418183937, + "loss": 2.767, + "step": 7062 + }, + { + "epoch": 0.20944162737597485, + "grad_norm": 0.1333780139684677, + "learning_rate": 0.0009032047196601905, + "loss": 2.7906, + "step": 7063 + }, + { + "epoch": 0.20947128072828633, + "grad_norm": 0.12781769037246704, + "learning_rate": 0.0009031768939315402, + "loss": 2.7883, + "step": 7064 + }, + { + "epoch": 0.2095009340805978, + "grad_norm": 0.14734195172786713, + "learning_rate": 0.0009031490646326894, + "loss": 2.7215, + "step": 7065 + }, + { + "epoch": 0.20953058743290928, + "grad_norm": 0.13457825779914856, + "learning_rate": 0.0009031212317638843, + "loss": 2.8003, + "step": 7066 + }, + { + "epoch": 0.20956024078522076, + "grad_norm": 0.1347389817237854, + "learning_rate": 0.0009030933953253717, + "loss": 2.8122, + "step": 7067 + }, + { + "epoch": 0.20958989413753223, + "grad_norm": 0.12842071056365967, + "learning_rate": 0.0009030655553173978, + "loss": 2.7892, + "step": 7068 + }, + { + "epoch": 0.20961954748984374, + "grad_norm": 0.13151681423187256, + "learning_rate": 0.0009030377117402092, + "loss": 2.7715, + "step": 7069 + }, + { + "epoch": 0.20964920084215521, + "grad_norm": 0.13686832785606384, + "learning_rate": 0.0009030098645940526, + "loss": 2.7841, + "step": 7070 + }, + { + "epoch": 0.2096788541944667, + "grad_norm": 0.14738836884498596, + "learning_rate": 0.0009029820138791744, + "loss": 2.8039, + "step": 7071 + }, + { + "epoch": 0.20970850754677817, + "grad_norm": 0.17553524672985077, + "learning_rate": 0.0009029541595958211, + "loss": 2.804, + "step": 7072 + }, + { + "epoch": 0.20973816089908964, + "grad_norm": 0.1888781040906906, + "learning_rate": 0.0009029263017442397, + "loss": 2.8005, + "step": 7073 + }, + { + "epoch": 0.20976781425140112, + "grad_norm": 0.18558292090892792, + "learning_rate": 0.000902898440324677, + "loss": 2.8118, + "step": 7074 + }, + { + "epoch": 0.2097974676037126, + "grad_norm": 0.18306943774223328, + "learning_rate": 0.0009028705753373791, + "loss": 2.8098, + "step": 7075 + }, + { + "epoch": 0.20982712095602407, + "grad_norm": 0.17039895057678223, + "learning_rate": 0.0009028427067825933, + "loss": 2.727, + "step": 7076 + }, + { + "epoch": 0.20985677430833555, + "grad_norm": 0.1985904425382614, + "learning_rate": 0.000902814834660566, + "loss": 2.7832, + "step": 7077 + }, + { + "epoch": 0.20988642766064702, + "grad_norm": 0.19520418345928192, + "learning_rate": 0.0009027869589715442, + "loss": 2.7978, + "step": 7078 + }, + { + "epoch": 0.20991608101295853, + "grad_norm": 0.16322147846221924, + "learning_rate": 0.0009027590797157749, + "loss": 2.7845, + "step": 7079 + }, + { + "epoch": 0.20994573436527, + "grad_norm": 0.18821747601032257, + "learning_rate": 0.0009027311968935048, + "loss": 2.7786, + "step": 7080 + }, + { + "epoch": 0.20997538771758148, + "grad_norm": 0.18755413591861725, + "learning_rate": 0.0009027033105049809, + "loss": 2.7848, + "step": 7081 + }, + { + "epoch": 0.21000504106989296, + "grad_norm": 0.19071592390537262, + "learning_rate": 0.00090267542055045, + "loss": 2.79, + "step": 7082 + }, + { + "epoch": 0.21003469442220443, + "grad_norm": 0.17248985171318054, + "learning_rate": 0.000902647527030159, + "loss": 2.8144, + "step": 7083 + }, + { + "epoch": 0.2100643477745159, + "grad_norm": 0.16327713429927826, + "learning_rate": 0.000902619629944355, + "loss": 2.7977, + "step": 7084 + }, + { + "epoch": 0.21009400112682738, + "grad_norm": 0.14017386734485626, + "learning_rate": 0.0009025917292932853, + "loss": 2.7762, + "step": 7085 + }, + { + "epoch": 0.21012365447913886, + "grad_norm": 0.12032225728034973, + "learning_rate": 0.0009025638250771966, + "loss": 2.793, + "step": 7086 + }, + { + "epoch": 0.21015330783145034, + "grad_norm": 0.15389034152030945, + "learning_rate": 0.0009025359172963361, + "loss": 2.7979, + "step": 7087 + }, + { + "epoch": 0.2101829611837618, + "grad_norm": 0.14905846118927002, + "learning_rate": 0.000902508005950951, + "loss": 2.8141, + "step": 7088 + }, + { + "epoch": 0.21021261453607332, + "grad_norm": 0.14237470924854279, + "learning_rate": 0.0009024800910412884, + "loss": 2.7424, + "step": 7089 + }, + { + "epoch": 0.2102422678883848, + "grad_norm": 0.1638885736465454, + "learning_rate": 0.0009024521725675956, + "loss": 2.7933, + "step": 7090 + }, + { + "epoch": 0.21027192124069627, + "grad_norm": 0.16687941551208496, + "learning_rate": 0.0009024242505301196, + "loss": 2.7669, + "step": 7091 + }, + { + "epoch": 0.21030157459300775, + "grad_norm": 0.1767580509185791, + "learning_rate": 0.0009023963249291078, + "loss": 2.7982, + "step": 7092 + }, + { + "epoch": 0.21033122794531922, + "grad_norm": 0.173883855342865, + "learning_rate": 0.0009023683957648077, + "loss": 2.7716, + "step": 7093 + }, + { + "epoch": 0.2103608812976307, + "grad_norm": 0.16268184781074524, + "learning_rate": 0.0009023404630374661, + "loss": 2.7701, + "step": 7094 + }, + { + "epoch": 0.21039053464994217, + "grad_norm": 0.1669096052646637, + "learning_rate": 0.0009023125267473308, + "loss": 2.7989, + "step": 7095 + }, + { + "epoch": 0.21042018800225365, + "grad_norm": 0.15552255511283875, + "learning_rate": 0.000902284586894649, + "loss": 2.7527, + "step": 7096 + }, + { + "epoch": 0.21044984135456513, + "grad_norm": 0.16074281930923462, + "learning_rate": 0.0009022566434796679, + "loss": 2.7804, + "step": 7097 + }, + { + "epoch": 0.2104794947068766, + "grad_norm": 0.15389280021190643, + "learning_rate": 0.0009022286965026356, + "loss": 2.7782, + "step": 7098 + }, + { + "epoch": 0.21050914805918808, + "grad_norm": 0.11302872747182846, + "learning_rate": 0.0009022007459637989, + "loss": 2.7873, + "step": 7099 + }, + { + "epoch": 0.21053880141149958, + "grad_norm": 0.12245526909828186, + "learning_rate": 0.0009021727918634055, + "loss": 2.7959, + "step": 7100 + }, + { + "epoch": 0.21056845476381106, + "grad_norm": 0.11811500042676926, + "learning_rate": 0.0009021448342017032, + "loss": 2.7636, + "step": 7101 + }, + { + "epoch": 0.21059810811612253, + "grad_norm": 0.11690552532672882, + "learning_rate": 0.000902116872978939, + "loss": 2.7701, + "step": 7102 + }, + { + "epoch": 0.210627761468434, + "grad_norm": 0.10816527158021927, + "learning_rate": 0.0009020889081953611, + "loss": 2.7692, + "step": 7103 + }, + { + "epoch": 0.2106574148207455, + "grad_norm": 0.12250029295682907, + "learning_rate": 0.000902060939851217, + "loss": 2.7689, + "step": 7104 + }, + { + "epoch": 0.21068706817305696, + "grad_norm": 0.14525744318962097, + "learning_rate": 0.0009020329679467543, + "loss": 2.7821, + "step": 7105 + }, + { + "epoch": 0.21071672152536844, + "grad_norm": 0.1672394722700119, + "learning_rate": 0.0009020049924822204, + "loss": 2.8046, + "step": 7106 + }, + { + "epoch": 0.21074637487767992, + "grad_norm": 0.18019206821918488, + "learning_rate": 0.0009019770134578635, + "loss": 2.7969, + "step": 7107 + }, + { + "epoch": 0.2107760282299914, + "grad_norm": 0.170858696103096, + "learning_rate": 0.0009019490308739311, + "loss": 2.7993, + "step": 7108 + }, + { + "epoch": 0.21080568158230287, + "grad_norm": 0.1523047536611557, + "learning_rate": 0.000901921044730671, + "loss": 2.8211, + "step": 7109 + }, + { + "epoch": 0.21083533493461437, + "grad_norm": 0.1311853528022766, + "learning_rate": 0.000901893055028331, + "loss": 2.7713, + "step": 7110 + }, + { + "epoch": 0.21086498828692585, + "grad_norm": 0.13084293901920319, + "learning_rate": 0.000901865061767159, + "loss": 2.7548, + "step": 7111 + }, + { + "epoch": 0.21089464163923732, + "grad_norm": 0.16061492264270782, + "learning_rate": 0.0009018370649474031, + "loss": 2.7889, + "step": 7112 + }, + { + "epoch": 0.2109242949915488, + "grad_norm": 0.14259348809719086, + "learning_rate": 0.0009018090645693109, + "loss": 2.812, + "step": 7113 + }, + { + "epoch": 0.21095394834386028, + "grad_norm": 0.1250423789024353, + "learning_rate": 0.0009017810606331305, + "loss": 2.8063, + "step": 7114 + }, + { + "epoch": 0.21098360169617175, + "grad_norm": 0.12362867593765259, + "learning_rate": 0.0009017530531391098, + "loss": 2.7835, + "step": 7115 + }, + { + "epoch": 0.21101325504848323, + "grad_norm": 0.14604492485523224, + "learning_rate": 0.0009017250420874968, + "loss": 2.7672, + "step": 7116 + }, + { + "epoch": 0.2110429084007947, + "grad_norm": 0.1556716114282608, + "learning_rate": 0.0009016970274785396, + "loss": 2.7822, + "step": 7117 + }, + { + "epoch": 0.21107256175310618, + "grad_norm": 0.1895604431629181, + "learning_rate": 0.0009016690093124865, + "loss": 2.778, + "step": 7118 + }, + { + "epoch": 0.21110221510541766, + "grad_norm": 0.20363575220108032, + "learning_rate": 0.0009016409875895852, + "loss": 2.7945, + "step": 7119 + }, + { + "epoch": 0.21113186845772913, + "grad_norm": 0.1763579100370407, + "learning_rate": 0.0009016129623100839, + "loss": 2.7779, + "step": 7120 + }, + { + "epoch": 0.21116152181004064, + "grad_norm": 0.1515868455171585, + "learning_rate": 0.000901584933474231, + "loss": 2.8182, + "step": 7121 + }, + { + "epoch": 0.2111911751623521, + "grad_norm": 0.1490892618894577, + "learning_rate": 0.0009015569010822746, + "loss": 2.8008, + "step": 7122 + }, + { + "epoch": 0.2112208285146636, + "grad_norm": 0.17163005471229553, + "learning_rate": 0.000901528865134463, + "loss": 2.8165, + "step": 7123 + }, + { + "epoch": 0.21125048186697507, + "grad_norm": 0.17474764585494995, + "learning_rate": 0.0009015008256310442, + "loss": 2.7795, + "step": 7124 + }, + { + "epoch": 0.21128013521928654, + "grad_norm": 0.1615917682647705, + "learning_rate": 0.0009014727825722668, + "loss": 2.8085, + "step": 7125 + }, + { + "epoch": 0.21130978857159802, + "grad_norm": 0.14523404836654663, + "learning_rate": 0.0009014447359583789, + "loss": 2.7839, + "step": 7126 + }, + { + "epoch": 0.2113394419239095, + "grad_norm": 0.1516755074262619, + "learning_rate": 0.0009014166857896291, + "loss": 2.7706, + "step": 7127 + }, + { + "epoch": 0.21136909527622097, + "grad_norm": 0.1717355102300644, + "learning_rate": 0.0009013886320662656, + "loss": 2.7652, + "step": 7128 + }, + { + "epoch": 0.21139874862853245, + "grad_norm": 0.16365906596183777, + "learning_rate": 0.0009013605747885367, + "loss": 2.7807, + "step": 7129 + }, + { + "epoch": 0.21142840198084392, + "grad_norm": 0.15182021260261536, + "learning_rate": 0.0009013325139566911, + "loss": 2.7952, + "step": 7130 + }, + { + "epoch": 0.21145805533315543, + "grad_norm": 0.1347564458847046, + "learning_rate": 0.0009013044495709772, + "loss": 2.8134, + "step": 7131 + }, + { + "epoch": 0.2114877086854669, + "grad_norm": 0.12863239645957947, + "learning_rate": 0.0009012763816316436, + "loss": 2.7686, + "step": 7132 + }, + { + "epoch": 0.21151736203777838, + "grad_norm": 0.1424695998430252, + "learning_rate": 0.0009012483101389388, + "loss": 2.7603, + "step": 7133 + }, + { + "epoch": 0.21154701539008985, + "grad_norm": 0.16902470588684082, + "learning_rate": 0.0009012202350931112, + "loss": 2.8018, + "step": 7134 + }, + { + "epoch": 0.21157666874240133, + "grad_norm": 0.20125602185726166, + "learning_rate": 0.0009011921564944096, + "loss": 2.78, + "step": 7135 + }, + { + "epoch": 0.2116063220947128, + "grad_norm": 0.19742657244205475, + "learning_rate": 0.0009011640743430827, + "loss": 2.8168, + "step": 7136 + }, + { + "epoch": 0.21163597544702428, + "grad_norm": 0.1919735223054886, + "learning_rate": 0.0009011359886393789, + "loss": 2.8209, + "step": 7137 + }, + { + "epoch": 0.21166562879933576, + "grad_norm": 0.19400008022785187, + "learning_rate": 0.0009011078993835471, + "loss": 2.7961, + "step": 7138 + }, + { + "epoch": 0.21169528215164723, + "grad_norm": 0.196564182639122, + "learning_rate": 0.0009010798065758361, + "loss": 2.7833, + "step": 7139 + }, + { + "epoch": 0.2117249355039587, + "grad_norm": 0.1511424034833908, + "learning_rate": 0.0009010517102164944, + "loss": 2.7974, + "step": 7140 + }, + { + "epoch": 0.21175458885627021, + "grad_norm": 0.16629059612751007, + "learning_rate": 0.000901023610305771, + "loss": 2.784, + "step": 7141 + }, + { + "epoch": 0.2117842422085817, + "grad_norm": 0.16692590713500977, + "learning_rate": 0.0009009955068439148, + "loss": 2.7756, + "step": 7142 + }, + { + "epoch": 0.21181389556089317, + "grad_norm": 0.14512652158737183, + "learning_rate": 0.0009009673998311745, + "loss": 2.7962, + "step": 7143 + }, + { + "epoch": 0.21184354891320464, + "grad_norm": 0.1546863615512848, + "learning_rate": 0.0009009392892677991, + "loss": 2.8211, + "step": 7144 + }, + { + "epoch": 0.21187320226551612, + "grad_norm": 0.15835972130298615, + "learning_rate": 0.0009009111751540374, + "loss": 2.8001, + "step": 7145 + }, + { + "epoch": 0.2119028556178276, + "grad_norm": 0.15760906040668488, + "learning_rate": 0.0009008830574901385, + "loss": 2.7744, + "step": 7146 + }, + { + "epoch": 0.21193250897013907, + "grad_norm": 0.14496269822120667, + "learning_rate": 0.0009008549362763512, + "loss": 2.7731, + "step": 7147 + }, + { + "epoch": 0.21196216232245055, + "grad_norm": 0.15087635815143585, + "learning_rate": 0.0009008268115129248, + "loss": 2.755, + "step": 7148 + }, + { + "epoch": 0.21199181567476202, + "grad_norm": 0.16607226431369781, + "learning_rate": 0.000900798683200108, + "loss": 2.7515, + "step": 7149 + }, + { + "epoch": 0.2120214690270735, + "grad_norm": 0.15659582614898682, + "learning_rate": 0.0009007705513381503, + "loss": 2.7759, + "step": 7150 + }, + { + "epoch": 0.21205112237938498, + "grad_norm": 0.14311161637306213, + "learning_rate": 0.0009007424159273004, + "loss": 2.79, + "step": 7151 + }, + { + "epoch": 0.21208077573169648, + "grad_norm": 0.13779820501804352, + "learning_rate": 0.0009007142769678076, + "loss": 2.76, + "step": 7152 + }, + { + "epoch": 0.21211042908400796, + "grad_norm": 0.1366264969110489, + "learning_rate": 0.0009006861344599212, + "loss": 2.8002, + "step": 7153 + }, + { + "epoch": 0.21214008243631943, + "grad_norm": 0.13564705848693848, + "learning_rate": 0.0009006579884038902, + "loss": 2.7654, + "step": 7154 + }, + { + "epoch": 0.2121697357886309, + "grad_norm": 0.13766586780548096, + "learning_rate": 0.0009006298387999641, + "loss": 2.7635, + "step": 7155 + }, + { + "epoch": 0.21219938914094238, + "grad_norm": 0.1425999253988266, + "learning_rate": 0.0009006016856483918, + "loss": 2.7927, + "step": 7156 + }, + { + "epoch": 0.21222904249325386, + "grad_norm": 0.14889977872371674, + "learning_rate": 0.000900573528949423, + "loss": 2.7776, + "step": 7157 + }, + { + "epoch": 0.21225869584556534, + "grad_norm": 0.15262246131896973, + "learning_rate": 0.0009005453687033067, + "loss": 2.7509, + "step": 7158 + }, + { + "epoch": 0.2122883491978768, + "grad_norm": 0.16172830760478973, + "learning_rate": 0.0009005172049102925, + "loss": 2.7774, + "step": 7159 + }, + { + "epoch": 0.2123180025501883, + "grad_norm": 0.1647965908050537, + "learning_rate": 0.0009004890375706296, + "loss": 2.7905, + "step": 7160 + }, + { + "epoch": 0.21234765590249977, + "grad_norm": 0.15223157405853271, + "learning_rate": 0.0009004608666845677, + "loss": 2.78, + "step": 7161 + }, + { + "epoch": 0.21237730925481127, + "grad_norm": 0.1287045180797577, + "learning_rate": 0.000900432692252356, + "loss": 2.7573, + "step": 7162 + }, + { + "epoch": 0.21240696260712275, + "grad_norm": 0.13201908767223358, + "learning_rate": 0.0009004045142742441, + "loss": 2.7765, + "step": 7163 + }, + { + "epoch": 0.21243661595943422, + "grad_norm": 0.14876939356327057, + "learning_rate": 0.0009003763327504815, + "loss": 2.7807, + "step": 7164 + }, + { + "epoch": 0.2124662693117457, + "grad_norm": 0.14219987392425537, + "learning_rate": 0.0009003481476813175, + "loss": 2.7792, + "step": 7165 + }, + { + "epoch": 0.21249592266405717, + "grad_norm": 0.16350609064102173, + "learning_rate": 0.0009003199590670023, + "loss": 2.7621, + "step": 7166 + }, + { + "epoch": 0.21252557601636865, + "grad_norm": 0.1820799559354782, + "learning_rate": 0.000900291766907785, + "loss": 2.7906, + "step": 7167 + }, + { + "epoch": 0.21255522936868013, + "grad_norm": 0.16340866684913635, + "learning_rate": 0.0009002635712039153, + "loss": 2.8106, + "step": 7168 + }, + { + "epoch": 0.2125848827209916, + "grad_norm": 0.16041499376296997, + "learning_rate": 0.0009002353719556431, + "loss": 2.7977, + "step": 7169 + }, + { + "epoch": 0.21261453607330308, + "grad_norm": 0.14663302898406982, + "learning_rate": 0.0009002071691632179, + "loss": 2.7723, + "step": 7170 + }, + { + "epoch": 0.21264418942561455, + "grad_norm": 0.1670663058757782, + "learning_rate": 0.0009001789628268896, + "loss": 2.7617, + "step": 7171 + }, + { + "epoch": 0.21267384277792603, + "grad_norm": 0.198011115193367, + "learning_rate": 0.0009001507529469079, + "loss": 2.7729, + "step": 7172 + }, + { + "epoch": 0.21270349613023753, + "grad_norm": 0.2007153034210205, + "learning_rate": 0.0009001225395235225, + "loss": 2.8186, + "step": 7173 + }, + { + "epoch": 0.212733149482549, + "grad_norm": 0.17422367632389069, + "learning_rate": 0.0009000943225569833, + "loss": 2.7956, + "step": 7174 + }, + { + "epoch": 0.2127628028348605, + "grad_norm": 0.15747639536857605, + "learning_rate": 0.0009000661020475404, + "loss": 2.8179, + "step": 7175 + }, + { + "epoch": 0.21279245618717196, + "grad_norm": 0.13177283108234406, + "learning_rate": 0.0009000378779954433, + "loss": 2.7797, + "step": 7176 + }, + { + "epoch": 0.21282210953948344, + "grad_norm": 0.11238181591033936, + "learning_rate": 0.0009000096504009423, + "loss": 2.7508, + "step": 7177 + }, + { + "epoch": 0.21285176289179492, + "grad_norm": 0.11721064895391464, + "learning_rate": 0.000899981419264287, + "loss": 2.7578, + "step": 7178 + }, + { + "epoch": 0.2128814162441064, + "grad_norm": 0.11719891428947449, + "learning_rate": 0.0008999531845857278, + "loss": 2.7957, + "step": 7179 + }, + { + "epoch": 0.21291106959641787, + "grad_norm": 0.1288411021232605, + "learning_rate": 0.0008999249463655144, + "loss": 2.81, + "step": 7180 + }, + { + "epoch": 0.21294072294872934, + "grad_norm": 0.14429891109466553, + "learning_rate": 0.0008998967046038968, + "loss": 2.7505, + "step": 7181 + }, + { + "epoch": 0.21297037630104082, + "grad_norm": 0.1540183424949646, + "learning_rate": 0.0008998684593011255, + "loss": 2.7873, + "step": 7182 + }, + { + "epoch": 0.21300002965335232, + "grad_norm": 0.1607450544834137, + "learning_rate": 0.0008998402104574501, + "loss": 2.8024, + "step": 7183 + }, + { + "epoch": 0.2130296830056638, + "grad_norm": 0.15039758384227753, + "learning_rate": 0.0008998119580731211, + "loss": 2.7687, + "step": 7184 + }, + { + "epoch": 0.21305933635797528, + "grad_norm": 0.1535564661026001, + "learning_rate": 0.0008997837021483887, + "loss": 2.7599, + "step": 7185 + }, + { + "epoch": 0.21308898971028675, + "grad_norm": 0.1511356234550476, + "learning_rate": 0.0008997554426835028, + "loss": 2.8154, + "step": 7186 + }, + { + "epoch": 0.21311864306259823, + "grad_norm": 0.13400216400623322, + "learning_rate": 0.000899727179678714, + "loss": 2.8102, + "step": 7187 + }, + { + "epoch": 0.2131482964149097, + "grad_norm": 0.138548344373703, + "learning_rate": 0.0008996989131342723, + "loss": 2.7765, + "step": 7188 + }, + { + "epoch": 0.21317794976722118, + "grad_norm": 0.14501741528511047, + "learning_rate": 0.0008996706430504282, + "loss": 2.7481, + "step": 7189 + }, + { + "epoch": 0.21320760311953266, + "grad_norm": 0.16550108790397644, + "learning_rate": 0.000899642369427432, + "loss": 2.7714, + "step": 7190 + }, + { + "epoch": 0.21323725647184413, + "grad_norm": 0.2136845588684082, + "learning_rate": 0.0008996140922655338, + "loss": 2.762, + "step": 7191 + }, + { + "epoch": 0.2132669098241556, + "grad_norm": 0.221516415476799, + "learning_rate": 0.0008995858115649844, + "loss": 2.7529, + "step": 7192 + }, + { + "epoch": 0.2132965631764671, + "grad_norm": 0.16879072785377502, + "learning_rate": 0.0008995575273260341, + "loss": 2.7922, + "step": 7193 + }, + { + "epoch": 0.2133262165287786, + "grad_norm": 0.18687058985233307, + "learning_rate": 0.0008995292395489331, + "loss": 2.7905, + "step": 7194 + }, + { + "epoch": 0.21335586988109007, + "grad_norm": 0.18590548634529114, + "learning_rate": 0.0008995009482339323, + "loss": 2.7326, + "step": 7195 + }, + { + "epoch": 0.21338552323340154, + "grad_norm": 0.18037402629852295, + "learning_rate": 0.000899472653381282, + "loss": 2.8116, + "step": 7196 + }, + { + "epoch": 0.21341517658571302, + "grad_norm": 0.16388989984989166, + "learning_rate": 0.0008994443549912328, + "loss": 2.7818, + "step": 7197 + }, + { + "epoch": 0.2134448299380245, + "grad_norm": 0.15408757328987122, + "learning_rate": 0.0008994160530640351, + "loss": 2.8153, + "step": 7198 + }, + { + "epoch": 0.21347448329033597, + "grad_norm": 0.15698590874671936, + "learning_rate": 0.0008993877475999399, + "loss": 2.7841, + "step": 7199 + }, + { + "epoch": 0.21350413664264745, + "grad_norm": 0.1546497493982315, + "learning_rate": 0.0008993594385991974, + "loss": 2.823, + "step": 7200 + }, + { + "epoch": 0.21353378999495892, + "grad_norm": 0.1347447633743286, + "learning_rate": 0.0008993311260620588, + "loss": 2.7933, + "step": 7201 + }, + { + "epoch": 0.2135634433472704, + "grad_norm": 0.13964805006980896, + "learning_rate": 0.0008993028099887743, + "loss": 2.7671, + "step": 7202 + }, + { + "epoch": 0.21359309669958187, + "grad_norm": 0.1665758192539215, + "learning_rate": 0.000899274490379595, + "loss": 2.7895, + "step": 7203 + }, + { + "epoch": 0.21362275005189338, + "grad_norm": 0.17197705805301666, + "learning_rate": 0.0008992461672347716, + "loss": 2.7299, + "step": 7204 + }, + { + "epoch": 0.21365240340420485, + "grad_norm": 0.1533140242099762, + "learning_rate": 0.0008992178405545548, + "loss": 2.8122, + "step": 7205 + }, + { + "epoch": 0.21368205675651633, + "grad_norm": 0.1499144583940506, + "learning_rate": 0.0008991895103391956, + "loss": 2.7771, + "step": 7206 + }, + { + "epoch": 0.2137117101088278, + "grad_norm": 0.1372266262769699, + "learning_rate": 0.0008991611765889446, + "loss": 2.7879, + "step": 7207 + }, + { + "epoch": 0.21374136346113928, + "grad_norm": 0.16069240868091583, + "learning_rate": 0.000899132839304053, + "loss": 2.7936, + "step": 7208 + }, + { + "epoch": 0.21377101681345076, + "grad_norm": 0.18742753565311432, + "learning_rate": 0.0008991044984847714, + "loss": 2.7834, + "step": 7209 + }, + { + "epoch": 0.21380067016576224, + "grad_norm": 0.18234902620315552, + "learning_rate": 0.0008990761541313511, + "loss": 2.7478, + "step": 7210 + }, + { + "epoch": 0.2138303235180737, + "grad_norm": 0.1816963255405426, + "learning_rate": 0.000899047806244043, + "loss": 2.7923, + "step": 7211 + }, + { + "epoch": 0.2138599768703852, + "grad_norm": 0.15905708074569702, + "learning_rate": 0.0008990194548230979, + "loss": 2.8019, + "step": 7212 + }, + { + "epoch": 0.21388963022269666, + "grad_norm": 0.14918367564678192, + "learning_rate": 0.0008989910998687673, + "loss": 2.7873, + "step": 7213 + }, + { + "epoch": 0.21391928357500817, + "grad_norm": 0.14556989073753357, + "learning_rate": 0.0008989627413813018, + "loss": 2.7517, + "step": 7214 + }, + { + "epoch": 0.21394893692731964, + "grad_norm": 0.14737078547477722, + "learning_rate": 0.0008989343793609529, + "loss": 2.8005, + "step": 7215 + }, + { + "epoch": 0.21397859027963112, + "grad_norm": 0.1444113403558731, + "learning_rate": 0.0008989060138079715, + "loss": 2.7879, + "step": 7216 + }, + { + "epoch": 0.2140082436319426, + "grad_norm": 0.13830538094043732, + "learning_rate": 0.0008988776447226088, + "loss": 2.7887, + "step": 7217 + }, + { + "epoch": 0.21403789698425407, + "grad_norm": 0.147503063082695, + "learning_rate": 0.0008988492721051163, + "loss": 2.7866, + "step": 7218 + }, + { + "epoch": 0.21406755033656555, + "grad_norm": 0.15197543799877167, + "learning_rate": 0.0008988208959557449, + "loss": 2.7563, + "step": 7219 + }, + { + "epoch": 0.21409720368887702, + "grad_norm": 0.15833020210266113, + "learning_rate": 0.0008987925162747461, + "loss": 2.7824, + "step": 7220 + }, + { + "epoch": 0.2141268570411885, + "grad_norm": 0.15941651165485382, + "learning_rate": 0.000898764133062371, + "loss": 2.7936, + "step": 7221 + }, + { + "epoch": 0.21415651039349998, + "grad_norm": 0.15694408118724823, + "learning_rate": 0.0008987357463188711, + "loss": 2.7541, + "step": 7222 + }, + { + "epoch": 0.21418616374581145, + "grad_norm": 0.16818158328533173, + "learning_rate": 0.0008987073560444977, + "loss": 2.7842, + "step": 7223 + }, + { + "epoch": 0.21421581709812293, + "grad_norm": 0.14591795206069946, + "learning_rate": 0.0008986789622395021, + "loss": 2.7967, + "step": 7224 + }, + { + "epoch": 0.21424547045043443, + "grad_norm": 0.12923908233642578, + "learning_rate": 0.0008986505649041361, + "loss": 2.772, + "step": 7225 + }, + { + "epoch": 0.2142751238027459, + "grad_norm": 0.15551498532295227, + "learning_rate": 0.0008986221640386509, + "loss": 2.7941, + "step": 7226 + }, + { + "epoch": 0.21430477715505739, + "grad_norm": 0.17137356102466583, + "learning_rate": 0.000898593759643298, + "loss": 2.795, + "step": 7227 + }, + { + "epoch": 0.21433443050736886, + "grad_norm": 0.15589503943920135, + "learning_rate": 0.0008985653517183288, + "loss": 2.7421, + "step": 7228 + }, + { + "epoch": 0.21436408385968034, + "grad_norm": 0.16798080503940582, + "learning_rate": 0.0008985369402639952, + "loss": 2.7866, + "step": 7229 + }, + { + "epoch": 0.21439373721199181, + "grad_norm": 0.16620607674121857, + "learning_rate": 0.0008985085252805483, + "loss": 2.7525, + "step": 7230 + }, + { + "epoch": 0.2144233905643033, + "grad_norm": 0.15330423414707184, + "learning_rate": 0.0008984801067682402, + "loss": 2.76, + "step": 7231 + }, + { + "epoch": 0.21445304391661477, + "grad_norm": 0.15514978766441345, + "learning_rate": 0.0008984516847273225, + "loss": 2.7669, + "step": 7232 + }, + { + "epoch": 0.21448269726892624, + "grad_norm": 0.14374719560146332, + "learning_rate": 0.0008984232591580465, + "loss": 2.7493, + "step": 7233 + }, + { + "epoch": 0.21451235062123772, + "grad_norm": 0.1537850797176361, + "learning_rate": 0.0008983948300606641, + "loss": 2.8072, + "step": 7234 + }, + { + "epoch": 0.21454200397354922, + "grad_norm": 0.14424417912960052, + "learning_rate": 0.0008983663974354273, + "loss": 2.7534, + "step": 7235 + }, + { + "epoch": 0.2145716573258607, + "grad_norm": 0.16297899186611176, + "learning_rate": 0.0008983379612825875, + "loss": 2.746, + "step": 7236 + }, + { + "epoch": 0.21460131067817217, + "grad_norm": 0.18014398217201233, + "learning_rate": 0.0008983095216023968, + "loss": 2.7846, + "step": 7237 + }, + { + "epoch": 0.21463096403048365, + "grad_norm": 0.2100510448217392, + "learning_rate": 0.0008982810783951069, + "loss": 2.8012, + "step": 7238 + }, + { + "epoch": 0.21466061738279513, + "grad_norm": 0.19508042931556702, + "learning_rate": 0.0008982526316609697, + "loss": 2.7692, + "step": 7239 + }, + { + "epoch": 0.2146902707351066, + "grad_norm": 0.15895415842533112, + "learning_rate": 0.000898224181400237, + "loss": 2.7605, + "step": 7240 + }, + { + "epoch": 0.21471992408741808, + "grad_norm": 0.17535936832427979, + "learning_rate": 0.000898195727613161, + "loss": 2.7767, + "step": 7241 + }, + { + "epoch": 0.21474957743972956, + "grad_norm": 0.19028156995773315, + "learning_rate": 0.0008981672702999933, + "loss": 2.7347, + "step": 7242 + }, + { + "epoch": 0.21477923079204103, + "grad_norm": 0.1684872955083847, + "learning_rate": 0.0008981388094609861, + "loss": 2.7768, + "step": 7243 + }, + { + "epoch": 0.2148088841443525, + "grad_norm": 0.14692552387714386, + "learning_rate": 0.0008981103450963915, + "loss": 2.7951, + "step": 7244 + }, + { + "epoch": 0.214838537496664, + "grad_norm": 0.16476935148239136, + "learning_rate": 0.0008980818772064613, + "loss": 2.7727, + "step": 7245 + }, + { + "epoch": 0.2148681908489755, + "grad_norm": 0.1629328578710556, + "learning_rate": 0.000898053405791448, + "loss": 2.8018, + "step": 7246 + }, + { + "epoch": 0.21489784420128696, + "grad_norm": 0.14903533458709717, + "learning_rate": 0.0008980249308516034, + "loss": 2.7531, + "step": 7247 + }, + { + "epoch": 0.21492749755359844, + "grad_norm": 0.14704929292201996, + "learning_rate": 0.0008979964523871796, + "loss": 2.7667, + "step": 7248 + }, + { + "epoch": 0.21495715090590992, + "grad_norm": 0.14150086045265198, + "learning_rate": 0.0008979679703984288, + "loss": 2.7692, + "step": 7249 + }, + { + "epoch": 0.2149868042582214, + "grad_norm": 0.13697996735572815, + "learning_rate": 0.0008979394848856035, + "loss": 2.7624, + "step": 7250 + }, + { + "epoch": 0.21501645761053287, + "grad_norm": 0.12450465559959412, + "learning_rate": 0.0008979109958489557, + "loss": 2.769, + "step": 7251 + }, + { + "epoch": 0.21504611096284434, + "grad_norm": 0.12708434462547302, + "learning_rate": 0.0008978825032887376, + "loss": 2.7635, + "step": 7252 + }, + { + "epoch": 0.21507576431515582, + "grad_norm": 0.11710634082555771, + "learning_rate": 0.0008978540072052019, + "loss": 2.7492, + "step": 7253 + }, + { + "epoch": 0.2151054176674673, + "grad_norm": 0.13630780577659607, + "learning_rate": 0.0008978255075986005, + "loss": 2.7607, + "step": 7254 + }, + { + "epoch": 0.21513507101977877, + "grad_norm": 0.13810008764266968, + "learning_rate": 0.0008977970044691859, + "loss": 2.7913, + "step": 7255 + }, + { + "epoch": 0.21516472437209028, + "grad_norm": 0.1533842384815216, + "learning_rate": 0.0008977684978172107, + "loss": 2.8047, + "step": 7256 + }, + { + "epoch": 0.21519437772440175, + "grad_norm": 0.1481626331806183, + "learning_rate": 0.0008977399876429271, + "loss": 2.8017, + "step": 7257 + }, + { + "epoch": 0.21522403107671323, + "grad_norm": 0.12466661632061005, + "learning_rate": 0.0008977114739465877, + "loss": 2.7496, + "step": 7258 + }, + { + "epoch": 0.2152536844290247, + "grad_norm": 0.16844376921653748, + "learning_rate": 0.0008976829567284447, + "loss": 2.7309, + "step": 7259 + }, + { + "epoch": 0.21528333778133618, + "grad_norm": 0.1855117231607437, + "learning_rate": 0.0008976544359887512, + "loss": 2.7914, + "step": 7260 + }, + { + "epoch": 0.21531299113364766, + "grad_norm": 0.1751808524131775, + "learning_rate": 0.0008976259117277592, + "loss": 2.7774, + "step": 7261 + }, + { + "epoch": 0.21534264448595913, + "grad_norm": 0.15798619389533997, + "learning_rate": 0.0008975973839457215, + "loss": 2.7431, + "step": 7262 + }, + { + "epoch": 0.2153722978382706, + "grad_norm": 0.1680150032043457, + "learning_rate": 0.0008975688526428909, + "loss": 2.7784, + "step": 7263 + }, + { + "epoch": 0.21540195119058209, + "grad_norm": 0.17659637331962585, + "learning_rate": 0.0008975403178195197, + "loss": 2.8329, + "step": 7264 + }, + { + "epoch": 0.21543160454289356, + "grad_norm": 0.1628921627998352, + "learning_rate": 0.0008975117794758607, + "loss": 2.7616, + "step": 7265 + }, + { + "epoch": 0.21546125789520507, + "grad_norm": 0.17250989377498627, + "learning_rate": 0.0008974832376121667, + "loss": 2.7464, + "step": 7266 + }, + { + "epoch": 0.21549091124751654, + "grad_norm": 0.16446568071842194, + "learning_rate": 0.0008974546922286906, + "loss": 2.7698, + "step": 7267 + }, + { + "epoch": 0.21552056459982802, + "grad_norm": 0.14606396853923798, + "learning_rate": 0.0008974261433256848, + "loss": 2.7447, + "step": 7268 + }, + { + "epoch": 0.2155502179521395, + "grad_norm": 0.134393110871315, + "learning_rate": 0.0008973975909034022, + "loss": 2.7773, + "step": 7269 + }, + { + "epoch": 0.21557987130445097, + "grad_norm": 0.13431230187416077, + "learning_rate": 0.000897369034962096, + "loss": 2.7653, + "step": 7270 + }, + { + "epoch": 0.21560952465676245, + "grad_norm": 0.13155928254127502, + "learning_rate": 0.0008973404755020185, + "loss": 2.7746, + "step": 7271 + }, + { + "epoch": 0.21563917800907392, + "grad_norm": 0.13619652390480042, + "learning_rate": 0.0008973119125234231, + "loss": 2.7891, + "step": 7272 + }, + { + "epoch": 0.2156688313613854, + "grad_norm": 0.14741885662078857, + "learning_rate": 0.0008972833460265624, + "loss": 2.7871, + "step": 7273 + }, + { + "epoch": 0.21569848471369688, + "grad_norm": 0.1753583550453186, + "learning_rate": 0.0008972547760116895, + "loss": 2.8085, + "step": 7274 + }, + { + "epoch": 0.21572813806600835, + "grad_norm": 0.17760051786899567, + "learning_rate": 0.0008972262024790574, + "loss": 2.7758, + "step": 7275 + }, + { + "epoch": 0.21575779141831983, + "grad_norm": 0.16155652701854706, + "learning_rate": 0.0008971976254289189, + "loss": 2.7675, + "step": 7276 + }, + { + "epoch": 0.21578744477063133, + "grad_norm": 0.1589616984128952, + "learning_rate": 0.0008971690448615275, + "loss": 2.8041, + "step": 7277 + }, + { + "epoch": 0.2158170981229428, + "grad_norm": 0.17266038060188293, + "learning_rate": 0.000897140460777136, + "loss": 2.7975, + "step": 7278 + }, + { + "epoch": 0.21584675147525428, + "grad_norm": 0.1818084865808487, + "learning_rate": 0.0008971118731759976, + "loss": 2.7979, + "step": 7279 + }, + { + "epoch": 0.21587640482756576, + "grad_norm": 0.18499042093753815, + "learning_rate": 0.0008970832820583652, + "loss": 2.7816, + "step": 7280 + }, + { + "epoch": 0.21590605817987724, + "grad_norm": 0.15480919182300568, + "learning_rate": 0.0008970546874244922, + "loss": 2.749, + "step": 7281 + }, + { + "epoch": 0.2159357115321887, + "grad_norm": 0.12730923295021057, + "learning_rate": 0.0008970260892746318, + "loss": 2.7015, + "step": 7282 + }, + { + "epoch": 0.2159653648845002, + "grad_norm": 0.1382388025522232, + "learning_rate": 0.0008969974876090374, + "loss": 2.7091, + "step": 7283 + }, + { + "epoch": 0.21599501823681166, + "grad_norm": 0.1452082395553589, + "learning_rate": 0.000896968882427962, + "loss": 2.8225, + "step": 7284 + }, + { + "epoch": 0.21602467158912314, + "grad_norm": 0.15811197459697723, + "learning_rate": 0.000896940273731659, + "loss": 2.7406, + "step": 7285 + }, + { + "epoch": 0.21605432494143462, + "grad_norm": 0.1589823067188263, + "learning_rate": 0.0008969116615203818, + "loss": 2.7718, + "step": 7286 + }, + { + "epoch": 0.21608397829374612, + "grad_norm": 0.15819749236106873, + "learning_rate": 0.0008968830457943836, + "loss": 2.7591, + "step": 7287 + }, + { + "epoch": 0.2161136316460576, + "grad_norm": 0.14982357621192932, + "learning_rate": 0.0008968544265539179, + "loss": 2.7651, + "step": 7288 + }, + { + "epoch": 0.21614328499836907, + "grad_norm": 0.14842891693115234, + "learning_rate": 0.0008968258037992383, + "loss": 2.7344, + "step": 7289 + }, + { + "epoch": 0.21617293835068055, + "grad_norm": 0.1345663070678711, + "learning_rate": 0.0008967971775305979, + "loss": 2.7867, + "step": 7290 + }, + { + "epoch": 0.21620259170299203, + "grad_norm": 0.13365596532821655, + "learning_rate": 0.0008967685477482506, + "loss": 2.8083, + "step": 7291 + }, + { + "epoch": 0.2162322450553035, + "grad_norm": 0.15899166464805603, + "learning_rate": 0.0008967399144524495, + "loss": 2.7615, + "step": 7292 + }, + { + "epoch": 0.21626189840761498, + "grad_norm": 0.17996548116207123, + "learning_rate": 0.0008967112776434485, + "loss": 2.7874, + "step": 7293 + }, + { + "epoch": 0.21629155175992645, + "grad_norm": 0.16624045372009277, + "learning_rate": 0.0008966826373215009, + "loss": 2.7724, + "step": 7294 + }, + { + "epoch": 0.21632120511223793, + "grad_norm": 0.15745045244693756, + "learning_rate": 0.0008966539934868605, + "loss": 2.7642, + "step": 7295 + }, + { + "epoch": 0.2163508584645494, + "grad_norm": 0.13724690675735474, + "learning_rate": 0.000896625346139781, + "loss": 2.7612, + "step": 7296 + }, + { + "epoch": 0.2163805118168609, + "grad_norm": 0.1521199494600296, + "learning_rate": 0.0008965966952805159, + "loss": 2.7786, + "step": 7297 + }, + { + "epoch": 0.21641016516917239, + "grad_norm": 0.15906915068626404, + "learning_rate": 0.000896568040909319, + "loss": 2.756, + "step": 7298 + }, + { + "epoch": 0.21643981852148386, + "grad_norm": 0.1584254503250122, + "learning_rate": 0.0008965393830264441, + "loss": 2.7569, + "step": 7299 + }, + { + "epoch": 0.21646947187379534, + "grad_norm": 0.15987202525138855, + "learning_rate": 0.0008965107216321449, + "loss": 2.7565, + "step": 7300 + }, + { + "epoch": 0.21649912522610681, + "grad_norm": 0.12120427191257477, + "learning_rate": 0.0008964820567266749, + "loss": 2.7746, + "step": 7301 + }, + { + "epoch": 0.2165287785784183, + "grad_norm": 0.1260526031255722, + "learning_rate": 0.0008964533883102885, + "loss": 2.7907, + "step": 7302 + }, + { + "epoch": 0.21655843193072977, + "grad_norm": 0.14193004369735718, + "learning_rate": 0.0008964247163832393, + "loss": 2.7707, + "step": 7303 + }, + { + "epoch": 0.21658808528304124, + "grad_norm": 0.1268797665834427, + "learning_rate": 0.0008963960409457812, + "loss": 2.8152, + "step": 7304 + }, + { + "epoch": 0.21661773863535272, + "grad_norm": 0.1332334578037262, + "learning_rate": 0.000896367361998168, + "loss": 2.7995, + "step": 7305 + }, + { + "epoch": 0.2166473919876642, + "grad_norm": 0.1571802943944931, + "learning_rate": 0.0008963386795406539, + "loss": 2.7359, + "step": 7306 + }, + { + "epoch": 0.21667704533997567, + "grad_norm": 0.14565041661262512, + "learning_rate": 0.0008963099935734927, + "loss": 2.7663, + "step": 7307 + }, + { + "epoch": 0.21670669869228718, + "grad_norm": 0.14109602570533752, + "learning_rate": 0.0008962813040969386, + "loss": 2.786, + "step": 7308 + }, + { + "epoch": 0.21673635204459865, + "grad_norm": 0.1714077740907669, + "learning_rate": 0.0008962526111112453, + "loss": 2.8058, + "step": 7309 + }, + { + "epoch": 0.21676600539691013, + "grad_norm": 0.1837899386882782, + "learning_rate": 0.0008962239146166673, + "loss": 2.7676, + "step": 7310 + }, + { + "epoch": 0.2167956587492216, + "grad_norm": 0.1600516140460968, + "learning_rate": 0.0008961952146134584, + "loss": 2.8232, + "step": 7311 + }, + { + "epoch": 0.21682531210153308, + "grad_norm": 0.15645700693130493, + "learning_rate": 0.0008961665111018728, + "loss": 2.8047, + "step": 7312 + }, + { + "epoch": 0.21685496545384456, + "grad_norm": 0.15131765604019165, + "learning_rate": 0.0008961378040821651, + "loss": 2.7708, + "step": 7313 + }, + { + "epoch": 0.21688461880615603, + "grad_norm": 0.1475325971841812, + "learning_rate": 0.0008961090935545888, + "loss": 2.7728, + "step": 7314 + }, + { + "epoch": 0.2169142721584675, + "grad_norm": 0.17089349031448364, + "learning_rate": 0.0008960803795193986, + "loss": 2.796, + "step": 7315 + }, + { + "epoch": 0.21694392551077898, + "grad_norm": 0.16517385840415955, + "learning_rate": 0.0008960516619768486, + "loss": 2.7573, + "step": 7316 + }, + { + "epoch": 0.21697357886309046, + "grad_norm": 0.13534729182720184, + "learning_rate": 0.0008960229409271933, + "loss": 2.7675, + "step": 7317 + }, + { + "epoch": 0.21700323221540196, + "grad_norm": 0.15069198608398438, + "learning_rate": 0.0008959942163706867, + "loss": 2.7792, + "step": 7318 + }, + { + "epoch": 0.21703288556771344, + "grad_norm": 0.13614222407341003, + "learning_rate": 0.0008959654883075835, + "loss": 2.7737, + "step": 7319 + }, + { + "epoch": 0.21706253892002492, + "grad_norm": 0.12809421122074127, + "learning_rate": 0.0008959367567381378, + "loss": 2.8052, + "step": 7320 + }, + { + "epoch": 0.2170921922723364, + "grad_norm": 0.1439678966999054, + "learning_rate": 0.0008959080216626043, + "loss": 2.8014, + "step": 7321 + }, + { + "epoch": 0.21712184562464787, + "grad_norm": 0.1467108577489853, + "learning_rate": 0.0008958792830812372, + "loss": 2.7611, + "step": 7322 + }, + { + "epoch": 0.21715149897695935, + "grad_norm": 0.1575440615415573, + "learning_rate": 0.0008958505409942912, + "loss": 2.7575, + "step": 7323 + }, + { + "epoch": 0.21718115232927082, + "grad_norm": 0.17994536459445953, + "learning_rate": 0.0008958217954020206, + "loss": 2.7681, + "step": 7324 + }, + { + "epoch": 0.2172108056815823, + "grad_norm": 0.19843198359012604, + "learning_rate": 0.0008957930463046801, + "loss": 2.7942, + "step": 7325 + }, + { + "epoch": 0.21724045903389377, + "grad_norm": 0.20844484865665436, + "learning_rate": 0.0008957642937025242, + "loss": 2.7721, + "step": 7326 + }, + { + "epoch": 0.21727011238620525, + "grad_norm": 0.17736341059207916, + "learning_rate": 0.0008957355375958076, + "loss": 2.7448, + "step": 7327 + }, + { + "epoch": 0.21729976573851673, + "grad_norm": 0.15550293028354645, + "learning_rate": 0.0008957067779847849, + "loss": 2.7774, + "step": 7328 + }, + { + "epoch": 0.21732941909082823, + "grad_norm": 0.17748354375362396, + "learning_rate": 0.0008956780148697108, + "loss": 2.8173, + "step": 7329 + }, + { + "epoch": 0.2173590724431397, + "grad_norm": 0.17054128646850586, + "learning_rate": 0.0008956492482508398, + "loss": 2.8028, + "step": 7330 + }, + { + "epoch": 0.21738872579545118, + "grad_norm": 0.15736395120620728, + "learning_rate": 0.0008956204781284269, + "loss": 2.7961, + "step": 7331 + }, + { + "epoch": 0.21741837914776266, + "grad_norm": 0.15646687150001526, + "learning_rate": 0.0008955917045027267, + "loss": 2.7692, + "step": 7332 + }, + { + "epoch": 0.21744803250007413, + "grad_norm": 0.13476116955280304, + "learning_rate": 0.0008955629273739941, + "loss": 2.7786, + "step": 7333 + }, + { + "epoch": 0.2174776858523856, + "grad_norm": 0.14476048946380615, + "learning_rate": 0.000895534146742484, + "loss": 2.8317, + "step": 7334 + }, + { + "epoch": 0.2175073392046971, + "grad_norm": 0.14272941648960114, + "learning_rate": 0.0008955053626084511, + "loss": 2.7449, + "step": 7335 + }, + { + "epoch": 0.21753699255700856, + "grad_norm": 0.1479666382074356, + "learning_rate": 0.00089547657497215, + "loss": 2.7692, + "step": 7336 + }, + { + "epoch": 0.21756664590932004, + "grad_norm": 0.14151740074157715, + "learning_rate": 0.0008954477838338363, + "loss": 2.7797, + "step": 7337 + }, + { + "epoch": 0.21759629926163152, + "grad_norm": 0.1304876059293747, + "learning_rate": 0.0008954189891937645, + "loss": 2.7454, + "step": 7338 + }, + { + "epoch": 0.21762595261394302, + "grad_norm": 0.14331761002540588, + "learning_rate": 0.0008953901910521896, + "loss": 2.7905, + "step": 7339 + }, + { + "epoch": 0.2176556059662545, + "grad_norm": 0.13337555527687073, + "learning_rate": 0.0008953613894093668, + "loss": 2.7729, + "step": 7340 + }, + { + "epoch": 0.21768525931856597, + "grad_norm": 0.1184181496500969, + "learning_rate": 0.0008953325842655511, + "loss": 2.753, + "step": 7341 + }, + { + "epoch": 0.21771491267087745, + "grad_norm": 0.1314995437860489, + "learning_rate": 0.0008953037756209974, + "loss": 2.7857, + "step": 7342 + }, + { + "epoch": 0.21774456602318892, + "grad_norm": 0.17166344821453094, + "learning_rate": 0.0008952749634759608, + "loss": 2.8096, + "step": 7343 + }, + { + "epoch": 0.2177742193755004, + "grad_norm": 0.17382898926734924, + "learning_rate": 0.0008952461478306967, + "loss": 2.7736, + "step": 7344 + }, + { + "epoch": 0.21780387272781188, + "grad_norm": 0.174271821975708, + "learning_rate": 0.0008952173286854602, + "loss": 2.7763, + "step": 7345 + }, + { + "epoch": 0.21783352608012335, + "grad_norm": 0.18017971515655518, + "learning_rate": 0.0008951885060405062, + "loss": 2.7725, + "step": 7346 + }, + { + "epoch": 0.21786317943243483, + "grad_norm": 0.18777601420879364, + "learning_rate": 0.0008951596798960901, + "loss": 2.7918, + "step": 7347 + }, + { + "epoch": 0.2178928327847463, + "grad_norm": 0.19278119504451752, + "learning_rate": 0.0008951308502524676, + "loss": 2.7403, + "step": 7348 + }, + { + "epoch": 0.2179224861370578, + "grad_norm": 0.1659878045320511, + "learning_rate": 0.0008951020171098933, + "loss": 2.7976, + "step": 7349 + }, + { + "epoch": 0.21795213948936928, + "grad_norm": 0.193878173828125, + "learning_rate": 0.0008950731804686227, + "loss": 2.771, + "step": 7350 + }, + { + "epoch": 0.21798179284168076, + "grad_norm": 0.19238179922103882, + "learning_rate": 0.0008950443403289114, + "loss": 2.8147, + "step": 7351 + }, + { + "epoch": 0.21801144619399224, + "grad_norm": 0.15835770964622498, + "learning_rate": 0.0008950154966910149, + "loss": 2.7788, + "step": 7352 + }, + { + "epoch": 0.2180410995463037, + "grad_norm": 0.17110273241996765, + "learning_rate": 0.0008949866495551881, + "loss": 2.7417, + "step": 7353 + }, + { + "epoch": 0.2180707528986152, + "grad_norm": 0.17665542662143707, + "learning_rate": 0.0008949577989216869, + "loss": 2.7869, + "step": 7354 + }, + { + "epoch": 0.21810040625092666, + "grad_norm": 0.15701189637184143, + "learning_rate": 0.0008949289447907665, + "loss": 2.7985, + "step": 7355 + }, + { + "epoch": 0.21813005960323814, + "grad_norm": 0.18237824738025665, + "learning_rate": 0.0008949000871626825, + "loss": 2.7718, + "step": 7356 + }, + { + "epoch": 0.21815971295554962, + "grad_norm": 0.1807079315185547, + "learning_rate": 0.0008948712260376903, + "loss": 2.8096, + "step": 7357 + }, + { + "epoch": 0.2181893663078611, + "grad_norm": 0.16638332605361938, + "learning_rate": 0.0008948423614160458, + "loss": 2.7489, + "step": 7358 + }, + { + "epoch": 0.21821901966017257, + "grad_norm": 0.17423024773597717, + "learning_rate": 0.0008948134932980043, + "loss": 2.807, + "step": 7359 + }, + { + "epoch": 0.21824867301248407, + "grad_norm": 0.15722216665744781, + "learning_rate": 0.0008947846216838216, + "loss": 2.7605, + "step": 7360 + }, + { + "epoch": 0.21827832636479555, + "grad_norm": 0.16545574367046356, + "learning_rate": 0.0008947557465737535, + "loss": 2.7911, + "step": 7361 + }, + { + "epoch": 0.21830797971710703, + "grad_norm": 0.13513296842575073, + "learning_rate": 0.0008947268679680553, + "loss": 2.7746, + "step": 7362 + }, + { + "epoch": 0.2183376330694185, + "grad_norm": 0.12408038228750229, + "learning_rate": 0.000894697985866983, + "loss": 2.7755, + "step": 7363 + }, + { + "epoch": 0.21836728642172998, + "grad_norm": 0.12577560544013977, + "learning_rate": 0.0008946691002707922, + "loss": 2.7657, + "step": 7364 + }, + { + "epoch": 0.21839693977404145, + "grad_norm": 0.11606646329164505, + "learning_rate": 0.0008946402111797387, + "loss": 2.7772, + "step": 7365 + }, + { + "epoch": 0.21842659312635293, + "grad_norm": 0.13763390481472015, + "learning_rate": 0.0008946113185940785, + "loss": 2.7922, + "step": 7366 + }, + { + "epoch": 0.2184562464786644, + "grad_norm": 0.13058313727378845, + "learning_rate": 0.0008945824225140676, + "loss": 2.7821, + "step": 7367 + }, + { + "epoch": 0.21848589983097588, + "grad_norm": 0.13296538591384888, + "learning_rate": 0.0008945535229399613, + "loss": 2.759, + "step": 7368 + }, + { + "epoch": 0.21851555318328736, + "grad_norm": 0.13006825745105743, + "learning_rate": 0.0008945246198720159, + "loss": 2.7827, + "step": 7369 + }, + { + "epoch": 0.21854520653559886, + "grad_norm": 0.1314118504524231, + "learning_rate": 0.0008944957133104872, + "loss": 2.7928, + "step": 7370 + }, + { + "epoch": 0.21857485988791034, + "grad_norm": 0.15394991636276245, + "learning_rate": 0.0008944668032556313, + "loss": 2.7564, + "step": 7371 + }, + { + "epoch": 0.21860451324022181, + "grad_norm": 0.16151994466781616, + "learning_rate": 0.0008944378897077041, + "loss": 2.7915, + "step": 7372 + }, + { + "epoch": 0.2186341665925333, + "grad_norm": 0.16747207939624786, + "learning_rate": 0.0008944089726669619, + "loss": 2.7439, + "step": 7373 + }, + { + "epoch": 0.21866381994484477, + "grad_norm": 0.15868651866912842, + "learning_rate": 0.0008943800521336604, + "loss": 2.7692, + "step": 7374 + }, + { + "epoch": 0.21869347329715624, + "grad_norm": 0.17780883610248566, + "learning_rate": 0.0008943511281080558, + "loss": 2.7795, + "step": 7375 + }, + { + "epoch": 0.21872312664946772, + "grad_norm": 0.18964385986328125, + "learning_rate": 0.0008943222005904043, + "loss": 2.7963, + "step": 7376 + }, + { + "epoch": 0.2187527800017792, + "grad_norm": 0.17951969802379608, + "learning_rate": 0.000894293269580962, + "loss": 2.7773, + "step": 7377 + }, + { + "epoch": 0.21878243335409067, + "grad_norm": 0.154854416847229, + "learning_rate": 0.0008942643350799852, + "loss": 2.7499, + "step": 7378 + }, + { + "epoch": 0.21881208670640215, + "grad_norm": 0.16278263926506042, + "learning_rate": 0.0008942353970877299, + "loss": 2.7873, + "step": 7379 + }, + { + "epoch": 0.21884174005871362, + "grad_norm": 0.17555060982704163, + "learning_rate": 0.0008942064556044526, + "loss": 2.7719, + "step": 7380 + }, + { + "epoch": 0.21887139341102513, + "grad_norm": 0.17950667440891266, + "learning_rate": 0.0008941775106304095, + "loss": 2.7658, + "step": 7381 + }, + { + "epoch": 0.2189010467633366, + "grad_norm": 0.16643349826335907, + "learning_rate": 0.0008941485621658569, + "loss": 2.785, + "step": 7382 + }, + { + "epoch": 0.21893070011564808, + "grad_norm": 0.14961567521095276, + "learning_rate": 0.000894119610211051, + "loss": 2.7944, + "step": 7383 + }, + { + "epoch": 0.21896035346795956, + "grad_norm": 0.1565902680158615, + "learning_rate": 0.0008940906547662484, + "loss": 2.7755, + "step": 7384 + }, + { + "epoch": 0.21899000682027103, + "grad_norm": 0.15751765668392181, + "learning_rate": 0.0008940616958317053, + "loss": 2.766, + "step": 7385 + }, + { + "epoch": 0.2190196601725825, + "grad_norm": 0.1428319364786148, + "learning_rate": 0.0008940327334076785, + "loss": 2.7677, + "step": 7386 + }, + { + "epoch": 0.21904931352489398, + "grad_norm": 0.1259852945804596, + "learning_rate": 0.0008940037674944239, + "loss": 2.7735, + "step": 7387 + }, + { + "epoch": 0.21907896687720546, + "grad_norm": 0.12243269383907318, + "learning_rate": 0.0008939747980921985, + "loss": 2.7764, + "step": 7388 + }, + { + "epoch": 0.21910862022951694, + "grad_norm": 0.14078491926193237, + "learning_rate": 0.0008939458252012585, + "loss": 2.7601, + "step": 7389 + }, + { + "epoch": 0.2191382735818284, + "grad_norm": 0.13527493178844452, + "learning_rate": 0.0008939168488218607, + "loss": 2.7698, + "step": 7390 + }, + { + "epoch": 0.21916792693413992, + "grad_norm": 0.1418587863445282, + "learning_rate": 0.0008938878689542615, + "loss": 2.7982, + "step": 7391 + }, + { + "epoch": 0.2191975802864514, + "grad_norm": 0.13236087560653687, + "learning_rate": 0.0008938588855987177, + "loss": 2.7898, + "step": 7392 + }, + { + "epoch": 0.21922723363876287, + "grad_norm": 0.13125836849212646, + "learning_rate": 0.000893829898755486, + "loss": 2.7935, + "step": 7393 + }, + { + "epoch": 0.21925688699107435, + "grad_norm": 0.13017158210277557, + "learning_rate": 0.0008938009084248226, + "loss": 2.7248, + "step": 7394 + }, + { + "epoch": 0.21928654034338582, + "grad_norm": 0.12280391901731491, + "learning_rate": 0.0008937719146069849, + "loss": 2.82, + "step": 7395 + }, + { + "epoch": 0.2193161936956973, + "grad_norm": 0.1345556527376175, + "learning_rate": 0.0008937429173022291, + "loss": 2.7802, + "step": 7396 + }, + { + "epoch": 0.21934584704800877, + "grad_norm": 0.1371321827173233, + "learning_rate": 0.0008937139165108123, + "loss": 2.7624, + "step": 7397 + }, + { + "epoch": 0.21937550040032025, + "grad_norm": 0.14098820090293884, + "learning_rate": 0.0008936849122329911, + "loss": 2.745, + "step": 7398 + }, + { + "epoch": 0.21940515375263173, + "grad_norm": 0.1561218947172165, + "learning_rate": 0.0008936559044690225, + "loss": 2.8114, + "step": 7399 + }, + { + "epoch": 0.2194348071049432, + "grad_norm": 0.1843462735414505, + "learning_rate": 0.0008936268932191631, + "loss": 2.7725, + "step": 7400 + }, + { + "epoch": 0.2194644604572547, + "grad_norm": 0.19116172194480896, + "learning_rate": 0.0008935978784836702, + "loss": 2.7987, + "step": 7401 + }, + { + "epoch": 0.21949411380956618, + "grad_norm": 0.18455369770526886, + "learning_rate": 0.0008935688602628005, + "loss": 2.7716, + "step": 7402 + }, + { + "epoch": 0.21952376716187766, + "grad_norm": 0.1959720402956009, + "learning_rate": 0.000893539838556811, + "loss": 2.7605, + "step": 7403 + }, + { + "epoch": 0.21955342051418913, + "grad_norm": 0.20531122386455536, + "learning_rate": 0.0008935108133659586, + "loss": 2.7519, + "step": 7404 + }, + { + "epoch": 0.2195830738665006, + "grad_norm": 0.16563770174980164, + "learning_rate": 0.0008934817846905004, + "loss": 2.7704, + "step": 7405 + }, + { + "epoch": 0.2196127272188121, + "grad_norm": 0.14233188331127167, + "learning_rate": 0.0008934527525306936, + "loss": 2.7762, + "step": 7406 + }, + { + "epoch": 0.21964238057112356, + "grad_norm": 0.14698056876659393, + "learning_rate": 0.0008934237168867949, + "loss": 2.7998, + "step": 7407 + }, + { + "epoch": 0.21967203392343504, + "grad_norm": 0.14062416553497314, + "learning_rate": 0.0008933946777590618, + "loss": 2.7584, + "step": 7408 + }, + { + "epoch": 0.21970168727574652, + "grad_norm": 0.12457569688558578, + "learning_rate": 0.0008933656351477514, + "loss": 2.7757, + "step": 7409 + }, + { + "epoch": 0.219731340628058, + "grad_norm": 0.12443752586841583, + "learning_rate": 0.0008933365890531206, + "loss": 2.7953, + "step": 7410 + }, + { + "epoch": 0.21976099398036947, + "grad_norm": 0.16159623861312866, + "learning_rate": 0.0008933075394754269, + "loss": 2.7913, + "step": 7411 + }, + { + "epoch": 0.21979064733268097, + "grad_norm": 0.1772221177816391, + "learning_rate": 0.0008932784864149275, + "loss": 2.7583, + "step": 7412 + }, + { + "epoch": 0.21982030068499245, + "grad_norm": 0.1934881955385208, + "learning_rate": 0.0008932494298718795, + "loss": 2.7928, + "step": 7413 + }, + { + "epoch": 0.21984995403730392, + "grad_norm": 0.1908227503299713, + "learning_rate": 0.0008932203698465402, + "loss": 2.7857, + "step": 7414 + }, + { + "epoch": 0.2198796073896154, + "grad_norm": 0.15358352661132812, + "learning_rate": 0.000893191306339167, + "loss": 2.7638, + "step": 7415 + }, + { + "epoch": 0.21990926074192688, + "grad_norm": 0.1583905816078186, + "learning_rate": 0.0008931622393500175, + "loss": 2.7732, + "step": 7416 + }, + { + "epoch": 0.21993891409423835, + "grad_norm": 0.15512333810329437, + "learning_rate": 0.0008931331688793488, + "loss": 2.8073, + "step": 7417 + }, + { + "epoch": 0.21996856744654983, + "grad_norm": 0.19268445670604706, + "learning_rate": 0.0008931040949274184, + "loss": 2.7779, + "step": 7418 + }, + { + "epoch": 0.2199982207988613, + "grad_norm": 0.15088224411010742, + "learning_rate": 0.0008930750174944837, + "loss": 2.7798, + "step": 7419 + }, + { + "epoch": 0.22002787415117278, + "grad_norm": 0.15421690046787262, + "learning_rate": 0.0008930459365808024, + "loss": 2.7767, + "step": 7420 + }, + { + "epoch": 0.22005752750348426, + "grad_norm": 0.13991039991378784, + "learning_rate": 0.0008930168521866318, + "loss": 2.7783, + "step": 7421 + }, + { + "epoch": 0.22008718085579576, + "grad_norm": 0.13174541294574738, + "learning_rate": 0.0008929877643122295, + "loss": 2.7525, + "step": 7422 + }, + { + "epoch": 0.22011683420810724, + "grad_norm": 0.12783777713775635, + "learning_rate": 0.0008929586729578531, + "loss": 2.7797, + "step": 7423 + }, + { + "epoch": 0.2201464875604187, + "grad_norm": 0.11832807213068008, + "learning_rate": 0.0008929295781237601, + "loss": 2.7547, + "step": 7424 + }, + { + "epoch": 0.2201761409127302, + "grad_norm": 0.11414944380521774, + "learning_rate": 0.0008929004798102083, + "loss": 2.7791, + "step": 7425 + }, + { + "epoch": 0.22020579426504167, + "grad_norm": 0.12860916554927826, + "learning_rate": 0.0008928713780174554, + "loss": 2.7692, + "step": 7426 + }, + { + "epoch": 0.22023544761735314, + "grad_norm": 0.1433970183134079, + "learning_rate": 0.000892842272745759, + "loss": 2.7589, + "step": 7427 + }, + { + "epoch": 0.22026510096966462, + "grad_norm": 0.15770287811756134, + "learning_rate": 0.0008928131639953767, + "loss": 2.7551, + "step": 7428 + }, + { + "epoch": 0.2202947543219761, + "grad_norm": 0.1632593274116516, + "learning_rate": 0.0008927840517665666, + "loss": 2.7321, + "step": 7429 + }, + { + "epoch": 0.22032440767428757, + "grad_norm": 0.1668098419904709, + "learning_rate": 0.0008927549360595861, + "loss": 2.78, + "step": 7430 + }, + { + "epoch": 0.22035406102659905, + "grad_norm": 0.1769678145647049, + "learning_rate": 0.0008927258168746935, + "loss": 2.7701, + "step": 7431 + }, + { + "epoch": 0.22038371437891052, + "grad_norm": 0.17182669043540955, + "learning_rate": 0.0008926966942121462, + "loss": 2.7807, + "step": 7432 + }, + { + "epoch": 0.22041336773122203, + "grad_norm": 0.16802076995372772, + "learning_rate": 0.0008926675680722022, + "loss": 2.7929, + "step": 7433 + }, + { + "epoch": 0.2204430210835335, + "grad_norm": 0.17860019207000732, + "learning_rate": 0.0008926384384551196, + "loss": 2.7862, + "step": 7434 + }, + { + "epoch": 0.22047267443584498, + "grad_norm": 0.18182827532291412, + "learning_rate": 0.0008926093053611561, + "loss": 2.7654, + "step": 7435 + }, + { + "epoch": 0.22050232778815645, + "grad_norm": 0.1495932638645172, + "learning_rate": 0.0008925801687905699, + "loss": 2.7563, + "step": 7436 + }, + { + "epoch": 0.22053198114046793, + "grad_norm": 0.13899795711040497, + "learning_rate": 0.0008925510287436189, + "loss": 2.7627, + "step": 7437 + }, + { + "epoch": 0.2205616344927794, + "grad_norm": 0.15738870203495026, + "learning_rate": 0.0008925218852205612, + "loss": 2.7612, + "step": 7438 + }, + { + "epoch": 0.22059128784509088, + "grad_norm": 0.15601332485675812, + "learning_rate": 0.0008924927382216549, + "loss": 2.7729, + "step": 7439 + }, + { + "epoch": 0.22062094119740236, + "grad_norm": 0.1420585662126541, + "learning_rate": 0.0008924635877471578, + "loss": 2.7904, + "step": 7440 + }, + { + "epoch": 0.22065059454971384, + "grad_norm": 0.1398031860589981, + "learning_rate": 0.0008924344337973285, + "loss": 2.802, + "step": 7441 + }, + { + "epoch": 0.2206802479020253, + "grad_norm": 0.14990024268627167, + "learning_rate": 0.0008924052763724248, + "loss": 2.7789, + "step": 7442 + }, + { + "epoch": 0.22070990125433682, + "grad_norm": 0.16316920518875122, + "learning_rate": 0.000892376115472705, + "loss": 2.8011, + "step": 7443 + }, + { + "epoch": 0.2207395546066483, + "grad_norm": 0.16795644164085388, + "learning_rate": 0.0008923469510984276, + "loss": 2.7683, + "step": 7444 + }, + { + "epoch": 0.22076920795895977, + "grad_norm": 0.1571473777294159, + "learning_rate": 0.0008923177832498504, + "loss": 2.7528, + "step": 7445 + }, + { + "epoch": 0.22079886131127124, + "grad_norm": 0.15177595615386963, + "learning_rate": 0.0008922886119272317, + "loss": 2.8032, + "step": 7446 + }, + { + "epoch": 0.22082851466358272, + "grad_norm": 0.1371406465768814, + "learning_rate": 0.0008922594371308304, + "loss": 2.7848, + "step": 7447 + }, + { + "epoch": 0.2208581680158942, + "grad_norm": 0.14576449990272522, + "learning_rate": 0.0008922302588609042, + "loss": 2.7478, + "step": 7448 + }, + { + "epoch": 0.22088782136820567, + "grad_norm": 0.16400571167469025, + "learning_rate": 0.0008922010771177119, + "loss": 2.7906, + "step": 7449 + }, + { + "epoch": 0.22091747472051715, + "grad_norm": 0.16273701190948486, + "learning_rate": 0.0008921718919015116, + "loss": 2.7563, + "step": 7450 + }, + { + "epoch": 0.22094712807282862, + "grad_norm": 0.15266768634319305, + "learning_rate": 0.0008921427032125618, + "loss": 2.7932, + "step": 7451 + }, + { + "epoch": 0.2209767814251401, + "grad_norm": 0.1508786827325821, + "learning_rate": 0.0008921135110511213, + "loss": 2.7851, + "step": 7452 + }, + { + "epoch": 0.2210064347774516, + "grad_norm": 0.1522080898284912, + "learning_rate": 0.0008920843154174481, + "loss": 2.7655, + "step": 7453 + }, + { + "epoch": 0.22103608812976308, + "grad_norm": 0.16922228038311005, + "learning_rate": 0.0008920551163118011, + "loss": 2.7425, + "step": 7454 + }, + { + "epoch": 0.22106574148207456, + "grad_norm": 0.1579950451850891, + "learning_rate": 0.0008920259137344389, + "loss": 2.7734, + "step": 7455 + }, + { + "epoch": 0.22109539483438603, + "grad_norm": 0.11892277002334595, + "learning_rate": 0.0008919967076856197, + "loss": 2.7452, + "step": 7456 + }, + { + "epoch": 0.2211250481866975, + "grad_norm": 0.12821581959724426, + "learning_rate": 0.0008919674981656025, + "loss": 2.7877, + "step": 7457 + }, + { + "epoch": 0.22115470153900899, + "grad_norm": 0.1250152885913849, + "learning_rate": 0.0008919382851746458, + "loss": 2.7665, + "step": 7458 + }, + { + "epoch": 0.22118435489132046, + "grad_norm": 0.13218796253204346, + "learning_rate": 0.0008919090687130082, + "loss": 2.7426, + "step": 7459 + }, + { + "epoch": 0.22121400824363194, + "grad_norm": 0.12526024878025055, + "learning_rate": 0.0008918798487809488, + "loss": 2.8048, + "step": 7460 + }, + { + "epoch": 0.2212436615959434, + "grad_norm": 0.14358609914779663, + "learning_rate": 0.0008918506253787258, + "loss": 2.7488, + "step": 7461 + }, + { + "epoch": 0.2212733149482549, + "grad_norm": 0.14694532752037048, + "learning_rate": 0.0008918213985065984, + "loss": 2.7726, + "step": 7462 + }, + { + "epoch": 0.22130296830056637, + "grad_norm": 0.17612342536449432, + "learning_rate": 0.0008917921681648252, + "loss": 2.7772, + "step": 7463 + }, + { + "epoch": 0.22133262165287787, + "grad_norm": 0.1931009143590927, + "learning_rate": 0.0008917629343536652, + "loss": 2.7781, + "step": 7464 + }, + { + "epoch": 0.22136227500518935, + "grad_norm": 0.17307139933109283, + "learning_rate": 0.000891733697073377, + "loss": 2.7764, + "step": 7465 + }, + { + "epoch": 0.22139192835750082, + "grad_norm": 0.15409961342811584, + "learning_rate": 0.0008917044563242198, + "loss": 2.7735, + "step": 7466 + }, + { + "epoch": 0.2214215817098123, + "grad_norm": 0.13438524305820465, + "learning_rate": 0.0008916752121064524, + "loss": 2.8021, + "step": 7467 + }, + { + "epoch": 0.22145123506212377, + "grad_norm": 0.15568578243255615, + "learning_rate": 0.0008916459644203337, + "loss": 2.7654, + "step": 7468 + }, + { + "epoch": 0.22148088841443525, + "grad_norm": 0.16661326587200165, + "learning_rate": 0.0008916167132661228, + "loss": 2.766, + "step": 7469 + }, + { + "epoch": 0.22151054176674673, + "grad_norm": 0.13920071721076965, + "learning_rate": 0.0008915874586440787, + "loss": 2.777, + "step": 7470 + }, + { + "epoch": 0.2215401951190582, + "grad_norm": 0.13111768662929535, + "learning_rate": 0.0008915582005544604, + "loss": 2.7726, + "step": 7471 + }, + { + "epoch": 0.22156984847136968, + "grad_norm": 0.1546841710805893, + "learning_rate": 0.0008915289389975269, + "loss": 2.7722, + "step": 7472 + }, + { + "epoch": 0.22159950182368116, + "grad_norm": 0.17110717296600342, + "learning_rate": 0.0008914996739735377, + "loss": 2.8208, + "step": 7473 + }, + { + "epoch": 0.22162915517599266, + "grad_norm": 0.18117927014827728, + "learning_rate": 0.0008914704054827517, + "loss": 2.7486, + "step": 7474 + }, + { + "epoch": 0.22165880852830414, + "grad_norm": 0.19045090675354004, + "learning_rate": 0.0008914411335254278, + "loss": 2.8034, + "step": 7475 + }, + { + "epoch": 0.2216884618806156, + "grad_norm": 0.19491985440254211, + "learning_rate": 0.0008914118581018255, + "loss": 2.7825, + "step": 7476 + }, + { + "epoch": 0.2217181152329271, + "grad_norm": 0.14677293598651886, + "learning_rate": 0.0008913825792122043, + "loss": 2.776, + "step": 7477 + }, + { + "epoch": 0.22174776858523856, + "grad_norm": 0.1351584494113922, + "learning_rate": 0.0008913532968568229, + "loss": 2.7836, + "step": 7478 + }, + { + "epoch": 0.22177742193755004, + "grad_norm": 0.16542206704616547, + "learning_rate": 0.0008913240110359409, + "loss": 2.7693, + "step": 7479 + }, + { + "epoch": 0.22180707528986152, + "grad_norm": 0.14866380393505096, + "learning_rate": 0.0008912947217498177, + "loss": 2.7781, + "step": 7480 + }, + { + "epoch": 0.221836728642173, + "grad_norm": 0.17770221829414368, + "learning_rate": 0.0008912654289987127, + "loss": 2.7654, + "step": 7481 + }, + { + "epoch": 0.22186638199448447, + "grad_norm": 0.17649418115615845, + "learning_rate": 0.000891236132782885, + "loss": 2.773, + "step": 7482 + }, + { + "epoch": 0.22189603534679594, + "grad_norm": 0.16358616948127747, + "learning_rate": 0.0008912068331025943, + "loss": 2.802, + "step": 7483 + }, + { + "epoch": 0.22192568869910742, + "grad_norm": 0.1522715538740158, + "learning_rate": 0.0008911775299580998, + "loss": 2.8256, + "step": 7484 + }, + { + "epoch": 0.22195534205141892, + "grad_norm": 0.1616431623697281, + "learning_rate": 0.0008911482233496612, + "loss": 2.7899, + "step": 7485 + }, + { + "epoch": 0.2219849954037304, + "grad_norm": 0.14146089553833008, + "learning_rate": 0.0008911189132775379, + "loss": 2.7795, + "step": 7486 + }, + { + "epoch": 0.22201464875604188, + "grad_norm": 0.12907056510448456, + "learning_rate": 0.0008910895997419894, + "loss": 2.7685, + "step": 7487 + }, + { + "epoch": 0.22204430210835335, + "grad_norm": 0.11599970608949661, + "learning_rate": 0.0008910602827432756, + "loss": 2.7529, + "step": 7488 + }, + { + "epoch": 0.22207395546066483, + "grad_norm": 0.1310170292854309, + "learning_rate": 0.0008910309622816557, + "loss": 2.7705, + "step": 7489 + }, + { + "epoch": 0.2221036088129763, + "grad_norm": 0.13592194020748138, + "learning_rate": 0.0008910016383573896, + "loss": 2.7778, + "step": 7490 + }, + { + "epoch": 0.22213326216528778, + "grad_norm": 0.13543663918972015, + "learning_rate": 0.0008909723109707369, + "loss": 2.7676, + "step": 7491 + }, + { + "epoch": 0.22216291551759926, + "grad_norm": 0.14112789928913116, + "learning_rate": 0.0008909429801219572, + "loss": 2.7827, + "step": 7492 + }, + { + "epoch": 0.22219256886991073, + "grad_norm": 0.14075934886932373, + "learning_rate": 0.0008909136458113103, + "loss": 2.7379, + "step": 7493 + }, + { + "epoch": 0.2222222222222222, + "grad_norm": 0.14218181371688843, + "learning_rate": 0.0008908843080390561, + "loss": 2.7691, + "step": 7494 + }, + { + "epoch": 0.2222518755745337, + "grad_norm": 0.15334266424179077, + "learning_rate": 0.000890854966805454, + "loss": 2.7717, + "step": 7495 + }, + { + "epoch": 0.2222815289268452, + "grad_norm": 0.14288905262947083, + "learning_rate": 0.0008908256221107642, + "loss": 2.7401, + "step": 7496 + }, + { + "epoch": 0.22231118227915667, + "grad_norm": 0.1399516612291336, + "learning_rate": 0.0008907962739552465, + "loss": 2.7614, + "step": 7497 + }, + { + "epoch": 0.22234083563146814, + "grad_norm": 0.12920509278774261, + "learning_rate": 0.0008907669223391606, + "loss": 2.7861, + "step": 7498 + }, + { + "epoch": 0.22237048898377962, + "grad_norm": 0.1497727334499359, + "learning_rate": 0.0008907375672627667, + "loss": 2.7869, + "step": 7499 + }, + { + "epoch": 0.2224001423360911, + "grad_norm": 0.16811731457710266, + "learning_rate": 0.0008907082087263246, + "loss": 2.7741, + "step": 7500 + }, + { + "epoch": 0.22242979568840257, + "grad_norm": 0.17568619549274445, + "learning_rate": 0.000890678846730094, + "loss": 2.7875, + "step": 7501 + }, + { + "epoch": 0.22245944904071405, + "grad_norm": 0.17123466730117798, + "learning_rate": 0.0008906494812743354, + "loss": 2.7714, + "step": 7502 + }, + { + "epoch": 0.22248910239302552, + "grad_norm": 0.1531820148229599, + "learning_rate": 0.0008906201123593084, + "loss": 2.7418, + "step": 7503 + }, + { + "epoch": 0.222518755745337, + "grad_norm": 0.15953925251960754, + "learning_rate": 0.0008905907399852733, + "loss": 2.7887, + "step": 7504 + }, + { + "epoch": 0.2225484090976485, + "grad_norm": 0.17794811725616455, + "learning_rate": 0.0008905613641524901, + "loss": 2.8012, + "step": 7505 + }, + { + "epoch": 0.22257806244995998, + "grad_norm": 0.1668381541967392, + "learning_rate": 0.0008905319848612192, + "loss": 2.7616, + "step": 7506 + }, + { + "epoch": 0.22260771580227146, + "grad_norm": 0.14994904398918152, + "learning_rate": 0.0008905026021117204, + "loss": 2.784, + "step": 7507 + }, + { + "epoch": 0.22263736915458293, + "grad_norm": 0.17063091695308685, + "learning_rate": 0.0008904732159042539, + "loss": 2.7308, + "step": 7508 + }, + { + "epoch": 0.2226670225068944, + "grad_norm": 0.1707906872034073, + "learning_rate": 0.0008904438262390803, + "loss": 2.7572, + "step": 7509 + }, + { + "epoch": 0.22269667585920588, + "grad_norm": 0.1449155956506729, + "learning_rate": 0.0008904144331164594, + "loss": 2.7851, + "step": 7510 + }, + { + "epoch": 0.22272632921151736, + "grad_norm": 0.1292160004377365, + "learning_rate": 0.0008903850365366517, + "loss": 2.7613, + "step": 7511 + }, + { + "epoch": 0.22275598256382884, + "grad_norm": 0.1304110437631607, + "learning_rate": 0.0008903556364999177, + "loss": 2.753, + "step": 7512 + }, + { + "epoch": 0.2227856359161403, + "grad_norm": 0.12509839236736298, + "learning_rate": 0.0008903262330065174, + "loss": 2.7771, + "step": 7513 + }, + { + "epoch": 0.2228152892684518, + "grad_norm": 0.13224582374095917, + "learning_rate": 0.0008902968260567113, + "loss": 2.8099, + "step": 7514 + }, + { + "epoch": 0.22284494262076326, + "grad_norm": 0.12769894301891327, + "learning_rate": 0.0008902674156507597, + "loss": 2.803, + "step": 7515 + }, + { + "epoch": 0.22287459597307477, + "grad_norm": 0.14509350061416626, + "learning_rate": 0.0008902380017889233, + "loss": 2.7591, + "step": 7516 + }, + { + "epoch": 0.22290424932538624, + "grad_norm": 0.14680679142475128, + "learning_rate": 0.0008902085844714624, + "loss": 2.7564, + "step": 7517 + }, + { + "epoch": 0.22293390267769772, + "grad_norm": 0.16811226308345795, + "learning_rate": 0.0008901791636986374, + "loss": 2.7866, + "step": 7518 + }, + { + "epoch": 0.2229635560300092, + "grad_norm": 0.17065447568893433, + "learning_rate": 0.000890149739470709, + "loss": 2.7561, + "step": 7519 + }, + { + "epoch": 0.22299320938232067, + "grad_norm": 0.16620558500289917, + "learning_rate": 0.0008901203117879377, + "loss": 2.7692, + "step": 7520 + }, + { + "epoch": 0.22302286273463215, + "grad_norm": 0.1365298330783844, + "learning_rate": 0.0008900908806505841, + "loss": 2.7853, + "step": 7521 + }, + { + "epoch": 0.22305251608694363, + "grad_norm": 0.15421025454998016, + "learning_rate": 0.0008900614460589087, + "loss": 2.7678, + "step": 7522 + }, + { + "epoch": 0.2230821694392551, + "grad_norm": 0.1619037687778473, + "learning_rate": 0.0008900320080131724, + "loss": 2.7843, + "step": 7523 + }, + { + "epoch": 0.22311182279156658, + "grad_norm": 0.18432006239891052, + "learning_rate": 0.0008900025665136356, + "loss": 2.7799, + "step": 7524 + }, + { + "epoch": 0.22314147614387805, + "grad_norm": 0.18754692375659943, + "learning_rate": 0.0008899731215605591, + "loss": 2.7821, + "step": 7525 + }, + { + "epoch": 0.22317112949618956, + "grad_norm": 0.18827231228351593, + "learning_rate": 0.0008899436731542037, + "loss": 2.7914, + "step": 7526 + }, + { + "epoch": 0.22320078284850103, + "grad_norm": 0.19177396595478058, + "learning_rate": 0.0008899142212948301, + "loss": 2.8053, + "step": 7527 + }, + { + "epoch": 0.2232304362008125, + "grad_norm": 0.15528421103954315, + "learning_rate": 0.0008898847659826993, + "loss": 2.8158, + "step": 7528 + }, + { + "epoch": 0.22326008955312399, + "grad_norm": 0.15145303308963776, + "learning_rate": 0.000889855307218072, + "loss": 2.7564, + "step": 7529 + }, + { + "epoch": 0.22328974290543546, + "grad_norm": 0.16892963647842407, + "learning_rate": 0.0008898258450012089, + "loss": 2.7529, + "step": 7530 + }, + { + "epoch": 0.22331939625774694, + "grad_norm": 0.15794877707958221, + "learning_rate": 0.0008897963793323711, + "loss": 2.7639, + "step": 7531 + }, + { + "epoch": 0.22334904961005841, + "grad_norm": 0.1739041656255722, + "learning_rate": 0.0008897669102118193, + "loss": 2.7913, + "step": 7532 + }, + { + "epoch": 0.2233787029623699, + "grad_norm": 0.19172915816307068, + "learning_rate": 0.0008897374376398146, + "loss": 2.7546, + "step": 7533 + }, + { + "epoch": 0.22340835631468137, + "grad_norm": 0.18408188223838806, + "learning_rate": 0.0008897079616166181, + "loss": 2.7808, + "step": 7534 + }, + { + "epoch": 0.22343800966699284, + "grad_norm": 0.1590842306613922, + "learning_rate": 0.0008896784821424908, + "loss": 2.7906, + "step": 7535 + }, + { + "epoch": 0.22346766301930432, + "grad_norm": 0.1572970747947693, + "learning_rate": 0.0008896489992176934, + "loss": 2.7608, + "step": 7536 + }, + { + "epoch": 0.22349731637161582, + "grad_norm": 0.14610296487808228, + "learning_rate": 0.0008896195128424876, + "loss": 2.7647, + "step": 7537 + }, + { + "epoch": 0.2235269697239273, + "grad_norm": 0.1468890756368637, + "learning_rate": 0.0008895900230171339, + "loss": 2.8214, + "step": 7538 + }, + { + "epoch": 0.22355662307623878, + "grad_norm": 0.1715775430202484, + "learning_rate": 0.0008895605297418936, + "loss": 2.7979, + "step": 7539 + }, + { + "epoch": 0.22358627642855025, + "grad_norm": 0.15594252943992615, + "learning_rate": 0.0008895310330170281, + "loss": 2.7977, + "step": 7540 + }, + { + "epoch": 0.22361592978086173, + "grad_norm": 0.15051357448101044, + "learning_rate": 0.0008895015328427984, + "loss": 2.7605, + "step": 7541 + }, + { + "epoch": 0.2236455831331732, + "grad_norm": 0.13933227956295013, + "learning_rate": 0.0008894720292194658, + "loss": 2.7514, + "step": 7542 + }, + { + "epoch": 0.22367523648548468, + "grad_norm": 0.15311598777770996, + "learning_rate": 0.0008894425221472915, + "loss": 2.7796, + "step": 7543 + }, + { + "epoch": 0.22370488983779616, + "grad_norm": 0.15415064990520477, + "learning_rate": 0.0008894130116265368, + "loss": 2.7687, + "step": 7544 + }, + { + "epoch": 0.22373454319010763, + "grad_norm": 0.1402336210012436, + "learning_rate": 0.0008893834976574631, + "loss": 2.7922, + "step": 7545 + }, + { + "epoch": 0.2237641965424191, + "grad_norm": 0.1306801587343216, + "learning_rate": 0.0008893539802403316, + "loss": 2.7545, + "step": 7546 + }, + { + "epoch": 0.2237938498947306, + "grad_norm": 0.1175156757235527, + "learning_rate": 0.0008893244593754037, + "loss": 2.7966, + "step": 7547 + }, + { + "epoch": 0.2238235032470421, + "grad_norm": 0.14047467708587646, + "learning_rate": 0.0008892949350629411, + "loss": 2.7928, + "step": 7548 + }, + { + "epoch": 0.22385315659935356, + "grad_norm": 0.15048713982105255, + "learning_rate": 0.0008892654073032049, + "loss": 2.7717, + "step": 7549 + }, + { + "epoch": 0.22388280995166504, + "grad_norm": 0.13009853661060333, + "learning_rate": 0.0008892358760964567, + "loss": 2.7748, + "step": 7550 + }, + { + "epoch": 0.22391246330397652, + "grad_norm": 0.11926960945129395, + "learning_rate": 0.0008892063414429581, + "loss": 2.7629, + "step": 7551 + }, + { + "epoch": 0.223942116656288, + "grad_norm": 0.153367817401886, + "learning_rate": 0.0008891768033429704, + "loss": 2.7543, + "step": 7552 + }, + { + "epoch": 0.22397177000859947, + "grad_norm": 0.16071127355098724, + "learning_rate": 0.0008891472617967554, + "loss": 2.8232, + "step": 7553 + }, + { + "epoch": 0.22400142336091095, + "grad_norm": 0.16478709876537323, + "learning_rate": 0.0008891177168045745, + "loss": 2.764, + "step": 7554 + }, + { + "epoch": 0.22403107671322242, + "grad_norm": 0.17337319254875183, + "learning_rate": 0.0008890881683666896, + "loss": 2.8151, + "step": 7555 + }, + { + "epoch": 0.2240607300655339, + "grad_norm": 0.15600910782814026, + "learning_rate": 0.000889058616483362, + "loss": 2.7697, + "step": 7556 + }, + { + "epoch": 0.2240903834178454, + "grad_norm": 0.141357421875, + "learning_rate": 0.0008890290611548537, + "loss": 2.7885, + "step": 7557 + }, + { + "epoch": 0.22412003677015688, + "grad_norm": 0.13781173527240753, + "learning_rate": 0.0008889995023814262, + "loss": 2.7497, + "step": 7558 + }, + { + "epoch": 0.22414969012246835, + "grad_norm": 0.13845078647136688, + "learning_rate": 0.0008889699401633414, + "loss": 2.7958, + "step": 7559 + }, + { + "epoch": 0.22417934347477983, + "grad_norm": 0.1413630247116089, + "learning_rate": 0.0008889403745008609, + "loss": 2.77, + "step": 7560 + }, + { + "epoch": 0.2242089968270913, + "grad_norm": 0.17656159400939941, + "learning_rate": 0.0008889108053942469, + "loss": 2.7616, + "step": 7561 + }, + { + "epoch": 0.22423865017940278, + "grad_norm": 0.19683052599430084, + "learning_rate": 0.0008888812328437607, + "loss": 2.8027, + "step": 7562 + }, + { + "epoch": 0.22426830353171426, + "grad_norm": 0.20057132840156555, + "learning_rate": 0.0008888516568496645, + "loss": 2.7635, + "step": 7563 + }, + { + "epoch": 0.22429795688402573, + "grad_norm": 0.19792132079601288, + "learning_rate": 0.0008888220774122202, + "loss": 2.7466, + "step": 7564 + }, + { + "epoch": 0.2243276102363372, + "grad_norm": 0.15959151089191437, + "learning_rate": 0.0008887924945316895, + "loss": 2.8293, + "step": 7565 + }, + { + "epoch": 0.2243572635886487, + "grad_norm": 0.14640355110168457, + "learning_rate": 0.0008887629082083346, + "loss": 2.7769, + "step": 7566 + }, + { + "epoch": 0.22438691694096016, + "grad_norm": 0.14661043882369995, + "learning_rate": 0.0008887333184424176, + "loss": 2.7601, + "step": 7567 + }, + { + "epoch": 0.22441657029327167, + "grad_norm": 0.14661645889282227, + "learning_rate": 0.0008887037252342001, + "loss": 2.746, + "step": 7568 + }, + { + "epoch": 0.22444622364558314, + "grad_norm": 0.1568950116634369, + "learning_rate": 0.0008886741285839446, + "loss": 2.7862, + "step": 7569 + }, + { + "epoch": 0.22447587699789462, + "grad_norm": 0.15192468464374542, + "learning_rate": 0.0008886445284919127, + "loss": 2.7561, + "step": 7570 + }, + { + "epoch": 0.2245055303502061, + "grad_norm": 0.1571810394525528, + "learning_rate": 0.0008886149249583671, + "loss": 2.7313, + "step": 7571 + }, + { + "epoch": 0.22453518370251757, + "grad_norm": 0.17280808091163635, + "learning_rate": 0.0008885853179835695, + "loss": 2.7678, + "step": 7572 + }, + { + "epoch": 0.22456483705482905, + "grad_norm": 0.16679167747497559, + "learning_rate": 0.0008885557075677821, + "loss": 2.7895, + "step": 7573 + }, + { + "epoch": 0.22459449040714052, + "grad_norm": 0.15046799182891846, + "learning_rate": 0.0008885260937112673, + "loss": 2.8009, + "step": 7574 + }, + { + "epoch": 0.224624143759452, + "grad_norm": 0.15667055547237396, + "learning_rate": 0.0008884964764142874, + "loss": 2.799, + "step": 7575 + }, + { + "epoch": 0.22465379711176348, + "grad_norm": 0.15894176065921783, + "learning_rate": 0.0008884668556771042, + "loss": 2.7194, + "step": 7576 + }, + { + "epoch": 0.22468345046407495, + "grad_norm": 0.1576077938079834, + "learning_rate": 0.0008884372314999805, + "loss": 2.7654, + "step": 7577 + }, + { + "epoch": 0.22471310381638646, + "grad_norm": 0.1376463919878006, + "learning_rate": 0.0008884076038831785, + "loss": 2.7664, + "step": 7578 + }, + { + "epoch": 0.22474275716869793, + "grad_norm": 0.1427226960659027, + "learning_rate": 0.0008883779728269604, + "loss": 2.7954, + "step": 7579 + }, + { + "epoch": 0.2247724105210094, + "grad_norm": 0.13806582987308502, + "learning_rate": 0.0008883483383315887, + "loss": 2.7606, + "step": 7580 + }, + { + "epoch": 0.22480206387332088, + "grad_norm": 0.13180674612522125, + "learning_rate": 0.0008883187003973259, + "loss": 2.752, + "step": 7581 + }, + { + "epoch": 0.22483171722563236, + "grad_norm": 0.15087665617465973, + "learning_rate": 0.0008882890590244341, + "loss": 2.7626, + "step": 7582 + }, + { + "epoch": 0.22486137057794384, + "grad_norm": 0.14779452979564667, + "learning_rate": 0.0008882594142131763, + "loss": 2.767, + "step": 7583 + }, + { + "epoch": 0.2248910239302553, + "grad_norm": 0.14177019894123077, + "learning_rate": 0.0008882297659638147, + "loss": 2.7826, + "step": 7584 + }, + { + "epoch": 0.2249206772825668, + "grad_norm": 0.13734084367752075, + "learning_rate": 0.0008882001142766117, + "loss": 2.7269, + "step": 7585 + }, + { + "epoch": 0.22495033063487826, + "grad_norm": 0.1426216959953308, + "learning_rate": 0.0008881704591518302, + "loss": 2.7864, + "step": 7586 + }, + { + "epoch": 0.22497998398718974, + "grad_norm": 0.16691026091575623, + "learning_rate": 0.0008881408005897327, + "loss": 2.7542, + "step": 7587 + }, + { + "epoch": 0.22500963733950122, + "grad_norm": 0.18193940818309784, + "learning_rate": 0.0008881111385905817, + "loss": 2.7449, + "step": 7588 + }, + { + "epoch": 0.22503929069181272, + "grad_norm": 0.1991291344165802, + "learning_rate": 0.00088808147315464, + "loss": 2.8027, + "step": 7589 + }, + { + "epoch": 0.2250689440441242, + "grad_norm": 0.17407700419425964, + "learning_rate": 0.0008880518042821702, + "loss": 2.7812, + "step": 7590 + }, + { + "epoch": 0.22509859739643567, + "grad_norm": 0.16616295278072357, + "learning_rate": 0.0008880221319734351, + "loss": 2.8106, + "step": 7591 + }, + { + "epoch": 0.22512825074874715, + "grad_norm": 0.17723031342029572, + "learning_rate": 0.0008879924562286974, + "loss": 2.8025, + "step": 7592 + }, + { + "epoch": 0.22515790410105863, + "grad_norm": 0.17042729258537292, + "learning_rate": 0.0008879627770482199, + "loss": 2.8151, + "step": 7593 + }, + { + "epoch": 0.2251875574533701, + "grad_norm": 0.1726001799106598, + "learning_rate": 0.0008879330944322654, + "loss": 2.7764, + "step": 7594 + }, + { + "epoch": 0.22521721080568158, + "grad_norm": 0.13882815837860107, + "learning_rate": 0.0008879034083810968, + "loss": 2.768, + "step": 7595 + }, + { + "epoch": 0.22524686415799305, + "grad_norm": 0.13608503341674805, + "learning_rate": 0.000887873718894977, + "loss": 2.7751, + "step": 7596 + }, + { + "epoch": 0.22527651751030453, + "grad_norm": 0.1478063464164734, + "learning_rate": 0.0008878440259741687, + "loss": 2.7789, + "step": 7597 + }, + { + "epoch": 0.225306170862616, + "grad_norm": 0.1572292447090149, + "learning_rate": 0.000887814329618935, + "loss": 2.7737, + "step": 7598 + }, + { + "epoch": 0.2253358242149275, + "grad_norm": 0.160882830619812, + "learning_rate": 0.0008877846298295389, + "loss": 2.8018, + "step": 7599 + }, + { + "epoch": 0.225365477567239, + "grad_norm": 0.14589837193489075, + "learning_rate": 0.0008877549266062435, + "loss": 2.7824, + "step": 7600 + }, + { + "epoch": 0.22539513091955046, + "grad_norm": 0.16431556642055511, + "learning_rate": 0.0008877252199493113, + "loss": 2.7459, + "step": 7601 + }, + { + "epoch": 0.22542478427186194, + "grad_norm": 0.15905901789665222, + "learning_rate": 0.000887695509859006, + "loss": 2.7865, + "step": 7602 + }, + { + "epoch": 0.22545443762417341, + "grad_norm": 0.15302124619483948, + "learning_rate": 0.0008876657963355903, + "loss": 2.802, + "step": 7603 + }, + { + "epoch": 0.2254840909764849, + "grad_norm": 0.1326592117547989, + "learning_rate": 0.0008876360793793275, + "loss": 2.8156, + "step": 7604 + }, + { + "epoch": 0.22551374432879637, + "grad_norm": 0.13434818387031555, + "learning_rate": 0.0008876063589904806, + "loss": 2.7823, + "step": 7605 + }, + { + "epoch": 0.22554339768110784, + "grad_norm": 0.14418621361255646, + "learning_rate": 0.0008875766351693128, + "loss": 2.7919, + "step": 7606 + }, + { + "epoch": 0.22557305103341932, + "grad_norm": 0.15549248456954956, + "learning_rate": 0.0008875469079160876, + "loss": 2.7669, + "step": 7607 + }, + { + "epoch": 0.2256027043857308, + "grad_norm": 0.1733989715576172, + "learning_rate": 0.0008875171772310679, + "loss": 2.7846, + "step": 7608 + }, + { + "epoch": 0.2256323577380423, + "grad_norm": 0.16837598383426666, + "learning_rate": 0.000887487443114517, + "loss": 2.7652, + "step": 7609 + }, + { + "epoch": 0.22566201109035378, + "grad_norm": 0.1664617359638214, + "learning_rate": 0.0008874577055666984, + "loss": 2.8009, + "step": 7610 + }, + { + "epoch": 0.22569166444266525, + "grad_norm": 0.1571362316608429, + "learning_rate": 0.0008874279645878753, + "loss": 2.7656, + "step": 7611 + }, + { + "epoch": 0.22572131779497673, + "grad_norm": 0.14202772080898285, + "learning_rate": 0.000887398220178311, + "loss": 2.8082, + "step": 7612 + }, + { + "epoch": 0.2257509711472882, + "grad_norm": 0.14861813187599182, + "learning_rate": 0.0008873684723382689, + "loss": 2.7984, + "step": 7613 + }, + { + "epoch": 0.22578062449959968, + "grad_norm": 0.12848101556301117, + "learning_rate": 0.0008873387210680126, + "loss": 2.7717, + "step": 7614 + }, + { + "epoch": 0.22581027785191116, + "grad_norm": 0.14485962688922882, + "learning_rate": 0.0008873089663678053, + "loss": 2.7446, + "step": 7615 + }, + { + "epoch": 0.22583993120422263, + "grad_norm": 0.15639038383960724, + "learning_rate": 0.0008872792082379109, + "loss": 2.7778, + "step": 7616 + }, + { + "epoch": 0.2258695845565341, + "grad_norm": 0.1494857668876648, + "learning_rate": 0.0008872494466785924, + "loss": 2.7578, + "step": 7617 + }, + { + "epoch": 0.22589923790884558, + "grad_norm": 0.14796291291713715, + "learning_rate": 0.0008872196816901137, + "loss": 2.7717, + "step": 7618 + }, + { + "epoch": 0.22592889126115706, + "grad_norm": 0.12992769479751587, + "learning_rate": 0.0008871899132727382, + "loss": 2.7788, + "step": 7619 + }, + { + "epoch": 0.22595854461346856, + "grad_norm": 0.15887393057346344, + "learning_rate": 0.0008871601414267295, + "loss": 2.7664, + "step": 7620 + }, + { + "epoch": 0.22598819796578004, + "grad_norm": 0.17181144654750824, + "learning_rate": 0.0008871303661523514, + "loss": 2.799, + "step": 7621 + }, + { + "epoch": 0.22601785131809152, + "grad_norm": 0.14473627507686615, + "learning_rate": 0.0008871005874498674, + "loss": 2.8017, + "step": 7622 + }, + { + "epoch": 0.226047504670403, + "grad_norm": 0.14700621366500854, + "learning_rate": 0.0008870708053195413, + "loss": 2.7962, + "step": 7623 + }, + { + "epoch": 0.22607715802271447, + "grad_norm": 0.1819409430027008, + "learning_rate": 0.0008870410197616368, + "loss": 2.8025, + "step": 7624 + }, + { + "epoch": 0.22610681137502595, + "grad_norm": 0.2039819359779358, + "learning_rate": 0.0008870112307764176, + "loss": 2.7624, + "step": 7625 + }, + { + "epoch": 0.22613646472733742, + "grad_norm": 0.21495072543621063, + "learning_rate": 0.0008869814383641474, + "loss": 2.793, + "step": 7626 + }, + { + "epoch": 0.2261661180796489, + "grad_norm": 0.19403257966041565, + "learning_rate": 0.0008869516425250902, + "loss": 2.7847, + "step": 7627 + }, + { + "epoch": 0.22619577143196037, + "grad_norm": 0.1476394236087799, + "learning_rate": 0.0008869218432595099, + "loss": 2.7736, + "step": 7628 + }, + { + "epoch": 0.22622542478427185, + "grad_norm": 0.15647128224372864, + "learning_rate": 0.0008868920405676701, + "loss": 2.7875, + "step": 7629 + }, + { + "epoch": 0.22625507813658335, + "grad_norm": 0.13880595564842224, + "learning_rate": 0.000886862234449835, + "loss": 2.7754, + "step": 7630 + }, + { + "epoch": 0.22628473148889483, + "grad_norm": 0.1504564732313156, + "learning_rate": 0.0008868324249062682, + "loss": 2.7676, + "step": 7631 + }, + { + "epoch": 0.2263143848412063, + "grad_norm": 0.15145675837993622, + "learning_rate": 0.0008868026119372342, + "loss": 2.7643, + "step": 7632 + }, + { + "epoch": 0.22634403819351778, + "grad_norm": 0.14697802066802979, + "learning_rate": 0.0008867727955429965, + "loss": 2.7813, + "step": 7633 + }, + { + "epoch": 0.22637369154582926, + "grad_norm": 0.18048542737960815, + "learning_rate": 0.0008867429757238194, + "loss": 2.7621, + "step": 7634 + }, + { + "epoch": 0.22640334489814073, + "grad_norm": 0.18577177822589874, + "learning_rate": 0.0008867131524799667, + "loss": 2.7583, + "step": 7635 + }, + { + "epoch": 0.2264329982504522, + "grad_norm": 0.14634168148040771, + "learning_rate": 0.0008866833258117029, + "loss": 2.7397, + "step": 7636 + }, + { + "epoch": 0.2264626516027637, + "grad_norm": 0.1445959210395813, + "learning_rate": 0.0008866534957192915, + "loss": 2.7695, + "step": 7637 + }, + { + "epoch": 0.22649230495507516, + "grad_norm": 0.14633087813854218, + "learning_rate": 0.0008866236622029974, + "loss": 2.7843, + "step": 7638 + }, + { + "epoch": 0.22652195830738664, + "grad_norm": 0.1571952849626541, + "learning_rate": 0.0008865938252630843, + "loss": 2.8006, + "step": 7639 + }, + { + "epoch": 0.22655161165969812, + "grad_norm": 0.1771729737520218, + "learning_rate": 0.0008865639848998164, + "loss": 2.7818, + "step": 7640 + }, + { + "epoch": 0.22658126501200962, + "grad_norm": 0.1509414166212082, + "learning_rate": 0.0008865341411134582, + "loss": 2.761, + "step": 7641 + }, + { + "epoch": 0.2266109183643211, + "grad_norm": 0.16831718385219574, + "learning_rate": 0.0008865042939042738, + "loss": 2.7733, + "step": 7642 + }, + { + "epoch": 0.22664057171663257, + "grad_norm": 0.17435842752456665, + "learning_rate": 0.0008864744432725275, + "loss": 2.7814, + "step": 7643 + }, + { + "epoch": 0.22667022506894405, + "grad_norm": 0.17166750133037567, + "learning_rate": 0.0008864445892184836, + "loss": 2.8062, + "step": 7644 + }, + { + "epoch": 0.22669987842125552, + "grad_norm": 0.1770479381084442, + "learning_rate": 0.0008864147317424068, + "loss": 2.7658, + "step": 7645 + }, + { + "epoch": 0.226729531773567, + "grad_norm": 0.17973940074443817, + "learning_rate": 0.000886384870844561, + "loss": 2.7351, + "step": 7646 + }, + { + "epoch": 0.22675918512587848, + "grad_norm": 0.17487910389900208, + "learning_rate": 0.0008863550065252108, + "loss": 2.769, + "step": 7647 + }, + { + "epoch": 0.22678883847818995, + "grad_norm": 0.14141535758972168, + "learning_rate": 0.0008863251387846207, + "loss": 2.7679, + "step": 7648 + }, + { + "epoch": 0.22681849183050143, + "grad_norm": 0.14781951904296875, + "learning_rate": 0.0008862952676230554, + "loss": 2.7768, + "step": 7649 + }, + { + "epoch": 0.2268481451828129, + "grad_norm": 0.15238313376903534, + "learning_rate": 0.0008862653930407789, + "loss": 2.7893, + "step": 7650 + }, + { + "epoch": 0.2268777985351244, + "grad_norm": 0.13421876728534698, + "learning_rate": 0.0008862355150380563, + "loss": 2.7338, + "step": 7651 + }, + { + "epoch": 0.22690745188743588, + "grad_norm": 0.1369776725769043, + "learning_rate": 0.0008862056336151518, + "loss": 2.7952, + "step": 7652 + }, + { + "epoch": 0.22693710523974736, + "grad_norm": 0.12975239753723145, + "learning_rate": 0.0008861757487723301, + "loss": 2.7813, + "step": 7653 + }, + { + "epoch": 0.22696675859205884, + "grad_norm": 0.13003335893154144, + "learning_rate": 0.000886145860509856, + "loss": 2.807, + "step": 7654 + }, + { + "epoch": 0.2269964119443703, + "grad_norm": 0.11575578153133392, + "learning_rate": 0.000886115968827994, + "loss": 2.7739, + "step": 7655 + }, + { + "epoch": 0.2270260652966818, + "grad_norm": 0.11351177841424942, + "learning_rate": 0.0008860860737270087, + "loss": 2.7466, + "step": 7656 + }, + { + "epoch": 0.22705571864899327, + "grad_norm": 0.11983473598957062, + "learning_rate": 0.0008860561752071649, + "loss": 2.7544, + "step": 7657 + }, + { + "epoch": 0.22708537200130474, + "grad_norm": 0.12938372790813446, + "learning_rate": 0.0008860262732687276, + "loss": 2.7824, + "step": 7658 + }, + { + "epoch": 0.22711502535361622, + "grad_norm": 0.14165936410427094, + "learning_rate": 0.0008859963679119612, + "loss": 2.781, + "step": 7659 + }, + { + "epoch": 0.2271446787059277, + "grad_norm": 0.14124137163162231, + "learning_rate": 0.0008859664591371308, + "loss": 2.7571, + "step": 7660 + }, + { + "epoch": 0.22717433205823917, + "grad_norm": 0.13786250352859497, + "learning_rate": 0.0008859365469445012, + "loss": 2.7438, + "step": 7661 + }, + { + "epoch": 0.22720398541055067, + "grad_norm": 0.15161746740341187, + "learning_rate": 0.0008859066313343371, + "loss": 2.7865, + "step": 7662 + }, + { + "epoch": 0.22723363876286215, + "grad_norm": 0.1760372668504715, + "learning_rate": 0.0008858767123069037, + "loss": 2.7707, + "step": 7663 + }, + { + "epoch": 0.22726329211517363, + "grad_norm": 0.1960250288248062, + "learning_rate": 0.0008858467898624657, + "loss": 2.7544, + "step": 7664 + }, + { + "epoch": 0.2272929454674851, + "grad_norm": 0.19649185240268707, + "learning_rate": 0.0008858168640012883, + "loss": 2.7711, + "step": 7665 + }, + { + "epoch": 0.22732259881979658, + "grad_norm": 0.1904785931110382, + "learning_rate": 0.000885786934723636, + "loss": 2.7643, + "step": 7666 + }, + { + "epoch": 0.22735225217210805, + "grad_norm": 0.1660098284482956, + "learning_rate": 0.0008857570020297744, + "loss": 2.7328, + "step": 7667 + }, + { + "epoch": 0.22738190552441953, + "grad_norm": 0.1543327271938324, + "learning_rate": 0.0008857270659199684, + "loss": 2.8077, + "step": 7668 + }, + { + "epoch": 0.227411558876731, + "grad_norm": 0.16545286774635315, + "learning_rate": 0.0008856971263944828, + "loss": 2.8095, + "step": 7669 + }, + { + "epoch": 0.22744121222904248, + "grad_norm": 0.16499610245227814, + "learning_rate": 0.0008856671834535831, + "loss": 2.7913, + "step": 7670 + }, + { + "epoch": 0.22747086558135396, + "grad_norm": 0.15400435030460358, + "learning_rate": 0.0008856372370975343, + "loss": 2.7893, + "step": 7671 + }, + { + "epoch": 0.22750051893366546, + "grad_norm": 0.15440940856933594, + "learning_rate": 0.0008856072873266015, + "loss": 2.7304, + "step": 7672 + }, + { + "epoch": 0.22753017228597694, + "grad_norm": 0.14202049374580383, + "learning_rate": 0.00088557733414105, + "loss": 2.7664, + "step": 7673 + }, + { + "epoch": 0.22755982563828842, + "grad_norm": 0.1372375190258026, + "learning_rate": 0.000885547377541145, + "loss": 2.8041, + "step": 7674 + }, + { + "epoch": 0.2275894789905999, + "grad_norm": 0.13050220906734467, + "learning_rate": 0.0008855174175271519, + "loss": 2.7506, + "step": 7675 + }, + { + "epoch": 0.22761913234291137, + "grad_norm": 0.13600757718086243, + "learning_rate": 0.0008854874540993357, + "loss": 2.7874, + "step": 7676 + }, + { + "epoch": 0.22764878569522284, + "grad_norm": 0.15195931494235992, + "learning_rate": 0.000885457487257962, + "loss": 2.7716, + "step": 7677 + }, + { + "epoch": 0.22767843904753432, + "grad_norm": 0.14443646371364594, + "learning_rate": 0.0008854275170032961, + "loss": 2.7705, + "step": 7678 + }, + { + "epoch": 0.2277080923998458, + "grad_norm": 0.13598394393920898, + "learning_rate": 0.0008853975433356034, + "loss": 2.7721, + "step": 7679 + }, + { + "epoch": 0.22773774575215727, + "grad_norm": 0.15940022468566895, + "learning_rate": 0.0008853675662551495, + "loss": 2.7786, + "step": 7680 + }, + { + "epoch": 0.22776739910446875, + "grad_norm": 0.18681982159614563, + "learning_rate": 0.0008853375857621993, + "loss": 2.781, + "step": 7681 + }, + { + "epoch": 0.22779705245678025, + "grad_norm": 0.17744697630405426, + "learning_rate": 0.0008853076018570189, + "loss": 2.78, + "step": 7682 + }, + { + "epoch": 0.22782670580909173, + "grad_norm": 0.16132062673568726, + "learning_rate": 0.0008852776145398733, + "loss": 2.7701, + "step": 7683 + }, + { + "epoch": 0.2278563591614032, + "grad_norm": 0.169771209359169, + "learning_rate": 0.0008852476238110285, + "loss": 2.796, + "step": 7684 + }, + { + "epoch": 0.22788601251371468, + "grad_norm": 0.1656927615404129, + "learning_rate": 0.0008852176296707498, + "loss": 2.7683, + "step": 7685 + }, + { + "epoch": 0.22791566586602616, + "grad_norm": 0.1697673797607422, + "learning_rate": 0.0008851876321193028, + "loss": 2.7933, + "step": 7686 + }, + { + "epoch": 0.22794531921833763, + "grad_norm": 0.15983846783638, + "learning_rate": 0.0008851576311569533, + "loss": 2.7524, + "step": 7687 + }, + { + "epoch": 0.2279749725706491, + "grad_norm": 0.15057754516601562, + "learning_rate": 0.0008851276267839669, + "loss": 2.773, + "step": 7688 + }, + { + "epoch": 0.22800462592296059, + "grad_norm": 0.1825229823589325, + "learning_rate": 0.0008850976190006093, + "loss": 2.7776, + "step": 7689 + }, + { + "epoch": 0.22803427927527206, + "grad_norm": 0.18604370951652527, + "learning_rate": 0.000885067607807146, + "loss": 2.7456, + "step": 7690 + }, + { + "epoch": 0.22806393262758354, + "grad_norm": 0.14444969594478607, + "learning_rate": 0.000885037593203843, + "loss": 2.781, + "step": 7691 + }, + { + "epoch": 0.228093585979895, + "grad_norm": 0.15878136456012726, + "learning_rate": 0.0008850075751909661, + "loss": 2.7504, + "step": 7692 + }, + { + "epoch": 0.22812323933220652, + "grad_norm": 0.15082916617393494, + "learning_rate": 0.000884977553768781, + "loss": 2.7093, + "step": 7693 + }, + { + "epoch": 0.228152892684518, + "grad_norm": 0.1467343121767044, + "learning_rate": 0.0008849475289375534, + "loss": 2.8074, + "step": 7694 + }, + { + "epoch": 0.22818254603682947, + "grad_norm": 0.13197743892669678, + "learning_rate": 0.0008849175006975496, + "loss": 2.7896, + "step": 7695 + }, + { + "epoch": 0.22821219938914095, + "grad_norm": 0.12994293868541718, + "learning_rate": 0.0008848874690490352, + "loss": 2.7756, + "step": 7696 + }, + { + "epoch": 0.22824185274145242, + "grad_norm": 0.13419941067695618, + "learning_rate": 0.0008848574339922761, + "loss": 2.783, + "step": 7697 + }, + { + "epoch": 0.2282715060937639, + "grad_norm": 0.13334231078624725, + "learning_rate": 0.0008848273955275385, + "loss": 2.7493, + "step": 7698 + }, + { + "epoch": 0.22830115944607537, + "grad_norm": 0.12411129474639893, + "learning_rate": 0.0008847973536550882, + "loss": 2.7474, + "step": 7699 + }, + { + "epoch": 0.22833081279838685, + "grad_norm": 0.15766926109790802, + "learning_rate": 0.0008847673083751912, + "loss": 2.77, + "step": 7700 + }, + { + "epoch": 0.22836046615069833, + "grad_norm": 0.17418648302555084, + "learning_rate": 0.0008847372596881137, + "loss": 2.7713, + "step": 7701 + }, + { + "epoch": 0.2283901195030098, + "grad_norm": 0.17976713180541992, + "learning_rate": 0.0008847072075941217, + "loss": 2.8069, + "step": 7702 + }, + { + "epoch": 0.2284197728553213, + "grad_norm": 0.16644278168678284, + "learning_rate": 0.0008846771520934814, + "loss": 2.7888, + "step": 7703 + }, + { + "epoch": 0.22844942620763278, + "grad_norm": 0.1350545883178711, + "learning_rate": 0.000884647093186459, + "loss": 2.7906, + "step": 7704 + }, + { + "epoch": 0.22847907955994426, + "grad_norm": 0.15195801854133606, + "learning_rate": 0.0008846170308733204, + "loss": 2.8061, + "step": 7705 + }, + { + "epoch": 0.22850873291225574, + "grad_norm": 0.14892570674419403, + "learning_rate": 0.0008845869651543319, + "loss": 2.7305, + "step": 7706 + }, + { + "epoch": 0.2285383862645672, + "grad_norm": 0.13249100744724274, + "learning_rate": 0.0008845568960297598, + "loss": 2.7519, + "step": 7707 + }, + { + "epoch": 0.2285680396168787, + "grad_norm": 0.12474086135625839, + "learning_rate": 0.0008845268234998707, + "loss": 2.7919, + "step": 7708 + }, + { + "epoch": 0.22859769296919016, + "grad_norm": 0.14478878676891327, + "learning_rate": 0.0008844967475649303, + "loss": 2.7547, + "step": 7709 + }, + { + "epoch": 0.22862734632150164, + "grad_norm": 0.1643766164779663, + "learning_rate": 0.0008844666682252053, + "loss": 2.7707, + "step": 7710 + }, + { + "epoch": 0.22865699967381312, + "grad_norm": 0.1713506430387497, + "learning_rate": 0.0008844365854809619, + "loss": 2.7556, + "step": 7711 + }, + { + "epoch": 0.2286866530261246, + "grad_norm": 0.16117876768112183, + "learning_rate": 0.0008844064993324666, + "loss": 2.7813, + "step": 7712 + }, + { + "epoch": 0.22871630637843607, + "grad_norm": 0.16354690492153168, + "learning_rate": 0.0008843764097799857, + "loss": 2.7953, + "step": 7713 + }, + { + "epoch": 0.22874595973074757, + "grad_norm": 0.16722267866134644, + "learning_rate": 0.0008843463168237858, + "loss": 2.7741, + "step": 7714 + }, + { + "epoch": 0.22877561308305905, + "grad_norm": 0.15877388417720795, + "learning_rate": 0.0008843162204641333, + "loss": 2.7568, + "step": 7715 + }, + { + "epoch": 0.22880526643537052, + "grad_norm": 0.12923069298267365, + "learning_rate": 0.0008842861207012945, + "loss": 2.7638, + "step": 7716 + }, + { + "epoch": 0.228834919787682, + "grad_norm": 0.1512320339679718, + "learning_rate": 0.0008842560175355364, + "loss": 2.7687, + "step": 7717 + }, + { + "epoch": 0.22886457313999348, + "grad_norm": 0.15246808528900146, + "learning_rate": 0.0008842259109671252, + "loss": 2.7507, + "step": 7718 + }, + { + "epoch": 0.22889422649230495, + "grad_norm": 0.13746953010559082, + "learning_rate": 0.0008841958009963276, + "loss": 2.7815, + "step": 7719 + }, + { + "epoch": 0.22892387984461643, + "grad_norm": 0.13658390939235687, + "learning_rate": 0.0008841656876234102, + "loss": 2.7623, + "step": 7720 + }, + { + "epoch": 0.2289535331969279, + "grad_norm": 0.13051332533359528, + "learning_rate": 0.0008841355708486397, + "loss": 2.7137, + "step": 7721 + }, + { + "epoch": 0.22898318654923938, + "grad_norm": 0.1387753039598465, + "learning_rate": 0.0008841054506722829, + "loss": 2.7571, + "step": 7722 + }, + { + "epoch": 0.22901283990155086, + "grad_norm": 0.14032384753227234, + "learning_rate": 0.0008840753270946063, + "loss": 2.7817, + "step": 7723 + }, + { + "epoch": 0.22904249325386236, + "grad_norm": 0.13849031925201416, + "learning_rate": 0.0008840452001158767, + "loss": 2.8151, + "step": 7724 + }, + { + "epoch": 0.22907214660617384, + "grad_norm": 0.1325313150882721, + "learning_rate": 0.0008840150697363611, + "loss": 2.7709, + "step": 7725 + }, + { + "epoch": 0.2291017999584853, + "grad_norm": 0.140472874045372, + "learning_rate": 0.0008839849359563261, + "loss": 2.7556, + "step": 7726 + }, + { + "epoch": 0.2291314533107968, + "grad_norm": 0.14045223593711853, + "learning_rate": 0.0008839547987760384, + "loss": 2.7629, + "step": 7727 + }, + { + "epoch": 0.22916110666310827, + "grad_norm": 0.15197128057479858, + "learning_rate": 0.0008839246581957652, + "loss": 2.7472, + "step": 7728 + }, + { + "epoch": 0.22919076001541974, + "grad_norm": 0.1254424750804901, + "learning_rate": 0.0008838945142157732, + "loss": 2.7716, + "step": 7729 + }, + { + "epoch": 0.22922041336773122, + "grad_norm": 0.13811102509498596, + "learning_rate": 0.0008838643668363296, + "loss": 2.7411, + "step": 7730 + }, + { + "epoch": 0.2292500667200427, + "grad_norm": 0.14159560203552246, + "learning_rate": 0.0008838342160577008, + "loss": 2.7828, + "step": 7731 + }, + { + "epoch": 0.22927972007235417, + "grad_norm": 0.14619526267051697, + "learning_rate": 0.0008838040618801544, + "loss": 2.7829, + "step": 7732 + }, + { + "epoch": 0.22930937342466565, + "grad_norm": 0.16883157193660736, + "learning_rate": 0.0008837739043039569, + "loss": 2.7959, + "step": 7733 + }, + { + "epoch": 0.22933902677697715, + "grad_norm": 0.18393103778362274, + "learning_rate": 0.0008837437433293758, + "loss": 2.7824, + "step": 7734 + }, + { + "epoch": 0.22936868012928863, + "grad_norm": 0.190599262714386, + "learning_rate": 0.0008837135789566779, + "loss": 2.7858, + "step": 7735 + }, + { + "epoch": 0.2293983334816001, + "grad_norm": 0.20922479033470154, + "learning_rate": 0.0008836834111861305, + "loss": 2.7737, + "step": 7736 + }, + { + "epoch": 0.22942798683391158, + "grad_norm": 0.1920759081840515, + "learning_rate": 0.0008836532400180007, + "loss": 2.7706, + "step": 7737 + }, + { + "epoch": 0.22945764018622306, + "grad_norm": 0.15635542571544647, + "learning_rate": 0.0008836230654525553, + "loss": 2.79, + "step": 7738 + }, + { + "epoch": 0.22948729353853453, + "grad_norm": 0.15610861778259277, + "learning_rate": 0.000883592887490062, + "loss": 2.7245, + "step": 7739 + }, + { + "epoch": 0.229516946890846, + "grad_norm": 0.16040214896202087, + "learning_rate": 0.0008835627061307878, + "loss": 2.7568, + "step": 7740 + }, + { + "epoch": 0.22954660024315748, + "grad_norm": 0.13834281265735626, + "learning_rate": 0.0008835325213750001, + "loss": 2.78, + "step": 7741 + }, + { + "epoch": 0.22957625359546896, + "grad_norm": 0.16517311334609985, + "learning_rate": 0.0008835023332229659, + "loss": 2.8147, + "step": 7742 + }, + { + "epoch": 0.22960590694778044, + "grad_norm": 0.17919370532035828, + "learning_rate": 0.000883472141674953, + "loss": 2.7575, + "step": 7743 + }, + { + "epoch": 0.2296355603000919, + "grad_norm": 0.18636459112167358, + "learning_rate": 0.0008834419467312282, + "loss": 2.7918, + "step": 7744 + }, + { + "epoch": 0.22966521365240342, + "grad_norm": 0.1870647817850113, + "learning_rate": 0.0008834117483920592, + "loss": 2.7621, + "step": 7745 + }, + { + "epoch": 0.2296948670047149, + "grad_norm": 0.1749151647090912, + "learning_rate": 0.0008833815466577133, + "loss": 2.7584, + "step": 7746 + }, + { + "epoch": 0.22972452035702637, + "grad_norm": 0.16568642854690552, + "learning_rate": 0.0008833513415284582, + "loss": 2.7694, + "step": 7747 + }, + { + "epoch": 0.22975417370933784, + "grad_norm": 0.18273794651031494, + "learning_rate": 0.000883321133004561, + "loss": 2.7676, + "step": 7748 + }, + { + "epoch": 0.22978382706164932, + "grad_norm": 0.1727350950241089, + "learning_rate": 0.0008832909210862894, + "loss": 2.7803, + "step": 7749 + }, + { + "epoch": 0.2298134804139608, + "grad_norm": 0.18306425213813782, + "learning_rate": 0.000883260705773911, + "loss": 2.7728, + "step": 7750 + }, + { + "epoch": 0.22984313376627227, + "grad_norm": 0.15938504040241241, + "learning_rate": 0.0008832304870676932, + "loss": 2.7877, + "step": 7751 + }, + { + "epoch": 0.22987278711858375, + "grad_norm": 0.13782326877117157, + "learning_rate": 0.0008832002649679036, + "loss": 2.7957, + "step": 7752 + }, + { + "epoch": 0.22990244047089523, + "grad_norm": 0.12466007471084595, + "learning_rate": 0.0008831700394748099, + "loss": 2.7881, + "step": 7753 + }, + { + "epoch": 0.2299320938232067, + "grad_norm": 0.150202214717865, + "learning_rate": 0.0008831398105886797, + "loss": 2.7773, + "step": 7754 + }, + { + "epoch": 0.2299617471755182, + "grad_norm": 0.1401636153459549, + "learning_rate": 0.0008831095783097809, + "loss": 2.7736, + "step": 7755 + }, + { + "epoch": 0.22999140052782968, + "grad_norm": 0.1236073449254036, + "learning_rate": 0.0008830793426383807, + "loss": 2.7867, + "step": 7756 + }, + { + "epoch": 0.23002105388014116, + "grad_norm": 0.12230802327394485, + "learning_rate": 0.0008830491035747474, + "loss": 2.7323, + "step": 7757 + }, + { + "epoch": 0.23005070723245263, + "grad_norm": 0.12295956909656525, + "learning_rate": 0.0008830188611191485, + "loss": 2.7628, + "step": 7758 + }, + { + "epoch": 0.2300803605847641, + "grad_norm": 0.12665784358978271, + "learning_rate": 0.0008829886152718519, + "loss": 2.8009, + "step": 7759 + }, + { + "epoch": 0.23011001393707559, + "grad_norm": 0.13403427600860596, + "learning_rate": 0.0008829583660331253, + "loss": 2.7843, + "step": 7760 + }, + { + "epoch": 0.23013966728938706, + "grad_norm": 0.14095644652843475, + "learning_rate": 0.0008829281134032366, + "loss": 2.7524, + "step": 7761 + }, + { + "epoch": 0.23016932064169854, + "grad_norm": 0.13782881200313568, + "learning_rate": 0.0008828978573824538, + "loss": 2.7488, + "step": 7762 + }, + { + "epoch": 0.23019897399401001, + "grad_norm": 0.14366382360458374, + "learning_rate": 0.0008828675979710448, + "loss": 2.785, + "step": 7763 + }, + { + "epoch": 0.2302286273463215, + "grad_norm": 0.15441621840000153, + "learning_rate": 0.0008828373351692773, + "loss": 2.7519, + "step": 7764 + }, + { + "epoch": 0.23025828069863297, + "grad_norm": 0.1659986972808838, + "learning_rate": 0.0008828070689774197, + "loss": 2.7713, + "step": 7765 + }, + { + "epoch": 0.23028793405094447, + "grad_norm": 0.15228399634361267, + "learning_rate": 0.0008827767993957396, + "loss": 2.7811, + "step": 7766 + }, + { + "epoch": 0.23031758740325595, + "grad_norm": 0.14483338594436646, + "learning_rate": 0.0008827465264245053, + "loss": 2.7807, + "step": 7767 + }, + { + "epoch": 0.23034724075556742, + "grad_norm": 0.15886424481868744, + "learning_rate": 0.0008827162500639849, + "loss": 2.8263, + "step": 7768 + }, + { + "epoch": 0.2303768941078789, + "grad_norm": 0.16587017476558685, + "learning_rate": 0.0008826859703144464, + "loss": 2.7769, + "step": 7769 + }, + { + "epoch": 0.23040654746019038, + "grad_norm": 0.18870611488819122, + "learning_rate": 0.000882655687176158, + "loss": 2.7941, + "step": 7770 + }, + { + "epoch": 0.23043620081250185, + "grad_norm": 0.19912591576576233, + "learning_rate": 0.0008826254006493876, + "loss": 2.7863, + "step": 7771 + }, + { + "epoch": 0.23046585416481333, + "grad_norm": 0.1846587210893631, + "learning_rate": 0.0008825951107344038, + "loss": 2.7767, + "step": 7772 + }, + { + "epoch": 0.2304955075171248, + "grad_norm": 0.15171392261981964, + "learning_rate": 0.0008825648174314745, + "loss": 2.7494, + "step": 7773 + }, + { + "epoch": 0.23052516086943628, + "grad_norm": 0.15327613055706024, + "learning_rate": 0.0008825345207408681, + "loss": 2.779, + "step": 7774 + }, + { + "epoch": 0.23055481422174776, + "grad_norm": 0.1563744992017746, + "learning_rate": 0.0008825042206628529, + "loss": 2.7567, + "step": 7775 + }, + { + "epoch": 0.23058446757405926, + "grad_norm": 0.1424528807401657, + "learning_rate": 0.0008824739171976971, + "loss": 2.7757, + "step": 7776 + }, + { + "epoch": 0.23061412092637074, + "grad_norm": 0.1466197967529297, + "learning_rate": 0.0008824436103456692, + "loss": 2.7884, + "step": 7777 + }, + { + "epoch": 0.2306437742786822, + "grad_norm": 0.12891337275505066, + "learning_rate": 0.0008824133001070375, + "loss": 2.7764, + "step": 7778 + }, + { + "epoch": 0.2306734276309937, + "grad_norm": 0.13261333107948303, + "learning_rate": 0.0008823829864820703, + "loss": 2.7694, + "step": 7779 + }, + { + "epoch": 0.23070308098330516, + "grad_norm": 0.12684784829616547, + "learning_rate": 0.0008823526694710361, + "loss": 2.7818, + "step": 7780 + }, + { + "epoch": 0.23073273433561664, + "grad_norm": 0.13521446287631989, + "learning_rate": 0.0008823223490742034, + "loss": 2.779, + "step": 7781 + }, + { + "epoch": 0.23076238768792812, + "grad_norm": 0.13422968983650208, + "learning_rate": 0.0008822920252918407, + "loss": 2.7722, + "step": 7782 + }, + { + "epoch": 0.2307920410402396, + "grad_norm": 0.15902428328990936, + "learning_rate": 0.0008822616981242165, + "loss": 2.7595, + "step": 7783 + }, + { + "epoch": 0.23082169439255107, + "grad_norm": 0.17202816903591156, + "learning_rate": 0.0008822313675715994, + "loss": 2.76, + "step": 7784 + }, + { + "epoch": 0.23085134774486255, + "grad_norm": 0.1833241879940033, + "learning_rate": 0.0008822010336342578, + "loss": 2.8116, + "step": 7785 + }, + { + "epoch": 0.23088100109717405, + "grad_norm": 0.19865189492702484, + "learning_rate": 0.0008821706963124605, + "loss": 2.8236, + "step": 7786 + }, + { + "epoch": 0.23091065444948553, + "grad_norm": 0.19673502445220947, + "learning_rate": 0.0008821403556064762, + "loss": 2.7601, + "step": 7787 + }, + { + "epoch": 0.230940307801797, + "grad_norm": 0.1511891782283783, + "learning_rate": 0.0008821100115165735, + "loss": 2.7989, + "step": 7788 + }, + { + "epoch": 0.23096996115410848, + "grad_norm": 0.13798628747463226, + "learning_rate": 0.0008820796640430208, + "loss": 2.7728, + "step": 7789 + }, + { + "epoch": 0.23099961450641995, + "grad_norm": 0.13650135695934296, + "learning_rate": 0.0008820493131860872, + "loss": 2.7519, + "step": 7790 + }, + { + "epoch": 0.23102926785873143, + "grad_norm": 0.13045670092105865, + "learning_rate": 0.0008820189589460414, + "loss": 2.7399, + "step": 7791 + }, + { + "epoch": 0.2310589212110429, + "grad_norm": 0.1284136176109314, + "learning_rate": 0.0008819886013231521, + "loss": 2.7288, + "step": 7792 + }, + { + "epoch": 0.23108857456335438, + "grad_norm": 0.15809836983680725, + "learning_rate": 0.0008819582403176882, + "loss": 2.8008, + "step": 7793 + }, + { + "epoch": 0.23111822791566586, + "grad_norm": 0.15211614966392517, + "learning_rate": 0.0008819278759299186, + "loss": 2.7439, + "step": 7794 + }, + { + "epoch": 0.23114788126797733, + "grad_norm": 0.14814992249011993, + "learning_rate": 0.0008818975081601118, + "loss": 2.7732, + "step": 7795 + }, + { + "epoch": 0.2311775346202888, + "grad_norm": 0.1497529000043869, + "learning_rate": 0.0008818671370085374, + "loss": 2.7548, + "step": 7796 + }, + { + "epoch": 0.23120718797260031, + "grad_norm": 0.15529055893421173, + "learning_rate": 0.0008818367624754637, + "loss": 2.8021, + "step": 7797 + }, + { + "epoch": 0.2312368413249118, + "grad_norm": 0.1473122090101242, + "learning_rate": 0.0008818063845611599, + "loss": 2.7381, + "step": 7798 + }, + { + "epoch": 0.23126649467722327, + "grad_norm": 0.13757316768169403, + "learning_rate": 0.0008817760032658953, + "loss": 2.7577, + "step": 7799 + }, + { + "epoch": 0.23129614802953474, + "grad_norm": 0.16873648762702942, + "learning_rate": 0.0008817456185899384, + "loss": 2.7526, + "step": 7800 + }, + { + "epoch": 0.23132580138184622, + "grad_norm": 0.15663057565689087, + "learning_rate": 0.0008817152305335586, + "loss": 2.7581, + "step": 7801 + }, + { + "epoch": 0.2313554547341577, + "grad_norm": 0.1500117927789688, + "learning_rate": 0.0008816848390970249, + "loss": 2.746, + "step": 7802 + }, + { + "epoch": 0.23138510808646917, + "grad_norm": 0.15620771050453186, + "learning_rate": 0.0008816544442806065, + "loss": 2.7402, + "step": 7803 + }, + { + "epoch": 0.23141476143878065, + "grad_norm": 0.15884551405906677, + "learning_rate": 0.0008816240460845725, + "loss": 2.7474, + "step": 7804 + }, + { + "epoch": 0.23144441479109212, + "grad_norm": 0.13177935779094696, + "learning_rate": 0.0008815936445091919, + "loss": 2.7449, + "step": 7805 + }, + { + "epoch": 0.2314740681434036, + "grad_norm": 0.13699789345264435, + "learning_rate": 0.0008815632395547342, + "loss": 2.7837, + "step": 7806 + }, + { + "epoch": 0.2315037214957151, + "grad_norm": 0.13602007925510406, + "learning_rate": 0.0008815328312214686, + "loss": 2.7621, + "step": 7807 + }, + { + "epoch": 0.23153337484802658, + "grad_norm": 0.13895143568515778, + "learning_rate": 0.0008815024195096641, + "loss": 2.7376, + "step": 7808 + }, + { + "epoch": 0.23156302820033806, + "grad_norm": 0.14261899888515472, + "learning_rate": 0.0008814720044195904, + "loss": 2.735, + "step": 7809 + }, + { + "epoch": 0.23159268155264953, + "grad_norm": 0.13867934048175812, + "learning_rate": 0.0008814415859515164, + "loss": 2.7797, + "step": 7810 + }, + { + "epoch": 0.231622334904961, + "grad_norm": 0.13237491250038147, + "learning_rate": 0.0008814111641057119, + "loss": 2.7559, + "step": 7811 + }, + { + "epoch": 0.23165198825727248, + "grad_norm": 0.14720705151557922, + "learning_rate": 0.000881380738882446, + "loss": 2.7652, + "step": 7812 + }, + { + "epoch": 0.23168164160958396, + "grad_norm": 0.15173596143722534, + "learning_rate": 0.0008813503102819881, + "loss": 2.7519, + "step": 7813 + }, + { + "epoch": 0.23171129496189544, + "grad_norm": 0.16313163936138153, + "learning_rate": 0.0008813198783046078, + "loss": 2.7937, + "step": 7814 + }, + { + "epoch": 0.2317409483142069, + "grad_norm": 0.16885003447532654, + "learning_rate": 0.0008812894429505745, + "loss": 2.7337, + "step": 7815 + }, + { + "epoch": 0.2317706016665184, + "grad_norm": 0.16794656217098236, + "learning_rate": 0.0008812590042201578, + "loss": 2.7812, + "step": 7816 + }, + { + "epoch": 0.23180025501882986, + "grad_norm": 0.19490021467208862, + "learning_rate": 0.0008812285621136271, + "loss": 2.7841, + "step": 7817 + }, + { + "epoch": 0.23182990837114137, + "grad_norm": 0.22616632282733917, + "learning_rate": 0.0008811981166312521, + "loss": 2.7696, + "step": 7818 + }, + { + "epoch": 0.23185956172345284, + "grad_norm": 0.21611525118350983, + "learning_rate": 0.0008811676677733022, + "loss": 2.7641, + "step": 7819 + }, + { + "epoch": 0.23188921507576432, + "grad_norm": 0.18303287029266357, + "learning_rate": 0.0008811372155400474, + "loss": 2.749, + "step": 7820 + }, + { + "epoch": 0.2319188684280758, + "grad_norm": 0.19389553368091583, + "learning_rate": 0.000881106759931757, + "loss": 2.7872, + "step": 7821 + }, + { + "epoch": 0.23194852178038727, + "grad_norm": 0.17722207307815552, + "learning_rate": 0.0008810763009487009, + "loss": 2.784, + "step": 7822 + }, + { + "epoch": 0.23197817513269875, + "grad_norm": 0.13412614166736603, + "learning_rate": 0.0008810458385911489, + "loss": 2.7715, + "step": 7823 + }, + { + "epoch": 0.23200782848501023, + "grad_norm": 0.15078599750995636, + "learning_rate": 0.0008810153728593703, + "loss": 2.774, + "step": 7824 + }, + { + "epoch": 0.2320374818373217, + "grad_norm": 0.15308071672916412, + "learning_rate": 0.0008809849037536353, + "loss": 2.7223, + "step": 7825 + }, + { + "epoch": 0.23206713518963318, + "grad_norm": 0.15161219239234924, + "learning_rate": 0.0008809544312742135, + "loss": 2.7964, + "step": 7826 + }, + { + "epoch": 0.23209678854194465, + "grad_norm": 0.17324791848659515, + "learning_rate": 0.000880923955421375, + "loss": 2.8119, + "step": 7827 + }, + { + "epoch": 0.23212644189425616, + "grad_norm": 0.15616796910762787, + "learning_rate": 0.0008808934761953893, + "loss": 2.7717, + "step": 7828 + }, + { + "epoch": 0.23215609524656763, + "grad_norm": 0.15814992785453796, + "learning_rate": 0.0008808629935965265, + "loss": 2.7561, + "step": 7829 + }, + { + "epoch": 0.2321857485988791, + "grad_norm": 0.14199590682983398, + "learning_rate": 0.0008808325076250566, + "loss": 2.7603, + "step": 7830 + }, + { + "epoch": 0.2322154019511906, + "grad_norm": 0.11481405794620514, + "learning_rate": 0.0008808020182812495, + "loss": 2.7965, + "step": 7831 + }, + { + "epoch": 0.23224505530350206, + "grad_norm": 0.128276064991951, + "learning_rate": 0.0008807715255653751, + "loss": 2.7482, + "step": 7832 + }, + { + "epoch": 0.23227470865581354, + "grad_norm": 0.12893101572990417, + "learning_rate": 0.0008807410294777035, + "loss": 2.7562, + "step": 7833 + }, + { + "epoch": 0.23230436200812501, + "grad_norm": 0.1535082310438156, + "learning_rate": 0.0008807105300185047, + "loss": 2.7773, + "step": 7834 + }, + { + "epoch": 0.2323340153604365, + "grad_norm": 0.1653982698917389, + "learning_rate": 0.0008806800271880488, + "loss": 2.793, + "step": 7835 + }, + { + "epoch": 0.23236366871274797, + "grad_norm": 0.14844484627246857, + "learning_rate": 0.000880649520986606, + "loss": 2.7731, + "step": 7836 + }, + { + "epoch": 0.23239332206505944, + "grad_norm": 0.13699142634868622, + "learning_rate": 0.0008806190114144463, + "loss": 2.7665, + "step": 7837 + }, + { + "epoch": 0.23242297541737095, + "grad_norm": 0.13278800249099731, + "learning_rate": 0.0008805884984718399, + "loss": 2.7771, + "step": 7838 + }, + { + "epoch": 0.23245262876968242, + "grad_norm": 0.13420848548412323, + "learning_rate": 0.000880557982159057, + "loss": 2.7576, + "step": 7839 + }, + { + "epoch": 0.2324822821219939, + "grad_norm": 0.13146674633026123, + "learning_rate": 0.000880527462476368, + "loss": 2.7864, + "step": 7840 + }, + { + "epoch": 0.23251193547430538, + "grad_norm": 0.11496435105800629, + "learning_rate": 0.0008804969394240429, + "loss": 2.7339, + "step": 7841 + }, + { + "epoch": 0.23254158882661685, + "grad_norm": 0.11798793077468872, + "learning_rate": 0.000880466413002352, + "loss": 2.7716, + "step": 7842 + }, + { + "epoch": 0.23257124217892833, + "grad_norm": 0.11665964126586914, + "learning_rate": 0.000880435883211566, + "loss": 2.8037, + "step": 7843 + }, + { + "epoch": 0.2326008955312398, + "grad_norm": 0.11870971322059631, + "learning_rate": 0.0008804053500519547, + "loss": 2.7546, + "step": 7844 + }, + { + "epoch": 0.23263054888355128, + "grad_norm": 0.13137316703796387, + "learning_rate": 0.0008803748135237888, + "loss": 2.7515, + "step": 7845 + }, + { + "epoch": 0.23266020223586276, + "grad_norm": 0.14212141931056976, + "learning_rate": 0.0008803442736273386, + "loss": 2.7793, + "step": 7846 + }, + { + "epoch": 0.23268985558817423, + "grad_norm": 0.16273637115955353, + "learning_rate": 0.0008803137303628745, + "loss": 2.7576, + "step": 7847 + }, + { + "epoch": 0.2327195089404857, + "grad_norm": 0.18270045518875122, + "learning_rate": 0.0008802831837306672, + "loss": 2.772, + "step": 7848 + }, + { + "epoch": 0.2327491622927972, + "grad_norm": 0.2024768441915512, + "learning_rate": 0.0008802526337309868, + "loss": 2.7973, + "step": 7849 + }, + { + "epoch": 0.2327788156451087, + "grad_norm": 0.23630352318286896, + "learning_rate": 0.0008802220803641043, + "loss": 2.7621, + "step": 7850 + }, + { + "epoch": 0.23280846899742016, + "grad_norm": 0.28384366631507874, + "learning_rate": 0.00088019152363029, + "loss": 2.8052, + "step": 7851 + }, + { + "epoch": 0.23283812234973164, + "grad_norm": 0.20974887907505035, + "learning_rate": 0.0008801609635298145, + "loss": 2.7756, + "step": 7852 + }, + { + "epoch": 0.23286777570204312, + "grad_norm": 0.15335455536842346, + "learning_rate": 0.0008801304000629482, + "loss": 2.7374, + "step": 7853 + }, + { + "epoch": 0.2328974290543546, + "grad_norm": 0.18534666299819946, + "learning_rate": 0.0008800998332299621, + "loss": 2.7952, + "step": 7854 + }, + { + "epoch": 0.23292708240666607, + "grad_norm": 0.14545316994190216, + "learning_rate": 0.0008800692630311268, + "loss": 2.789, + "step": 7855 + }, + { + "epoch": 0.23295673575897755, + "grad_norm": 0.14956338703632355, + "learning_rate": 0.000880038689466713, + "loss": 2.766, + "step": 7856 + }, + { + "epoch": 0.23298638911128902, + "grad_norm": 0.13314418494701385, + "learning_rate": 0.0008800081125369911, + "loss": 2.7388, + "step": 7857 + }, + { + "epoch": 0.2330160424636005, + "grad_norm": 0.12809054553508759, + "learning_rate": 0.0008799775322422323, + "loss": 2.7547, + "step": 7858 + }, + { + "epoch": 0.233045695815912, + "grad_norm": 0.12626418471336365, + "learning_rate": 0.0008799469485827072, + "loss": 2.7571, + "step": 7859 + }, + { + "epoch": 0.23307534916822348, + "grad_norm": 0.1275675892829895, + "learning_rate": 0.0008799163615586868, + "loss": 2.7522, + "step": 7860 + }, + { + "epoch": 0.23310500252053495, + "grad_norm": 0.12754420936107635, + "learning_rate": 0.0008798857711704416, + "loss": 2.7453, + "step": 7861 + }, + { + "epoch": 0.23313465587284643, + "grad_norm": 0.12972351908683777, + "learning_rate": 0.0008798551774182428, + "loss": 2.7762, + "step": 7862 + }, + { + "epoch": 0.2331643092251579, + "grad_norm": 0.13311398029327393, + "learning_rate": 0.000879824580302361, + "loss": 2.7596, + "step": 7863 + }, + { + "epoch": 0.23319396257746938, + "grad_norm": 0.13763809204101562, + "learning_rate": 0.0008797939798230676, + "loss": 2.7345, + "step": 7864 + }, + { + "epoch": 0.23322361592978086, + "grad_norm": 0.1428224891424179, + "learning_rate": 0.0008797633759806331, + "loss": 2.7487, + "step": 7865 + }, + { + "epoch": 0.23325326928209233, + "grad_norm": 0.13591890037059784, + "learning_rate": 0.0008797327687753289, + "loss": 2.7907, + "step": 7866 + }, + { + "epoch": 0.2332829226344038, + "grad_norm": 0.12724359333515167, + "learning_rate": 0.0008797021582074258, + "loss": 2.7783, + "step": 7867 + }, + { + "epoch": 0.2333125759867153, + "grad_norm": 0.12670379877090454, + "learning_rate": 0.000879671544277195, + "loss": 2.716, + "step": 7868 + }, + { + "epoch": 0.23334222933902676, + "grad_norm": 0.12825855612754822, + "learning_rate": 0.0008796409269849073, + "loss": 2.7636, + "step": 7869 + }, + { + "epoch": 0.23337188269133827, + "grad_norm": 0.15127575397491455, + "learning_rate": 0.0008796103063308343, + "loss": 2.7416, + "step": 7870 + }, + { + "epoch": 0.23340153604364974, + "grad_norm": 0.15448389947414398, + "learning_rate": 0.0008795796823152466, + "loss": 2.8561, + "step": 7871 + }, + { + "epoch": 0.23343118939596122, + "grad_norm": 0.13659396767616272, + "learning_rate": 0.0008795490549384159, + "loss": 2.7938, + "step": 7872 + }, + { + "epoch": 0.2334608427482727, + "grad_norm": 0.14740462601184845, + "learning_rate": 0.0008795184242006129, + "loss": 2.7607, + "step": 7873 + }, + { + "epoch": 0.23349049610058417, + "grad_norm": 0.11695794761180878, + "learning_rate": 0.0008794877901021094, + "loss": 2.7786, + "step": 7874 + }, + { + "epoch": 0.23352014945289565, + "grad_norm": 0.10613825917243958, + "learning_rate": 0.0008794571526431762, + "loss": 2.7533, + "step": 7875 + }, + { + "epoch": 0.23354980280520712, + "grad_norm": 0.11570797860622406, + "learning_rate": 0.0008794265118240847, + "loss": 2.7526, + "step": 7876 + }, + { + "epoch": 0.2335794561575186, + "grad_norm": 0.1261027604341507, + "learning_rate": 0.0008793958676451066, + "loss": 2.7445, + "step": 7877 + }, + { + "epoch": 0.23360910950983008, + "grad_norm": 0.12994630634784698, + "learning_rate": 0.0008793652201065128, + "loss": 2.7527, + "step": 7878 + }, + { + "epoch": 0.23363876286214155, + "grad_norm": 0.1301358938217163, + "learning_rate": 0.0008793345692085748, + "loss": 2.7624, + "step": 7879 + }, + { + "epoch": 0.23366841621445306, + "grad_norm": 0.14383268356323242, + "learning_rate": 0.0008793039149515643, + "loss": 2.7574, + "step": 7880 + }, + { + "epoch": 0.23369806956676453, + "grad_norm": 0.1462443321943283, + "learning_rate": 0.0008792732573357523, + "loss": 2.7715, + "step": 7881 + }, + { + "epoch": 0.233727722919076, + "grad_norm": 0.1670101433992386, + "learning_rate": 0.0008792425963614105, + "loss": 2.7399, + "step": 7882 + }, + { + "epoch": 0.23375737627138748, + "grad_norm": 0.23321548104286194, + "learning_rate": 0.0008792119320288105, + "loss": 2.7744, + "step": 7883 + }, + { + "epoch": 0.23378702962369896, + "grad_norm": 0.24263109266757965, + "learning_rate": 0.0008791812643382238, + "loss": 2.7404, + "step": 7884 + }, + { + "epoch": 0.23381668297601044, + "grad_norm": 0.2240418642759323, + "learning_rate": 0.0008791505932899217, + "loss": 2.7769, + "step": 7885 + }, + { + "epoch": 0.2338463363283219, + "grad_norm": 0.25358670949935913, + "learning_rate": 0.0008791199188841764, + "loss": 2.7484, + "step": 7886 + }, + { + "epoch": 0.2338759896806334, + "grad_norm": 0.1623252034187317, + "learning_rate": 0.0008790892411212588, + "loss": 2.7758, + "step": 7887 + }, + { + "epoch": 0.23390564303294487, + "grad_norm": 0.15834908187389374, + "learning_rate": 0.0008790585600014409, + "loss": 2.7903, + "step": 7888 + }, + { + "epoch": 0.23393529638525634, + "grad_norm": 0.1615820676088333, + "learning_rate": 0.0008790278755249945, + "loss": 2.8153, + "step": 7889 + }, + { + "epoch": 0.23396494973756785, + "grad_norm": 0.17887327075004578, + "learning_rate": 0.0008789971876921913, + "loss": 2.7747, + "step": 7890 + }, + { + "epoch": 0.23399460308987932, + "grad_norm": 0.15963990986347198, + "learning_rate": 0.0008789664965033029, + "loss": 2.7901, + "step": 7891 + }, + { + "epoch": 0.2340242564421908, + "grad_norm": 0.14904777705669403, + "learning_rate": 0.000878935801958601, + "loss": 2.7129, + "step": 7892 + }, + { + "epoch": 0.23405390979450227, + "grad_norm": 0.1457272320985794, + "learning_rate": 0.0008789051040583576, + "loss": 2.7581, + "step": 7893 + }, + { + "epoch": 0.23408356314681375, + "grad_norm": 0.12658081948757172, + "learning_rate": 0.0008788744028028445, + "loss": 2.7532, + "step": 7894 + }, + { + "epoch": 0.23411321649912523, + "grad_norm": 0.13266237080097198, + "learning_rate": 0.0008788436981923335, + "loss": 2.7872, + "step": 7895 + }, + { + "epoch": 0.2341428698514367, + "grad_norm": 0.1320151835680008, + "learning_rate": 0.0008788129902270965, + "loss": 2.7594, + "step": 7896 + }, + { + "epoch": 0.23417252320374818, + "grad_norm": 0.12386034429073334, + "learning_rate": 0.0008787822789074056, + "loss": 2.7879, + "step": 7897 + }, + { + "epoch": 0.23420217655605965, + "grad_norm": 0.11705136299133301, + "learning_rate": 0.0008787515642335324, + "loss": 2.7744, + "step": 7898 + }, + { + "epoch": 0.23423182990837113, + "grad_norm": 0.12231305241584778, + "learning_rate": 0.0008787208462057492, + "loss": 2.7754, + "step": 7899 + }, + { + "epoch": 0.2342614832606826, + "grad_norm": 0.14326415956020355, + "learning_rate": 0.0008786901248243277, + "loss": 2.773, + "step": 7900 + }, + { + "epoch": 0.2342911366129941, + "grad_norm": 0.134212464094162, + "learning_rate": 0.0008786594000895404, + "loss": 2.7768, + "step": 7901 + }, + { + "epoch": 0.2343207899653056, + "grad_norm": 0.1392042338848114, + "learning_rate": 0.0008786286720016591, + "loss": 2.7586, + "step": 7902 + }, + { + "epoch": 0.23435044331761706, + "grad_norm": 0.12768493592739105, + "learning_rate": 0.0008785979405609559, + "loss": 2.7643, + "step": 7903 + }, + { + "epoch": 0.23438009666992854, + "grad_norm": 0.1319635659456253, + "learning_rate": 0.0008785672057677028, + "loss": 2.7514, + "step": 7904 + }, + { + "epoch": 0.23440975002224002, + "grad_norm": 0.14587636291980743, + "learning_rate": 0.0008785364676221722, + "loss": 2.7335, + "step": 7905 + }, + { + "epoch": 0.2344394033745515, + "grad_norm": 0.15146665275096893, + "learning_rate": 0.0008785057261246363, + "loss": 2.76, + "step": 7906 + }, + { + "epoch": 0.23446905672686297, + "grad_norm": 0.16188499331474304, + "learning_rate": 0.000878474981275367, + "loss": 2.7835, + "step": 7907 + }, + { + "epoch": 0.23449871007917444, + "grad_norm": 0.16475729644298553, + "learning_rate": 0.000878444233074637, + "loss": 2.749, + "step": 7908 + }, + { + "epoch": 0.23452836343148592, + "grad_norm": 0.1484954059123993, + "learning_rate": 0.0008784134815227183, + "loss": 2.7929, + "step": 7909 + }, + { + "epoch": 0.2345580167837974, + "grad_norm": 0.14349420368671417, + "learning_rate": 0.0008783827266198831, + "loss": 2.8115, + "step": 7910 + }, + { + "epoch": 0.2345876701361089, + "grad_norm": 0.15721943974494934, + "learning_rate": 0.0008783519683664042, + "loss": 2.8108, + "step": 7911 + }, + { + "epoch": 0.23461732348842038, + "grad_norm": 0.15801553428173065, + "learning_rate": 0.0008783212067625534, + "loss": 2.7551, + "step": 7912 + }, + { + "epoch": 0.23464697684073185, + "grad_norm": 0.13759540021419525, + "learning_rate": 0.0008782904418086035, + "loss": 2.7739, + "step": 7913 + }, + { + "epoch": 0.23467663019304333, + "grad_norm": 0.14773379266262054, + "learning_rate": 0.0008782596735048269, + "loss": 2.7712, + "step": 7914 + }, + { + "epoch": 0.2347062835453548, + "grad_norm": 0.1447540521621704, + "learning_rate": 0.0008782289018514958, + "loss": 2.7953, + "step": 7915 + }, + { + "epoch": 0.23473593689766628, + "grad_norm": 0.1326368749141693, + "learning_rate": 0.000878198126848883, + "loss": 2.7679, + "step": 7916 + }, + { + "epoch": 0.23476559024997776, + "grad_norm": 0.1316055804491043, + "learning_rate": 0.0008781673484972608, + "loss": 2.789, + "step": 7917 + }, + { + "epoch": 0.23479524360228923, + "grad_norm": 0.12169145792722702, + "learning_rate": 0.0008781365667969018, + "loss": 2.7055, + "step": 7918 + }, + { + "epoch": 0.2348248969546007, + "grad_norm": 0.1302434802055359, + "learning_rate": 0.0008781057817480786, + "loss": 2.7723, + "step": 7919 + }, + { + "epoch": 0.23485455030691219, + "grad_norm": 0.14369502663612366, + "learning_rate": 0.0008780749933510638, + "loss": 2.7652, + "step": 7920 + }, + { + "epoch": 0.23488420365922366, + "grad_norm": 0.16687771677970886, + "learning_rate": 0.00087804420160613, + "loss": 2.7807, + "step": 7921 + }, + { + "epoch": 0.23491385701153517, + "grad_norm": 0.18935084342956543, + "learning_rate": 0.0008780134065135499, + "loss": 2.7215, + "step": 7922 + }, + { + "epoch": 0.23494351036384664, + "grad_norm": 0.17648768424987793, + "learning_rate": 0.0008779826080735963, + "loss": 2.7853, + "step": 7923 + }, + { + "epoch": 0.23497316371615812, + "grad_norm": 0.16662061214447021, + "learning_rate": 0.0008779518062865418, + "loss": 2.7375, + "step": 7924 + }, + { + "epoch": 0.2350028170684696, + "grad_norm": 0.17632007598876953, + "learning_rate": 0.0008779210011526591, + "loss": 2.7586, + "step": 7925 + }, + { + "epoch": 0.23503247042078107, + "grad_norm": 0.19691763818264008, + "learning_rate": 0.0008778901926722212, + "loss": 2.7876, + "step": 7926 + }, + { + "epoch": 0.23506212377309255, + "grad_norm": 0.19041472673416138, + "learning_rate": 0.0008778593808455007, + "loss": 2.7501, + "step": 7927 + }, + { + "epoch": 0.23509177712540402, + "grad_norm": 0.150332972407341, + "learning_rate": 0.0008778285656727704, + "loss": 2.7642, + "step": 7928 + }, + { + "epoch": 0.2351214304777155, + "grad_norm": 0.16972123086452484, + "learning_rate": 0.0008777977471543035, + "loss": 2.7574, + "step": 7929 + }, + { + "epoch": 0.23515108383002697, + "grad_norm": 0.18751122057437897, + "learning_rate": 0.0008777669252903726, + "loss": 2.7794, + "step": 7930 + }, + { + "epoch": 0.23518073718233845, + "grad_norm": 0.1672779619693756, + "learning_rate": 0.0008777361000812507, + "loss": 2.7832, + "step": 7931 + }, + { + "epoch": 0.23521039053464995, + "grad_norm": 0.15705513954162598, + "learning_rate": 0.0008777052715272109, + "loss": 2.7271, + "step": 7932 + }, + { + "epoch": 0.23524004388696143, + "grad_norm": 0.16612419486045837, + "learning_rate": 0.0008776744396285261, + "loss": 2.7486, + "step": 7933 + }, + { + "epoch": 0.2352696972392729, + "grad_norm": 0.16472949087619781, + "learning_rate": 0.0008776436043854692, + "loss": 2.7878, + "step": 7934 + }, + { + "epoch": 0.23529935059158438, + "grad_norm": 0.13766279816627502, + "learning_rate": 0.0008776127657983135, + "loss": 2.7476, + "step": 7935 + }, + { + "epoch": 0.23532900394389586, + "grad_norm": 0.15047739446163177, + "learning_rate": 0.0008775819238673317, + "loss": 2.7728, + "step": 7936 + }, + { + "epoch": 0.23535865729620734, + "grad_norm": 0.13393503427505493, + "learning_rate": 0.0008775510785927974, + "loss": 2.776, + "step": 7937 + }, + { + "epoch": 0.2353883106485188, + "grad_norm": 0.14933013916015625, + "learning_rate": 0.0008775202299749834, + "loss": 2.7645, + "step": 7938 + }, + { + "epoch": 0.2354179640008303, + "grad_norm": 0.14264856278896332, + "learning_rate": 0.0008774893780141629, + "loss": 2.7519, + "step": 7939 + }, + { + "epoch": 0.23544761735314176, + "grad_norm": 0.15242785215377808, + "learning_rate": 0.0008774585227106093, + "loss": 2.7519, + "step": 7940 + }, + { + "epoch": 0.23547727070545324, + "grad_norm": 0.17771169543266296, + "learning_rate": 0.0008774276640645955, + "loss": 2.7998, + "step": 7941 + }, + { + "epoch": 0.23550692405776474, + "grad_norm": 0.16814950108528137, + "learning_rate": 0.0008773968020763951, + "loss": 2.7895, + "step": 7942 + }, + { + "epoch": 0.23553657741007622, + "grad_norm": 0.15446199476718903, + "learning_rate": 0.0008773659367462813, + "loss": 2.7625, + "step": 7943 + }, + { + "epoch": 0.2355662307623877, + "grad_norm": 0.14872941374778748, + "learning_rate": 0.0008773350680745273, + "loss": 2.7629, + "step": 7944 + }, + { + "epoch": 0.23559588411469917, + "grad_norm": 0.14183299243450165, + "learning_rate": 0.0008773041960614063, + "loss": 2.7757, + "step": 7945 + }, + { + "epoch": 0.23562553746701065, + "grad_norm": 0.1487589031457901, + "learning_rate": 0.0008772733207071922, + "loss": 2.7377, + "step": 7946 + }, + { + "epoch": 0.23565519081932212, + "grad_norm": 0.16332322359085083, + "learning_rate": 0.0008772424420121579, + "loss": 2.7363, + "step": 7947 + }, + { + "epoch": 0.2356848441716336, + "grad_norm": 0.15093262493610382, + "learning_rate": 0.0008772115599765771, + "loss": 2.791, + "step": 7948 + }, + { + "epoch": 0.23571449752394508, + "grad_norm": 0.14363670349121094, + "learning_rate": 0.0008771806746007231, + "loss": 2.7185, + "step": 7949 + }, + { + "epoch": 0.23574415087625655, + "grad_norm": 0.13052089512348175, + "learning_rate": 0.0008771497858848695, + "loss": 2.781, + "step": 7950 + }, + { + "epoch": 0.23577380422856803, + "grad_norm": 0.11471826583147049, + "learning_rate": 0.0008771188938292897, + "loss": 2.7873, + "step": 7951 + }, + { + "epoch": 0.2358034575808795, + "grad_norm": 0.14322784543037415, + "learning_rate": 0.0008770879984342577, + "loss": 2.7629, + "step": 7952 + }, + { + "epoch": 0.235833110933191, + "grad_norm": 0.1525477170944214, + "learning_rate": 0.0008770570997000464, + "loss": 2.7838, + "step": 7953 + }, + { + "epoch": 0.23586276428550249, + "grad_norm": 0.14683280885219574, + "learning_rate": 0.0008770261976269301, + "loss": 2.7668, + "step": 7954 + }, + { + "epoch": 0.23589241763781396, + "grad_norm": 0.1353396326303482, + "learning_rate": 0.0008769952922151818, + "loss": 2.7666, + "step": 7955 + }, + { + "epoch": 0.23592207099012544, + "grad_norm": 0.15426284074783325, + "learning_rate": 0.0008769643834650756, + "loss": 2.7593, + "step": 7956 + }, + { + "epoch": 0.2359517243424369, + "grad_norm": 0.1587265282869339, + "learning_rate": 0.0008769334713768851, + "loss": 2.7553, + "step": 7957 + }, + { + "epoch": 0.2359813776947484, + "grad_norm": 0.18026132881641388, + "learning_rate": 0.000876902555950884, + "loss": 2.7505, + "step": 7958 + }, + { + "epoch": 0.23601103104705987, + "grad_norm": 0.1708964705467224, + "learning_rate": 0.000876871637187346, + "loss": 2.7419, + "step": 7959 + }, + { + "epoch": 0.23604068439937134, + "grad_norm": 0.16443714499473572, + "learning_rate": 0.0008768407150865449, + "loss": 2.801, + "step": 7960 + }, + { + "epoch": 0.23607033775168282, + "grad_norm": 0.15858028829097748, + "learning_rate": 0.0008768097896487548, + "loss": 2.7731, + "step": 7961 + }, + { + "epoch": 0.2360999911039943, + "grad_norm": 0.14514917135238647, + "learning_rate": 0.0008767788608742493, + "loss": 2.7589, + "step": 7962 + }, + { + "epoch": 0.2361296444563058, + "grad_norm": 0.14539071917533875, + "learning_rate": 0.0008767479287633023, + "loss": 2.76, + "step": 7963 + }, + { + "epoch": 0.23615929780861727, + "grad_norm": 0.1418025642633438, + "learning_rate": 0.0008767169933161876, + "loss": 2.7621, + "step": 7964 + }, + { + "epoch": 0.23618895116092875, + "grad_norm": 0.15752717852592468, + "learning_rate": 0.0008766860545331794, + "loss": 2.7556, + "step": 7965 + }, + { + "epoch": 0.23621860451324023, + "grad_norm": 0.142378568649292, + "learning_rate": 0.0008766551124145515, + "loss": 2.7508, + "step": 7966 + }, + { + "epoch": 0.2362482578655517, + "grad_norm": 0.12171753495931625, + "learning_rate": 0.0008766241669605777, + "loss": 2.8026, + "step": 7967 + }, + { + "epoch": 0.23627791121786318, + "grad_norm": 0.13214559853076935, + "learning_rate": 0.0008765932181715325, + "loss": 2.7846, + "step": 7968 + }, + { + "epoch": 0.23630756457017466, + "grad_norm": 0.14740365743637085, + "learning_rate": 0.0008765622660476897, + "loss": 2.7462, + "step": 7969 + }, + { + "epoch": 0.23633721792248613, + "grad_norm": 0.13455404341220856, + "learning_rate": 0.0008765313105893233, + "loss": 2.7791, + "step": 7970 + }, + { + "epoch": 0.2363668712747976, + "grad_norm": 0.1397978514432907, + "learning_rate": 0.0008765003517967077, + "loss": 2.7393, + "step": 7971 + }, + { + "epoch": 0.23639652462710908, + "grad_norm": 0.15574227273464203, + "learning_rate": 0.0008764693896701165, + "loss": 2.7704, + "step": 7972 + }, + { + "epoch": 0.23642617797942056, + "grad_norm": 0.1570558100938797, + "learning_rate": 0.0008764384242098247, + "loss": 2.7671, + "step": 7973 + }, + { + "epoch": 0.23645583133173206, + "grad_norm": 0.16732481122016907, + "learning_rate": 0.0008764074554161057, + "loss": 2.7861, + "step": 7974 + }, + { + "epoch": 0.23648548468404354, + "grad_norm": 0.15903660655021667, + "learning_rate": 0.0008763764832892343, + "loss": 2.7874, + "step": 7975 + }, + { + "epoch": 0.23651513803635502, + "grad_norm": 0.17161153256893158, + "learning_rate": 0.0008763455078294842, + "loss": 2.7842, + "step": 7976 + }, + { + "epoch": 0.2365447913886665, + "grad_norm": 0.18254992365837097, + "learning_rate": 0.0008763145290371304, + "loss": 2.7742, + "step": 7977 + }, + { + "epoch": 0.23657444474097797, + "grad_norm": 0.17260538041591644, + "learning_rate": 0.0008762835469124466, + "loss": 2.7651, + "step": 7978 + }, + { + "epoch": 0.23660409809328944, + "grad_norm": 0.14440105855464935, + "learning_rate": 0.0008762525614557076, + "loss": 2.771, + "step": 7979 + }, + { + "epoch": 0.23663375144560092, + "grad_norm": 0.14607056975364685, + "learning_rate": 0.0008762215726671874, + "loss": 2.7655, + "step": 7980 + }, + { + "epoch": 0.2366634047979124, + "grad_norm": 0.15441353619098663, + "learning_rate": 0.0008761905805471607, + "loss": 2.7538, + "step": 7981 + }, + { + "epoch": 0.23669305815022387, + "grad_norm": 0.13873107731342316, + "learning_rate": 0.0008761595850959019, + "loss": 2.752, + "step": 7982 + }, + { + "epoch": 0.23672271150253535, + "grad_norm": 0.14131225645542145, + "learning_rate": 0.0008761285863136852, + "loss": 2.7366, + "step": 7983 + }, + { + "epoch": 0.23675236485484685, + "grad_norm": 0.15296687185764313, + "learning_rate": 0.0008760975842007855, + "loss": 2.7777, + "step": 7984 + }, + { + "epoch": 0.23678201820715833, + "grad_norm": 0.14141982793807983, + "learning_rate": 0.000876066578757477, + "loss": 2.7867, + "step": 7985 + }, + { + "epoch": 0.2368116715594698, + "grad_norm": 0.15476453304290771, + "learning_rate": 0.0008760355699840345, + "loss": 2.7539, + "step": 7986 + }, + { + "epoch": 0.23684132491178128, + "grad_norm": 0.14540110528469086, + "learning_rate": 0.0008760045578807324, + "loss": 2.7597, + "step": 7987 + }, + { + "epoch": 0.23687097826409276, + "grad_norm": 0.13703738152980804, + "learning_rate": 0.0008759735424478455, + "loss": 2.7608, + "step": 7988 + }, + { + "epoch": 0.23690063161640423, + "grad_norm": 0.13986578583717346, + "learning_rate": 0.0008759425236856482, + "loss": 2.8, + "step": 7989 + }, + { + "epoch": 0.2369302849687157, + "grad_norm": 0.14079929888248444, + "learning_rate": 0.0008759115015944155, + "loss": 2.8049, + "step": 7990 + }, + { + "epoch": 0.23695993832102719, + "grad_norm": 0.14255882799625397, + "learning_rate": 0.0008758804761744218, + "loss": 2.7843, + "step": 7991 + }, + { + "epoch": 0.23698959167333866, + "grad_norm": 0.12327823042869568, + "learning_rate": 0.0008758494474259419, + "loss": 2.7895, + "step": 7992 + }, + { + "epoch": 0.23701924502565014, + "grad_norm": 0.12207376956939697, + "learning_rate": 0.0008758184153492508, + "loss": 2.7449, + "step": 7993 + }, + { + "epoch": 0.23704889837796164, + "grad_norm": 0.12363265454769135, + "learning_rate": 0.0008757873799446231, + "loss": 2.7249, + "step": 7994 + }, + { + "epoch": 0.23707855173027312, + "grad_norm": 0.13445782661437988, + "learning_rate": 0.0008757563412123336, + "loss": 2.7598, + "step": 7995 + }, + { + "epoch": 0.2371082050825846, + "grad_norm": 0.1400202363729477, + "learning_rate": 0.0008757252991526572, + "loss": 2.7643, + "step": 7996 + }, + { + "epoch": 0.23713785843489607, + "grad_norm": 0.16513684391975403, + "learning_rate": 0.0008756942537658688, + "loss": 2.7517, + "step": 7997 + }, + { + "epoch": 0.23716751178720755, + "grad_norm": 0.20943032205104828, + "learning_rate": 0.0008756632050522432, + "loss": 2.7795, + "step": 7998 + }, + { + "epoch": 0.23719716513951902, + "grad_norm": 0.23302896320819855, + "learning_rate": 0.0008756321530120556, + "loss": 2.7605, + "step": 7999 + }, + { + "epoch": 0.2372268184918305, + "grad_norm": 0.21398892998695374, + "learning_rate": 0.0008756010976455807, + "loss": 2.7711, + "step": 8000 + }, + { + "epoch": 0.23725647184414198, + "grad_norm": 0.1673252135515213, + "learning_rate": 0.0008755700389530936, + "loss": 2.7374, + "step": 8001 + }, + { + "epoch": 0.23728612519645345, + "grad_norm": 0.208206444978714, + "learning_rate": 0.0008755389769348694, + "loss": 2.7708, + "step": 8002 + }, + { + "epoch": 0.23731577854876493, + "grad_norm": 0.19446630775928497, + "learning_rate": 0.000875507911591183, + "loss": 2.7377, + "step": 8003 + }, + { + "epoch": 0.2373454319010764, + "grad_norm": 0.15488363802433014, + "learning_rate": 0.0008754768429223098, + "loss": 2.7695, + "step": 8004 + }, + { + "epoch": 0.2373750852533879, + "grad_norm": 0.18656277656555176, + "learning_rate": 0.0008754457709285247, + "loss": 2.7527, + "step": 8005 + }, + { + "epoch": 0.23740473860569938, + "grad_norm": 0.18781566619873047, + "learning_rate": 0.0008754146956101025, + "loss": 2.7274, + "step": 8006 + }, + { + "epoch": 0.23743439195801086, + "grad_norm": 0.16828271746635437, + "learning_rate": 0.000875383616967319, + "loss": 2.7819, + "step": 8007 + }, + { + "epoch": 0.23746404531032234, + "grad_norm": 0.17127974331378937, + "learning_rate": 0.0008753525350004492, + "loss": 2.7436, + "step": 8008 + }, + { + "epoch": 0.2374936986626338, + "grad_norm": 0.12926945090293884, + "learning_rate": 0.0008753214497097681, + "loss": 2.7642, + "step": 8009 + }, + { + "epoch": 0.2375233520149453, + "grad_norm": 0.1414984166622162, + "learning_rate": 0.0008752903610955512, + "loss": 2.7727, + "step": 8010 + }, + { + "epoch": 0.23755300536725676, + "grad_norm": 0.1327488124370575, + "learning_rate": 0.0008752592691580738, + "loss": 2.8027, + "step": 8011 + }, + { + "epoch": 0.23758265871956824, + "grad_norm": 0.12404493987560272, + "learning_rate": 0.0008752281738976111, + "loss": 2.7361, + "step": 8012 + }, + { + "epoch": 0.23761231207187972, + "grad_norm": 0.11649817228317261, + "learning_rate": 0.0008751970753144385, + "loss": 2.7961, + "step": 8013 + }, + { + "epoch": 0.2376419654241912, + "grad_norm": 0.11539949476718903, + "learning_rate": 0.0008751659734088314, + "loss": 2.7809, + "step": 8014 + }, + { + "epoch": 0.2376716187765027, + "grad_norm": 0.12863342463970184, + "learning_rate": 0.0008751348681810651, + "loss": 2.7676, + "step": 8015 + }, + { + "epoch": 0.23770127212881417, + "grad_norm": 0.12373930960893631, + "learning_rate": 0.0008751037596314153, + "loss": 2.7648, + "step": 8016 + }, + { + "epoch": 0.23773092548112565, + "grad_norm": 0.12984733283519745, + "learning_rate": 0.0008750726477601574, + "loss": 2.7532, + "step": 8017 + }, + { + "epoch": 0.23776057883343713, + "grad_norm": 0.12501326203346252, + "learning_rate": 0.0008750415325675667, + "loss": 2.7696, + "step": 8018 + }, + { + "epoch": 0.2377902321857486, + "grad_norm": 0.1324038952589035, + "learning_rate": 0.0008750104140539189, + "loss": 2.7983, + "step": 8019 + }, + { + "epoch": 0.23781988553806008, + "grad_norm": 0.12254035472869873, + "learning_rate": 0.0008749792922194895, + "loss": 2.818, + "step": 8020 + }, + { + "epoch": 0.23784953889037155, + "grad_norm": 0.14361795783042908, + "learning_rate": 0.0008749481670645541, + "loss": 2.7681, + "step": 8021 + }, + { + "epoch": 0.23787919224268303, + "grad_norm": 0.1704871654510498, + "learning_rate": 0.0008749170385893883, + "loss": 2.7608, + "step": 8022 + }, + { + "epoch": 0.2379088455949945, + "grad_norm": 0.18168659508228302, + "learning_rate": 0.0008748859067942678, + "loss": 2.7682, + "step": 8023 + }, + { + "epoch": 0.23793849894730598, + "grad_norm": 0.17972728610038757, + "learning_rate": 0.0008748547716794682, + "loss": 2.8093, + "step": 8024 + }, + { + "epoch": 0.23796815229961746, + "grad_norm": 0.17943266034126282, + "learning_rate": 0.0008748236332452653, + "loss": 2.758, + "step": 8025 + }, + { + "epoch": 0.23799780565192896, + "grad_norm": 0.16635455191135406, + "learning_rate": 0.0008747924914919347, + "loss": 2.7404, + "step": 8026 + }, + { + "epoch": 0.23802745900424044, + "grad_norm": 0.15609128773212433, + "learning_rate": 0.0008747613464197523, + "loss": 2.7712, + "step": 8027 + }, + { + "epoch": 0.23805711235655191, + "grad_norm": 0.14459572732448578, + "learning_rate": 0.0008747301980289939, + "loss": 2.7715, + "step": 8028 + }, + { + "epoch": 0.2380867657088634, + "grad_norm": 0.13174298405647278, + "learning_rate": 0.0008746990463199352, + "loss": 2.7718, + "step": 8029 + }, + { + "epoch": 0.23811641906117487, + "grad_norm": 0.16035747528076172, + "learning_rate": 0.0008746678912928523, + "loss": 2.7875, + "step": 8030 + }, + { + "epoch": 0.23814607241348634, + "grad_norm": 0.13592207431793213, + "learning_rate": 0.0008746367329480207, + "loss": 2.7482, + "step": 8031 + }, + { + "epoch": 0.23817572576579782, + "grad_norm": 0.12968704104423523, + "learning_rate": 0.0008746055712857166, + "loss": 2.7475, + "step": 8032 + }, + { + "epoch": 0.2382053791181093, + "grad_norm": 0.15360647439956665, + "learning_rate": 0.000874574406306216, + "loss": 2.7901, + "step": 8033 + }, + { + "epoch": 0.23823503247042077, + "grad_norm": 0.15157976746559143, + "learning_rate": 0.0008745432380097946, + "loss": 2.7499, + "step": 8034 + }, + { + "epoch": 0.23826468582273225, + "grad_norm": 0.12417697161436081, + "learning_rate": 0.0008745120663967285, + "loss": 2.7786, + "step": 8035 + }, + { + "epoch": 0.23829433917504375, + "grad_norm": 0.1316055804491043, + "learning_rate": 0.0008744808914672939, + "loss": 2.719, + "step": 8036 + }, + { + "epoch": 0.23832399252735523, + "grad_norm": 0.14515526592731476, + "learning_rate": 0.0008744497132217666, + "loss": 2.7476, + "step": 8037 + }, + { + "epoch": 0.2383536458796667, + "grad_norm": 0.13620100915431976, + "learning_rate": 0.000874418531660423, + "loss": 2.7394, + "step": 8038 + }, + { + "epoch": 0.23838329923197818, + "grad_norm": 0.1314086765050888, + "learning_rate": 0.0008743873467835389, + "loss": 2.745, + "step": 8039 + }, + { + "epoch": 0.23841295258428966, + "grad_norm": 0.14225652813911438, + "learning_rate": 0.0008743561585913904, + "loss": 2.7414, + "step": 8040 + }, + { + "epoch": 0.23844260593660113, + "grad_norm": 0.1607133448123932, + "learning_rate": 0.0008743249670842541, + "loss": 2.7978, + "step": 8041 + }, + { + "epoch": 0.2384722592889126, + "grad_norm": 0.18328291177749634, + "learning_rate": 0.0008742937722624059, + "loss": 2.7689, + "step": 8042 + }, + { + "epoch": 0.23850191264122408, + "grad_norm": 0.2302754670381546, + "learning_rate": 0.0008742625741261221, + "loss": 2.7668, + "step": 8043 + }, + { + "epoch": 0.23853156599353556, + "grad_norm": 0.20513306558132172, + "learning_rate": 0.000874231372675679, + "loss": 2.7227, + "step": 8044 + }, + { + "epoch": 0.23856121934584704, + "grad_norm": 0.1672314703464508, + "learning_rate": 0.0008742001679113528, + "loss": 2.7658, + "step": 8045 + }, + { + "epoch": 0.23859087269815854, + "grad_norm": 0.1803247481584549, + "learning_rate": 0.0008741689598334199, + "loss": 2.7418, + "step": 8046 + }, + { + "epoch": 0.23862052605047002, + "grad_norm": 0.16404275596141815, + "learning_rate": 0.0008741377484421566, + "loss": 2.7809, + "step": 8047 + }, + { + "epoch": 0.2386501794027815, + "grad_norm": 0.14415545761585236, + "learning_rate": 0.0008741065337378394, + "loss": 2.777, + "step": 8048 + }, + { + "epoch": 0.23867983275509297, + "grad_norm": 0.15745997428894043, + "learning_rate": 0.0008740753157207446, + "loss": 2.7652, + "step": 8049 + }, + { + "epoch": 0.23870948610740444, + "grad_norm": 0.1642315685749054, + "learning_rate": 0.0008740440943911487, + "loss": 2.7583, + "step": 8050 + }, + { + "epoch": 0.23873913945971592, + "grad_norm": 0.20321603119373322, + "learning_rate": 0.0008740128697493282, + "loss": 2.7648, + "step": 8051 + }, + { + "epoch": 0.2387687928120274, + "grad_norm": 0.16245779395103455, + "learning_rate": 0.0008739816417955594, + "loss": 2.7317, + "step": 8052 + }, + { + "epoch": 0.23879844616433887, + "grad_norm": 0.15986144542694092, + "learning_rate": 0.000873950410530119, + "loss": 2.7335, + "step": 8053 + }, + { + "epoch": 0.23882809951665035, + "grad_norm": 0.13480734825134277, + "learning_rate": 0.0008739191759532835, + "loss": 2.7679, + "step": 8054 + }, + { + "epoch": 0.23885775286896183, + "grad_norm": 0.14455997943878174, + "learning_rate": 0.0008738879380653296, + "loss": 2.7742, + "step": 8055 + }, + { + "epoch": 0.2388874062212733, + "grad_norm": 0.14534436166286469, + "learning_rate": 0.0008738566968665338, + "loss": 2.7803, + "step": 8056 + }, + { + "epoch": 0.2389170595735848, + "grad_norm": 0.13404580950737, + "learning_rate": 0.0008738254523571727, + "loss": 2.7717, + "step": 8057 + }, + { + "epoch": 0.23894671292589628, + "grad_norm": 0.12712794542312622, + "learning_rate": 0.0008737942045375231, + "loss": 2.7221, + "step": 8058 + }, + { + "epoch": 0.23897636627820776, + "grad_norm": 0.11566556990146637, + "learning_rate": 0.0008737629534078617, + "loss": 2.7846, + "step": 8059 + }, + { + "epoch": 0.23900601963051923, + "grad_norm": 0.12073981761932373, + "learning_rate": 0.0008737316989684651, + "loss": 2.762, + "step": 8060 + }, + { + "epoch": 0.2390356729828307, + "grad_norm": 0.11612123250961304, + "learning_rate": 0.0008737004412196104, + "loss": 2.7201, + "step": 8061 + }, + { + "epoch": 0.2390653263351422, + "grad_norm": 0.12348908931016922, + "learning_rate": 0.0008736691801615739, + "loss": 2.7513, + "step": 8062 + }, + { + "epoch": 0.23909497968745366, + "grad_norm": 0.14587847888469696, + "learning_rate": 0.0008736379157946329, + "loss": 2.7744, + "step": 8063 + }, + { + "epoch": 0.23912463303976514, + "grad_norm": 0.1440543383359909, + "learning_rate": 0.0008736066481190637, + "loss": 2.7705, + "step": 8064 + }, + { + "epoch": 0.23915428639207661, + "grad_norm": 0.139703169465065, + "learning_rate": 0.0008735753771351437, + "loss": 2.7468, + "step": 8065 + }, + { + "epoch": 0.2391839397443881, + "grad_norm": 0.14787955582141876, + "learning_rate": 0.0008735441028431497, + "loss": 2.7572, + "step": 8066 + }, + { + "epoch": 0.2392135930966996, + "grad_norm": 0.1197848990559578, + "learning_rate": 0.0008735128252433582, + "loss": 2.7494, + "step": 8067 + }, + { + "epoch": 0.23924324644901107, + "grad_norm": 0.13408976793289185, + "learning_rate": 0.0008734815443360469, + "loss": 2.79, + "step": 8068 + }, + { + "epoch": 0.23927289980132255, + "grad_norm": 0.14308930933475494, + "learning_rate": 0.0008734502601214922, + "loss": 2.7662, + "step": 8069 + }, + { + "epoch": 0.23930255315363402, + "grad_norm": 0.14934669435024261, + "learning_rate": 0.0008734189725999714, + "loss": 2.7232, + "step": 8070 + }, + { + "epoch": 0.2393322065059455, + "grad_norm": 0.17002183198928833, + "learning_rate": 0.0008733876817717615, + "loss": 2.7848, + "step": 8071 + }, + { + "epoch": 0.23936185985825698, + "grad_norm": 0.15630388259887695, + "learning_rate": 0.0008733563876371397, + "loss": 2.7768, + "step": 8072 + }, + { + "epoch": 0.23939151321056845, + "grad_norm": 0.13836552202701569, + "learning_rate": 0.0008733250901963827, + "loss": 2.7921, + "step": 8073 + }, + { + "epoch": 0.23942116656287993, + "grad_norm": 0.1437065452337265, + "learning_rate": 0.0008732937894497684, + "loss": 2.7607, + "step": 8074 + }, + { + "epoch": 0.2394508199151914, + "grad_norm": 0.15561147034168243, + "learning_rate": 0.0008732624853975731, + "loss": 2.7964, + "step": 8075 + }, + { + "epoch": 0.23948047326750288, + "grad_norm": 0.15283897519111633, + "learning_rate": 0.0008732311780400746, + "loss": 2.8087, + "step": 8076 + }, + { + "epoch": 0.23951012661981436, + "grad_norm": 0.163782998919487, + "learning_rate": 0.00087319986737755, + "loss": 2.7567, + "step": 8077 + }, + { + "epoch": 0.23953977997212586, + "grad_norm": 0.16077063977718353, + "learning_rate": 0.0008731685534102765, + "loss": 2.7732, + "step": 8078 + }, + { + "epoch": 0.23956943332443734, + "grad_norm": 0.15797209739685059, + "learning_rate": 0.0008731372361385312, + "loss": 2.7831, + "step": 8079 + }, + { + "epoch": 0.2395990866767488, + "grad_norm": 0.14915023744106293, + "learning_rate": 0.0008731059155625919, + "loss": 2.7888, + "step": 8080 + }, + { + "epoch": 0.2396287400290603, + "grad_norm": 0.13703230023384094, + "learning_rate": 0.0008730745916827356, + "loss": 2.7352, + "step": 8081 + }, + { + "epoch": 0.23965839338137176, + "grad_norm": 0.13571543991565704, + "learning_rate": 0.0008730432644992397, + "loss": 2.7597, + "step": 8082 + }, + { + "epoch": 0.23968804673368324, + "grad_norm": 0.14553414285182953, + "learning_rate": 0.0008730119340123817, + "loss": 2.7675, + "step": 8083 + }, + { + "epoch": 0.23971770008599472, + "grad_norm": 0.16225966811180115, + "learning_rate": 0.000872980600222439, + "loss": 2.7256, + "step": 8084 + }, + { + "epoch": 0.2397473534383062, + "grad_norm": 0.1469906121492386, + "learning_rate": 0.000872949263129689, + "loss": 2.7635, + "step": 8085 + }, + { + "epoch": 0.23977700679061767, + "grad_norm": 0.1365465372800827, + "learning_rate": 0.0008729179227344092, + "loss": 2.748, + "step": 8086 + }, + { + "epoch": 0.23980666014292915, + "grad_norm": 0.13213098049163818, + "learning_rate": 0.0008728865790368774, + "loss": 2.756, + "step": 8087 + }, + { + "epoch": 0.23983631349524065, + "grad_norm": 0.13895471394062042, + "learning_rate": 0.0008728552320373708, + "loss": 2.7693, + "step": 8088 + }, + { + "epoch": 0.23986596684755213, + "grad_norm": 0.12194559723138809, + "learning_rate": 0.0008728238817361672, + "loss": 2.7513, + "step": 8089 + }, + { + "epoch": 0.2398956201998636, + "grad_norm": 0.1234259381890297, + "learning_rate": 0.000872792528133544, + "loss": 2.7856, + "step": 8090 + }, + { + "epoch": 0.23992527355217508, + "grad_norm": 0.1368485540151596, + "learning_rate": 0.0008727611712297791, + "loss": 2.7876, + "step": 8091 + }, + { + "epoch": 0.23995492690448655, + "grad_norm": 0.14633867144584656, + "learning_rate": 0.00087272981102515, + "loss": 2.7483, + "step": 8092 + }, + { + "epoch": 0.23998458025679803, + "grad_norm": 0.18470431864261627, + "learning_rate": 0.0008726984475199344, + "loss": 2.7599, + "step": 8093 + }, + { + "epoch": 0.2400142336091095, + "grad_norm": 0.1754523515701294, + "learning_rate": 0.0008726670807144101, + "loss": 2.7752, + "step": 8094 + }, + { + "epoch": 0.24004388696142098, + "grad_norm": 0.13736169040203094, + "learning_rate": 0.0008726357106088548, + "loss": 2.741, + "step": 8095 + }, + { + "epoch": 0.24007354031373246, + "grad_norm": 0.14291906356811523, + "learning_rate": 0.0008726043372035464, + "loss": 2.7728, + "step": 8096 + }, + { + "epoch": 0.24010319366604393, + "grad_norm": 0.16009017825126648, + "learning_rate": 0.0008725729604987626, + "loss": 2.7655, + "step": 8097 + }, + { + "epoch": 0.24013284701835544, + "grad_norm": 0.13982214033603668, + "learning_rate": 0.0008725415804947813, + "loss": 2.762, + "step": 8098 + }, + { + "epoch": 0.24016250037066691, + "grad_norm": 0.1552249789237976, + "learning_rate": 0.0008725101971918803, + "loss": 2.734, + "step": 8099 + }, + { + "epoch": 0.2401921537229784, + "grad_norm": 0.1533990502357483, + "learning_rate": 0.0008724788105903376, + "loss": 2.7263, + "step": 8100 + }, + { + "epoch": 0.24022180707528987, + "grad_norm": 0.15844053030014038, + "learning_rate": 0.0008724474206904311, + "loss": 2.7646, + "step": 8101 + }, + { + "epoch": 0.24025146042760134, + "grad_norm": 0.17497733235359192, + "learning_rate": 0.0008724160274924389, + "loss": 2.7673, + "step": 8102 + }, + { + "epoch": 0.24028111377991282, + "grad_norm": 0.19436651468276978, + "learning_rate": 0.0008723846309966385, + "loss": 2.7462, + "step": 8103 + }, + { + "epoch": 0.2403107671322243, + "grad_norm": 0.16440436244010925, + "learning_rate": 0.0008723532312033086, + "loss": 2.7388, + "step": 8104 + }, + { + "epoch": 0.24034042048453577, + "grad_norm": 0.1433306485414505, + "learning_rate": 0.0008723218281127268, + "loss": 2.7187, + "step": 8105 + }, + { + "epoch": 0.24037007383684725, + "grad_norm": 0.15714626014232635, + "learning_rate": 0.0008722904217251713, + "loss": 2.7843, + "step": 8106 + }, + { + "epoch": 0.24039972718915872, + "grad_norm": 0.1558142900466919, + "learning_rate": 0.0008722590120409204, + "loss": 2.7966, + "step": 8107 + }, + { + "epoch": 0.2404293805414702, + "grad_norm": 0.15748219192028046, + "learning_rate": 0.0008722275990602518, + "loss": 2.7622, + "step": 8108 + }, + { + "epoch": 0.2404590338937817, + "grad_norm": 0.15876685082912445, + "learning_rate": 0.000872196182783444, + "loss": 2.7542, + "step": 8109 + }, + { + "epoch": 0.24048868724609318, + "grad_norm": 0.14630666375160217, + "learning_rate": 0.0008721647632107751, + "loss": 2.7344, + "step": 8110 + }, + { + "epoch": 0.24051834059840466, + "grad_norm": 0.13773563504219055, + "learning_rate": 0.0008721333403425233, + "loss": 2.7774, + "step": 8111 + }, + { + "epoch": 0.24054799395071613, + "grad_norm": 0.14607802033424377, + "learning_rate": 0.000872101914178967, + "loss": 2.7559, + "step": 8112 + }, + { + "epoch": 0.2405776473030276, + "grad_norm": 0.14823105931282043, + "learning_rate": 0.0008720704847203845, + "loss": 2.7428, + "step": 8113 + }, + { + "epoch": 0.24060730065533908, + "grad_norm": 0.1544792503118515, + "learning_rate": 0.0008720390519670537, + "loss": 2.7699, + "step": 8114 + }, + { + "epoch": 0.24063695400765056, + "grad_norm": 0.1583496332168579, + "learning_rate": 0.0008720076159192534, + "loss": 2.7277, + "step": 8115 + }, + { + "epoch": 0.24066660735996204, + "grad_norm": 0.18216106295585632, + "learning_rate": 0.0008719761765772617, + "loss": 2.7962, + "step": 8116 + }, + { + "epoch": 0.2406962607122735, + "grad_norm": 0.19273746013641357, + "learning_rate": 0.0008719447339413571, + "loss": 2.7773, + "step": 8117 + }, + { + "epoch": 0.240725914064585, + "grad_norm": 0.1950848549604416, + "learning_rate": 0.0008719132880118182, + "loss": 2.7826, + "step": 8118 + }, + { + "epoch": 0.2407555674168965, + "grad_norm": 0.1638997346162796, + "learning_rate": 0.0008718818387889231, + "loss": 2.7505, + "step": 8119 + }, + { + "epoch": 0.24078522076920797, + "grad_norm": 0.15827269852161407, + "learning_rate": 0.0008718503862729508, + "loss": 2.7511, + "step": 8120 + }, + { + "epoch": 0.24081487412151945, + "grad_norm": 0.16761411726474762, + "learning_rate": 0.0008718189304641792, + "loss": 2.7809, + "step": 8121 + }, + { + "epoch": 0.24084452747383092, + "grad_norm": 0.1383691430091858, + "learning_rate": 0.0008717874713628873, + "loss": 2.793, + "step": 8122 + }, + { + "epoch": 0.2408741808261424, + "grad_norm": 0.1203489750623703, + "learning_rate": 0.0008717560089693535, + "loss": 2.7823, + "step": 8123 + }, + { + "epoch": 0.24090383417845387, + "grad_norm": 0.13041840493679047, + "learning_rate": 0.0008717245432838563, + "loss": 2.7412, + "step": 8124 + }, + { + "epoch": 0.24093348753076535, + "grad_norm": 0.12047265470027924, + "learning_rate": 0.0008716930743066746, + "loss": 2.7659, + "step": 8125 + }, + { + "epoch": 0.24096314088307683, + "grad_norm": 0.13757893443107605, + "learning_rate": 0.0008716616020380868, + "loss": 2.725, + "step": 8126 + }, + { + "epoch": 0.2409927942353883, + "grad_norm": 0.1582046002149582, + "learning_rate": 0.0008716301264783719, + "loss": 2.7469, + "step": 8127 + }, + { + "epoch": 0.24102244758769978, + "grad_norm": 0.16818389296531677, + "learning_rate": 0.0008715986476278084, + "loss": 2.7597, + "step": 8128 + }, + { + "epoch": 0.24105210094001125, + "grad_norm": 0.15733487904071808, + "learning_rate": 0.000871567165486675, + "loss": 2.7684, + "step": 8129 + }, + { + "epoch": 0.24108175429232276, + "grad_norm": 0.15903247892856598, + "learning_rate": 0.0008715356800552505, + "loss": 2.7646, + "step": 8130 + }, + { + "epoch": 0.24111140764463423, + "grad_norm": 0.15706491470336914, + "learning_rate": 0.000871504191333814, + "loss": 2.7722, + "step": 8131 + }, + { + "epoch": 0.2411410609969457, + "grad_norm": 0.1340179592370987, + "learning_rate": 0.0008714726993226439, + "loss": 2.7617, + "step": 8132 + }, + { + "epoch": 0.2411707143492572, + "grad_norm": 0.14248257875442505, + "learning_rate": 0.0008714412040220195, + "loss": 2.7238, + "step": 8133 + }, + { + "epoch": 0.24120036770156866, + "grad_norm": 0.14708751440048218, + "learning_rate": 0.0008714097054322194, + "loss": 2.7394, + "step": 8134 + }, + { + "epoch": 0.24123002105388014, + "grad_norm": 0.1383112221956253, + "learning_rate": 0.0008713782035535225, + "loss": 2.7833, + "step": 8135 + }, + { + "epoch": 0.24125967440619162, + "grad_norm": 0.12727388739585876, + "learning_rate": 0.000871346698386208, + "loss": 2.7513, + "step": 8136 + }, + { + "epoch": 0.2412893277585031, + "grad_norm": 0.12742362916469574, + "learning_rate": 0.0008713151899305547, + "loss": 2.7761, + "step": 8137 + }, + { + "epoch": 0.24131898111081457, + "grad_norm": 0.15152953565120697, + "learning_rate": 0.0008712836781868416, + "loss": 2.7736, + "step": 8138 + }, + { + "epoch": 0.24134863446312604, + "grad_norm": 0.15972132980823517, + "learning_rate": 0.0008712521631553478, + "loss": 2.7962, + "step": 8139 + }, + { + "epoch": 0.24137828781543755, + "grad_norm": 0.1901942938566208, + "learning_rate": 0.0008712206448363524, + "loss": 2.7761, + "step": 8140 + }, + { + "epoch": 0.24140794116774902, + "grad_norm": 0.22143304347991943, + "learning_rate": 0.0008711891232301345, + "loss": 2.7783, + "step": 8141 + }, + { + "epoch": 0.2414375945200605, + "grad_norm": 0.19204813241958618, + "learning_rate": 0.0008711575983369733, + "loss": 2.7573, + "step": 8142 + }, + { + "epoch": 0.24146724787237198, + "grad_norm": 0.13307394087314606, + "learning_rate": 0.0008711260701571477, + "loss": 2.7618, + "step": 8143 + }, + { + "epoch": 0.24149690122468345, + "grad_norm": 0.139821857213974, + "learning_rate": 0.000871094538690937, + "loss": 2.7451, + "step": 8144 + }, + { + "epoch": 0.24152655457699493, + "grad_norm": 0.1262185424566269, + "learning_rate": 0.0008710630039386207, + "loss": 2.782, + "step": 8145 + }, + { + "epoch": 0.2415562079293064, + "grad_norm": 0.12191040813922882, + "learning_rate": 0.0008710314659004777, + "loss": 2.7701, + "step": 8146 + }, + { + "epoch": 0.24158586128161788, + "grad_norm": 0.13524805009365082, + "learning_rate": 0.0008709999245767872, + "loss": 2.7398, + "step": 8147 + }, + { + "epoch": 0.24161551463392936, + "grad_norm": 0.1494104117155075, + "learning_rate": 0.0008709683799678289, + "loss": 2.7708, + "step": 8148 + }, + { + "epoch": 0.24164516798624083, + "grad_norm": 0.1388787180185318, + "learning_rate": 0.0008709368320738818, + "loss": 2.7902, + "step": 8149 + }, + { + "epoch": 0.24167482133855234, + "grad_norm": 0.14321576058864594, + "learning_rate": 0.0008709052808952254, + "loss": 2.7593, + "step": 8150 + }, + { + "epoch": 0.2417044746908638, + "grad_norm": 0.13616077601909637, + "learning_rate": 0.0008708737264321391, + "loss": 2.7477, + "step": 8151 + }, + { + "epoch": 0.2417341280431753, + "grad_norm": 0.13462768495082855, + "learning_rate": 0.0008708421686849025, + "loss": 2.7289, + "step": 8152 + }, + { + "epoch": 0.24176378139548677, + "grad_norm": 0.13905157148838043, + "learning_rate": 0.0008708106076537945, + "loss": 2.7578, + "step": 8153 + }, + { + "epoch": 0.24179343474779824, + "grad_norm": 0.1642356663942337, + "learning_rate": 0.0008707790433390949, + "loss": 2.7654, + "step": 8154 + }, + { + "epoch": 0.24182308810010972, + "grad_norm": 0.17760752141475677, + "learning_rate": 0.0008707474757410835, + "loss": 2.7705, + "step": 8155 + }, + { + "epoch": 0.2418527414524212, + "grad_norm": 0.16279608011245728, + "learning_rate": 0.0008707159048600395, + "loss": 2.786, + "step": 8156 + }, + { + "epoch": 0.24188239480473267, + "grad_norm": 0.1916700154542923, + "learning_rate": 0.0008706843306962425, + "loss": 2.7742, + "step": 8157 + }, + { + "epoch": 0.24191204815704415, + "grad_norm": 0.17921414971351624, + "learning_rate": 0.000870652753249972, + "loss": 2.7294, + "step": 8158 + }, + { + "epoch": 0.24194170150935562, + "grad_norm": 0.18507389724254608, + "learning_rate": 0.0008706211725215078, + "loss": 2.7778, + "step": 8159 + }, + { + "epoch": 0.2419713548616671, + "grad_norm": 0.18015733361244202, + "learning_rate": 0.0008705895885111296, + "loss": 2.7865, + "step": 8160 + }, + { + "epoch": 0.2420010082139786, + "grad_norm": 0.18326865136623383, + "learning_rate": 0.0008705580012191169, + "loss": 2.7481, + "step": 8161 + }, + { + "epoch": 0.24203066156629008, + "grad_norm": 0.16787131130695343, + "learning_rate": 0.0008705264106457497, + "loss": 2.7538, + "step": 8162 + }, + { + "epoch": 0.24206031491860155, + "grad_norm": 0.16142891347408295, + "learning_rate": 0.0008704948167913074, + "loss": 2.7413, + "step": 8163 + }, + { + "epoch": 0.24208996827091303, + "grad_norm": 0.14528898894786835, + "learning_rate": 0.0008704632196560697, + "loss": 2.7938, + "step": 8164 + }, + { + "epoch": 0.2421196216232245, + "grad_norm": 0.14659257233142853, + "learning_rate": 0.0008704316192403168, + "loss": 2.7234, + "step": 8165 + }, + { + "epoch": 0.24214927497553598, + "grad_norm": 0.16446766257286072, + "learning_rate": 0.0008704000155443283, + "loss": 2.7746, + "step": 8166 + }, + { + "epoch": 0.24217892832784746, + "grad_norm": 0.14560222625732422, + "learning_rate": 0.000870368408568384, + "loss": 2.742, + "step": 8167 + }, + { + "epoch": 0.24220858168015894, + "grad_norm": 0.12586301565170288, + "learning_rate": 0.0008703367983127642, + "loss": 2.7679, + "step": 8168 + }, + { + "epoch": 0.2422382350324704, + "grad_norm": 0.15001367032527924, + "learning_rate": 0.0008703051847777482, + "loss": 2.753, + "step": 8169 + }, + { + "epoch": 0.2422678883847819, + "grad_norm": 0.15341798961162567, + "learning_rate": 0.0008702735679636162, + "loss": 2.7527, + "step": 8170 + }, + { + "epoch": 0.2422975417370934, + "grad_norm": 0.1717713624238968, + "learning_rate": 0.0008702419478706483, + "loss": 2.8198, + "step": 8171 + }, + { + "epoch": 0.24232719508940487, + "grad_norm": 0.16097131371498108, + "learning_rate": 0.0008702103244991242, + "loss": 2.7973, + "step": 8172 + }, + { + "epoch": 0.24235684844171634, + "grad_norm": 0.16774219274520874, + "learning_rate": 0.0008701786978493243, + "loss": 2.7539, + "step": 8173 + }, + { + "epoch": 0.24238650179402782, + "grad_norm": 0.1660245954990387, + "learning_rate": 0.0008701470679215286, + "loss": 2.7331, + "step": 8174 + }, + { + "epoch": 0.2424161551463393, + "grad_norm": 0.1658252626657486, + "learning_rate": 0.000870115434716017, + "loss": 2.6996, + "step": 8175 + }, + { + "epoch": 0.24244580849865077, + "grad_norm": 0.15983673930168152, + "learning_rate": 0.0008700837982330696, + "loss": 2.7708, + "step": 8176 + }, + { + "epoch": 0.24247546185096225, + "grad_norm": 0.12172868102788925, + "learning_rate": 0.0008700521584729667, + "loss": 2.7828, + "step": 8177 + }, + { + "epoch": 0.24250511520327372, + "grad_norm": 0.11446640640497208, + "learning_rate": 0.0008700205154359884, + "loss": 2.7729, + "step": 8178 + }, + { + "epoch": 0.2425347685555852, + "grad_norm": 0.11812300980091095, + "learning_rate": 0.0008699888691224149, + "loss": 2.7345, + "step": 8179 + }, + { + "epoch": 0.24256442190789668, + "grad_norm": 0.10909309983253479, + "learning_rate": 0.0008699572195325265, + "loss": 2.7494, + "step": 8180 + }, + { + "epoch": 0.24259407526020815, + "grad_norm": 0.12092798203229904, + "learning_rate": 0.0008699255666666035, + "loss": 2.7807, + "step": 8181 + }, + { + "epoch": 0.24262372861251966, + "grad_norm": 0.11856841295957565, + "learning_rate": 0.0008698939105249259, + "loss": 2.7447, + "step": 8182 + }, + { + "epoch": 0.24265338196483113, + "grad_norm": 0.1193113625049591, + "learning_rate": 0.0008698622511077744, + "loss": 2.7537, + "step": 8183 + }, + { + "epoch": 0.2426830353171426, + "grad_norm": 0.12119896709918976, + "learning_rate": 0.0008698305884154292, + "loss": 2.7307, + "step": 8184 + }, + { + "epoch": 0.24271268866945409, + "grad_norm": 0.13105061650276184, + "learning_rate": 0.0008697989224481706, + "loss": 2.7618, + "step": 8185 + }, + { + "epoch": 0.24274234202176556, + "grad_norm": 0.13407063484191895, + "learning_rate": 0.000869767253206279, + "loss": 2.7632, + "step": 8186 + }, + { + "epoch": 0.24277199537407704, + "grad_norm": 0.12536883354187012, + "learning_rate": 0.0008697355806900349, + "loss": 2.7346, + "step": 8187 + }, + { + "epoch": 0.2428016487263885, + "grad_norm": 0.15525266528129578, + "learning_rate": 0.0008697039048997188, + "loss": 2.7865, + "step": 8188 + }, + { + "epoch": 0.2428313020787, + "grad_norm": 0.15847966074943542, + "learning_rate": 0.0008696722258356113, + "loss": 2.7754, + "step": 8189 + }, + { + "epoch": 0.24286095543101147, + "grad_norm": 0.14938893914222717, + "learning_rate": 0.0008696405434979926, + "loss": 2.7586, + "step": 8190 + }, + { + "epoch": 0.24289060878332294, + "grad_norm": 0.12892892956733704, + "learning_rate": 0.0008696088578871436, + "loss": 2.7561, + "step": 8191 + }, + { + "epoch": 0.24292026213563445, + "grad_norm": 0.12808360159397125, + "learning_rate": 0.0008695771690033447, + "loss": 2.7682, + "step": 8192 + }, + { + "epoch": 0.24294991548794592, + "grad_norm": 0.11291572451591492, + "learning_rate": 0.0008695454768468764, + "loss": 2.7448, + "step": 8193 + }, + { + "epoch": 0.2429795688402574, + "grad_norm": 0.12801216542720795, + "learning_rate": 0.0008695137814180196, + "loss": 2.7487, + "step": 8194 + }, + { + "epoch": 0.24300922219256887, + "grad_norm": 0.1321478933095932, + "learning_rate": 0.0008694820827170548, + "loss": 2.78, + "step": 8195 + }, + { + "epoch": 0.24303887554488035, + "grad_norm": 0.1396595686674118, + "learning_rate": 0.0008694503807442626, + "loss": 2.7289, + "step": 8196 + }, + { + "epoch": 0.24306852889719183, + "grad_norm": 0.13178198039531708, + "learning_rate": 0.0008694186754999241, + "loss": 2.7377, + "step": 8197 + }, + { + "epoch": 0.2430981822495033, + "grad_norm": 0.1368614137172699, + "learning_rate": 0.0008693869669843198, + "loss": 2.769, + "step": 8198 + }, + { + "epoch": 0.24312783560181478, + "grad_norm": 0.1452128291130066, + "learning_rate": 0.0008693552551977302, + "loss": 2.7261, + "step": 8199 + }, + { + "epoch": 0.24315748895412626, + "grad_norm": 0.16379952430725098, + "learning_rate": 0.0008693235401404367, + "loss": 2.7609, + "step": 8200 + }, + { + "epoch": 0.24318714230643773, + "grad_norm": 0.17901025712490082, + "learning_rate": 0.0008692918218127197, + "loss": 2.7628, + "step": 8201 + }, + { + "epoch": 0.24321679565874924, + "grad_norm": 0.19180113077163696, + "learning_rate": 0.0008692601002148603, + "loss": 2.742, + "step": 8202 + }, + { + "epoch": 0.2432464490110607, + "grad_norm": 0.213668555021286, + "learning_rate": 0.0008692283753471394, + "loss": 2.7631, + "step": 8203 + }, + { + "epoch": 0.2432761023633722, + "grad_norm": 0.2574099004268646, + "learning_rate": 0.0008691966472098378, + "loss": 2.7776, + "step": 8204 + }, + { + "epoch": 0.24330575571568366, + "grad_norm": 0.21441930532455444, + "learning_rate": 0.0008691649158032365, + "loss": 2.7648, + "step": 8205 + }, + { + "epoch": 0.24333540906799514, + "grad_norm": 0.16383785009384155, + "learning_rate": 0.0008691331811276165, + "loss": 2.7639, + "step": 8206 + }, + { + "epoch": 0.24336506242030662, + "grad_norm": 0.17094749212265015, + "learning_rate": 0.0008691014431832589, + "loss": 2.7297, + "step": 8207 + }, + { + "epoch": 0.2433947157726181, + "grad_norm": 0.16204726696014404, + "learning_rate": 0.0008690697019704445, + "loss": 2.7144, + "step": 8208 + }, + { + "epoch": 0.24342436912492957, + "grad_norm": 0.1675691306591034, + "learning_rate": 0.0008690379574894547, + "loss": 2.7726, + "step": 8209 + }, + { + "epoch": 0.24345402247724104, + "grad_norm": 0.15835514664649963, + "learning_rate": 0.0008690062097405705, + "loss": 2.7489, + "step": 8210 + }, + { + "epoch": 0.24348367582955252, + "grad_norm": 0.14828982949256897, + "learning_rate": 0.0008689744587240728, + "loss": 2.7652, + "step": 8211 + }, + { + "epoch": 0.243513329181864, + "grad_norm": 0.12525895237922668, + "learning_rate": 0.0008689427044402429, + "loss": 2.7016, + "step": 8212 + }, + { + "epoch": 0.2435429825341755, + "grad_norm": 0.13421393930912018, + "learning_rate": 0.0008689109468893622, + "loss": 2.7624, + "step": 8213 + }, + { + "epoch": 0.24357263588648698, + "grad_norm": 0.13604643940925598, + "learning_rate": 0.0008688791860717117, + "loss": 2.7649, + "step": 8214 + }, + { + "epoch": 0.24360228923879845, + "grad_norm": 0.12611332535743713, + "learning_rate": 0.0008688474219875726, + "loss": 2.7808, + "step": 8215 + }, + { + "epoch": 0.24363194259110993, + "grad_norm": 0.12318737059831619, + "learning_rate": 0.0008688156546372264, + "loss": 2.8102, + "step": 8216 + }, + { + "epoch": 0.2436615959434214, + "grad_norm": 0.13334468007087708, + "learning_rate": 0.0008687838840209541, + "loss": 2.7859, + "step": 8217 + }, + { + "epoch": 0.24369124929573288, + "grad_norm": 0.1279546469449997, + "learning_rate": 0.0008687521101390373, + "loss": 2.7826, + "step": 8218 + }, + { + "epoch": 0.24372090264804436, + "grad_norm": 0.11753199994564056, + "learning_rate": 0.0008687203329917572, + "loss": 2.759, + "step": 8219 + }, + { + "epoch": 0.24375055600035583, + "grad_norm": 0.12956872582435608, + "learning_rate": 0.0008686885525793954, + "loss": 2.7814, + "step": 8220 + }, + { + "epoch": 0.2437802093526673, + "grad_norm": 0.12755326926708221, + "learning_rate": 0.0008686567689022331, + "loss": 2.7345, + "step": 8221 + }, + { + "epoch": 0.24380986270497879, + "grad_norm": 0.11776053160429001, + "learning_rate": 0.0008686249819605518, + "loss": 2.7704, + "step": 8222 + }, + { + "epoch": 0.2438395160572903, + "grad_norm": 0.11637707054615021, + "learning_rate": 0.000868593191754633, + "loss": 2.7662, + "step": 8223 + }, + { + "epoch": 0.24386916940960177, + "grad_norm": 0.12913045287132263, + "learning_rate": 0.0008685613982847585, + "loss": 2.7458, + "step": 8224 + }, + { + "epoch": 0.24389882276191324, + "grad_norm": 0.13520929217338562, + "learning_rate": 0.0008685296015512092, + "loss": 2.7776, + "step": 8225 + }, + { + "epoch": 0.24392847611422472, + "grad_norm": 0.14995035529136658, + "learning_rate": 0.0008684978015542672, + "loss": 2.7754, + "step": 8226 + }, + { + "epoch": 0.2439581294665362, + "grad_norm": 0.16245071589946747, + "learning_rate": 0.0008684659982942138, + "loss": 2.7737, + "step": 8227 + }, + { + "epoch": 0.24398778281884767, + "grad_norm": 0.15780912339687347, + "learning_rate": 0.0008684341917713308, + "loss": 2.7565, + "step": 8228 + }, + { + "epoch": 0.24401743617115915, + "grad_norm": 0.15356232225894928, + "learning_rate": 0.0008684023819858998, + "loss": 2.7492, + "step": 8229 + }, + { + "epoch": 0.24404708952347062, + "grad_norm": 0.1653558909893036, + "learning_rate": 0.0008683705689382025, + "loss": 2.7632, + "step": 8230 + }, + { + "epoch": 0.2440767428757821, + "grad_norm": 0.18332333862781525, + "learning_rate": 0.0008683387526285205, + "loss": 2.7527, + "step": 8231 + }, + { + "epoch": 0.24410639622809358, + "grad_norm": 0.17732501029968262, + "learning_rate": 0.0008683069330571357, + "loss": 2.7867, + "step": 8232 + }, + { + "epoch": 0.24413604958040505, + "grad_norm": 0.14790940284729004, + "learning_rate": 0.0008682751102243298, + "loss": 2.7691, + "step": 8233 + }, + { + "epoch": 0.24416570293271656, + "grad_norm": 0.14594100415706635, + "learning_rate": 0.0008682432841303845, + "loss": 2.7755, + "step": 8234 + }, + { + "epoch": 0.24419535628502803, + "grad_norm": 0.1424471139907837, + "learning_rate": 0.0008682114547755817, + "loss": 2.7587, + "step": 8235 + }, + { + "epoch": 0.2442250096373395, + "grad_norm": 0.15328902006149292, + "learning_rate": 0.0008681796221602034, + "loss": 2.7934, + "step": 8236 + }, + { + "epoch": 0.24425466298965098, + "grad_norm": 0.16037696599960327, + "learning_rate": 0.0008681477862845313, + "loss": 2.7612, + "step": 8237 + }, + { + "epoch": 0.24428431634196246, + "grad_norm": 0.14640936255455017, + "learning_rate": 0.0008681159471488472, + "loss": 2.7774, + "step": 8238 + }, + { + "epoch": 0.24431396969427394, + "grad_norm": 0.14045478403568268, + "learning_rate": 0.0008680841047534333, + "loss": 2.7429, + "step": 8239 + }, + { + "epoch": 0.2443436230465854, + "grad_norm": 0.14355427026748657, + "learning_rate": 0.0008680522590985715, + "loss": 2.7723, + "step": 8240 + }, + { + "epoch": 0.2443732763988969, + "grad_norm": 0.1498190462589264, + "learning_rate": 0.0008680204101845439, + "loss": 2.7597, + "step": 8241 + }, + { + "epoch": 0.24440292975120836, + "grad_norm": 0.1651633083820343, + "learning_rate": 0.0008679885580116322, + "loss": 2.7949, + "step": 8242 + }, + { + "epoch": 0.24443258310351984, + "grad_norm": 0.19539324939250946, + "learning_rate": 0.0008679567025801187, + "loss": 2.7465, + "step": 8243 + }, + { + "epoch": 0.24446223645583134, + "grad_norm": 0.1955375224351883, + "learning_rate": 0.0008679248438902856, + "loss": 2.7541, + "step": 8244 + }, + { + "epoch": 0.24449188980814282, + "grad_norm": 0.18056775629520416, + "learning_rate": 0.0008678929819424146, + "loss": 2.7627, + "step": 8245 + }, + { + "epoch": 0.2445215431604543, + "grad_norm": 0.13366365432739258, + "learning_rate": 0.0008678611167367882, + "loss": 2.7228, + "step": 8246 + }, + { + "epoch": 0.24455119651276577, + "grad_norm": 0.14908882975578308, + "learning_rate": 0.0008678292482736885, + "loss": 2.7962, + "step": 8247 + }, + { + "epoch": 0.24458084986507725, + "grad_norm": 0.1646070033311844, + "learning_rate": 0.0008677973765533977, + "loss": 2.7563, + "step": 8248 + }, + { + "epoch": 0.24461050321738873, + "grad_norm": 0.14226703345775604, + "learning_rate": 0.0008677655015761979, + "loss": 2.7509, + "step": 8249 + }, + { + "epoch": 0.2446401565697002, + "grad_norm": 0.1434483379125595, + "learning_rate": 0.0008677336233423716, + "loss": 2.7595, + "step": 8250 + }, + { + "epoch": 0.24466980992201168, + "grad_norm": 0.14672547578811646, + "learning_rate": 0.0008677017418522009, + "loss": 2.7917, + "step": 8251 + }, + { + "epoch": 0.24469946327432315, + "grad_norm": 0.148855060338974, + "learning_rate": 0.0008676698571059681, + "loss": 2.765, + "step": 8252 + }, + { + "epoch": 0.24472911662663463, + "grad_norm": 0.1300908625125885, + "learning_rate": 0.0008676379691039555, + "loss": 2.7613, + "step": 8253 + }, + { + "epoch": 0.24475876997894613, + "grad_norm": 0.13338710367679596, + "learning_rate": 0.0008676060778464457, + "loss": 2.7463, + "step": 8254 + }, + { + "epoch": 0.2447884233312576, + "grad_norm": 0.14773690700531006, + "learning_rate": 0.000867574183333721, + "loss": 2.7287, + "step": 8255 + }, + { + "epoch": 0.24481807668356909, + "grad_norm": 0.1340198665857315, + "learning_rate": 0.0008675422855660638, + "loss": 2.7546, + "step": 8256 + }, + { + "epoch": 0.24484773003588056, + "grad_norm": 0.11994274705648422, + "learning_rate": 0.0008675103845437565, + "loss": 2.7823, + "step": 8257 + }, + { + "epoch": 0.24487738338819204, + "grad_norm": 0.12349435687065125, + "learning_rate": 0.0008674784802670817, + "loss": 2.7342, + "step": 8258 + }, + { + "epoch": 0.24490703674050351, + "grad_norm": 0.13571549952030182, + "learning_rate": 0.0008674465727363221, + "loss": 2.7417, + "step": 8259 + }, + { + "epoch": 0.244936690092815, + "grad_norm": 0.13425973057746887, + "learning_rate": 0.0008674146619517597, + "loss": 2.7767, + "step": 8260 + }, + { + "epoch": 0.24496634344512647, + "grad_norm": 0.1445121318101883, + "learning_rate": 0.0008673827479136776, + "loss": 2.7364, + "step": 8261 + }, + { + "epoch": 0.24499599679743794, + "grad_norm": 0.1567063331604004, + "learning_rate": 0.0008673508306223581, + "loss": 2.7412, + "step": 8262 + }, + { + "epoch": 0.24502565014974942, + "grad_norm": 0.15684619545936584, + "learning_rate": 0.000867318910078084, + "loss": 2.7815, + "step": 8263 + }, + { + "epoch": 0.2450553035020609, + "grad_norm": 0.15827299654483795, + "learning_rate": 0.0008672869862811379, + "loss": 2.7555, + "step": 8264 + }, + { + "epoch": 0.2450849568543724, + "grad_norm": 0.15206573903560638, + "learning_rate": 0.0008672550592318024, + "loss": 2.7276, + "step": 8265 + }, + { + "epoch": 0.24511461020668388, + "grad_norm": 0.1329347938299179, + "learning_rate": 0.0008672231289303605, + "loss": 2.7754, + "step": 8266 + }, + { + "epoch": 0.24514426355899535, + "grad_norm": 0.13389040529727936, + "learning_rate": 0.0008671911953770946, + "loss": 2.7057, + "step": 8267 + }, + { + "epoch": 0.24517391691130683, + "grad_norm": 0.15127821266651154, + "learning_rate": 0.0008671592585722878, + "loss": 2.7333, + "step": 8268 + }, + { + "epoch": 0.2452035702636183, + "grad_norm": 0.1736467033624649, + "learning_rate": 0.0008671273185162225, + "loss": 2.7579, + "step": 8269 + }, + { + "epoch": 0.24523322361592978, + "grad_norm": 0.2057371884584427, + "learning_rate": 0.0008670953752091819, + "loss": 2.7298, + "step": 8270 + }, + { + "epoch": 0.24526287696824126, + "grad_norm": 0.1983376294374466, + "learning_rate": 0.0008670634286514488, + "loss": 2.7541, + "step": 8271 + }, + { + "epoch": 0.24529253032055273, + "grad_norm": 0.15803933143615723, + "learning_rate": 0.000867031478843306, + "loss": 2.7234, + "step": 8272 + }, + { + "epoch": 0.2453221836728642, + "grad_norm": 0.14798370003700256, + "learning_rate": 0.0008669995257850365, + "loss": 2.7517, + "step": 8273 + }, + { + "epoch": 0.24535183702517568, + "grad_norm": 0.1493455469608307, + "learning_rate": 0.000866967569476923, + "loss": 2.7214, + "step": 8274 + }, + { + "epoch": 0.2453814903774872, + "grad_norm": 0.15751464664936066, + "learning_rate": 0.0008669356099192489, + "loss": 2.7687, + "step": 8275 + }, + { + "epoch": 0.24541114372979866, + "grad_norm": 0.15622615814208984, + "learning_rate": 0.0008669036471122969, + "loss": 2.7426, + "step": 8276 + }, + { + "epoch": 0.24544079708211014, + "grad_norm": 0.14517255127429962, + "learning_rate": 0.0008668716810563502, + "loss": 2.7477, + "step": 8277 + }, + { + "epoch": 0.24547045043442162, + "grad_norm": 0.1538090705871582, + "learning_rate": 0.0008668397117516918, + "loss": 2.7688, + "step": 8278 + }, + { + "epoch": 0.2455001037867331, + "grad_norm": 0.15014252066612244, + "learning_rate": 0.0008668077391986047, + "loss": 2.7482, + "step": 8279 + }, + { + "epoch": 0.24552975713904457, + "grad_norm": 0.15174993872642517, + "learning_rate": 0.0008667757633973721, + "loss": 2.7554, + "step": 8280 + }, + { + "epoch": 0.24555941049135604, + "grad_norm": 0.14758966863155365, + "learning_rate": 0.0008667437843482772, + "loss": 2.8054, + "step": 8281 + }, + { + "epoch": 0.24558906384366752, + "grad_norm": 0.1474243402481079, + "learning_rate": 0.0008667118020516031, + "loss": 2.7768, + "step": 8282 + }, + { + "epoch": 0.245618717195979, + "grad_norm": 0.17211830615997314, + "learning_rate": 0.0008666798165076331, + "loss": 2.7958, + "step": 8283 + }, + { + "epoch": 0.24564837054829047, + "grad_norm": 0.19481723010540009, + "learning_rate": 0.0008666478277166503, + "loss": 2.7449, + "step": 8284 + }, + { + "epoch": 0.24567802390060195, + "grad_norm": 0.1743735373020172, + "learning_rate": 0.0008666158356789382, + "loss": 2.7758, + "step": 8285 + }, + { + "epoch": 0.24570767725291345, + "grad_norm": 0.16685053706169128, + "learning_rate": 0.0008665838403947799, + "loss": 2.7731, + "step": 8286 + }, + { + "epoch": 0.24573733060522493, + "grad_norm": 0.17670725286006927, + "learning_rate": 0.0008665518418644587, + "loss": 2.7651, + "step": 8287 + }, + { + "epoch": 0.2457669839575364, + "grad_norm": 0.14751951396465302, + "learning_rate": 0.0008665198400882579, + "loss": 2.7729, + "step": 8288 + }, + { + "epoch": 0.24579663730984788, + "grad_norm": 0.13954268395900726, + "learning_rate": 0.0008664878350664614, + "loss": 2.7544, + "step": 8289 + }, + { + "epoch": 0.24582629066215936, + "grad_norm": 0.1277262568473816, + "learning_rate": 0.0008664558267993519, + "loss": 2.7469, + "step": 8290 + }, + { + "epoch": 0.24585594401447083, + "grad_norm": 0.15061216056346893, + "learning_rate": 0.0008664238152872131, + "loss": 2.7788, + "step": 8291 + }, + { + "epoch": 0.2458855973667823, + "grad_norm": 0.13725794851779938, + "learning_rate": 0.0008663918005303287, + "loss": 2.7736, + "step": 8292 + }, + { + "epoch": 0.2459152507190938, + "grad_norm": 0.14178118109703064, + "learning_rate": 0.000866359782528982, + "loss": 2.759, + "step": 8293 + }, + { + "epoch": 0.24594490407140526, + "grad_norm": 0.12578535079956055, + "learning_rate": 0.0008663277612834564, + "loss": 2.7806, + "step": 8294 + }, + { + "epoch": 0.24597455742371674, + "grad_norm": 0.12495650351047516, + "learning_rate": 0.0008662957367940357, + "loss": 2.7581, + "step": 8295 + }, + { + "epoch": 0.24600421077602824, + "grad_norm": 0.1471804827451706, + "learning_rate": 0.0008662637090610034, + "loss": 2.7747, + "step": 8296 + }, + { + "epoch": 0.24603386412833972, + "grad_norm": 0.14817532896995544, + "learning_rate": 0.0008662316780846431, + "loss": 2.757, + "step": 8297 + }, + { + "epoch": 0.2460635174806512, + "grad_norm": 0.14732332527637482, + "learning_rate": 0.0008661996438652384, + "loss": 2.7809, + "step": 8298 + }, + { + "epoch": 0.24609317083296267, + "grad_norm": 0.14675982296466827, + "learning_rate": 0.0008661676064030729, + "loss": 2.7444, + "step": 8299 + }, + { + "epoch": 0.24612282418527415, + "grad_norm": 0.14409561455249786, + "learning_rate": 0.0008661355656984305, + "loss": 2.7478, + "step": 8300 + }, + { + "epoch": 0.24615247753758562, + "grad_norm": 0.13794395327568054, + "learning_rate": 0.0008661035217515947, + "loss": 2.7866, + "step": 8301 + }, + { + "epoch": 0.2461821308898971, + "grad_norm": 0.1296362280845642, + "learning_rate": 0.0008660714745628495, + "loss": 2.7388, + "step": 8302 + }, + { + "epoch": 0.24621178424220858, + "grad_norm": 0.12321338802576065, + "learning_rate": 0.0008660394241324785, + "loss": 2.7682, + "step": 8303 + }, + { + "epoch": 0.24624143759452005, + "grad_norm": 0.13631267845630646, + "learning_rate": 0.0008660073704607656, + "loss": 2.7679, + "step": 8304 + }, + { + "epoch": 0.24627109094683153, + "grad_norm": 0.12305500358343124, + "learning_rate": 0.0008659753135479946, + "loss": 2.7464, + "step": 8305 + }, + { + "epoch": 0.24630074429914303, + "grad_norm": 0.14654748141765594, + "learning_rate": 0.0008659432533944495, + "loss": 2.7644, + "step": 8306 + }, + { + "epoch": 0.2463303976514545, + "grad_norm": 0.15743140876293182, + "learning_rate": 0.000865911190000414, + "loss": 2.7554, + "step": 8307 + }, + { + "epoch": 0.24636005100376598, + "grad_norm": 0.1854938566684723, + "learning_rate": 0.000865879123366172, + "loss": 2.7629, + "step": 8308 + }, + { + "epoch": 0.24638970435607746, + "grad_norm": 0.21523308753967285, + "learning_rate": 0.0008658470534920076, + "loss": 2.7098, + "step": 8309 + }, + { + "epoch": 0.24641935770838894, + "grad_norm": 0.21031437814235687, + "learning_rate": 0.0008658149803782047, + "loss": 2.7689, + "step": 8310 + }, + { + "epoch": 0.2464490110607004, + "grad_norm": 0.14455829560756683, + "learning_rate": 0.0008657829040250476, + "loss": 2.7358, + "step": 8311 + }, + { + "epoch": 0.2464786644130119, + "grad_norm": 0.14439038932323456, + "learning_rate": 0.0008657508244328198, + "loss": 2.7606, + "step": 8312 + }, + { + "epoch": 0.24650831776532336, + "grad_norm": 0.17652857303619385, + "learning_rate": 0.0008657187416018057, + "loss": 2.7679, + "step": 8313 + }, + { + "epoch": 0.24653797111763484, + "grad_norm": 0.1756753772497177, + "learning_rate": 0.0008656866555322895, + "loss": 2.8117, + "step": 8314 + }, + { + "epoch": 0.24656762446994632, + "grad_norm": 0.159694641828537, + "learning_rate": 0.0008656545662245553, + "loss": 2.7407, + "step": 8315 + }, + { + "epoch": 0.2465972778222578, + "grad_norm": 0.2080194056034088, + "learning_rate": 0.0008656224736788869, + "loss": 2.7426, + "step": 8316 + }, + { + "epoch": 0.2466269311745693, + "grad_norm": 0.19865018129348755, + "learning_rate": 0.000865590377895569, + "loss": 2.772, + "step": 8317 + }, + { + "epoch": 0.24665658452688077, + "grad_norm": 0.1712205410003662, + "learning_rate": 0.0008655582788748852, + "loss": 2.7469, + "step": 8318 + }, + { + "epoch": 0.24668623787919225, + "grad_norm": 0.2102566361427307, + "learning_rate": 0.0008655261766171204, + "loss": 2.7793, + "step": 8319 + }, + { + "epoch": 0.24671589123150373, + "grad_norm": 0.15616844594478607, + "learning_rate": 0.0008654940711225585, + "loss": 2.7478, + "step": 8320 + }, + { + "epoch": 0.2467455445838152, + "grad_norm": 0.14039470255374908, + "learning_rate": 0.0008654619623914838, + "loss": 2.762, + "step": 8321 + }, + { + "epoch": 0.24677519793612668, + "grad_norm": 0.14365515112876892, + "learning_rate": 0.0008654298504241806, + "loss": 2.7675, + "step": 8322 + }, + { + "epoch": 0.24680485128843815, + "grad_norm": 0.16517756879329681, + "learning_rate": 0.0008653977352209336, + "loss": 2.7877, + "step": 8323 + }, + { + "epoch": 0.24683450464074963, + "grad_norm": 0.16547399759292603, + "learning_rate": 0.0008653656167820267, + "loss": 2.7498, + "step": 8324 + }, + { + "epoch": 0.2468641579930611, + "grad_norm": 0.14111553132534027, + "learning_rate": 0.0008653334951077448, + "loss": 2.7509, + "step": 8325 + }, + { + "epoch": 0.24689381134537258, + "grad_norm": 0.1608186662197113, + "learning_rate": 0.0008653013701983718, + "loss": 2.7645, + "step": 8326 + }, + { + "epoch": 0.2469234646976841, + "grad_norm": 0.14166057109832764, + "learning_rate": 0.0008652692420541928, + "loss": 2.7191, + "step": 8327 + }, + { + "epoch": 0.24695311804999556, + "grad_norm": 0.1320667564868927, + "learning_rate": 0.0008652371106754917, + "loss": 2.733, + "step": 8328 + }, + { + "epoch": 0.24698277140230704, + "grad_norm": 0.12783192098140717, + "learning_rate": 0.0008652049760625533, + "loss": 2.7641, + "step": 8329 + }, + { + "epoch": 0.24701242475461851, + "grad_norm": 0.14309565722942352, + "learning_rate": 0.0008651728382156622, + "loss": 2.7095, + "step": 8330 + }, + { + "epoch": 0.24704207810693, + "grad_norm": 0.13636597990989685, + "learning_rate": 0.000865140697135103, + "loss": 2.7639, + "step": 8331 + }, + { + "epoch": 0.24707173145924147, + "grad_norm": 0.12971900403499603, + "learning_rate": 0.0008651085528211602, + "loss": 2.7421, + "step": 8332 + }, + { + "epoch": 0.24710138481155294, + "grad_norm": 0.13130638003349304, + "learning_rate": 0.0008650764052741185, + "loss": 2.7466, + "step": 8333 + }, + { + "epoch": 0.24713103816386442, + "grad_norm": 0.12795791029930115, + "learning_rate": 0.0008650442544942625, + "loss": 2.724, + "step": 8334 + }, + { + "epoch": 0.2471606915161759, + "grad_norm": 0.13251322507858276, + "learning_rate": 0.000865012100481877, + "loss": 2.7299, + "step": 8335 + }, + { + "epoch": 0.24719034486848737, + "grad_norm": 0.1513678878545761, + "learning_rate": 0.0008649799432372468, + "loss": 2.7161, + "step": 8336 + }, + { + "epoch": 0.24721999822079885, + "grad_norm": 0.14696331322193146, + "learning_rate": 0.0008649477827606564, + "loss": 2.7369, + "step": 8337 + }, + { + "epoch": 0.24724965157311035, + "grad_norm": 0.1434537172317505, + "learning_rate": 0.0008649156190523909, + "loss": 2.7697, + "step": 8338 + }, + { + "epoch": 0.24727930492542183, + "grad_norm": 0.14275309443473816, + "learning_rate": 0.0008648834521127349, + "loss": 2.7627, + "step": 8339 + }, + { + "epoch": 0.2473089582777333, + "grad_norm": 0.16061195731163025, + "learning_rate": 0.0008648512819419733, + "loss": 2.7245, + "step": 8340 + }, + { + "epoch": 0.24733861163004478, + "grad_norm": 0.18128304183483124, + "learning_rate": 0.0008648191085403909, + "loss": 2.788, + "step": 8341 + }, + { + "epoch": 0.24736826498235626, + "grad_norm": 0.17358332872390747, + "learning_rate": 0.0008647869319082728, + "loss": 2.7448, + "step": 8342 + }, + { + "epoch": 0.24739791833466773, + "grad_norm": 0.1475282907485962, + "learning_rate": 0.0008647547520459035, + "loss": 2.7751, + "step": 8343 + }, + { + "epoch": 0.2474275716869792, + "grad_norm": 0.13955600559711456, + "learning_rate": 0.0008647225689535687, + "loss": 2.7922, + "step": 8344 + }, + { + "epoch": 0.24745722503929068, + "grad_norm": 0.13881815969944, + "learning_rate": 0.0008646903826315526, + "loss": 2.7712, + "step": 8345 + }, + { + "epoch": 0.24748687839160216, + "grad_norm": 0.17666879296302795, + "learning_rate": 0.0008646581930801408, + "loss": 2.7475, + "step": 8346 + }, + { + "epoch": 0.24751653174391364, + "grad_norm": 0.19091854989528656, + "learning_rate": 0.000864626000299618, + "loss": 2.7533, + "step": 8347 + }, + { + "epoch": 0.24754618509622514, + "grad_norm": 0.16061809659004211, + "learning_rate": 0.0008645938042902693, + "loss": 2.7356, + "step": 8348 + }, + { + "epoch": 0.24757583844853662, + "grad_norm": 0.13536503911018372, + "learning_rate": 0.0008645616050523802, + "loss": 2.7236, + "step": 8349 + }, + { + "epoch": 0.2476054918008481, + "grad_norm": 0.13446882367134094, + "learning_rate": 0.0008645294025862351, + "loss": 2.7281, + "step": 8350 + }, + { + "epoch": 0.24763514515315957, + "grad_norm": 0.13493365049362183, + "learning_rate": 0.0008644971968921198, + "loss": 2.7552, + "step": 8351 + }, + { + "epoch": 0.24766479850547105, + "grad_norm": 0.15886667370796204, + "learning_rate": 0.0008644649879703193, + "loss": 2.7628, + "step": 8352 + }, + { + "epoch": 0.24769445185778252, + "grad_norm": 0.17139458656311035, + "learning_rate": 0.0008644327758211186, + "loss": 2.7694, + "step": 8353 + }, + { + "epoch": 0.247724105210094, + "grad_norm": 0.16277030110359192, + "learning_rate": 0.0008644005604448031, + "loss": 2.7574, + "step": 8354 + }, + { + "epoch": 0.24775375856240547, + "grad_norm": 0.15486854314804077, + "learning_rate": 0.0008643683418416583, + "loss": 2.747, + "step": 8355 + }, + { + "epoch": 0.24778341191471695, + "grad_norm": 0.1371971070766449, + "learning_rate": 0.0008643361200119691, + "loss": 2.7484, + "step": 8356 + }, + { + "epoch": 0.24781306526702843, + "grad_norm": 0.14064840972423553, + "learning_rate": 0.0008643038949560212, + "loss": 2.7435, + "step": 8357 + }, + { + "epoch": 0.24784271861933993, + "grad_norm": 0.14734722673892975, + "learning_rate": 0.0008642716666740995, + "loss": 2.7648, + "step": 8358 + }, + { + "epoch": 0.2478723719716514, + "grad_norm": 0.15463712811470032, + "learning_rate": 0.0008642394351664899, + "loss": 2.7589, + "step": 8359 + }, + { + "epoch": 0.24790202532396288, + "grad_norm": 0.1401699036359787, + "learning_rate": 0.0008642072004334775, + "loss": 2.7078, + "step": 8360 + }, + { + "epoch": 0.24793167867627436, + "grad_norm": 0.12351908534765244, + "learning_rate": 0.0008641749624753479, + "loss": 2.7525, + "step": 8361 + }, + { + "epoch": 0.24796133202858583, + "grad_norm": 0.12741439044475555, + "learning_rate": 0.0008641427212923863, + "loss": 2.7493, + "step": 8362 + }, + { + "epoch": 0.2479909853808973, + "grad_norm": 0.1355377733707428, + "learning_rate": 0.0008641104768848787, + "loss": 2.7634, + "step": 8363 + }, + { + "epoch": 0.2480206387332088, + "grad_norm": 0.15737833082675934, + "learning_rate": 0.0008640782292531101, + "loss": 2.7731, + "step": 8364 + }, + { + "epoch": 0.24805029208552026, + "grad_norm": 0.16324210166931152, + "learning_rate": 0.0008640459783973664, + "loss": 2.7585, + "step": 8365 + }, + { + "epoch": 0.24807994543783174, + "grad_norm": 0.15439999103546143, + "learning_rate": 0.0008640137243179331, + "loss": 2.7511, + "step": 8366 + }, + { + "epoch": 0.24810959879014322, + "grad_norm": 0.12451609969139099, + "learning_rate": 0.0008639814670150956, + "loss": 2.7531, + "step": 8367 + }, + { + "epoch": 0.2481392521424547, + "grad_norm": 0.12205979228019714, + "learning_rate": 0.0008639492064891398, + "loss": 2.7541, + "step": 8368 + }, + { + "epoch": 0.2481689054947662, + "grad_norm": 0.17314356565475464, + "learning_rate": 0.0008639169427403514, + "loss": 2.7481, + "step": 8369 + }, + { + "epoch": 0.24819855884707767, + "grad_norm": 0.17975787818431854, + "learning_rate": 0.0008638846757690159, + "loss": 2.804, + "step": 8370 + }, + { + "epoch": 0.24822821219938915, + "grad_norm": 0.151776522397995, + "learning_rate": 0.0008638524055754193, + "loss": 2.7577, + "step": 8371 + }, + { + "epoch": 0.24825786555170062, + "grad_norm": 0.17020875215530396, + "learning_rate": 0.0008638201321598471, + "loss": 2.7789, + "step": 8372 + }, + { + "epoch": 0.2482875189040121, + "grad_norm": 0.15922684967517853, + "learning_rate": 0.0008637878555225851, + "loss": 2.7616, + "step": 8373 + }, + { + "epoch": 0.24831717225632358, + "grad_norm": 0.15115399658679962, + "learning_rate": 0.0008637555756639192, + "loss": 2.7597, + "step": 8374 + }, + { + "epoch": 0.24834682560863505, + "grad_norm": 0.13820675015449524, + "learning_rate": 0.0008637232925841354, + "loss": 2.7823, + "step": 8375 + }, + { + "epoch": 0.24837647896094653, + "grad_norm": 0.12291789054870605, + "learning_rate": 0.0008636910062835193, + "loss": 2.7814, + "step": 8376 + }, + { + "epoch": 0.248406132313258, + "grad_norm": 0.14169611036777496, + "learning_rate": 0.0008636587167623568, + "loss": 2.7576, + "step": 8377 + }, + { + "epoch": 0.24843578566556948, + "grad_norm": 0.14807048439979553, + "learning_rate": 0.0008636264240209342, + "loss": 2.7859, + "step": 8378 + }, + { + "epoch": 0.24846543901788098, + "grad_norm": 0.15986742079257965, + "learning_rate": 0.0008635941280595372, + "loss": 2.8046, + "step": 8379 + }, + { + "epoch": 0.24849509237019246, + "grad_norm": 0.14800460636615753, + "learning_rate": 0.0008635618288784514, + "loss": 2.779, + "step": 8380 + }, + { + "epoch": 0.24852474572250394, + "grad_norm": 0.13790063560009003, + "learning_rate": 0.0008635295264779636, + "loss": 2.7896, + "step": 8381 + }, + { + "epoch": 0.2485543990748154, + "grad_norm": 0.12203879654407501, + "learning_rate": 0.0008634972208583593, + "loss": 2.7428, + "step": 8382 + }, + { + "epoch": 0.2485840524271269, + "grad_norm": 0.14408956468105316, + "learning_rate": 0.0008634649120199247, + "loss": 2.7481, + "step": 8383 + }, + { + "epoch": 0.24861370577943837, + "grad_norm": 0.14473778009414673, + "learning_rate": 0.000863432599962946, + "loss": 2.7629, + "step": 8384 + }, + { + "epoch": 0.24864335913174984, + "grad_norm": 0.14205922186374664, + "learning_rate": 0.0008634002846877091, + "loss": 2.7593, + "step": 8385 + }, + { + "epoch": 0.24867301248406132, + "grad_norm": 0.14545224606990814, + "learning_rate": 0.0008633679661945005, + "loss": 2.7482, + "step": 8386 + }, + { + "epoch": 0.2487026658363728, + "grad_norm": 0.1703212708234787, + "learning_rate": 0.000863335644483606, + "loss": 2.7488, + "step": 8387 + }, + { + "epoch": 0.24873231918868427, + "grad_norm": 0.20121058821678162, + "learning_rate": 0.0008633033195553121, + "loss": 2.7116, + "step": 8388 + }, + { + "epoch": 0.24876197254099575, + "grad_norm": 0.17786137759685516, + "learning_rate": 0.0008632709914099049, + "loss": 2.7054, + "step": 8389 + }, + { + "epoch": 0.24879162589330725, + "grad_norm": 0.13968904316425323, + "learning_rate": 0.0008632386600476707, + "loss": 2.7538, + "step": 8390 + }, + { + "epoch": 0.24882127924561873, + "grad_norm": 0.15870733559131622, + "learning_rate": 0.0008632063254688959, + "loss": 2.7424, + "step": 8391 + }, + { + "epoch": 0.2488509325979302, + "grad_norm": 0.16133615374565125, + "learning_rate": 0.0008631739876738667, + "loss": 2.7473, + "step": 8392 + }, + { + "epoch": 0.24888058595024168, + "grad_norm": 0.1811702996492386, + "learning_rate": 0.0008631416466628694, + "loss": 2.7656, + "step": 8393 + }, + { + "epoch": 0.24891023930255315, + "grad_norm": 0.1656372845172882, + "learning_rate": 0.0008631093024361907, + "loss": 2.7746, + "step": 8394 + }, + { + "epoch": 0.24893989265486463, + "grad_norm": 0.1346045732498169, + "learning_rate": 0.0008630769549941166, + "loss": 2.7427, + "step": 8395 + }, + { + "epoch": 0.2489695460071761, + "grad_norm": 0.13207124173641205, + "learning_rate": 0.0008630446043369338, + "loss": 2.7657, + "step": 8396 + }, + { + "epoch": 0.24899919935948758, + "grad_norm": 0.14272746443748474, + "learning_rate": 0.0008630122504649287, + "loss": 2.7742, + "step": 8397 + }, + { + "epoch": 0.24902885271179906, + "grad_norm": 0.14765654504299164, + "learning_rate": 0.0008629798933783879, + "loss": 2.7454, + "step": 8398 + }, + { + "epoch": 0.24905850606411054, + "grad_norm": 0.13601557910442352, + "learning_rate": 0.0008629475330775978, + "loss": 2.7543, + "step": 8399 + }, + { + "epoch": 0.24908815941642204, + "grad_norm": 0.12410370260477066, + "learning_rate": 0.000862915169562845, + "loss": 2.7384, + "step": 8400 + }, + { + "epoch": 0.24911781276873352, + "grad_norm": 0.15826839208602905, + "learning_rate": 0.0008628828028344161, + "loss": 2.7524, + "step": 8401 + }, + { + "epoch": 0.249147466121045, + "grad_norm": 0.1731422394514084, + "learning_rate": 0.0008628504328925977, + "loss": 2.7533, + "step": 8402 + }, + { + "epoch": 0.24917711947335647, + "grad_norm": 0.16205225884914398, + "learning_rate": 0.0008628180597376764, + "loss": 2.7599, + "step": 8403 + }, + { + "epoch": 0.24920677282566794, + "grad_norm": 0.1536986082792282, + "learning_rate": 0.0008627856833699388, + "loss": 2.7875, + "step": 8404 + }, + { + "epoch": 0.24923642617797942, + "grad_norm": 0.16435439884662628, + "learning_rate": 0.0008627533037896718, + "loss": 2.7655, + "step": 8405 + }, + { + "epoch": 0.2492660795302909, + "grad_norm": 0.1603132039308548, + "learning_rate": 0.0008627209209971621, + "loss": 2.7868, + "step": 8406 + }, + { + "epoch": 0.24929573288260237, + "grad_norm": 0.13714009523391724, + "learning_rate": 0.0008626885349926963, + "loss": 2.7413, + "step": 8407 + }, + { + "epoch": 0.24932538623491385, + "grad_norm": 0.13321459293365479, + "learning_rate": 0.0008626561457765612, + "loss": 2.7728, + "step": 8408 + }, + { + "epoch": 0.24935503958722532, + "grad_norm": 0.12616994976997375, + "learning_rate": 0.0008626237533490437, + "loss": 2.7753, + "step": 8409 + }, + { + "epoch": 0.24938469293953683, + "grad_norm": 0.131476491689682, + "learning_rate": 0.0008625913577104307, + "loss": 2.7754, + "step": 8410 + }, + { + "epoch": 0.2494143462918483, + "grad_norm": 0.1426507532596588, + "learning_rate": 0.000862558958861009, + "loss": 2.7785, + "step": 8411 + }, + { + "epoch": 0.24944399964415978, + "grad_norm": 0.15362662076950073, + "learning_rate": 0.0008625265568010655, + "loss": 2.7334, + "step": 8412 + }, + { + "epoch": 0.24947365299647126, + "grad_norm": 0.1420498490333557, + "learning_rate": 0.000862494151530887, + "loss": 2.7352, + "step": 8413 + }, + { + "epoch": 0.24950330634878273, + "grad_norm": 0.13615304231643677, + "learning_rate": 0.0008624617430507606, + "loss": 2.7434, + "step": 8414 + }, + { + "epoch": 0.2495329597010942, + "grad_norm": 0.14518921077251434, + "learning_rate": 0.0008624293313609734, + "loss": 2.7696, + "step": 8415 + }, + { + "epoch": 0.24956261305340569, + "grad_norm": 0.14161087572574615, + "learning_rate": 0.0008623969164618122, + "loss": 2.743, + "step": 8416 + }, + { + "epoch": 0.24959226640571716, + "grad_norm": 0.1374136507511139, + "learning_rate": 0.000862364498353564, + "loss": 2.7778, + "step": 8417 + }, + { + "epoch": 0.24962191975802864, + "grad_norm": 0.13341306149959564, + "learning_rate": 0.000862332077036516, + "loss": 2.7194, + "step": 8418 + }, + { + "epoch": 0.2496515731103401, + "grad_norm": 0.1399783194065094, + "learning_rate": 0.0008622996525109552, + "loss": 2.7506, + "step": 8419 + }, + { + "epoch": 0.2496812264626516, + "grad_norm": 0.1664642095565796, + "learning_rate": 0.000862267224777169, + "loss": 2.8002, + "step": 8420 + }, + { + "epoch": 0.2497108798149631, + "grad_norm": 0.18296776711940765, + "learning_rate": 0.0008622347938354442, + "loss": 2.7724, + "step": 8421 + }, + { + "epoch": 0.24974053316727457, + "grad_norm": 0.20804166793823242, + "learning_rate": 0.0008622023596860681, + "loss": 2.7579, + "step": 8422 + }, + { + "epoch": 0.24977018651958605, + "grad_norm": 0.21086573600769043, + "learning_rate": 0.000862169922329328, + "loss": 2.7777, + "step": 8423 + }, + { + "epoch": 0.24979983987189752, + "grad_norm": 0.1777319312095642, + "learning_rate": 0.000862137481765511, + "loss": 2.7888, + "step": 8424 + }, + { + "epoch": 0.249829493224209, + "grad_norm": 0.18359969556331635, + "learning_rate": 0.0008621050379949045, + "loss": 2.7462, + "step": 8425 + }, + { + "epoch": 0.24985914657652047, + "grad_norm": 0.19244517385959625, + "learning_rate": 0.0008620725910177957, + "loss": 2.7611, + "step": 8426 + }, + { + "epoch": 0.24988879992883195, + "grad_norm": 0.1871882975101471, + "learning_rate": 0.000862040140834472, + "loss": 2.7146, + "step": 8427 + }, + { + "epoch": 0.24991845328114343, + "grad_norm": 0.1809714138507843, + "learning_rate": 0.0008620076874452208, + "loss": 2.7655, + "step": 8428 + }, + { + "epoch": 0.2499481066334549, + "grad_norm": 0.15913571417331696, + "learning_rate": 0.000861975230850329, + "loss": 2.7347, + "step": 8429 + }, + { + "epoch": 0.24997775998576638, + "grad_norm": 0.1609528362751007, + "learning_rate": 0.0008619427710500848, + "loss": 2.7419, + "step": 8430 + }, + { + "epoch": 0.25000741333807786, + "grad_norm": 0.15600082278251648, + "learning_rate": 0.0008619103080447751, + "loss": 2.7472, + "step": 8431 + }, + { + "epoch": 0.25003706669038933, + "grad_norm": 0.14336073398590088, + "learning_rate": 0.0008618778418346875, + "loss": 2.7816, + "step": 8432 + }, + { + "epoch": 0.2500667200427008, + "grad_norm": 0.13272573053836823, + "learning_rate": 0.0008618453724201094, + "loss": 2.7548, + "step": 8433 + }, + { + "epoch": 0.2500963733950123, + "grad_norm": 0.13295012712478638, + "learning_rate": 0.0008618128998013286, + "loss": 2.8203, + "step": 8434 + }, + { + "epoch": 0.25012602674732376, + "grad_norm": 0.14390796422958374, + "learning_rate": 0.0008617804239786324, + "loss": 2.7499, + "step": 8435 + }, + { + "epoch": 0.2501556800996353, + "grad_norm": 0.14124175906181335, + "learning_rate": 0.0008617479449523085, + "loss": 2.7749, + "step": 8436 + }, + { + "epoch": 0.25018533345194677, + "grad_norm": 0.13254381716251373, + "learning_rate": 0.0008617154627226444, + "loss": 2.7618, + "step": 8437 + }, + { + "epoch": 0.25021498680425824, + "grad_norm": 0.15607210993766785, + "learning_rate": 0.0008616829772899277, + "loss": 2.7621, + "step": 8438 + }, + { + "epoch": 0.2502446401565697, + "grad_norm": 0.18104900419712067, + "learning_rate": 0.0008616504886544463, + "loss": 2.7592, + "step": 8439 + }, + { + "epoch": 0.2502742935088812, + "grad_norm": 0.1644473671913147, + "learning_rate": 0.0008616179968164877, + "loss": 2.776, + "step": 8440 + }, + { + "epoch": 0.25030394686119267, + "grad_norm": 0.14922548830509186, + "learning_rate": 0.0008615855017763396, + "loss": 2.7423, + "step": 8441 + }, + { + "epoch": 0.25033360021350415, + "grad_norm": 0.12592537701129913, + "learning_rate": 0.0008615530035342898, + "loss": 2.7778, + "step": 8442 + }, + { + "epoch": 0.2503632535658156, + "grad_norm": 0.1351630538702011, + "learning_rate": 0.0008615205020906262, + "loss": 2.7451, + "step": 8443 + }, + { + "epoch": 0.2503929069181271, + "grad_norm": 0.1618419736623764, + "learning_rate": 0.0008614879974456365, + "loss": 2.7781, + "step": 8444 + }, + { + "epoch": 0.2504225602704386, + "grad_norm": 0.15003527700901031, + "learning_rate": 0.0008614554895996084, + "loss": 2.7697, + "step": 8445 + }, + { + "epoch": 0.25045221362275005, + "grad_norm": 0.15414546430110931, + "learning_rate": 0.0008614229785528301, + "loss": 2.7854, + "step": 8446 + }, + { + "epoch": 0.25048186697506153, + "grad_norm": 0.14754410088062286, + "learning_rate": 0.0008613904643055891, + "loss": 2.7871, + "step": 8447 + }, + { + "epoch": 0.250511520327373, + "grad_norm": 0.12497599422931671, + "learning_rate": 0.0008613579468581736, + "loss": 2.7541, + "step": 8448 + }, + { + "epoch": 0.2505411736796845, + "grad_norm": 0.13044117391109467, + "learning_rate": 0.0008613254262108714, + "loss": 2.7528, + "step": 8449 + }, + { + "epoch": 0.25057082703199596, + "grad_norm": 0.1325228363275528, + "learning_rate": 0.0008612929023639706, + "loss": 2.7217, + "step": 8450 + }, + { + "epoch": 0.25060048038430743, + "grad_norm": 0.15295393764972687, + "learning_rate": 0.000861260375317759, + "loss": 2.7965, + "step": 8451 + }, + { + "epoch": 0.2506301337366189, + "grad_norm": 0.14261406660079956, + "learning_rate": 0.0008612278450725249, + "loss": 2.7709, + "step": 8452 + }, + { + "epoch": 0.2506597870889304, + "grad_norm": 0.1342848390340805, + "learning_rate": 0.0008611953116285562, + "loss": 2.7284, + "step": 8453 + }, + { + "epoch": 0.25068944044124186, + "grad_norm": 0.12399737536907196, + "learning_rate": 0.0008611627749861411, + "loss": 2.7385, + "step": 8454 + }, + { + "epoch": 0.25071909379355334, + "grad_norm": 0.12294568866491318, + "learning_rate": 0.0008611302351455674, + "loss": 2.7425, + "step": 8455 + }, + { + "epoch": 0.2507487471458648, + "grad_norm": 0.1318843960762024, + "learning_rate": 0.0008610976921071236, + "loss": 2.7854, + "step": 8456 + }, + { + "epoch": 0.25077840049817635, + "grad_norm": 0.12777121365070343, + "learning_rate": 0.0008610651458710978, + "loss": 2.7517, + "step": 8457 + }, + { + "epoch": 0.2508080538504878, + "grad_norm": 0.1329258531332016, + "learning_rate": 0.0008610325964377781, + "loss": 2.786, + "step": 8458 + }, + { + "epoch": 0.2508377072027993, + "grad_norm": 0.13693109154701233, + "learning_rate": 0.0008610000438074529, + "loss": 2.7445, + "step": 8459 + }, + { + "epoch": 0.2508673605551108, + "grad_norm": 0.15878181159496307, + "learning_rate": 0.0008609674879804102, + "loss": 2.7346, + "step": 8460 + }, + { + "epoch": 0.25089701390742225, + "grad_norm": 0.15014676749706268, + "learning_rate": 0.0008609349289569385, + "loss": 2.7284, + "step": 8461 + }, + { + "epoch": 0.2509266672597337, + "grad_norm": 0.1334519386291504, + "learning_rate": 0.0008609023667373261, + "loss": 2.7881, + "step": 8462 + }, + { + "epoch": 0.2509563206120452, + "grad_norm": 0.12024054676294327, + "learning_rate": 0.0008608698013218612, + "loss": 2.7472, + "step": 8463 + }, + { + "epoch": 0.2509859739643567, + "grad_norm": 0.12728209793567657, + "learning_rate": 0.0008608372327108325, + "loss": 2.7413, + "step": 8464 + }, + { + "epoch": 0.25101562731666816, + "grad_norm": 0.17183631658554077, + "learning_rate": 0.0008608046609045279, + "loss": 2.7519, + "step": 8465 + }, + { + "epoch": 0.25104528066897963, + "grad_norm": 0.2081369161605835, + "learning_rate": 0.0008607720859032362, + "loss": 2.7867, + "step": 8466 + }, + { + "epoch": 0.2510749340212911, + "grad_norm": 0.20846791565418243, + "learning_rate": 0.0008607395077072457, + "loss": 2.7625, + "step": 8467 + }, + { + "epoch": 0.2511045873736026, + "grad_norm": 0.1788739114999771, + "learning_rate": 0.000860706926316845, + "loss": 2.7225, + "step": 8468 + }, + { + "epoch": 0.25113424072591406, + "grad_norm": 0.16627150774002075, + "learning_rate": 0.0008606743417323225, + "loss": 2.7753, + "step": 8469 + }, + { + "epoch": 0.25116389407822554, + "grad_norm": 0.1604144424200058, + "learning_rate": 0.0008606417539539668, + "loss": 2.7338, + "step": 8470 + }, + { + "epoch": 0.251193547430537, + "grad_norm": 0.1753745675086975, + "learning_rate": 0.0008606091629820665, + "loss": 2.7724, + "step": 8471 + }, + { + "epoch": 0.2512232007828485, + "grad_norm": 0.15775515139102936, + "learning_rate": 0.0008605765688169103, + "loss": 2.7856, + "step": 8472 + }, + { + "epoch": 0.25125285413515996, + "grad_norm": 0.15610381960868835, + "learning_rate": 0.0008605439714587864, + "loss": 2.7523, + "step": 8473 + }, + { + "epoch": 0.25128250748747144, + "grad_norm": 0.17780163884162903, + "learning_rate": 0.0008605113709079839, + "loss": 2.718, + "step": 8474 + }, + { + "epoch": 0.2513121608397829, + "grad_norm": 0.14942140877246857, + "learning_rate": 0.0008604787671647914, + "loss": 2.7731, + "step": 8475 + }, + { + "epoch": 0.2513418141920944, + "grad_norm": 0.1320739984512329, + "learning_rate": 0.0008604461602294974, + "loss": 2.7791, + "step": 8476 + }, + { + "epoch": 0.25137146754440587, + "grad_norm": 3.0797088146209717, + "learning_rate": 0.0008604135501023909, + "loss": 2.7441, + "step": 8477 + }, + { + "epoch": 0.2514011208967174, + "grad_norm": 0.22691988945007324, + "learning_rate": 0.0008603809367837605, + "loss": 2.7946, + "step": 8478 + }, + { + "epoch": 0.2514307742490289, + "grad_norm": 0.3089278042316437, + "learning_rate": 0.000860348320273895, + "loss": 2.7522, + "step": 8479 + }, + { + "epoch": 0.25146042760134035, + "grad_norm": 0.17654769122600555, + "learning_rate": 0.0008603157005730833, + "loss": 2.7748, + "step": 8480 + }, + { + "epoch": 0.25149008095365183, + "grad_norm": 0.17754098773002625, + "learning_rate": 0.0008602830776816142, + "loss": 2.7584, + "step": 8481 + }, + { + "epoch": 0.2515197343059633, + "grad_norm": 0.3702373206615448, + "learning_rate": 0.0008602504515997767, + "loss": 2.7716, + "step": 8482 + }, + { + "epoch": 0.2515493876582748, + "grad_norm": 0.1480545699596405, + "learning_rate": 0.0008602178223278595, + "loss": 2.7205, + "step": 8483 + }, + { + "epoch": 0.25157904101058626, + "grad_norm": 0.17555548250675201, + "learning_rate": 0.0008601851898661517, + "loss": 2.7102, + "step": 8484 + }, + { + "epoch": 0.25160869436289773, + "grad_norm": 0.18012495338916779, + "learning_rate": 0.0008601525542149422, + "loss": 2.788, + "step": 8485 + }, + { + "epoch": 0.2516383477152092, + "grad_norm": 0.16281907260417938, + "learning_rate": 0.00086011991537452, + "loss": 2.709, + "step": 8486 + }, + { + "epoch": 0.2516680010675207, + "grad_norm": 0.15097589790821075, + "learning_rate": 0.0008600872733451742, + "loss": 2.7722, + "step": 8487 + }, + { + "epoch": 0.25169765441983216, + "grad_norm": 0.1480705291032791, + "learning_rate": 0.0008600546281271938, + "loss": 2.7707, + "step": 8488 + }, + { + "epoch": 0.25172730777214364, + "grad_norm": 0.2982954680919647, + "learning_rate": 0.0008600219797208678, + "loss": 2.773, + "step": 8489 + }, + { + "epoch": 0.2517569611244551, + "grad_norm": 0.15998022258281708, + "learning_rate": 0.0008599893281264854, + "loss": 2.7571, + "step": 8490 + }, + { + "epoch": 0.2517866144767666, + "grad_norm": 0.15061524510383606, + "learning_rate": 0.0008599566733443358, + "loss": 2.7622, + "step": 8491 + }, + { + "epoch": 0.25181626782907807, + "grad_norm": 0.12115473300218582, + "learning_rate": 0.0008599240153747079, + "loss": 2.7989, + "step": 8492 + }, + { + "epoch": 0.25184592118138954, + "grad_norm": 0.16369447112083435, + "learning_rate": 0.0008598913542178912, + "loss": 2.7691, + "step": 8493 + }, + { + "epoch": 0.251875574533701, + "grad_norm": 0.13356083631515503, + "learning_rate": 0.0008598586898741747, + "loss": 2.7443, + "step": 8494 + }, + { + "epoch": 0.2519052278860125, + "grad_norm": 0.12737946212291718, + "learning_rate": 0.0008598260223438476, + "loss": 2.7729, + "step": 8495 + }, + { + "epoch": 0.25193488123832397, + "grad_norm": 0.12122455984354019, + "learning_rate": 0.0008597933516271997, + "loss": 2.767, + "step": 8496 + }, + { + "epoch": 0.25196453459063545, + "grad_norm": 0.1247224435210228, + "learning_rate": 0.0008597606777245195, + "loss": 2.7556, + "step": 8497 + }, + { + "epoch": 0.2519941879429469, + "grad_norm": 0.11028178781270981, + "learning_rate": 0.000859728000636097, + "loss": 2.7524, + "step": 8498 + }, + { + "epoch": 0.25202384129525846, + "grad_norm": 0.1156972125172615, + "learning_rate": 0.0008596953203622213, + "loss": 2.7314, + "step": 8499 + }, + { + "epoch": 0.25205349464756993, + "grad_norm": 0.11704494804143906, + "learning_rate": 0.0008596626369031817, + "loss": 2.7646, + "step": 8500 + }, + { + "epoch": 0.2520831479998814, + "grad_norm": 0.10390044748783112, + "learning_rate": 0.0008596299502592677, + "loss": 2.7572, + "step": 8501 + }, + { + "epoch": 0.2521128013521929, + "grad_norm": 0.1201128363609314, + "learning_rate": 0.0008595972604307689, + "loss": 2.7435, + "step": 8502 + }, + { + "epoch": 0.25214245470450436, + "grad_norm": 0.09964454919099808, + "learning_rate": 0.0008595645674179744, + "loss": 2.7802, + "step": 8503 + }, + { + "epoch": 0.25217210805681584, + "grad_norm": 0.6991463899612427, + "learning_rate": 0.0008595318712211742, + "loss": 2.7624, + "step": 8504 + }, + { + "epoch": 0.2522017614091273, + "grad_norm": 0.1545969545841217, + "learning_rate": 0.0008594991718406574, + "loss": 2.7754, + "step": 8505 + }, + { + "epoch": 0.2522314147614388, + "grad_norm": 0.13473084568977356, + "learning_rate": 0.0008594664692767138, + "loss": 2.7597, + "step": 8506 + }, + { + "epoch": 0.25226106811375026, + "grad_norm": 0.3364623486995697, + "learning_rate": 0.0008594337635296329, + "loss": 2.7348, + "step": 8507 + }, + { + "epoch": 0.25229072146606174, + "grad_norm": 0.9186479449272156, + "learning_rate": 0.0008594010545997042, + "loss": 2.8397, + "step": 8508 + }, + { + "epoch": 0.2523203748183732, + "grad_norm": 0.3311084806919098, + "learning_rate": 0.0008593683424872176, + "loss": 2.7841, + "step": 8509 + }, + { + "epoch": 0.2523500281706847, + "grad_norm": 0.22396765649318695, + "learning_rate": 0.0008593356271924626, + "loss": 2.7714, + "step": 8510 + }, + { + "epoch": 0.25237968152299617, + "grad_norm": 0.24467746913433075, + "learning_rate": 0.000859302908715729, + "loss": 2.7748, + "step": 8511 + }, + { + "epoch": 0.25240933487530764, + "grad_norm": 0.19098517298698425, + "learning_rate": 0.0008592701870573066, + "loss": 2.7831, + "step": 8512 + }, + { + "epoch": 0.2524389882276191, + "grad_norm": 0.15400435030460358, + "learning_rate": 0.0008592374622174848, + "loss": 2.7867, + "step": 8513 + }, + { + "epoch": 0.2524686415799306, + "grad_norm": 0.14445710182189941, + "learning_rate": 0.0008592047341965536, + "loss": 2.7354, + "step": 8514 + }, + { + "epoch": 0.2524982949322421, + "grad_norm": 0.15042072534561157, + "learning_rate": 0.0008591720029948029, + "loss": 2.775, + "step": 8515 + }, + { + "epoch": 0.25252794828455355, + "grad_norm": 0.16665004193782806, + "learning_rate": 0.0008591392686125225, + "loss": 2.794, + "step": 8516 + }, + { + "epoch": 0.252557601636865, + "grad_norm": 0.13336831331253052, + "learning_rate": 0.0008591065310500021, + "loss": 2.7999, + "step": 8517 + }, + { + "epoch": 0.2525872549891765, + "grad_norm": 0.13191884756088257, + "learning_rate": 0.0008590737903075319, + "loss": 2.7446, + "step": 8518 + }, + { + "epoch": 0.252616908341488, + "grad_norm": 0.1267249584197998, + "learning_rate": 0.0008590410463854014, + "loss": 2.7411, + "step": 8519 + }, + { + "epoch": 0.2526465616937995, + "grad_norm": 0.11246085166931152, + "learning_rate": 0.0008590082992839011, + "loss": 2.7413, + "step": 8520 + }, + { + "epoch": 0.252676215046111, + "grad_norm": 0.1309717446565628, + "learning_rate": 0.0008589755490033207, + "loss": 2.7528, + "step": 8521 + }, + { + "epoch": 0.25270586839842246, + "grad_norm": 0.12343201786279678, + "learning_rate": 0.00085894279554395, + "loss": 2.7915, + "step": 8522 + }, + { + "epoch": 0.25273552175073394, + "grad_norm": 0.11180511862039566, + "learning_rate": 0.0008589100389060794, + "loss": 2.765, + "step": 8523 + }, + { + "epoch": 0.2527651751030454, + "grad_norm": 0.11769154667854309, + "learning_rate": 0.0008588772790899986, + "loss": 2.7912, + "step": 8524 + }, + { + "epoch": 0.2527948284553569, + "grad_norm": 0.12179121375083923, + "learning_rate": 0.0008588445160959979, + "loss": 2.7742, + "step": 8525 + }, + { + "epoch": 0.25282448180766837, + "grad_norm": 0.10756474733352661, + "learning_rate": 0.0008588117499243675, + "loss": 2.7686, + "step": 8526 + }, + { + "epoch": 0.25285413515997984, + "grad_norm": 0.10470312833786011, + "learning_rate": 0.0008587789805753975, + "loss": 2.7111, + "step": 8527 + }, + { + "epoch": 0.2528837885122913, + "grad_norm": 0.10987240076065063, + "learning_rate": 0.0008587462080493779, + "loss": 2.7234, + "step": 8528 + }, + { + "epoch": 0.2529134418646028, + "grad_norm": 0.10336931049823761, + "learning_rate": 0.0008587134323465993, + "loss": 2.7907, + "step": 8529 + }, + { + "epoch": 0.25294309521691427, + "grad_norm": 0.10188834369182587, + "learning_rate": 0.0008586806534673514, + "loss": 2.7595, + "step": 8530 + }, + { + "epoch": 0.25297274856922575, + "grad_norm": 0.1294534057378769, + "learning_rate": 0.000858647871411925, + "loss": 2.7278, + "step": 8531 + }, + { + "epoch": 0.2530024019215372, + "grad_norm": 0.1308017373085022, + "learning_rate": 0.00085861508618061, + "loss": 2.743, + "step": 8532 + }, + { + "epoch": 0.2530320552738487, + "grad_norm": 0.12138667702674866, + "learning_rate": 0.0008585822977736969, + "loss": 2.7566, + "step": 8533 + }, + { + "epoch": 0.2530617086261602, + "grad_norm": 0.1033918634057045, + "learning_rate": 0.000858549506191476, + "loss": 2.7114, + "step": 8534 + }, + { + "epoch": 0.25309136197847165, + "grad_norm": 0.12212918698787689, + "learning_rate": 0.0008585167114342376, + "loss": 2.7616, + "step": 8535 + }, + { + "epoch": 0.25312101533078313, + "grad_norm": 0.11714168637990952, + "learning_rate": 0.0008584839135022723, + "loss": 2.7607, + "step": 8536 + }, + { + "epoch": 0.2531506686830946, + "grad_norm": 0.10241804271936417, + "learning_rate": 0.0008584511123958704, + "loss": 2.7361, + "step": 8537 + }, + { + "epoch": 0.2531803220354061, + "grad_norm": 0.10799199342727661, + "learning_rate": 0.0008584183081153223, + "loss": 2.7372, + "step": 8538 + }, + { + "epoch": 0.25320997538771756, + "grad_norm": 0.1049075797200203, + "learning_rate": 0.0008583855006609186, + "loss": 2.7565, + "step": 8539 + }, + { + "epoch": 0.2532396287400291, + "grad_norm": 0.10606357455253601, + "learning_rate": 0.0008583526900329497, + "loss": 2.7173, + "step": 8540 + }, + { + "epoch": 0.25326928209234056, + "grad_norm": 0.23543059825897217, + "learning_rate": 0.0008583198762317064, + "loss": 2.7477, + "step": 8541 + }, + { + "epoch": 0.25329893544465204, + "grad_norm": 0.13355351984500885, + "learning_rate": 0.0008582870592574789, + "loss": 2.7548, + "step": 8542 + }, + { + "epoch": 0.2533285887969635, + "grad_norm": 0.12342465668916702, + "learning_rate": 0.0008582542391105581, + "loss": 2.7579, + "step": 8543 + }, + { + "epoch": 0.253358242149275, + "grad_norm": 0.11675694584846497, + "learning_rate": 0.0008582214157912345, + "loss": 2.7732, + "step": 8544 + }, + { + "epoch": 0.25338789550158647, + "grad_norm": 0.12530550360679626, + "learning_rate": 0.0008581885892997987, + "loss": 2.7873, + "step": 8545 + }, + { + "epoch": 0.25341754885389794, + "grad_norm": 0.12086319923400879, + "learning_rate": 0.0008581557596365416, + "loss": 2.7475, + "step": 8546 + }, + { + "epoch": 0.2534472022062094, + "grad_norm": 0.11052881181240082, + "learning_rate": 0.0008581229268017536, + "loss": 2.7319, + "step": 8547 + }, + { + "epoch": 0.2534768555585209, + "grad_norm": 0.1164017990231514, + "learning_rate": 0.0008580900907957258, + "loss": 2.7512, + "step": 8548 + }, + { + "epoch": 0.2535065089108324, + "grad_norm": 0.11453898996114731, + "learning_rate": 0.0008580572516187486, + "loss": 2.72, + "step": 8549 + }, + { + "epoch": 0.25353616226314385, + "grad_norm": 0.1238834336400032, + "learning_rate": 0.0008580244092711132, + "loss": 2.7568, + "step": 8550 + }, + { + "epoch": 0.2535658156154553, + "grad_norm": 0.12439718097448349, + "learning_rate": 0.00085799156375311, + "loss": 2.7636, + "step": 8551 + }, + { + "epoch": 0.2535954689677668, + "grad_norm": 0.14917868375778198, + "learning_rate": 0.0008579587150650301, + "loss": 2.7718, + "step": 8552 + }, + { + "epoch": 0.2536251223200783, + "grad_norm": 0.18798036873340607, + "learning_rate": 0.0008579258632071643, + "loss": 2.7638, + "step": 8553 + }, + { + "epoch": 0.25365477567238975, + "grad_norm": 0.1981436312198639, + "learning_rate": 0.0008578930081798037, + "loss": 2.7551, + "step": 8554 + }, + { + "epoch": 0.25368442902470123, + "grad_norm": 0.17895697057247162, + "learning_rate": 0.0008578601499832389, + "loss": 2.7579, + "step": 8555 + }, + { + "epoch": 0.2537140823770127, + "grad_norm": 0.148884117603302, + "learning_rate": 0.0008578272886177611, + "loss": 2.7483, + "step": 8556 + }, + { + "epoch": 0.2537437357293242, + "grad_norm": 0.17273975908756256, + "learning_rate": 0.0008577944240836614, + "loss": 2.7573, + "step": 8557 + }, + { + "epoch": 0.25377338908163566, + "grad_norm": 0.199807807803154, + "learning_rate": 0.0008577615563812304, + "loss": 2.7529, + "step": 8558 + }, + { + "epoch": 0.25380304243394713, + "grad_norm": 0.17566035687923431, + "learning_rate": 0.0008577286855107596, + "loss": 2.7605, + "step": 8559 + }, + { + "epoch": 0.2538326957862586, + "grad_norm": 0.17045611143112183, + "learning_rate": 0.0008576958114725399, + "loss": 2.8191, + "step": 8560 + }, + { + "epoch": 0.25386234913857014, + "grad_norm": 0.16774329543113708, + "learning_rate": 0.0008576629342668623, + "loss": 2.7792, + "step": 8561 + }, + { + "epoch": 0.2538920024908816, + "grad_norm": 0.15623436868190765, + "learning_rate": 0.000857630053894018, + "loss": 2.7615, + "step": 8562 + }, + { + "epoch": 0.2539216558431931, + "grad_norm": 0.1461210548877716, + "learning_rate": 0.0008575971703542981, + "loss": 2.7573, + "step": 8563 + }, + { + "epoch": 0.25395130919550457, + "grad_norm": 0.14518620073795319, + "learning_rate": 0.0008575642836479941, + "loss": 2.7542, + "step": 8564 + }, + { + "epoch": 0.25398096254781605, + "grad_norm": 0.16276799142360687, + "learning_rate": 0.0008575313937753969, + "loss": 2.7696, + "step": 8565 + }, + { + "epoch": 0.2540106159001275, + "grad_norm": 0.1371157020330429, + "learning_rate": 0.0008574985007367979, + "loss": 2.7669, + "step": 8566 + }, + { + "epoch": 0.254040269252439, + "grad_norm": 0.13030436635017395, + "learning_rate": 0.0008574656045324883, + "loss": 2.7423, + "step": 8567 + }, + { + "epoch": 0.2540699226047505, + "grad_norm": 0.20388780534267426, + "learning_rate": 0.0008574327051627593, + "loss": 2.766, + "step": 8568 + }, + { + "epoch": 0.25409957595706195, + "grad_norm": 0.1606322079896927, + "learning_rate": 0.0008573998026279024, + "loss": 2.753, + "step": 8569 + }, + { + "epoch": 0.25412922930937343, + "grad_norm": 0.23821936547756195, + "learning_rate": 0.0008573668969282089, + "loss": 2.7689, + "step": 8570 + }, + { + "epoch": 0.2541588826616849, + "grad_norm": 0.14622719585895538, + "learning_rate": 0.0008573339880639701, + "loss": 2.7639, + "step": 8571 + }, + { + "epoch": 0.2541885360139964, + "grad_norm": 0.16295669972896576, + "learning_rate": 0.0008573010760354776, + "loss": 2.7842, + "step": 8572 + }, + { + "epoch": 0.25421818936630786, + "grad_norm": 0.15619033575057983, + "learning_rate": 0.0008572681608430228, + "loss": 2.7599, + "step": 8573 + }, + { + "epoch": 0.25424784271861933, + "grad_norm": 0.17463433742523193, + "learning_rate": 0.000857235242486897, + "loss": 2.7246, + "step": 8574 + }, + { + "epoch": 0.2542774960709308, + "grad_norm": 0.17290060222148895, + "learning_rate": 0.0008572023209673918, + "loss": 2.7715, + "step": 8575 + }, + { + "epoch": 0.2543071494232423, + "grad_norm": 0.1794765740633011, + "learning_rate": 0.0008571693962847989, + "loss": 2.7207, + "step": 8576 + }, + { + "epoch": 0.25433680277555376, + "grad_norm": 0.17001883685588837, + "learning_rate": 0.0008571364684394095, + "loss": 2.7723, + "step": 8577 + }, + { + "epoch": 0.25436645612786524, + "grad_norm": 0.14738771319389343, + "learning_rate": 0.0008571035374315155, + "loss": 2.7398, + "step": 8578 + }, + { + "epoch": 0.2543961094801767, + "grad_norm": 0.15522436797618866, + "learning_rate": 0.0008570706032614083, + "loss": 2.7288, + "step": 8579 + }, + { + "epoch": 0.2544257628324882, + "grad_norm": 0.13940738141536713, + "learning_rate": 0.0008570376659293797, + "loss": 2.7436, + "step": 8580 + }, + { + "epoch": 0.25445541618479967, + "grad_norm": 0.1331658959388733, + "learning_rate": 0.0008570047254357211, + "loss": 2.7893, + "step": 8581 + }, + { + "epoch": 0.2544850695371112, + "grad_norm": 0.11864031851291656, + "learning_rate": 0.0008569717817807246, + "loss": 2.7462, + "step": 8582 + }, + { + "epoch": 0.2545147228894227, + "grad_norm": 0.11197254806756973, + "learning_rate": 0.0008569388349646816, + "loss": 2.7541, + "step": 8583 + }, + { + "epoch": 0.25454437624173415, + "grad_norm": 0.11223531514406204, + "learning_rate": 0.0008569058849878839, + "loss": 2.7966, + "step": 8584 + }, + { + "epoch": 0.2545740295940456, + "grad_norm": 0.11772023141384125, + "learning_rate": 0.0008568729318506234, + "loss": 2.7304, + "step": 8585 + }, + { + "epoch": 0.2546036829463571, + "grad_norm": 0.10672507435083389, + "learning_rate": 0.0008568399755531919, + "loss": 2.7329, + "step": 8586 + }, + { + "epoch": 0.2546333362986686, + "grad_norm": 0.11985470354557037, + "learning_rate": 0.0008568070160958809, + "loss": 2.7839, + "step": 8587 + }, + { + "epoch": 0.25466298965098005, + "grad_norm": 0.11825823038816452, + "learning_rate": 0.0008567740534789828, + "loss": 2.7814, + "step": 8588 + }, + { + "epoch": 0.25469264300329153, + "grad_norm": 0.13237527012825012, + "learning_rate": 0.0008567410877027891, + "loss": 2.7576, + "step": 8589 + }, + { + "epoch": 0.254722296355603, + "grad_norm": 0.1245674341917038, + "learning_rate": 0.0008567081187675918, + "loss": 2.7279, + "step": 8590 + }, + { + "epoch": 0.2547519497079145, + "grad_norm": 0.11815883219242096, + "learning_rate": 0.0008566751466736831, + "loss": 2.773, + "step": 8591 + }, + { + "epoch": 0.25478160306022596, + "grad_norm": 0.13470034301280975, + "learning_rate": 0.0008566421714213546, + "loss": 2.7055, + "step": 8592 + }, + { + "epoch": 0.25481125641253743, + "grad_norm": 0.16423824429512024, + "learning_rate": 0.0008566091930108983, + "loss": 2.771, + "step": 8593 + }, + { + "epoch": 0.2548409097648489, + "grad_norm": 0.15458112955093384, + "learning_rate": 0.0008565762114426065, + "loss": 2.7514, + "step": 8594 + }, + { + "epoch": 0.2548705631171604, + "grad_norm": 0.12783795595169067, + "learning_rate": 0.0008565432267167712, + "loss": 2.7803, + "step": 8595 + }, + { + "epoch": 0.25490021646947186, + "grad_norm": 0.14539027214050293, + "learning_rate": 0.0008565102388336845, + "loss": 2.7348, + "step": 8596 + }, + { + "epoch": 0.25492986982178334, + "grad_norm": 0.14408336579799652, + "learning_rate": 0.0008564772477936383, + "loss": 2.8034, + "step": 8597 + }, + { + "epoch": 0.2549595231740948, + "grad_norm": 0.14916810393333435, + "learning_rate": 0.0008564442535969249, + "loss": 2.751, + "step": 8598 + }, + { + "epoch": 0.2549891765264063, + "grad_norm": 0.18814963102340698, + "learning_rate": 0.0008564112562438364, + "loss": 2.7738, + "step": 8599 + }, + { + "epoch": 0.25501882987871777, + "grad_norm": 0.22905980050563812, + "learning_rate": 0.0008563782557346652, + "loss": 2.778, + "step": 8600 + }, + { + "epoch": 0.25504848323102924, + "grad_norm": 0.22018899023532867, + "learning_rate": 0.0008563452520697032, + "loss": 2.7273, + "step": 8601 + }, + { + "epoch": 0.2550781365833407, + "grad_norm": 0.19537603855133057, + "learning_rate": 0.000856312245249243, + "loss": 2.7848, + "step": 8602 + }, + { + "epoch": 0.25510778993565225, + "grad_norm": 0.20069408416748047, + "learning_rate": 0.0008562792352735766, + "loss": 2.7781, + "step": 8603 + }, + { + "epoch": 0.2551374432879637, + "grad_norm": 0.1951730102300644, + "learning_rate": 0.0008562462221429966, + "loss": 2.7598, + "step": 8604 + }, + { + "epoch": 0.2551670966402752, + "grad_norm": 0.18415166437625885, + "learning_rate": 0.000856213205857795, + "loss": 2.7753, + "step": 8605 + }, + { + "epoch": 0.2551967499925867, + "grad_norm": 0.1743001639842987, + "learning_rate": 0.0008561801864182642, + "loss": 2.7496, + "step": 8606 + }, + { + "epoch": 0.25522640334489816, + "grad_norm": 0.14371541142463684, + "learning_rate": 0.0008561471638246968, + "loss": 2.7531, + "step": 8607 + }, + { + "epoch": 0.25525605669720963, + "grad_norm": 0.15905502438545227, + "learning_rate": 0.0008561141380773853, + "loss": 2.7679, + "step": 8608 + }, + { + "epoch": 0.2552857100495211, + "grad_norm": 0.167242631316185, + "learning_rate": 0.0008560811091766218, + "loss": 2.7645, + "step": 8609 + }, + { + "epoch": 0.2553153634018326, + "grad_norm": 0.1357765793800354, + "learning_rate": 0.000856048077122699, + "loss": 2.7506, + "step": 8610 + }, + { + "epoch": 0.25534501675414406, + "grad_norm": 0.13510020077228546, + "learning_rate": 0.0008560150419159094, + "loss": 2.7259, + "step": 8611 + }, + { + "epoch": 0.25537467010645554, + "grad_norm": 0.1374228298664093, + "learning_rate": 0.0008559820035565454, + "loss": 2.7522, + "step": 8612 + }, + { + "epoch": 0.255404323458767, + "grad_norm": 0.15931759774684906, + "learning_rate": 0.0008559489620448997, + "loss": 2.7549, + "step": 8613 + }, + { + "epoch": 0.2554339768110785, + "grad_norm": 0.13333792984485626, + "learning_rate": 0.000855915917381265, + "loss": 2.754, + "step": 8614 + }, + { + "epoch": 0.25546363016338997, + "grad_norm": 0.12705440819263458, + "learning_rate": 0.0008558828695659336, + "loss": 2.7596, + "step": 8615 + }, + { + "epoch": 0.25549328351570144, + "grad_norm": 0.1450764238834381, + "learning_rate": 0.0008558498185991983, + "loss": 2.7668, + "step": 8616 + }, + { + "epoch": 0.2555229368680129, + "grad_norm": 0.16920851171016693, + "learning_rate": 0.0008558167644813517, + "loss": 2.762, + "step": 8617 + }, + { + "epoch": 0.2555525902203244, + "grad_norm": 0.17953628301620483, + "learning_rate": 0.0008557837072126866, + "loss": 2.7298, + "step": 8618 + }, + { + "epoch": 0.25558224357263587, + "grad_norm": 0.1650027334690094, + "learning_rate": 0.0008557506467934959, + "loss": 2.7508, + "step": 8619 + }, + { + "epoch": 0.25561189692494735, + "grad_norm": 0.15931613743305206, + "learning_rate": 0.000855717583224072, + "loss": 2.7516, + "step": 8620 + }, + { + "epoch": 0.2556415502772588, + "grad_norm": 0.13999000191688538, + "learning_rate": 0.0008556845165047078, + "loss": 2.7136, + "step": 8621 + }, + { + "epoch": 0.2556712036295703, + "grad_norm": 0.12297897785902023, + "learning_rate": 0.0008556514466356963, + "loss": 2.7377, + "step": 8622 + }, + { + "epoch": 0.2557008569818818, + "grad_norm": 0.1255943477153778, + "learning_rate": 0.0008556183736173302, + "loss": 2.7801, + "step": 8623 + }, + { + "epoch": 0.2557305103341933, + "grad_norm": 0.12434910982847214, + "learning_rate": 0.0008555852974499023, + "loss": 2.7671, + "step": 8624 + }, + { + "epoch": 0.2557601636865048, + "grad_norm": 0.1157090812921524, + "learning_rate": 0.0008555522181337054, + "loss": 2.7697, + "step": 8625 + }, + { + "epoch": 0.25578981703881626, + "grad_norm": 0.12249185889959335, + "learning_rate": 0.0008555191356690329, + "loss": 2.7737, + "step": 8626 + }, + { + "epoch": 0.25581947039112773, + "grad_norm": 0.12926913797855377, + "learning_rate": 0.0008554860500561772, + "loss": 2.7617, + "step": 8627 + }, + { + "epoch": 0.2558491237434392, + "grad_norm": 0.13629339635372162, + "learning_rate": 0.0008554529612954315, + "loss": 2.7581, + "step": 8628 + }, + { + "epoch": 0.2558787770957507, + "grad_norm": 0.1412501484155655, + "learning_rate": 0.0008554198693870889, + "loss": 2.7862, + "step": 8629 + }, + { + "epoch": 0.25590843044806216, + "grad_norm": 0.14652420580387115, + "learning_rate": 0.0008553867743314423, + "loss": 2.7469, + "step": 8630 + }, + { + "epoch": 0.25593808380037364, + "grad_norm": 0.14206022024154663, + "learning_rate": 0.0008553536761287848, + "loss": 2.7699, + "step": 8631 + }, + { + "epoch": 0.2559677371526851, + "grad_norm": 0.12785795331001282, + "learning_rate": 0.0008553205747794095, + "loss": 2.7165, + "step": 8632 + }, + { + "epoch": 0.2559973905049966, + "grad_norm": 0.13940513134002686, + "learning_rate": 0.0008552874702836096, + "loss": 2.7502, + "step": 8633 + }, + { + "epoch": 0.25602704385730807, + "grad_norm": 0.1601562201976776, + "learning_rate": 0.0008552543626416783, + "loss": 2.705, + "step": 8634 + }, + { + "epoch": 0.25605669720961954, + "grad_norm": 0.18072983622550964, + "learning_rate": 0.0008552212518539084, + "loss": 2.7653, + "step": 8635 + }, + { + "epoch": 0.256086350561931, + "grad_norm": 0.16938835382461548, + "learning_rate": 0.0008551881379205932, + "loss": 2.752, + "step": 8636 + }, + { + "epoch": 0.2561160039142425, + "grad_norm": 0.1566033661365509, + "learning_rate": 0.0008551550208420265, + "loss": 2.7528, + "step": 8637 + }, + { + "epoch": 0.25614565726655397, + "grad_norm": 0.18272434175014496, + "learning_rate": 0.0008551219006185008, + "loss": 2.7677, + "step": 8638 + }, + { + "epoch": 0.25617531061886545, + "grad_norm": 0.19925104081630707, + "learning_rate": 0.0008550887772503097, + "loss": 2.7317, + "step": 8639 + }, + { + "epoch": 0.2562049639711769, + "grad_norm": 0.1951032429933548, + "learning_rate": 0.0008550556507377467, + "loss": 2.7716, + "step": 8640 + }, + { + "epoch": 0.2562346173234884, + "grad_norm": 0.16671843826770782, + "learning_rate": 0.000855022521081105, + "loss": 2.7456, + "step": 8641 + }, + { + "epoch": 0.2562642706757999, + "grad_norm": 0.1340288668870926, + "learning_rate": 0.0008549893882806778, + "loss": 2.7384, + "step": 8642 + }, + { + "epoch": 0.25629392402811135, + "grad_norm": 0.14495578408241272, + "learning_rate": 0.0008549562523367586, + "loss": 2.7465, + "step": 8643 + }, + { + "epoch": 0.2563235773804229, + "grad_norm": 0.15654481947422028, + "learning_rate": 0.0008549231132496412, + "loss": 2.729, + "step": 8644 + }, + { + "epoch": 0.25635323073273436, + "grad_norm": 0.17586225271224976, + "learning_rate": 0.0008548899710196183, + "loss": 2.7722, + "step": 8645 + }, + { + "epoch": 0.25638288408504584, + "grad_norm": 0.17180612683296204, + "learning_rate": 0.000854856825646984, + "loss": 2.7904, + "step": 8646 + }, + { + "epoch": 0.2564125374373573, + "grad_norm": 0.15897740423679352, + "learning_rate": 0.0008548236771320314, + "loss": 2.7509, + "step": 8647 + }, + { + "epoch": 0.2564421907896688, + "grad_norm": 0.14749954640865326, + "learning_rate": 0.0008547905254750545, + "loss": 2.7824, + "step": 8648 + }, + { + "epoch": 0.25647184414198027, + "grad_norm": 0.17345088720321655, + "learning_rate": 0.0008547573706763465, + "loss": 2.7732, + "step": 8649 + }, + { + "epoch": 0.25650149749429174, + "grad_norm": 0.17090892791748047, + "learning_rate": 0.000854724212736201, + "loss": 2.6991, + "step": 8650 + }, + { + "epoch": 0.2565311508466032, + "grad_norm": 0.14595426619052887, + "learning_rate": 0.0008546910516549118, + "loss": 2.7284, + "step": 8651 + }, + { + "epoch": 0.2565608041989147, + "grad_norm": 0.14331519603729248, + "learning_rate": 0.0008546578874327724, + "loss": 2.7703, + "step": 8652 + }, + { + "epoch": 0.25659045755122617, + "grad_norm": 0.14575211703777313, + "learning_rate": 0.0008546247200700765, + "loss": 2.7658, + "step": 8653 + }, + { + "epoch": 0.25662011090353765, + "grad_norm": 0.15558242797851562, + "learning_rate": 0.000854591549567118, + "loss": 2.7601, + "step": 8654 + }, + { + "epoch": 0.2566497642558491, + "grad_norm": 0.14064805209636688, + "learning_rate": 0.0008545583759241904, + "loss": 2.7762, + "step": 8655 + }, + { + "epoch": 0.2566794176081606, + "grad_norm": 0.14427584409713745, + "learning_rate": 0.0008545251991415874, + "loss": 2.7581, + "step": 8656 + }, + { + "epoch": 0.2567090709604721, + "grad_norm": 0.14705848693847656, + "learning_rate": 0.0008544920192196031, + "loss": 2.7751, + "step": 8657 + }, + { + "epoch": 0.25673872431278355, + "grad_norm": 0.1579429805278778, + "learning_rate": 0.0008544588361585309, + "loss": 2.7456, + "step": 8658 + }, + { + "epoch": 0.256768377665095, + "grad_norm": 0.18001583218574524, + "learning_rate": 0.0008544256499586649, + "loss": 2.7538, + "step": 8659 + }, + { + "epoch": 0.2567980310174065, + "grad_norm": 0.1877259612083435, + "learning_rate": 0.0008543924606202991, + "loss": 2.7824, + "step": 8660 + }, + { + "epoch": 0.256827684369718, + "grad_norm": 0.18952256441116333, + "learning_rate": 0.0008543592681437271, + "loss": 2.7541, + "step": 8661 + }, + { + "epoch": 0.25685733772202946, + "grad_norm": 0.20334574580192566, + "learning_rate": 0.000854326072529243, + "loss": 2.7617, + "step": 8662 + }, + { + "epoch": 0.25688699107434093, + "grad_norm": 0.21008117496967316, + "learning_rate": 0.0008542928737771407, + "loss": 2.7768, + "step": 8663 + }, + { + "epoch": 0.2569166444266524, + "grad_norm": 0.14855870604515076, + "learning_rate": 0.0008542596718877142, + "loss": 2.7673, + "step": 8664 + }, + { + "epoch": 0.25694629777896394, + "grad_norm": 0.14701786637306213, + "learning_rate": 0.0008542264668612575, + "loss": 2.7622, + "step": 8665 + }, + { + "epoch": 0.2569759511312754, + "grad_norm": 0.1447966992855072, + "learning_rate": 0.0008541932586980647, + "loss": 2.7476, + "step": 8666 + }, + { + "epoch": 0.2570056044835869, + "grad_norm": 0.1348574459552765, + "learning_rate": 0.0008541600473984297, + "loss": 2.747, + "step": 8667 + }, + { + "epoch": 0.25703525783589837, + "grad_norm": 0.14253011345863342, + "learning_rate": 0.0008541268329626466, + "loss": 2.757, + "step": 8668 + }, + { + "epoch": 0.25706491118820984, + "grad_norm": 0.12918700277805328, + "learning_rate": 0.0008540936153910097, + "loss": 2.7487, + "step": 8669 + }, + { + "epoch": 0.2570945645405213, + "grad_norm": 0.140305295586586, + "learning_rate": 0.0008540603946838131, + "loss": 2.7373, + "step": 8670 + }, + { + "epoch": 0.2571242178928328, + "grad_norm": 0.1530754119157791, + "learning_rate": 0.000854027170841351, + "loss": 2.7367, + "step": 8671 + }, + { + "epoch": 0.25715387124514427, + "grad_norm": 0.1488233506679535, + "learning_rate": 0.0008539939438639174, + "loss": 2.7612, + "step": 8672 + }, + { + "epoch": 0.25718352459745575, + "grad_norm": 0.1401963084936142, + "learning_rate": 0.0008539607137518066, + "loss": 2.7866, + "step": 8673 + }, + { + "epoch": 0.2572131779497672, + "grad_norm": 0.14780327677726746, + "learning_rate": 0.0008539274805053131, + "loss": 2.754, + "step": 8674 + }, + { + "epoch": 0.2572428313020787, + "grad_norm": 0.15525035560131073, + "learning_rate": 0.000853894244124731, + "loss": 2.751, + "step": 8675 + }, + { + "epoch": 0.2572724846543902, + "grad_norm": 0.15905962884426117, + "learning_rate": 0.0008538610046103546, + "loss": 2.767, + "step": 8676 + }, + { + "epoch": 0.25730213800670165, + "grad_norm": 0.15948748588562012, + "learning_rate": 0.0008538277619624782, + "loss": 2.7402, + "step": 8677 + }, + { + "epoch": 0.25733179135901313, + "grad_norm": 0.15179972350597382, + "learning_rate": 0.0008537945161813963, + "loss": 2.7336, + "step": 8678 + }, + { + "epoch": 0.2573614447113246, + "grad_norm": 0.12645339965820312, + "learning_rate": 0.0008537612672674031, + "loss": 2.7493, + "step": 8679 + }, + { + "epoch": 0.2573910980636361, + "grad_norm": 0.12938083708286285, + "learning_rate": 0.0008537280152207933, + "loss": 2.7757, + "step": 8680 + }, + { + "epoch": 0.25742075141594756, + "grad_norm": 0.14785361289978027, + "learning_rate": 0.000853694760041861, + "loss": 2.7784, + "step": 8681 + }, + { + "epoch": 0.25745040476825903, + "grad_norm": 0.19938309490680695, + "learning_rate": 0.0008536615017309011, + "loss": 2.7366, + "step": 8682 + }, + { + "epoch": 0.2574800581205705, + "grad_norm": 0.23196928203105927, + "learning_rate": 0.0008536282402882079, + "loss": 2.7877, + "step": 8683 + }, + { + "epoch": 0.257509711472882, + "grad_norm": 0.1793021708726883, + "learning_rate": 0.0008535949757140759, + "loss": 2.7633, + "step": 8684 + }, + { + "epoch": 0.25753936482519346, + "grad_norm": 0.13584205508232117, + "learning_rate": 0.0008535617080087997, + "loss": 2.7517, + "step": 8685 + }, + { + "epoch": 0.257569018177505, + "grad_norm": 0.14337675273418427, + "learning_rate": 0.0008535284371726737, + "loss": 2.7421, + "step": 8686 + }, + { + "epoch": 0.25759867152981647, + "grad_norm": 0.15212714672088623, + "learning_rate": 0.000853495163205993, + "loss": 2.7348, + "step": 8687 + }, + { + "epoch": 0.25762832488212795, + "grad_norm": 0.15886428952217102, + "learning_rate": 0.0008534618861090517, + "loss": 2.7665, + "step": 8688 + }, + { + "epoch": 0.2576579782344394, + "grad_norm": 0.15448497235774994, + "learning_rate": 0.0008534286058821448, + "loss": 2.7217, + "step": 8689 + }, + { + "epoch": 0.2576876315867509, + "grad_norm": 0.1578522026538849, + "learning_rate": 0.0008533953225255671, + "loss": 2.736, + "step": 8690 + }, + { + "epoch": 0.2577172849390624, + "grad_norm": 0.14255356788635254, + "learning_rate": 0.000853362036039613, + "loss": 2.7639, + "step": 8691 + }, + { + "epoch": 0.25774693829137385, + "grad_norm": 0.15428189933300018, + "learning_rate": 0.0008533287464245774, + "loss": 2.7628, + "step": 8692 + }, + { + "epoch": 0.2577765916436853, + "grad_norm": 0.1449660062789917, + "learning_rate": 0.0008532954536807552, + "loss": 2.7536, + "step": 8693 + }, + { + "epoch": 0.2578062449959968, + "grad_norm": 0.15823394060134888, + "learning_rate": 0.000853262157808441, + "loss": 2.7665, + "step": 8694 + }, + { + "epoch": 0.2578358983483083, + "grad_norm": 0.17102612555027008, + "learning_rate": 0.0008532288588079299, + "loss": 2.7689, + "step": 8695 + }, + { + "epoch": 0.25786555170061976, + "grad_norm": 0.1464645117521286, + "learning_rate": 0.0008531955566795166, + "loss": 2.7226, + "step": 8696 + }, + { + "epoch": 0.25789520505293123, + "grad_norm": 0.14895683526992798, + "learning_rate": 0.0008531622514234959, + "loss": 2.7359, + "step": 8697 + }, + { + "epoch": 0.2579248584052427, + "grad_norm": 0.14260923862457275, + "learning_rate": 0.000853128943040163, + "loss": 2.7561, + "step": 8698 + }, + { + "epoch": 0.2579545117575542, + "grad_norm": 0.13115082681179047, + "learning_rate": 0.0008530956315298125, + "loss": 2.7518, + "step": 8699 + }, + { + "epoch": 0.25798416510986566, + "grad_norm": 0.13643471896648407, + "learning_rate": 0.0008530623168927397, + "loss": 2.7492, + "step": 8700 + }, + { + "epoch": 0.25801381846217714, + "grad_norm": 0.14040076732635498, + "learning_rate": 0.0008530289991292394, + "loss": 2.7657, + "step": 8701 + }, + { + "epoch": 0.2580434718144886, + "grad_norm": 0.13761796057224274, + "learning_rate": 0.0008529956782396069, + "loss": 2.7994, + "step": 8702 + }, + { + "epoch": 0.2580731251668001, + "grad_norm": 0.13840293884277344, + "learning_rate": 0.000852962354224137, + "loss": 2.7576, + "step": 8703 + }, + { + "epoch": 0.25810277851911156, + "grad_norm": 0.13600680232048035, + "learning_rate": 0.0008529290270831247, + "loss": 2.724, + "step": 8704 + }, + { + "epoch": 0.25813243187142304, + "grad_norm": 0.14257535338401794, + "learning_rate": 0.0008528956968168655, + "loss": 2.7323, + "step": 8705 + }, + { + "epoch": 0.2581620852237345, + "grad_norm": 0.14683011174201965, + "learning_rate": 0.0008528623634256543, + "loss": 2.7378, + "step": 8706 + }, + { + "epoch": 0.25819173857604605, + "grad_norm": 0.1378985345363617, + "learning_rate": 0.0008528290269097863, + "loss": 2.7176, + "step": 8707 + }, + { + "epoch": 0.2582213919283575, + "grad_norm": 0.15503914654254913, + "learning_rate": 0.0008527956872695565, + "loss": 2.7248, + "step": 8708 + }, + { + "epoch": 0.258251045280669, + "grad_norm": 0.1596272736787796, + "learning_rate": 0.0008527623445052604, + "loss": 2.7744, + "step": 8709 + }, + { + "epoch": 0.2582806986329805, + "grad_norm": 0.15157131850719452, + "learning_rate": 0.0008527289986171934, + "loss": 2.7892, + "step": 8710 + }, + { + "epoch": 0.25831035198529195, + "grad_norm": 0.14465855062007904, + "learning_rate": 0.0008526956496056504, + "loss": 2.7758, + "step": 8711 + }, + { + "epoch": 0.25834000533760343, + "grad_norm": 0.17347437143325806, + "learning_rate": 0.0008526622974709269, + "loss": 2.7548, + "step": 8712 + }, + { + "epoch": 0.2583696586899149, + "grad_norm": 0.1706840544939041, + "learning_rate": 0.0008526289422133182, + "loss": 2.7632, + "step": 8713 + }, + { + "epoch": 0.2583993120422264, + "grad_norm": 0.16924387216567993, + "learning_rate": 0.0008525955838331198, + "loss": 2.7678, + "step": 8714 + }, + { + "epoch": 0.25842896539453786, + "grad_norm": 0.19039271771907806, + "learning_rate": 0.0008525622223306269, + "loss": 2.7424, + "step": 8715 + }, + { + "epoch": 0.25845861874684933, + "grad_norm": 0.17596790194511414, + "learning_rate": 0.0008525288577061349, + "loss": 2.7758, + "step": 8716 + }, + { + "epoch": 0.2584882720991608, + "grad_norm": 0.159047469496727, + "learning_rate": 0.0008524954899599397, + "loss": 2.7673, + "step": 8717 + }, + { + "epoch": 0.2585179254514723, + "grad_norm": 0.149015873670578, + "learning_rate": 0.0008524621190923362, + "loss": 2.7465, + "step": 8718 + }, + { + "epoch": 0.25854757880378376, + "grad_norm": 0.15447591245174408, + "learning_rate": 0.0008524287451036201, + "loss": 2.7836, + "step": 8719 + }, + { + "epoch": 0.25857723215609524, + "grad_norm": 0.15769678354263306, + "learning_rate": 0.000852395367994087, + "loss": 2.7313, + "step": 8720 + }, + { + "epoch": 0.2586068855084067, + "grad_norm": 0.1463683396577835, + "learning_rate": 0.0008523619877640325, + "loss": 2.7001, + "step": 8721 + }, + { + "epoch": 0.2586365388607182, + "grad_norm": 0.13404059410095215, + "learning_rate": 0.0008523286044137521, + "loss": 2.7767, + "step": 8722 + }, + { + "epoch": 0.25866619221302967, + "grad_norm": 0.12429710477590561, + "learning_rate": 0.0008522952179435412, + "loss": 2.7418, + "step": 8723 + }, + { + "epoch": 0.25869584556534114, + "grad_norm": 0.11177954077720642, + "learning_rate": 0.0008522618283536961, + "loss": 2.7233, + "step": 8724 + }, + { + "epoch": 0.2587254989176526, + "grad_norm": 0.12500077486038208, + "learning_rate": 0.0008522284356445118, + "loss": 2.7575, + "step": 8725 + }, + { + "epoch": 0.2587551522699641, + "grad_norm": 0.13756625354290009, + "learning_rate": 0.0008521950398162842, + "loss": 2.6946, + "step": 8726 + }, + { + "epoch": 0.25878480562227557, + "grad_norm": 0.1385488659143448, + "learning_rate": 0.0008521616408693092, + "loss": 2.7683, + "step": 8727 + }, + { + "epoch": 0.2588144589745871, + "grad_norm": 0.13453301787376404, + "learning_rate": 0.0008521282388038822, + "loss": 2.7372, + "step": 8728 + }, + { + "epoch": 0.2588441123268986, + "grad_norm": 0.14096839725971222, + "learning_rate": 0.0008520948336202994, + "loss": 2.7452, + "step": 8729 + }, + { + "epoch": 0.25887376567921005, + "grad_norm": 0.12617535889148712, + "learning_rate": 0.0008520614253188563, + "loss": 2.7535, + "step": 8730 + }, + { + "epoch": 0.25890341903152153, + "grad_norm": 0.11504589766263962, + "learning_rate": 0.0008520280138998489, + "loss": 2.7374, + "step": 8731 + }, + { + "epoch": 0.258933072383833, + "grad_norm": 0.12258243560791016, + "learning_rate": 0.0008519945993635731, + "loss": 2.7853, + "step": 8732 + }, + { + "epoch": 0.2589627257361445, + "grad_norm": 0.1455676406621933, + "learning_rate": 0.0008519611817103246, + "loss": 2.7312, + "step": 8733 + }, + { + "epoch": 0.25899237908845596, + "grad_norm": 0.16132284700870514, + "learning_rate": 0.0008519277609403995, + "loss": 2.7637, + "step": 8734 + }, + { + "epoch": 0.25902203244076744, + "grad_norm": 0.16294245421886444, + "learning_rate": 0.0008518943370540935, + "loss": 2.7667, + "step": 8735 + }, + { + "epoch": 0.2590516857930789, + "grad_norm": 0.15367856621742249, + "learning_rate": 0.000851860910051703, + "loss": 2.7864, + "step": 8736 + }, + { + "epoch": 0.2590813391453904, + "grad_norm": 0.15926632285118103, + "learning_rate": 0.0008518274799335235, + "loss": 2.7199, + "step": 8737 + }, + { + "epoch": 0.25911099249770186, + "grad_norm": 0.1804599016904831, + "learning_rate": 0.0008517940466998515, + "loss": 2.7505, + "step": 8738 + }, + { + "epoch": 0.25914064585001334, + "grad_norm": 0.19146595895290375, + "learning_rate": 0.0008517606103509828, + "loss": 2.7752, + "step": 8739 + }, + { + "epoch": 0.2591702992023248, + "grad_norm": 0.19507469236850739, + "learning_rate": 0.0008517271708872133, + "loss": 2.758, + "step": 8740 + }, + { + "epoch": 0.2591999525546363, + "grad_norm": 0.17788267135620117, + "learning_rate": 0.0008516937283088394, + "loss": 2.7732, + "step": 8741 + }, + { + "epoch": 0.25922960590694777, + "grad_norm": 0.1589796245098114, + "learning_rate": 0.0008516602826161572, + "loss": 2.7613, + "step": 8742 + }, + { + "epoch": 0.25925925925925924, + "grad_norm": 0.16521379351615906, + "learning_rate": 0.000851626833809463, + "loss": 2.7287, + "step": 8743 + }, + { + "epoch": 0.2592889126115707, + "grad_norm": 0.15613017976284027, + "learning_rate": 0.0008515933818890527, + "loss": 2.7505, + "step": 8744 + }, + { + "epoch": 0.2593185659638822, + "grad_norm": 0.1331799179315567, + "learning_rate": 0.0008515599268552226, + "loss": 2.7543, + "step": 8745 + }, + { + "epoch": 0.2593482193161937, + "grad_norm": 0.16332201659679413, + "learning_rate": 0.0008515264687082692, + "loss": 2.7809, + "step": 8746 + }, + { + "epoch": 0.25937787266850515, + "grad_norm": 0.19181938469409943, + "learning_rate": 0.0008514930074484883, + "loss": 2.7564, + "step": 8747 + }, + { + "epoch": 0.2594075260208166, + "grad_norm": 0.19492857158184052, + "learning_rate": 0.0008514595430761764, + "loss": 2.8003, + "step": 8748 + }, + { + "epoch": 0.25943717937312816, + "grad_norm": 0.2018609493970871, + "learning_rate": 0.0008514260755916304, + "loss": 2.7661, + "step": 8749 + }, + { + "epoch": 0.25946683272543963, + "grad_norm": 0.1734100878238678, + "learning_rate": 0.0008513926049951459, + "loss": 2.7427, + "step": 8750 + }, + { + "epoch": 0.2594964860777511, + "grad_norm": 0.1568375676870346, + "learning_rate": 0.0008513591312870194, + "loss": 2.7701, + "step": 8751 + }, + { + "epoch": 0.2595261394300626, + "grad_norm": 0.16479407250881195, + "learning_rate": 0.0008513256544675479, + "loss": 2.7752, + "step": 8752 + }, + { + "epoch": 0.25955579278237406, + "grad_norm": 0.1636567860841751, + "learning_rate": 0.000851292174537027, + "loss": 2.7844, + "step": 8753 + }, + { + "epoch": 0.25958544613468554, + "grad_norm": 0.1608579307794571, + "learning_rate": 0.0008512586914957538, + "loss": 2.7813, + "step": 8754 + }, + { + "epoch": 0.259615099486997, + "grad_norm": 0.16480682790279388, + "learning_rate": 0.0008512252053440246, + "loss": 2.7493, + "step": 8755 + }, + { + "epoch": 0.2596447528393085, + "grad_norm": 0.1748708337545395, + "learning_rate": 0.0008511917160821358, + "loss": 2.779, + "step": 8756 + }, + { + "epoch": 0.25967440619161997, + "grad_norm": 0.16269280016422272, + "learning_rate": 0.0008511582237103843, + "loss": 2.768, + "step": 8757 + }, + { + "epoch": 0.25970405954393144, + "grad_norm": 0.1454765349626541, + "learning_rate": 0.0008511247282290664, + "loss": 2.7799, + "step": 8758 + }, + { + "epoch": 0.2597337128962429, + "grad_norm": 0.1384330689907074, + "learning_rate": 0.0008510912296384786, + "loss": 2.7809, + "step": 8759 + }, + { + "epoch": 0.2597633662485544, + "grad_norm": 0.13884486258029938, + "learning_rate": 0.0008510577279389178, + "loss": 2.7623, + "step": 8760 + }, + { + "epoch": 0.25979301960086587, + "grad_norm": 0.13268418610095978, + "learning_rate": 0.0008510242231306805, + "loss": 2.7465, + "step": 8761 + }, + { + "epoch": 0.25982267295317735, + "grad_norm": 0.13674215972423553, + "learning_rate": 0.0008509907152140635, + "loss": 2.7357, + "step": 8762 + }, + { + "epoch": 0.2598523263054888, + "grad_norm": 0.13682834804058075, + "learning_rate": 0.0008509572041893635, + "loss": 2.7717, + "step": 8763 + }, + { + "epoch": 0.2598819796578003, + "grad_norm": 0.13532768189907074, + "learning_rate": 0.0008509236900568772, + "loss": 2.7347, + "step": 8764 + }, + { + "epoch": 0.2599116330101118, + "grad_norm": 0.1264514923095703, + "learning_rate": 0.0008508901728169014, + "loss": 2.7345, + "step": 8765 + }, + { + "epoch": 0.25994128636242325, + "grad_norm": 0.12897242605686188, + "learning_rate": 0.0008508566524697327, + "loss": 2.6994, + "step": 8766 + }, + { + "epoch": 0.25997093971473473, + "grad_norm": 0.1421547681093216, + "learning_rate": 0.0008508231290156684, + "loss": 2.7873, + "step": 8767 + }, + { + "epoch": 0.2600005930670462, + "grad_norm": 0.14685434103012085, + "learning_rate": 0.000850789602455005, + "loss": 2.7468, + "step": 8768 + }, + { + "epoch": 0.26003024641935774, + "grad_norm": 0.13003773987293243, + "learning_rate": 0.0008507560727880393, + "loss": 2.7722, + "step": 8769 + }, + { + "epoch": 0.2600598997716692, + "grad_norm": 0.1242729052901268, + "learning_rate": 0.0008507225400150685, + "loss": 2.7407, + "step": 8770 + }, + { + "epoch": 0.2600895531239807, + "grad_norm": 0.13711464405059814, + "learning_rate": 0.0008506890041363895, + "loss": 2.7286, + "step": 8771 + }, + { + "epoch": 0.26011920647629216, + "grad_norm": 0.13559894263744354, + "learning_rate": 0.0008506554651522991, + "loss": 2.7492, + "step": 8772 + }, + { + "epoch": 0.26014885982860364, + "grad_norm": 0.1229100152850151, + "learning_rate": 0.0008506219230630941, + "loss": 2.749, + "step": 8773 + }, + { + "epoch": 0.2601785131809151, + "grad_norm": 0.14489230513572693, + "learning_rate": 0.0008505883778690722, + "loss": 2.7619, + "step": 8774 + }, + { + "epoch": 0.2602081665332266, + "grad_norm": 0.15839558839797974, + "learning_rate": 0.0008505548295705298, + "loss": 2.7667, + "step": 8775 + }, + { + "epoch": 0.26023781988553807, + "grad_norm": 0.17778794467449188, + "learning_rate": 0.0008505212781677642, + "loss": 2.7661, + "step": 8776 + }, + { + "epoch": 0.26026747323784954, + "grad_norm": 0.18634359538555145, + "learning_rate": 0.0008504877236610726, + "loss": 2.7289, + "step": 8777 + }, + { + "epoch": 0.260297126590161, + "grad_norm": 0.1815323829650879, + "learning_rate": 0.0008504541660507521, + "loss": 2.7372, + "step": 8778 + }, + { + "epoch": 0.2603267799424725, + "grad_norm": 0.1917445957660675, + "learning_rate": 0.0008504206053370997, + "loss": 2.7532, + "step": 8779 + }, + { + "epoch": 0.260356433294784, + "grad_norm": 0.19375798106193542, + "learning_rate": 0.0008503870415204127, + "loss": 2.7614, + "step": 8780 + }, + { + "epoch": 0.26038608664709545, + "grad_norm": 0.18580715358257294, + "learning_rate": 0.0008503534746009884, + "loss": 2.75, + "step": 8781 + }, + { + "epoch": 0.2604157399994069, + "grad_norm": 0.16833768784999847, + "learning_rate": 0.0008503199045791239, + "loss": 2.7081, + "step": 8782 + }, + { + "epoch": 0.2604453933517184, + "grad_norm": 0.1513129472732544, + "learning_rate": 0.0008502863314551164, + "loss": 2.7382, + "step": 8783 + }, + { + "epoch": 0.2604750467040299, + "grad_norm": 0.1592566817998886, + "learning_rate": 0.0008502527552292634, + "loss": 2.7522, + "step": 8784 + }, + { + "epoch": 0.26050470005634135, + "grad_norm": 0.1706959307193756, + "learning_rate": 0.0008502191759018621, + "loss": 2.754, + "step": 8785 + }, + { + "epoch": 0.26053435340865283, + "grad_norm": 0.14716286957263947, + "learning_rate": 0.0008501855934732099, + "loss": 2.7716, + "step": 8786 + }, + { + "epoch": 0.2605640067609643, + "grad_norm": 0.13362012803554535, + "learning_rate": 0.0008501520079436043, + "loss": 2.7632, + "step": 8787 + }, + { + "epoch": 0.2605936601132758, + "grad_norm": 0.11004379391670227, + "learning_rate": 0.0008501184193133425, + "loss": 2.6875, + "step": 8788 + }, + { + "epoch": 0.26062331346558726, + "grad_norm": 0.1381637305021286, + "learning_rate": 0.0008500848275827217, + "loss": 2.7862, + "step": 8789 + }, + { + "epoch": 0.2606529668178988, + "grad_norm": 0.17515158653259277, + "learning_rate": 0.00085005123275204, + "loss": 2.7797, + "step": 8790 + }, + { + "epoch": 0.26068262017021027, + "grad_norm": 0.1879468560218811, + "learning_rate": 0.0008500176348215945, + "loss": 2.7902, + "step": 8791 + }, + { + "epoch": 0.26071227352252174, + "grad_norm": 0.17687612771987915, + "learning_rate": 0.0008499840337916827, + "loss": 2.7657, + "step": 8792 + }, + { + "epoch": 0.2607419268748332, + "grad_norm": 0.17313621938228607, + "learning_rate": 0.0008499504296626022, + "loss": 2.7528, + "step": 8793 + }, + { + "epoch": 0.2607715802271447, + "grad_norm": 0.16214533150196075, + "learning_rate": 0.0008499168224346505, + "loss": 2.7479, + "step": 8794 + }, + { + "epoch": 0.26080123357945617, + "grad_norm": 0.15795116126537323, + "learning_rate": 0.0008498832121081255, + "loss": 2.7868, + "step": 8795 + }, + { + "epoch": 0.26083088693176765, + "grad_norm": 0.14706562459468842, + "learning_rate": 0.0008498495986833244, + "loss": 2.7388, + "step": 8796 + }, + { + "epoch": 0.2608605402840791, + "grad_norm": 0.13630947470664978, + "learning_rate": 0.0008498159821605451, + "loss": 2.7542, + "step": 8797 + }, + { + "epoch": 0.2608901936363906, + "grad_norm": 0.14070068299770355, + "learning_rate": 0.0008497823625400853, + "loss": 2.7432, + "step": 8798 + }, + { + "epoch": 0.2609198469887021, + "grad_norm": 0.12387733906507492, + "learning_rate": 0.0008497487398222425, + "loss": 2.7948, + "step": 8799 + }, + { + "epoch": 0.26094950034101355, + "grad_norm": 0.14608685672283173, + "learning_rate": 0.0008497151140073147, + "loss": 2.7667, + "step": 8800 + }, + { + "epoch": 0.26097915369332503, + "grad_norm": 0.1480807363986969, + "learning_rate": 0.0008496814850955996, + "loss": 2.7684, + "step": 8801 + }, + { + "epoch": 0.2610088070456365, + "grad_norm": 0.12662282586097717, + "learning_rate": 0.0008496478530873948, + "loss": 2.7171, + "step": 8802 + }, + { + "epoch": 0.261038460397948, + "grad_norm": 0.14106471836566925, + "learning_rate": 0.0008496142179829984, + "loss": 2.7614, + "step": 8803 + }, + { + "epoch": 0.26106811375025946, + "grad_norm": 0.13865812122821808, + "learning_rate": 0.0008495805797827079, + "loss": 2.7134, + "step": 8804 + }, + { + "epoch": 0.26109776710257093, + "grad_norm": 0.15840384364128113, + "learning_rate": 0.0008495469384868214, + "loss": 2.7776, + "step": 8805 + }, + { + "epoch": 0.2611274204548824, + "grad_norm": 0.14112670719623566, + "learning_rate": 0.0008495132940956367, + "loss": 2.8216, + "step": 8806 + }, + { + "epoch": 0.2611570738071939, + "grad_norm": 0.13556092977523804, + "learning_rate": 0.000849479646609452, + "loss": 2.755, + "step": 8807 + }, + { + "epoch": 0.26118672715950536, + "grad_norm": 0.15763431787490845, + "learning_rate": 0.000849445996028565, + "loss": 2.7379, + "step": 8808 + }, + { + "epoch": 0.26121638051181684, + "grad_norm": 0.16703541576862335, + "learning_rate": 0.0008494123423532736, + "loss": 2.7659, + "step": 8809 + }, + { + "epoch": 0.2612460338641283, + "grad_norm": 0.14762872457504272, + "learning_rate": 0.0008493786855838759, + "loss": 2.7393, + "step": 8810 + }, + { + "epoch": 0.26127568721643984, + "grad_norm": 0.15527036786079407, + "learning_rate": 0.0008493450257206701, + "loss": 2.7812, + "step": 8811 + }, + { + "epoch": 0.2613053405687513, + "grad_norm": 0.18377338349819183, + "learning_rate": 0.000849311362763954, + "loss": 2.7908, + "step": 8812 + }, + { + "epoch": 0.2613349939210628, + "grad_norm": 0.188493549823761, + "learning_rate": 0.0008492776967140259, + "loss": 2.7475, + "step": 8813 + }, + { + "epoch": 0.2613646472733743, + "grad_norm": 0.15240377187728882, + "learning_rate": 0.0008492440275711839, + "loss": 2.7684, + "step": 8814 + }, + { + "epoch": 0.26139430062568575, + "grad_norm": 0.13498568534851074, + "learning_rate": 0.0008492103553357261, + "loss": 2.7741, + "step": 8815 + }, + { + "epoch": 0.2614239539779972, + "grad_norm": 0.13665862381458282, + "learning_rate": 0.0008491766800079505, + "loss": 2.7339, + "step": 8816 + }, + { + "epoch": 0.2614536073303087, + "grad_norm": 0.14457933604717255, + "learning_rate": 0.0008491430015881556, + "loss": 2.7894, + "step": 8817 + }, + { + "epoch": 0.2614832606826202, + "grad_norm": 0.15756893157958984, + "learning_rate": 0.0008491093200766395, + "loss": 2.7833, + "step": 8818 + }, + { + "epoch": 0.26151291403493165, + "grad_norm": 0.15249699354171753, + "learning_rate": 0.0008490756354737004, + "loss": 2.7305, + "step": 8819 + }, + { + "epoch": 0.26154256738724313, + "grad_norm": 0.15896065533161163, + "learning_rate": 0.0008490419477796366, + "loss": 2.7449, + "step": 8820 + }, + { + "epoch": 0.2615722207395546, + "grad_norm": 0.15993525087833405, + "learning_rate": 0.0008490082569947465, + "loss": 2.7715, + "step": 8821 + }, + { + "epoch": 0.2616018740918661, + "grad_norm": 0.13306443393230438, + "learning_rate": 0.0008489745631193285, + "loss": 2.7846, + "step": 8822 + }, + { + "epoch": 0.26163152744417756, + "grad_norm": 0.14025835692882538, + "learning_rate": 0.0008489408661536806, + "loss": 2.7832, + "step": 8823 + }, + { + "epoch": 0.26166118079648903, + "grad_norm": 0.14194533228874207, + "learning_rate": 0.0008489071660981015, + "loss": 2.7467, + "step": 8824 + }, + { + "epoch": 0.2616908341488005, + "grad_norm": 0.12684720754623413, + "learning_rate": 0.0008488734629528894, + "loss": 2.7456, + "step": 8825 + }, + { + "epoch": 0.261720487501112, + "grad_norm": 0.131218820810318, + "learning_rate": 0.0008488397567183433, + "loss": 2.7577, + "step": 8826 + }, + { + "epoch": 0.26175014085342346, + "grad_norm": 0.1378381997346878, + "learning_rate": 0.000848806047394761, + "loss": 2.7193, + "step": 8827 + }, + { + "epoch": 0.26177979420573494, + "grad_norm": 0.1554490625858307, + "learning_rate": 0.0008487723349824413, + "loss": 2.739, + "step": 8828 + }, + { + "epoch": 0.2618094475580464, + "grad_norm": 0.14937405288219452, + "learning_rate": 0.0008487386194816829, + "loss": 2.7283, + "step": 8829 + }, + { + "epoch": 0.2618391009103579, + "grad_norm": 0.16088064014911652, + "learning_rate": 0.0008487049008927838, + "loss": 2.747, + "step": 8830 + }, + { + "epoch": 0.26186875426266937, + "grad_norm": 0.17502690851688385, + "learning_rate": 0.0008486711792160432, + "loss": 2.7165, + "step": 8831 + }, + { + "epoch": 0.2618984076149809, + "grad_norm": 0.19193468987941742, + "learning_rate": 0.0008486374544517594, + "loss": 2.7663, + "step": 8832 + }, + { + "epoch": 0.2619280609672924, + "grad_norm": 0.1790933907032013, + "learning_rate": 0.0008486037266002311, + "loss": 2.7575, + "step": 8833 + }, + { + "epoch": 0.26195771431960385, + "grad_norm": 0.1564294546842575, + "learning_rate": 0.0008485699956617571, + "loss": 2.7714, + "step": 8834 + }, + { + "epoch": 0.2619873676719153, + "grad_norm": 0.14712652564048767, + "learning_rate": 0.0008485362616366359, + "loss": 2.7166, + "step": 8835 + }, + { + "epoch": 0.2620170210242268, + "grad_norm": 0.15133334696292877, + "learning_rate": 0.000848502524525166, + "loss": 2.7353, + "step": 8836 + }, + { + "epoch": 0.2620466743765383, + "grad_norm": 0.13953252136707306, + "learning_rate": 0.0008484687843276469, + "loss": 2.7495, + "step": 8837 + }, + { + "epoch": 0.26207632772884976, + "grad_norm": 0.13921058177947998, + "learning_rate": 0.0008484350410443764, + "loss": 2.7719, + "step": 8838 + }, + { + "epoch": 0.26210598108116123, + "grad_norm": 0.14418797194957733, + "learning_rate": 0.000848401294675654, + "loss": 2.7238, + "step": 8839 + }, + { + "epoch": 0.2621356344334727, + "grad_norm": 0.1338307112455368, + "learning_rate": 0.0008483675452217785, + "loss": 2.7441, + "step": 8840 + }, + { + "epoch": 0.2621652877857842, + "grad_norm": 0.1332729458808899, + "learning_rate": 0.0008483337926830486, + "loss": 2.6972, + "step": 8841 + }, + { + "epoch": 0.26219494113809566, + "grad_norm": 0.14220979809761047, + "learning_rate": 0.0008483000370597629, + "loss": 2.7416, + "step": 8842 + }, + { + "epoch": 0.26222459449040714, + "grad_norm": 0.1577109396457672, + "learning_rate": 0.0008482662783522208, + "loss": 2.7231, + "step": 8843 + }, + { + "epoch": 0.2622542478427186, + "grad_norm": 0.13459879159927368, + "learning_rate": 0.0008482325165607208, + "loss": 2.7698, + "step": 8844 + }, + { + "epoch": 0.2622839011950301, + "grad_norm": 0.13608068227767944, + "learning_rate": 0.0008481987516855624, + "loss": 2.7448, + "step": 8845 + }, + { + "epoch": 0.26231355454734157, + "grad_norm": 0.1202581524848938, + "learning_rate": 0.000848164983727044, + "loss": 2.7408, + "step": 8846 + }, + { + "epoch": 0.26234320789965304, + "grad_norm": 0.12817223370075226, + "learning_rate": 0.0008481312126854652, + "loss": 2.7217, + "step": 8847 + }, + { + "epoch": 0.2623728612519645, + "grad_norm": 0.14261242747306824, + "learning_rate": 0.0008480974385611246, + "loss": 2.73, + "step": 8848 + }, + { + "epoch": 0.262402514604276, + "grad_norm": 0.1513056606054306, + "learning_rate": 0.0008480636613543214, + "loss": 2.7524, + "step": 8849 + }, + { + "epoch": 0.26243216795658747, + "grad_norm": 0.16696834564208984, + "learning_rate": 0.0008480298810653548, + "loss": 2.7578, + "step": 8850 + }, + { + "epoch": 0.26246182130889895, + "grad_norm": 0.18447284400463104, + "learning_rate": 0.0008479960976945238, + "loss": 2.7561, + "step": 8851 + }, + { + "epoch": 0.2624914746612104, + "grad_norm": 0.17839105427265167, + "learning_rate": 0.0008479623112421276, + "loss": 2.7351, + "step": 8852 + }, + { + "epoch": 0.26252112801352195, + "grad_norm": 0.171925887465477, + "learning_rate": 0.0008479285217084656, + "loss": 2.7667, + "step": 8853 + }, + { + "epoch": 0.26255078136583343, + "grad_norm": 0.2110324203968048, + "learning_rate": 0.0008478947290938366, + "loss": 2.7425, + "step": 8854 + }, + { + "epoch": 0.2625804347181449, + "grad_norm": 0.21319657564163208, + "learning_rate": 0.0008478609333985401, + "loss": 2.7494, + "step": 8855 + }, + { + "epoch": 0.2626100880704564, + "grad_norm": 0.16917851567268372, + "learning_rate": 0.0008478271346228755, + "loss": 2.729, + "step": 8856 + }, + { + "epoch": 0.26263974142276786, + "grad_norm": 0.14771956205368042, + "learning_rate": 0.0008477933327671416, + "loss": 2.7449, + "step": 8857 + }, + { + "epoch": 0.26266939477507933, + "grad_norm": 0.1524410992860794, + "learning_rate": 0.0008477595278316382, + "loss": 2.7271, + "step": 8858 + }, + { + "epoch": 0.2626990481273908, + "grad_norm": 0.13893581926822662, + "learning_rate": 0.0008477257198166646, + "loss": 2.7481, + "step": 8859 + }, + { + "epoch": 0.2627287014797023, + "grad_norm": 0.15133719146251678, + "learning_rate": 0.0008476919087225199, + "loss": 2.7806, + "step": 8860 + }, + { + "epoch": 0.26275835483201376, + "grad_norm": 0.16074946522712708, + "learning_rate": 0.0008476580945495038, + "loss": 2.7894, + "step": 8861 + }, + { + "epoch": 0.26278800818432524, + "grad_norm": 0.16487745940685272, + "learning_rate": 0.0008476242772979156, + "loss": 2.7705, + "step": 8862 + }, + { + "epoch": 0.2628176615366367, + "grad_norm": 0.1632668375968933, + "learning_rate": 0.0008475904569680547, + "loss": 2.7682, + "step": 8863 + }, + { + "epoch": 0.2628473148889482, + "grad_norm": 0.15773174166679382, + "learning_rate": 0.0008475566335602205, + "loss": 2.7567, + "step": 8864 + }, + { + "epoch": 0.26287696824125967, + "grad_norm": 0.16096605360507965, + "learning_rate": 0.0008475228070747128, + "loss": 2.7177, + "step": 8865 + }, + { + "epoch": 0.26290662159357114, + "grad_norm": 0.15379363298416138, + "learning_rate": 0.0008474889775118311, + "loss": 2.7791, + "step": 8866 + }, + { + "epoch": 0.2629362749458826, + "grad_norm": 0.15444344282150269, + "learning_rate": 0.0008474551448718747, + "loss": 2.7494, + "step": 8867 + }, + { + "epoch": 0.2629659282981941, + "grad_norm": 0.1560405045747757, + "learning_rate": 0.0008474213091551434, + "loss": 2.7603, + "step": 8868 + }, + { + "epoch": 0.26299558165050557, + "grad_norm": 0.14186006784439087, + "learning_rate": 0.0008473874703619368, + "loss": 2.7569, + "step": 8869 + }, + { + "epoch": 0.26302523500281705, + "grad_norm": 0.1373831033706665, + "learning_rate": 0.0008473536284925545, + "loss": 2.7371, + "step": 8870 + }, + { + "epoch": 0.2630548883551285, + "grad_norm": 0.13559696078300476, + "learning_rate": 0.0008473197835472961, + "loss": 2.7358, + "step": 8871 + }, + { + "epoch": 0.26308454170744, + "grad_norm": 0.1443263590335846, + "learning_rate": 0.0008472859355264615, + "loss": 2.739, + "step": 8872 + }, + { + "epoch": 0.26311419505975153, + "grad_norm": 0.13422152400016785, + "learning_rate": 0.0008472520844303504, + "loss": 2.7573, + "step": 8873 + }, + { + "epoch": 0.263143848412063, + "grad_norm": 0.12190810590982437, + "learning_rate": 0.0008472182302592623, + "loss": 2.7621, + "step": 8874 + }, + { + "epoch": 0.2631735017643745, + "grad_norm": 0.1423460692167282, + "learning_rate": 0.0008471843730134973, + "loss": 2.7811, + "step": 8875 + }, + { + "epoch": 0.26320315511668596, + "grad_norm": 0.14941257238388062, + "learning_rate": 0.000847150512693355, + "loss": 2.7494, + "step": 8876 + }, + { + "epoch": 0.26323280846899744, + "grad_norm": 0.13556118309497833, + "learning_rate": 0.0008471166492991354, + "loss": 2.7279, + "step": 8877 + }, + { + "epoch": 0.2632624618213089, + "grad_norm": 0.1389911025762558, + "learning_rate": 0.0008470827828311382, + "loss": 2.758, + "step": 8878 + }, + { + "epoch": 0.2632921151736204, + "grad_norm": 0.15436440706253052, + "learning_rate": 0.0008470489132896635, + "loss": 2.7214, + "step": 8879 + }, + { + "epoch": 0.26332176852593187, + "grad_norm": 0.1579187959432602, + "learning_rate": 0.0008470150406750111, + "loss": 2.7482, + "step": 8880 + }, + { + "epoch": 0.26335142187824334, + "grad_norm": 0.18332922458648682, + "learning_rate": 0.000846981164987481, + "loss": 2.7549, + "step": 8881 + }, + { + "epoch": 0.2633810752305548, + "grad_norm": 0.14782531559467316, + "learning_rate": 0.000846947286227373, + "loss": 2.7044, + "step": 8882 + }, + { + "epoch": 0.2634107285828663, + "grad_norm": 0.13069386780261993, + "learning_rate": 0.0008469134043949871, + "loss": 2.7464, + "step": 8883 + }, + { + "epoch": 0.26344038193517777, + "grad_norm": 0.1620127260684967, + "learning_rate": 0.0008468795194906237, + "loss": 2.7133, + "step": 8884 + }, + { + "epoch": 0.26347003528748925, + "grad_norm": 0.16366474330425262, + "learning_rate": 0.0008468456315145825, + "loss": 2.7564, + "step": 8885 + }, + { + "epoch": 0.2634996886398007, + "grad_norm": 0.15789417922496796, + "learning_rate": 0.0008468117404671638, + "loss": 2.729, + "step": 8886 + }, + { + "epoch": 0.2635293419921122, + "grad_norm": 0.14392618834972382, + "learning_rate": 0.0008467778463486675, + "loss": 2.7778, + "step": 8887 + }, + { + "epoch": 0.2635589953444237, + "grad_norm": 0.14093919098377228, + "learning_rate": 0.0008467439491593939, + "loss": 2.7349, + "step": 8888 + }, + { + "epoch": 0.26358864869673515, + "grad_norm": 0.1353493630886078, + "learning_rate": 0.0008467100488996431, + "loss": 2.739, + "step": 8889 + }, + { + "epoch": 0.2636183020490466, + "grad_norm": 0.13019561767578125, + "learning_rate": 0.0008466761455697151, + "loss": 2.7207, + "step": 8890 + }, + { + "epoch": 0.2636479554013581, + "grad_norm": 0.13293242454528809, + "learning_rate": 0.0008466422391699106, + "loss": 2.7395, + "step": 8891 + }, + { + "epoch": 0.2636776087536696, + "grad_norm": 0.14430956542491913, + "learning_rate": 0.0008466083297005296, + "loss": 2.7956, + "step": 8892 + }, + { + "epoch": 0.26370726210598106, + "grad_norm": 0.14399990439414978, + "learning_rate": 0.0008465744171618722, + "loss": 2.7601, + "step": 8893 + }, + { + "epoch": 0.2637369154582926, + "grad_norm": 0.13547326624393463, + "learning_rate": 0.0008465405015542389, + "loss": 2.7253, + "step": 8894 + }, + { + "epoch": 0.26376656881060406, + "grad_norm": 0.13690713047981262, + "learning_rate": 0.0008465065828779301, + "loss": 2.7455, + "step": 8895 + }, + { + "epoch": 0.26379622216291554, + "grad_norm": 0.15113544464111328, + "learning_rate": 0.0008464726611332457, + "loss": 2.7277, + "step": 8896 + }, + { + "epoch": 0.263825875515227, + "grad_norm": 0.1480884850025177, + "learning_rate": 0.0008464387363204866, + "loss": 2.7768, + "step": 8897 + }, + { + "epoch": 0.2638555288675385, + "grad_norm": 0.14291499555110931, + "learning_rate": 0.0008464048084399531, + "loss": 2.7333, + "step": 8898 + }, + { + "epoch": 0.26388518221984997, + "grad_norm": 0.14857199788093567, + "learning_rate": 0.0008463708774919456, + "loss": 2.6787, + "step": 8899 + }, + { + "epoch": 0.26391483557216144, + "grad_norm": 0.1336888074874878, + "learning_rate": 0.0008463369434767644, + "loss": 2.7394, + "step": 8900 + }, + { + "epoch": 0.2639444889244729, + "grad_norm": 0.1466742306947708, + "learning_rate": 0.0008463030063947101, + "loss": 2.7682, + "step": 8901 + }, + { + "epoch": 0.2639741422767844, + "grad_norm": 0.1415032297372818, + "learning_rate": 0.0008462690662460832, + "loss": 2.7514, + "step": 8902 + }, + { + "epoch": 0.26400379562909587, + "grad_norm": 0.1377275139093399, + "learning_rate": 0.0008462351230311844, + "loss": 2.7425, + "step": 8903 + }, + { + "epoch": 0.26403344898140735, + "grad_norm": 0.13149265944957733, + "learning_rate": 0.0008462011767503141, + "loss": 2.7449, + "step": 8904 + }, + { + "epoch": 0.2640631023337188, + "grad_norm": 0.13729332387447357, + "learning_rate": 0.0008461672274037731, + "loss": 2.7605, + "step": 8905 + }, + { + "epoch": 0.2640927556860303, + "grad_norm": 0.13702328503131866, + "learning_rate": 0.0008461332749918616, + "loss": 2.7761, + "step": 8906 + }, + { + "epoch": 0.2641224090383418, + "grad_norm": 0.1544608771800995, + "learning_rate": 0.0008460993195148807, + "loss": 2.7567, + "step": 8907 + }, + { + "epoch": 0.26415206239065325, + "grad_norm": 0.15203431248664856, + "learning_rate": 0.0008460653609731311, + "loss": 2.7494, + "step": 8908 + }, + { + "epoch": 0.26418171574296473, + "grad_norm": 0.1340070515871048, + "learning_rate": 0.0008460313993669128, + "loss": 2.7572, + "step": 8909 + }, + { + "epoch": 0.2642113690952762, + "grad_norm": 0.1401306539773941, + "learning_rate": 0.0008459974346965276, + "loss": 2.7207, + "step": 8910 + }, + { + "epoch": 0.2642410224475877, + "grad_norm": 0.16012465953826904, + "learning_rate": 0.0008459634669622755, + "loss": 2.759, + "step": 8911 + }, + { + "epoch": 0.26427067579989916, + "grad_norm": 0.18804317712783813, + "learning_rate": 0.0008459294961644574, + "loss": 2.7764, + "step": 8912 + }, + { + "epoch": 0.26430032915221063, + "grad_norm": 0.2019786834716797, + "learning_rate": 0.0008458955223033744, + "loss": 2.779, + "step": 8913 + }, + { + "epoch": 0.2643299825045221, + "grad_norm": 0.18909558653831482, + "learning_rate": 0.0008458615453793273, + "loss": 2.7862, + "step": 8914 + }, + { + "epoch": 0.26435963585683364, + "grad_norm": 0.1522672474384308, + "learning_rate": 0.0008458275653926166, + "loss": 2.7713, + "step": 8915 + }, + { + "epoch": 0.2643892892091451, + "grad_norm": 0.16138474643230438, + "learning_rate": 0.0008457935823435437, + "loss": 2.7776, + "step": 8916 + }, + { + "epoch": 0.2644189425614566, + "grad_norm": 0.1648871898651123, + "learning_rate": 0.0008457595962324089, + "loss": 2.7655, + "step": 8917 + }, + { + "epoch": 0.26444859591376807, + "grad_norm": 0.15271688997745514, + "learning_rate": 0.0008457256070595138, + "loss": 2.7325, + "step": 8918 + }, + { + "epoch": 0.26447824926607955, + "grad_norm": 0.133914977312088, + "learning_rate": 0.0008456916148251592, + "loss": 2.7399, + "step": 8919 + }, + { + "epoch": 0.264507902618391, + "grad_norm": 0.1241016685962677, + "learning_rate": 0.0008456576195296458, + "loss": 2.7716, + "step": 8920 + }, + { + "epoch": 0.2645375559707025, + "grad_norm": 0.1324356645345688, + "learning_rate": 0.0008456236211732748, + "loss": 2.7684, + "step": 8921 + }, + { + "epoch": 0.264567209323014, + "grad_norm": 0.1545688956975937, + "learning_rate": 0.0008455896197563475, + "loss": 2.7624, + "step": 8922 + }, + { + "epoch": 0.26459686267532545, + "grad_norm": 0.1649443656206131, + "learning_rate": 0.0008455556152791646, + "loss": 2.7207, + "step": 8923 + }, + { + "epoch": 0.2646265160276369, + "grad_norm": 0.1445784717798233, + "learning_rate": 0.0008455216077420277, + "loss": 2.7416, + "step": 8924 + }, + { + "epoch": 0.2646561693799484, + "grad_norm": 0.12840378284454346, + "learning_rate": 0.0008454875971452375, + "loss": 2.7272, + "step": 8925 + }, + { + "epoch": 0.2646858227322599, + "grad_norm": 0.13980749249458313, + "learning_rate": 0.0008454535834890953, + "loss": 2.7571, + "step": 8926 + }, + { + "epoch": 0.26471547608457136, + "grad_norm": 0.14866413176059723, + "learning_rate": 0.0008454195667739024, + "loss": 2.7015, + "step": 8927 + }, + { + "epoch": 0.26474512943688283, + "grad_norm": 0.1563718467950821, + "learning_rate": 0.0008453855469999597, + "loss": 2.7344, + "step": 8928 + }, + { + "epoch": 0.2647747827891943, + "grad_norm": 0.15124377608299255, + "learning_rate": 0.000845351524167569, + "loss": 2.7896, + "step": 8929 + }, + { + "epoch": 0.2648044361415058, + "grad_norm": 0.15314780175685883, + "learning_rate": 0.0008453174982770311, + "loss": 2.7439, + "step": 8930 + }, + { + "epoch": 0.26483408949381726, + "grad_norm": 0.16810430586338043, + "learning_rate": 0.0008452834693286475, + "loss": 2.7513, + "step": 8931 + }, + { + "epoch": 0.26486374284612874, + "grad_norm": 0.16014555096626282, + "learning_rate": 0.0008452494373227196, + "loss": 2.7374, + "step": 8932 + }, + { + "epoch": 0.2648933961984402, + "grad_norm": 0.1212165579199791, + "learning_rate": 0.0008452154022595487, + "loss": 2.7163, + "step": 8933 + }, + { + "epoch": 0.2649230495507517, + "grad_norm": 0.13283738493919373, + "learning_rate": 0.000845181364139436, + "loss": 2.7406, + "step": 8934 + }, + { + "epoch": 0.26495270290306316, + "grad_norm": 0.14369982481002808, + "learning_rate": 0.0008451473229626832, + "loss": 2.7355, + "step": 8935 + }, + { + "epoch": 0.2649823562553747, + "grad_norm": 0.13869231939315796, + "learning_rate": 0.0008451132787295915, + "loss": 2.7275, + "step": 8936 + }, + { + "epoch": 0.26501200960768617, + "grad_norm": 0.13975103199481964, + "learning_rate": 0.0008450792314404625, + "loss": 2.7367, + "step": 8937 + }, + { + "epoch": 0.26504166295999765, + "grad_norm": 0.13517199456691742, + "learning_rate": 0.0008450451810955977, + "loss": 2.7612, + "step": 8938 + }, + { + "epoch": 0.2650713163123091, + "grad_norm": 0.13807420432567596, + "learning_rate": 0.0008450111276952987, + "loss": 2.7269, + "step": 8939 + }, + { + "epoch": 0.2651009696646206, + "grad_norm": 0.144160658121109, + "learning_rate": 0.0008449770712398668, + "loss": 2.7335, + "step": 8940 + }, + { + "epoch": 0.2651306230169321, + "grad_norm": 0.13175906240940094, + "learning_rate": 0.0008449430117296037, + "loss": 2.7548, + "step": 8941 + }, + { + "epoch": 0.26516027636924355, + "grad_norm": 0.13579824566841125, + "learning_rate": 0.0008449089491648112, + "loss": 2.7313, + "step": 8942 + }, + { + "epoch": 0.26518992972155503, + "grad_norm": 0.13917621970176697, + "learning_rate": 0.0008448748835457907, + "loss": 2.7505, + "step": 8943 + }, + { + "epoch": 0.2652195830738665, + "grad_norm": 0.15390193462371826, + "learning_rate": 0.0008448408148728437, + "loss": 2.7383, + "step": 8944 + }, + { + "epoch": 0.265249236426178, + "grad_norm": 0.169902965426445, + "learning_rate": 0.0008448067431462723, + "loss": 2.7606, + "step": 8945 + }, + { + "epoch": 0.26527888977848946, + "grad_norm": 0.13912659883499146, + "learning_rate": 0.000844772668366378, + "loss": 2.7487, + "step": 8946 + }, + { + "epoch": 0.26530854313080093, + "grad_norm": 0.1437210887670517, + "learning_rate": 0.0008447385905334625, + "loss": 2.7543, + "step": 8947 + }, + { + "epoch": 0.2653381964831124, + "grad_norm": 0.14772702753543854, + "learning_rate": 0.0008447045096478276, + "loss": 2.7558, + "step": 8948 + }, + { + "epoch": 0.2653678498354239, + "grad_norm": 0.13764546811580658, + "learning_rate": 0.000844670425709775, + "loss": 2.7526, + "step": 8949 + }, + { + "epoch": 0.26539750318773536, + "grad_norm": 0.1449236273765564, + "learning_rate": 0.0008446363387196068, + "loss": 2.764, + "step": 8950 + }, + { + "epoch": 0.26542715654004684, + "grad_norm": 0.15760686993598938, + "learning_rate": 0.0008446022486776246, + "loss": 2.72, + "step": 8951 + }, + { + "epoch": 0.2654568098923583, + "grad_norm": 0.18051618337631226, + "learning_rate": 0.0008445681555841303, + "loss": 2.7024, + "step": 8952 + }, + { + "epoch": 0.2654864632446698, + "grad_norm": 0.20281349122524261, + "learning_rate": 0.0008445340594394259, + "loss": 2.7456, + "step": 8953 + }, + { + "epoch": 0.26551611659698127, + "grad_norm": 0.1971968412399292, + "learning_rate": 0.000844499960243813, + "loss": 2.7251, + "step": 8954 + }, + { + "epoch": 0.26554576994929274, + "grad_norm": 0.16166016459465027, + "learning_rate": 0.0008444658579975942, + "loss": 2.7507, + "step": 8955 + }, + { + "epoch": 0.2655754233016042, + "grad_norm": 0.17451496422290802, + "learning_rate": 0.0008444317527010708, + "loss": 2.7343, + "step": 8956 + }, + { + "epoch": 0.26560507665391575, + "grad_norm": 0.1435411274433136, + "learning_rate": 0.0008443976443545454, + "loss": 2.7282, + "step": 8957 + }, + { + "epoch": 0.2656347300062272, + "grad_norm": 0.12425043433904648, + "learning_rate": 0.0008443635329583196, + "loss": 2.7501, + "step": 8958 + }, + { + "epoch": 0.2656643833585387, + "grad_norm": 0.13527938723564148, + "learning_rate": 0.0008443294185126955, + "loss": 2.773, + "step": 8959 + }, + { + "epoch": 0.2656940367108502, + "grad_norm": 0.13537795841693878, + "learning_rate": 0.0008442953010179754, + "loss": 2.7889, + "step": 8960 + }, + { + "epoch": 0.26572369006316165, + "grad_norm": 0.13593913614749908, + "learning_rate": 0.0008442611804744613, + "loss": 2.7542, + "step": 8961 + }, + { + "epoch": 0.26575334341547313, + "grad_norm": 0.12891022861003876, + "learning_rate": 0.0008442270568824555, + "loss": 2.7477, + "step": 8962 + }, + { + "epoch": 0.2657829967677846, + "grad_norm": 0.12809191644191742, + "learning_rate": 0.0008441929302422598, + "loss": 2.7451, + "step": 8963 + }, + { + "epoch": 0.2658126501200961, + "grad_norm": 0.1483277529478073, + "learning_rate": 0.0008441588005541767, + "loss": 2.7729, + "step": 8964 + }, + { + "epoch": 0.26584230347240756, + "grad_norm": 0.14924520254135132, + "learning_rate": 0.0008441246678185084, + "loss": 2.7229, + "step": 8965 + }, + { + "epoch": 0.26587195682471904, + "grad_norm": 0.14406439661979675, + "learning_rate": 0.000844090532035557, + "loss": 2.7311, + "step": 8966 + }, + { + "epoch": 0.2659016101770305, + "grad_norm": 0.13936342298984528, + "learning_rate": 0.0008440563932056249, + "loss": 2.7571, + "step": 8967 + }, + { + "epoch": 0.265931263529342, + "grad_norm": 0.12443601340055466, + "learning_rate": 0.0008440222513290145, + "loss": 2.7512, + "step": 8968 + }, + { + "epoch": 0.26596091688165346, + "grad_norm": 0.12584945559501648, + "learning_rate": 0.0008439881064060279, + "loss": 2.7489, + "step": 8969 + }, + { + "epoch": 0.26599057023396494, + "grad_norm": 0.13805536925792694, + "learning_rate": 0.0008439539584369675, + "loss": 2.7674, + "step": 8970 + }, + { + "epoch": 0.2660202235862764, + "grad_norm": 0.13354364037513733, + "learning_rate": 0.0008439198074221359, + "loss": 2.7485, + "step": 8971 + }, + { + "epoch": 0.2660498769385879, + "grad_norm": 0.15452070534229279, + "learning_rate": 0.0008438856533618354, + "loss": 2.7384, + "step": 8972 + }, + { + "epoch": 0.26607953029089937, + "grad_norm": 0.1775115728378296, + "learning_rate": 0.0008438514962563684, + "loss": 2.7169, + "step": 8973 + }, + { + "epoch": 0.26610918364321084, + "grad_norm": 0.15861766040325165, + "learning_rate": 0.0008438173361060373, + "loss": 2.7674, + "step": 8974 + }, + { + "epoch": 0.2661388369955223, + "grad_norm": 0.14657719433307648, + "learning_rate": 0.0008437831729111447, + "loss": 2.7545, + "step": 8975 + }, + { + "epoch": 0.2661684903478338, + "grad_norm": 0.1611362248659134, + "learning_rate": 0.000843749006671993, + "loss": 2.736, + "step": 8976 + }, + { + "epoch": 0.26619814370014533, + "grad_norm": 0.17133229970932007, + "learning_rate": 0.000843714837388885, + "loss": 2.7379, + "step": 8977 + }, + { + "epoch": 0.2662277970524568, + "grad_norm": 0.16761153936386108, + "learning_rate": 0.0008436806650621231, + "loss": 2.7436, + "step": 8978 + }, + { + "epoch": 0.2662574504047683, + "grad_norm": 0.17582528293132782, + "learning_rate": 0.0008436464896920099, + "loss": 2.7573, + "step": 8979 + }, + { + "epoch": 0.26628710375707976, + "grad_norm": 0.1685054749250412, + "learning_rate": 0.0008436123112788478, + "loss": 2.7919, + "step": 8980 + }, + { + "epoch": 0.26631675710939123, + "grad_norm": 0.16249382495880127, + "learning_rate": 0.0008435781298229402, + "loss": 2.7534, + "step": 8981 + }, + { + "epoch": 0.2663464104617027, + "grad_norm": 0.17472591996192932, + "learning_rate": 0.0008435439453245889, + "loss": 2.7875, + "step": 8982 + }, + { + "epoch": 0.2663760638140142, + "grad_norm": 0.1371847689151764, + "learning_rate": 0.0008435097577840971, + "loss": 2.764, + "step": 8983 + }, + { + "epoch": 0.26640571716632566, + "grad_norm": 0.11940235644578934, + "learning_rate": 0.0008434755672017674, + "loss": 2.7464, + "step": 8984 + }, + { + "epoch": 0.26643537051863714, + "grad_norm": 0.14128625392913818, + "learning_rate": 0.0008434413735779028, + "loss": 2.7892, + "step": 8985 + }, + { + "epoch": 0.2664650238709486, + "grad_norm": 0.140574112534523, + "learning_rate": 0.0008434071769128056, + "loss": 2.7242, + "step": 8986 + }, + { + "epoch": 0.2664946772232601, + "grad_norm": 0.1459510326385498, + "learning_rate": 0.0008433729772067789, + "loss": 2.7169, + "step": 8987 + }, + { + "epoch": 0.26652433057557157, + "grad_norm": 0.14776715636253357, + "learning_rate": 0.0008433387744601257, + "loss": 2.7667, + "step": 8988 + }, + { + "epoch": 0.26655398392788304, + "grad_norm": 0.16926871240139008, + "learning_rate": 0.0008433045686731486, + "loss": 2.7404, + "step": 8989 + }, + { + "epoch": 0.2665836372801945, + "grad_norm": 0.18641792237758636, + "learning_rate": 0.0008432703598461508, + "loss": 2.7604, + "step": 8990 + }, + { + "epoch": 0.266613290632506, + "grad_norm": 0.1774597316980362, + "learning_rate": 0.000843236147979435, + "loss": 2.7432, + "step": 8991 + }, + { + "epoch": 0.26664294398481747, + "grad_norm": 0.14337721467018127, + "learning_rate": 0.0008432019330733041, + "loss": 2.7621, + "step": 8992 + }, + { + "epoch": 0.26667259733712895, + "grad_norm": 0.15981219708919525, + "learning_rate": 0.0008431677151280612, + "loss": 2.7215, + "step": 8993 + }, + { + "epoch": 0.2667022506894404, + "grad_norm": 0.1531355381011963, + "learning_rate": 0.0008431334941440093, + "loss": 2.7126, + "step": 8994 + }, + { + "epoch": 0.2667319040417519, + "grad_norm": 0.125324085354805, + "learning_rate": 0.0008430992701214515, + "loss": 2.7197, + "step": 8995 + }, + { + "epoch": 0.2667615573940634, + "grad_norm": 0.1561906784772873, + "learning_rate": 0.0008430650430606906, + "loss": 2.727, + "step": 8996 + }, + { + "epoch": 0.26679121074637485, + "grad_norm": 0.13465315103530884, + "learning_rate": 0.0008430308129620299, + "loss": 2.7282, + "step": 8997 + }, + { + "epoch": 0.2668208640986864, + "grad_norm": 0.1291162222623825, + "learning_rate": 0.0008429965798257726, + "loss": 2.7368, + "step": 8998 + }, + { + "epoch": 0.26685051745099786, + "grad_norm": 0.15074263513088226, + "learning_rate": 0.0008429623436522215, + "loss": 2.7439, + "step": 8999 + }, + { + "epoch": 0.26688017080330934, + "grad_norm": 0.1324465572834015, + "learning_rate": 0.0008429281044416801, + "loss": 2.7621, + "step": 9000 + }, + { + "epoch": 0.2669098241556208, + "grad_norm": 0.13818643987178802, + "learning_rate": 0.0008428938621944515, + "loss": 2.7467, + "step": 9001 + }, + { + "epoch": 0.2669394775079323, + "grad_norm": 0.15349480509757996, + "learning_rate": 0.0008428596169108389, + "loss": 2.7281, + "step": 9002 + }, + { + "epoch": 0.26696913086024376, + "grad_norm": 0.14500150084495544, + "learning_rate": 0.0008428253685911455, + "loss": 2.7248, + "step": 9003 + }, + { + "epoch": 0.26699878421255524, + "grad_norm": 0.12521512806415558, + "learning_rate": 0.0008427911172356746, + "loss": 2.7834, + "step": 9004 + }, + { + "epoch": 0.2670284375648667, + "grad_norm": 0.13964912295341492, + "learning_rate": 0.0008427568628447295, + "loss": 2.7176, + "step": 9005 + }, + { + "epoch": 0.2670580909171782, + "grad_norm": 0.1517937332391739, + "learning_rate": 0.0008427226054186135, + "loss": 2.722, + "step": 9006 + }, + { + "epoch": 0.26708774426948967, + "grad_norm": 0.16758008301258087, + "learning_rate": 0.0008426883449576301, + "loss": 2.724, + "step": 9007 + }, + { + "epoch": 0.26711739762180114, + "grad_norm": 0.15130016207695007, + "learning_rate": 0.0008426540814620827, + "loss": 2.7392, + "step": 9008 + }, + { + "epoch": 0.2671470509741126, + "grad_norm": 0.15784966945648193, + "learning_rate": 0.0008426198149322744, + "loss": 2.7884, + "step": 9009 + }, + { + "epoch": 0.2671767043264241, + "grad_norm": 0.1698497086763382, + "learning_rate": 0.0008425855453685089, + "loss": 2.7395, + "step": 9010 + }, + { + "epoch": 0.2672063576787356, + "grad_norm": 0.1494881510734558, + "learning_rate": 0.0008425512727710895, + "loss": 2.7162, + "step": 9011 + }, + { + "epoch": 0.26723601103104705, + "grad_norm": 0.14349131286144257, + "learning_rate": 0.0008425169971403199, + "loss": 2.7733, + "step": 9012 + }, + { + "epoch": 0.2672656643833585, + "grad_norm": 0.14426620304584503, + "learning_rate": 0.0008424827184765034, + "loss": 2.7551, + "step": 9013 + }, + { + "epoch": 0.26729531773567, + "grad_norm": 0.13577838242053986, + "learning_rate": 0.0008424484367799438, + "loss": 2.7487, + "step": 9014 + }, + { + "epoch": 0.2673249710879815, + "grad_norm": 0.15047086775302887, + "learning_rate": 0.0008424141520509443, + "loss": 2.7573, + "step": 9015 + }, + { + "epoch": 0.26735462444029295, + "grad_norm": 0.15170547366142273, + "learning_rate": 0.0008423798642898089, + "loss": 2.774, + "step": 9016 + }, + { + "epoch": 0.26738427779260443, + "grad_norm": 0.1807326227426529, + "learning_rate": 0.0008423455734968409, + "loss": 2.746, + "step": 9017 + }, + { + "epoch": 0.2674139311449159, + "grad_norm": 0.1421457678079605, + "learning_rate": 0.0008423112796723442, + "loss": 2.7921, + "step": 9018 + }, + { + "epoch": 0.26744358449722744, + "grad_norm": 0.12826362252235413, + "learning_rate": 0.0008422769828166222, + "loss": 2.7619, + "step": 9019 + }, + { + "epoch": 0.2674732378495389, + "grad_norm": 0.11985941231250763, + "learning_rate": 0.0008422426829299789, + "loss": 2.7751, + "step": 9020 + }, + { + "epoch": 0.2675028912018504, + "grad_norm": 0.12654606997966766, + "learning_rate": 0.0008422083800127178, + "loss": 2.7766, + "step": 9021 + }, + { + "epoch": 0.26753254455416187, + "grad_norm": 0.1407603770494461, + "learning_rate": 0.0008421740740651429, + "loss": 2.7432, + "step": 9022 + }, + { + "epoch": 0.26756219790647334, + "grad_norm": 0.13868360221385956, + "learning_rate": 0.0008421397650875578, + "loss": 2.7087, + "step": 9023 + }, + { + "epoch": 0.2675918512587848, + "grad_norm": 0.12093349546194077, + "learning_rate": 0.0008421054530802663, + "loss": 2.7627, + "step": 9024 + }, + { + "epoch": 0.2676215046110963, + "grad_norm": 0.11825554072856903, + "learning_rate": 0.0008420711380435721, + "loss": 2.763, + "step": 9025 + }, + { + "epoch": 0.26765115796340777, + "grad_norm": 0.12924762070178986, + "learning_rate": 0.0008420368199777796, + "loss": 2.7578, + "step": 9026 + }, + { + "epoch": 0.26768081131571925, + "grad_norm": 0.15076906979084015, + "learning_rate": 0.0008420024988831923, + "loss": 2.7332, + "step": 9027 + }, + { + "epoch": 0.2677104646680307, + "grad_norm": 0.14123426377773285, + "learning_rate": 0.000841968174760114, + "loss": 2.7419, + "step": 9028 + }, + { + "epoch": 0.2677401180203422, + "grad_norm": 0.12485407292842865, + "learning_rate": 0.000841933847608849, + "loss": 2.7802, + "step": 9029 + }, + { + "epoch": 0.2677697713726537, + "grad_norm": 0.1298147737979889, + "learning_rate": 0.0008418995174297009, + "loss": 2.7708, + "step": 9030 + }, + { + "epoch": 0.26779942472496515, + "grad_norm": 0.14639301598072052, + "learning_rate": 0.000841865184222974, + "loss": 2.7446, + "step": 9031 + }, + { + "epoch": 0.2678290780772766, + "grad_norm": 0.15533263981342316, + "learning_rate": 0.000841830847988972, + "loss": 2.7358, + "step": 9032 + }, + { + "epoch": 0.2678587314295881, + "grad_norm": 0.17403846979141235, + "learning_rate": 0.0008417965087279994, + "loss": 2.7408, + "step": 9033 + }, + { + "epoch": 0.2678883847818996, + "grad_norm": 0.18791571259498596, + "learning_rate": 0.0008417621664403601, + "loss": 2.7459, + "step": 9034 + }, + { + "epoch": 0.26791803813421106, + "grad_norm": 0.1614425629377365, + "learning_rate": 0.0008417278211263579, + "loss": 2.7836, + "step": 9035 + }, + { + "epoch": 0.26794769148652253, + "grad_norm": 0.12899446487426758, + "learning_rate": 0.0008416934727862974, + "loss": 2.7338, + "step": 9036 + }, + { + "epoch": 0.267977344838834, + "grad_norm": 0.14882522821426392, + "learning_rate": 0.0008416591214204825, + "loss": 2.7431, + "step": 9037 + }, + { + "epoch": 0.2680069981911455, + "grad_norm": 0.15385523438453674, + "learning_rate": 0.0008416247670292174, + "loss": 2.7126, + "step": 9038 + }, + { + "epoch": 0.26803665154345696, + "grad_norm": 0.16516031324863434, + "learning_rate": 0.0008415904096128063, + "loss": 2.7529, + "step": 9039 + }, + { + "epoch": 0.2680663048957685, + "grad_norm": 0.17487956583499908, + "learning_rate": 0.0008415560491715536, + "loss": 2.7823, + "step": 9040 + }, + { + "epoch": 0.26809595824807997, + "grad_norm": 0.18192444741725922, + "learning_rate": 0.0008415216857057635, + "loss": 2.7602, + "step": 9041 + }, + { + "epoch": 0.26812561160039144, + "grad_norm": 0.17777669429779053, + "learning_rate": 0.00084148731921574, + "loss": 2.7414, + "step": 9042 + }, + { + "epoch": 0.2681552649527029, + "grad_norm": 0.1609233170747757, + "learning_rate": 0.0008414529497017879, + "loss": 2.7391, + "step": 9043 + }, + { + "epoch": 0.2681849183050144, + "grad_norm": 0.12934695184230804, + "learning_rate": 0.0008414185771642113, + "loss": 2.7321, + "step": 9044 + }, + { + "epoch": 0.2682145716573259, + "grad_norm": 0.1455708146095276, + "learning_rate": 0.0008413842016033144, + "loss": 2.7638, + "step": 9045 + }, + { + "epoch": 0.26824422500963735, + "grad_norm": 0.16548945009708405, + "learning_rate": 0.000841349823019402, + "loss": 2.7483, + "step": 9046 + }, + { + "epoch": 0.2682738783619488, + "grad_norm": 0.14119888842105865, + "learning_rate": 0.0008413154414127784, + "loss": 2.7274, + "step": 9047 + }, + { + "epoch": 0.2683035317142603, + "grad_norm": 0.14010483026504517, + "learning_rate": 0.0008412810567837478, + "loss": 2.7569, + "step": 9048 + }, + { + "epoch": 0.2683331850665718, + "grad_norm": 0.1492466926574707, + "learning_rate": 0.0008412466691326148, + "loss": 2.749, + "step": 9049 + }, + { + "epoch": 0.26836283841888325, + "grad_norm": 0.15744549036026, + "learning_rate": 0.0008412122784596842, + "loss": 2.7264, + "step": 9050 + }, + { + "epoch": 0.26839249177119473, + "grad_norm": 0.16518932580947876, + "learning_rate": 0.00084117788476526, + "loss": 2.7691, + "step": 9051 + }, + { + "epoch": 0.2684221451235062, + "grad_norm": 0.1770678609609604, + "learning_rate": 0.0008411434880496474, + "loss": 2.752, + "step": 9052 + }, + { + "epoch": 0.2684517984758177, + "grad_norm": 0.16525393724441528, + "learning_rate": 0.0008411090883131505, + "loss": 2.7393, + "step": 9053 + }, + { + "epoch": 0.26848145182812916, + "grad_norm": 0.14400899410247803, + "learning_rate": 0.0008410746855560741, + "loss": 2.7678, + "step": 9054 + }, + { + "epoch": 0.26851110518044063, + "grad_norm": 0.15271520614624023, + "learning_rate": 0.0008410402797787229, + "loss": 2.7739, + "step": 9055 + }, + { + "epoch": 0.2685407585327521, + "grad_norm": 0.13750618696212769, + "learning_rate": 0.0008410058709814013, + "loss": 2.7559, + "step": 9056 + }, + { + "epoch": 0.2685704118850636, + "grad_norm": 0.1388920098543167, + "learning_rate": 0.0008409714591644142, + "loss": 2.743, + "step": 9057 + }, + { + "epoch": 0.26860006523737506, + "grad_norm": 0.1485164314508438, + "learning_rate": 0.0008409370443280664, + "loss": 2.7274, + "step": 9058 + }, + { + "epoch": 0.26862971858968654, + "grad_norm": 0.15284056961536407, + "learning_rate": 0.0008409026264726625, + "loss": 2.7868, + "step": 9059 + }, + { + "epoch": 0.268659371941998, + "grad_norm": 0.1469743549823761, + "learning_rate": 0.0008408682055985073, + "loss": 2.7568, + "step": 9060 + }, + { + "epoch": 0.26868902529430955, + "grad_norm": 0.14564825594425201, + "learning_rate": 0.0008408337817059058, + "loss": 2.7059, + "step": 9061 + }, + { + "epoch": 0.268718678646621, + "grad_norm": 0.15327660739421844, + "learning_rate": 0.0008407993547951625, + "loss": 2.7291, + "step": 9062 + }, + { + "epoch": 0.2687483319989325, + "grad_norm": 0.1323070526123047, + "learning_rate": 0.0008407649248665825, + "loss": 2.7339, + "step": 9063 + }, + { + "epoch": 0.268777985351244, + "grad_norm": 0.11702514439821243, + "learning_rate": 0.0008407304919204704, + "loss": 2.7505, + "step": 9064 + }, + { + "epoch": 0.26880763870355545, + "grad_norm": 0.12776325643062592, + "learning_rate": 0.0008406960559571315, + "loss": 2.7278, + "step": 9065 + }, + { + "epoch": 0.2688372920558669, + "grad_norm": 0.1517653465270996, + "learning_rate": 0.0008406616169768706, + "loss": 2.765, + "step": 9066 + }, + { + "epoch": 0.2688669454081784, + "grad_norm": 0.16098523139953613, + "learning_rate": 0.0008406271749799926, + "loss": 2.7373, + "step": 9067 + }, + { + "epoch": 0.2688965987604899, + "grad_norm": 0.1851942390203476, + "learning_rate": 0.0008405927299668022, + "loss": 2.7381, + "step": 9068 + }, + { + "epoch": 0.26892625211280136, + "grad_norm": 0.17839057743549347, + "learning_rate": 0.0008405582819376049, + "loss": 2.7539, + "step": 9069 + }, + { + "epoch": 0.26895590546511283, + "grad_norm": 0.15074194967746735, + "learning_rate": 0.0008405238308927057, + "loss": 2.7404, + "step": 9070 + }, + { + "epoch": 0.2689855588174243, + "grad_norm": 0.16969959437847137, + "learning_rate": 0.0008404893768324094, + "loss": 2.7499, + "step": 9071 + }, + { + "epoch": 0.2690152121697358, + "grad_norm": 0.1597863733768463, + "learning_rate": 0.0008404549197570212, + "loss": 2.77, + "step": 9072 + }, + { + "epoch": 0.26904486552204726, + "grad_norm": 0.1356612592935562, + "learning_rate": 0.0008404204596668463, + "loss": 2.7336, + "step": 9073 + }, + { + "epoch": 0.26907451887435874, + "grad_norm": 0.149150550365448, + "learning_rate": 0.0008403859965621898, + "loss": 2.7494, + "step": 9074 + }, + { + "epoch": 0.2691041722266702, + "grad_norm": 0.14758753776550293, + "learning_rate": 0.0008403515304433569, + "loss": 2.7763, + "step": 9075 + }, + { + "epoch": 0.2691338255789817, + "grad_norm": 0.1295236200094223, + "learning_rate": 0.0008403170613106527, + "loss": 2.7588, + "step": 9076 + }, + { + "epoch": 0.26916347893129317, + "grad_norm": 0.1588318943977356, + "learning_rate": 0.0008402825891643823, + "loss": 2.7144, + "step": 9077 + }, + { + "epoch": 0.26919313228360464, + "grad_norm": 0.16348694264888763, + "learning_rate": 0.0008402481140048516, + "loss": 2.7387, + "step": 9078 + }, + { + "epoch": 0.2692227856359161, + "grad_norm": 0.1833367645740509, + "learning_rate": 0.0008402136358323652, + "loss": 2.7533, + "step": 9079 + }, + { + "epoch": 0.2692524389882276, + "grad_norm": 0.16726601123809814, + "learning_rate": 0.0008401791546472288, + "loss": 2.6983, + "step": 9080 + }, + { + "epoch": 0.2692820923405391, + "grad_norm": 0.1519058495759964, + "learning_rate": 0.0008401446704497475, + "loss": 2.7485, + "step": 9081 + }, + { + "epoch": 0.2693117456928506, + "grad_norm": 0.17154167592525482, + "learning_rate": 0.0008401101832402267, + "loss": 2.7574, + "step": 9082 + }, + { + "epoch": 0.2693413990451621, + "grad_norm": 0.16336213052272797, + "learning_rate": 0.0008400756930189719, + "loss": 2.7624, + "step": 9083 + }, + { + "epoch": 0.26937105239747355, + "grad_norm": 0.1339191198348999, + "learning_rate": 0.0008400411997862885, + "loss": 2.7373, + "step": 9084 + }, + { + "epoch": 0.26940070574978503, + "grad_norm": 0.12036995589733124, + "learning_rate": 0.0008400067035424819, + "loss": 2.7433, + "step": 9085 + }, + { + "epoch": 0.2694303591020965, + "grad_norm": 0.129272922873497, + "learning_rate": 0.0008399722042878575, + "loss": 2.725, + "step": 9086 + }, + { + "epoch": 0.269460012454408, + "grad_norm": 0.13392364978790283, + "learning_rate": 0.0008399377020227209, + "loss": 2.7515, + "step": 9087 + }, + { + "epoch": 0.26948966580671946, + "grad_norm": 0.12324144691228867, + "learning_rate": 0.0008399031967473777, + "loss": 2.7501, + "step": 9088 + }, + { + "epoch": 0.26951931915903093, + "grad_norm": 0.12222085893154144, + "learning_rate": 0.0008398686884621332, + "loss": 2.7519, + "step": 9089 + }, + { + "epoch": 0.2695489725113424, + "grad_norm": 0.12267465144395828, + "learning_rate": 0.0008398341771672932, + "loss": 2.7577, + "step": 9090 + }, + { + "epoch": 0.2695786258636539, + "grad_norm": 0.13318178057670593, + "learning_rate": 0.0008397996628631632, + "loss": 2.7826, + "step": 9091 + }, + { + "epoch": 0.26960827921596536, + "grad_norm": 0.1530788391828537, + "learning_rate": 0.000839765145550049, + "loss": 2.7373, + "step": 9092 + }, + { + "epoch": 0.26963793256827684, + "grad_norm": 0.1594625860452652, + "learning_rate": 0.0008397306252282559, + "loss": 2.7597, + "step": 9093 + }, + { + "epoch": 0.2696675859205883, + "grad_norm": 0.16081342101097107, + "learning_rate": 0.00083969610189809, + "loss": 2.7339, + "step": 9094 + }, + { + "epoch": 0.2696972392728998, + "grad_norm": 0.12407468259334564, + "learning_rate": 0.0008396615755598566, + "loss": 2.7736, + "step": 9095 + }, + { + "epoch": 0.26972689262521127, + "grad_norm": 0.1394646167755127, + "learning_rate": 0.0008396270462138619, + "loss": 2.7511, + "step": 9096 + }, + { + "epoch": 0.26975654597752274, + "grad_norm": 0.14087507128715515, + "learning_rate": 0.0008395925138604113, + "loss": 2.7385, + "step": 9097 + }, + { + "epoch": 0.2697861993298342, + "grad_norm": 0.1429409384727478, + "learning_rate": 0.0008395579784998107, + "loss": 2.7378, + "step": 9098 + }, + { + "epoch": 0.2698158526821457, + "grad_norm": 0.15404000878334045, + "learning_rate": 0.0008395234401323659, + "loss": 2.7376, + "step": 9099 + }, + { + "epoch": 0.26984550603445717, + "grad_norm": 0.16876272857189178, + "learning_rate": 0.0008394888987583826, + "loss": 2.7388, + "step": 9100 + }, + { + "epoch": 0.26987515938676865, + "grad_norm": 0.1965043991804123, + "learning_rate": 0.0008394543543781671, + "loss": 2.7444, + "step": 9101 + }, + { + "epoch": 0.2699048127390802, + "grad_norm": 0.2058243751525879, + "learning_rate": 0.000839419806992025, + "loss": 2.7247, + "step": 9102 + }, + { + "epoch": 0.26993446609139166, + "grad_norm": 0.19329135119915009, + "learning_rate": 0.000839385256600262, + "loss": 2.7035, + "step": 9103 + }, + { + "epoch": 0.26996411944370313, + "grad_norm": 0.1577562391757965, + "learning_rate": 0.0008393507032031844, + "loss": 2.7251, + "step": 9104 + }, + { + "epoch": 0.2699937727960146, + "grad_norm": 0.16734366118907928, + "learning_rate": 0.0008393161468010982, + "loss": 2.7856, + "step": 9105 + }, + { + "epoch": 0.2700234261483261, + "grad_norm": 0.1762562394142151, + "learning_rate": 0.0008392815873943092, + "loss": 2.7557, + "step": 9106 + }, + { + "epoch": 0.27005307950063756, + "grad_norm": 0.16395504772663116, + "learning_rate": 0.0008392470249831235, + "loss": 2.7268, + "step": 9107 + }, + { + "epoch": 0.27008273285294904, + "grad_norm": 0.13818730413913727, + "learning_rate": 0.0008392124595678472, + "loss": 2.7449, + "step": 9108 + }, + { + "epoch": 0.2701123862052605, + "grad_norm": 0.12615084648132324, + "learning_rate": 0.0008391778911487863, + "loss": 2.6842, + "step": 9109 + }, + { + "epoch": 0.270142039557572, + "grad_norm": 0.12340845912694931, + "learning_rate": 0.000839143319726247, + "loss": 2.7479, + "step": 9110 + }, + { + "epoch": 0.27017169290988347, + "grad_norm": 0.131648987531662, + "learning_rate": 0.0008391087453005352, + "loss": 2.7073, + "step": 9111 + }, + { + "epoch": 0.27020134626219494, + "grad_norm": 0.13539256155490875, + "learning_rate": 0.0008390741678719575, + "loss": 2.7525, + "step": 9112 + }, + { + "epoch": 0.2702309996145064, + "grad_norm": 0.14172372221946716, + "learning_rate": 0.0008390395874408199, + "loss": 2.7081, + "step": 9113 + }, + { + "epoch": 0.2702606529668179, + "grad_norm": 0.1568879634141922, + "learning_rate": 0.0008390050040074284, + "loss": 2.735, + "step": 9114 + }, + { + "epoch": 0.27029030631912937, + "grad_norm": 0.13349902629852295, + "learning_rate": 0.0008389704175720894, + "loss": 2.737, + "step": 9115 + }, + { + "epoch": 0.27031995967144085, + "grad_norm": 0.12008749693632126, + "learning_rate": 0.0008389358281351092, + "loss": 2.7086, + "step": 9116 + }, + { + "epoch": 0.2703496130237523, + "grad_norm": 0.10905669629573822, + "learning_rate": 0.000838901235696794, + "loss": 2.7388, + "step": 9117 + }, + { + "epoch": 0.2703792663760638, + "grad_norm": 0.12385600060224533, + "learning_rate": 0.0008388666402574503, + "loss": 2.7136, + "step": 9118 + }, + { + "epoch": 0.2704089197283753, + "grad_norm": 0.14539484679698944, + "learning_rate": 0.0008388320418173843, + "loss": 2.7675, + "step": 9119 + }, + { + "epoch": 0.27043857308068675, + "grad_norm": 0.1619938462972641, + "learning_rate": 0.0008387974403769023, + "loss": 2.7266, + "step": 9120 + }, + { + "epoch": 0.2704682264329982, + "grad_norm": 0.18528605997562408, + "learning_rate": 0.0008387628359363109, + "loss": 2.7504, + "step": 9121 + }, + { + "epoch": 0.2704978797853097, + "grad_norm": 0.17582036554813385, + "learning_rate": 0.0008387282284959164, + "loss": 2.7842, + "step": 9122 + }, + { + "epoch": 0.27052753313762123, + "grad_norm": 0.1669946312904358, + "learning_rate": 0.0008386936180560254, + "loss": 2.76, + "step": 9123 + }, + { + "epoch": 0.2705571864899327, + "grad_norm": 0.18774117529392242, + "learning_rate": 0.0008386590046169443, + "loss": 2.7681, + "step": 9124 + }, + { + "epoch": 0.2705868398422442, + "grad_norm": 0.19041773676872253, + "learning_rate": 0.0008386243881789794, + "loss": 2.7056, + "step": 9125 + }, + { + "epoch": 0.27061649319455566, + "grad_norm": 0.12876860797405243, + "learning_rate": 0.0008385897687424375, + "loss": 2.7466, + "step": 9126 + }, + { + "epoch": 0.27064614654686714, + "grad_norm": 0.14419794082641602, + "learning_rate": 0.000838555146307625, + "loss": 2.7438, + "step": 9127 + }, + { + "epoch": 0.2706757998991786, + "grad_norm": 0.17124418914318085, + "learning_rate": 0.0008385205208748487, + "loss": 2.7648, + "step": 9128 + }, + { + "epoch": 0.2707054532514901, + "grad_norm": 0.15092937648296356, + "learning_rate": 0.000838485892444415, + "loss": 2.7305, + "step": 9129 + }, + { + "epoch": 0.27073510660380157, + "grad_norm": 0.15362700819969177, + "learning_rate": 0.0008384512610166307, + "loss": 2.7346, + "step": 9130 + }, + { + "epoch": 0.27076475995611304, + "grad_norm": 0.13855858147144318, + "learning_rate": 0.0008384166265918022, + "loss": 2.7036, + "step": 9131 + }, + { + "epoch": 0.2707944133084245, + "grad_norm": 0.16668349504470825, + "learning_rate": 0.0008383819891702366, + "loss": 2.7147, + "step": 9132 + }, + { + "epoch": 0.270824066660736, + "grad_norm": 0.16730734705924988, + "learning_rate": 0.0008383473487522404, + "loss": 2.7681, + "step": 9133 + }, + { + "epoch": 0.27085372001304747, + "grad_norm": 0.14904457330703735, + "learning_rate": 0.0008383127053381203, + "loss": 2.7424, + "step": 9134 + }, + { + "epoch": 0.27088337336535895, + "grad_norm": 0.14908969402313232, + "learning_rate": 0.0008382780589281831, + "loss": 2.7467, + "step": 9135 + }, + { + "epoch": 0.2709130267176704, + "grad_norm": 0.1697569042444229, + "learning_rate": 0.0008382434095227356, + "loss": 2.7531, + "step": 9136 + }, + { + "epoch": 0.2709426800699819, + "grad_norm": 0.1891573667526245, + "learning_rate": 0.0008382087571220847, + "loss": 2.7589, + "step": 9137 + }, + { + "epoch": 0.2709723334222934, + "grad_norm": 0.1790931522846222, + "learning_rate": 0.0008381741017265371, + "loss": 2.7291, + "step": 9138 + }, + { + "epoch": 0.27100198677460485, + "grad_norm": 0.16594508290290833, + "learning_rate": 0.0008381394433364, + "loss": 2.757, + "step": 9139 + }, + { + "epoch": 0.27103164012691633, + "grad_norm": 0.14199590682983398, + "learning_rate": 0.0008381047819519799, + "loss": 2.7626, + "step": 9140 + }, + { + "epoch": 0.2710612934792278, + "grad_norm": 0.12941622734069824, + "learning_rate": 0.0008380701175735841, + "loss": 2.7668, + "step": 9141 + }, + { + "epoch": 0.2710909468315393, + "grad_norm": 0.14228858053684235, + "learning_rate": 0.0008380354502015191, + "loss": 2.7559, + "step": 9142 + }, + { + "epoch": 0.27112060018385076, + "grad_norm": 0.16527067124843597, + "learning_rate": 0.0008380007798360924, + "loss": 2.7802, + "step": 9143 + }, + { + "epoch": 0.2711502535361623, + "grad_norm": 0.14398731291294098, + "learning_rate": 0.0008379661064776106, + "loss": 2.7518, + "step": 9144 + }, + { + "epoch": 0.27117990688847377, + "grad_norm": 0.12014464288949966, + "learning_rate": 0.0008379314301263811, + "loss": 2.7353, + "step": 9145 + }, + { + "epoch": 0.27120956024078524, + "grad_norm": 0.11968258768320084, + "learning_rate": 0.0008378967507827106, + "loss": 2.7137, + "step": 9146 + }, + { + "epoch": 0.2712392135930967, + "grad_norm": 0.12928734719753265, + "learning_rate": 0.0008378620684469064, + "loss": 2.7325, + "step": 9147 + }, + { + "epoch": 0.2712688669454082, + "grad_norm": 0.12798115611076355, + "learning_rate": 0.0008378273831192758, + "loss": 2.7403, + "step": 9148 + }, + { + "epoch": 0.27129852029771967, + "grad_norm": 0.12931913137435913, + "learning_rate": 0.0008377926948001255, + "loss": 2.7614, + "step": 9149 + }, + { + "epoch": 0.27132817365003115, + "grad_norm": 0.12559710443019867, + "learning_rate": 0.0008377580034897631, + "loss": 2.7294, + "step": 9150 + }, + { + "epoch": 0.2713578270023426, + "grad_norm": 0.1248488575220108, + "learning_rate": 0.0008377233091884955, + "loss": 2.7737, + "step": 9151 + }, + { + "epoch": 0.2713874803546541, + "grad_norm": 0.12253263592720032, + "learning_rate": 0.00083768861189663, + "loss": 2.7728, + "step": 9152 + }, + { + "epoch": 0.2714171337069656, + "grad_norm": 0.12628582119941711, + "learning_rate": 0.000837653911614474, + "loss": 2.7421, + "step": 9153 + }, + { + "epoch": 0.27144678705927705, + "grad_norm": 0.13451996445655823, + "learning_rate": 0.0008376192083423344, + "loss": 2.7739, + "step": 9154 + }, + { + "epoch": 0.2714764404115885, + "grad_norm": 0.13383492827415466, + "learning_rate": 0.0008375845020805189, + "loss": 2.7554, + "step": 9155 + }, + { + "epoch": 0.2715060937639, + "grad_norm": 0.1261654794216156, + "learning_rate": 0.0008375497928293348, + "loss": 2.7188, + "step": 9156 + }, + { + "epoch": 0.2715357471162115, + "grad_norm": 0.1308044046163559, + "learning_rate": 0.0008375150805890892, + "loss": 2.7498, + "step": 9157 + }, + { + "epoch": 0.27156540046852296, + "grad_norm": 0.14744718372821808, + "learning_rate": 0.0008374803653600898, + "loss": 2.763, + "step": 9158 + }, + { + "epoch": 0.27159505382083443, + "grad_norm": 0.16233614087104797, + "learning_rate": 0.0008374456471426438, + "loss": 2.7279, + "step": 9159 + }, + { + "epoch": 0.2716247071731459, + "grad_norm": 0.14470212161540985, + "learning_rate": 0.0008374109259370586, + "loss": 2.7759, + "step": 9160 + }, + { + "epoch": 0.2716543605254574, + "grad_norm": 0.12886559963226318, + "learning_rate": 0.0008373762017436416, + "loss": 2.7308, + "step": 9161 + }, + { + "epoch": 0.27168401387776886, + "grad_norm": 0.1327599585056305, + "learning_rate": 0.0008373414745627006, + "loss": 2.6958, + "step": 9162 + }, + { + "epoch": 0.27171366723008034, + "grad_norm": 0.14410561323165894, + "learning_rate": 0.0008373067443945428, + "loss": 2.7405, + "step": 9163 + }, + { + "epoch": 0.2717433205823918, + "grad_norm": 0.15111221373081207, + "learning_rate": 0.0008372720112394761, + "loss": 2.7612, + "step": 9164 + }, + { + "epoch": 0.27177297393470334, + "grad_norm": 0.17626076936721802, + "learning_rate": 0.0008372372750978077, + "loss": 2.7589, + "step": 9165 + }, + { + "epoch": 0.2718026272870148, + "grad_norm": 0.19560283422470093, + "learning_rate": 0.0008372025359698453, + "loss": 2.7445, + "step": 9166 + }, + { + "epoch": 0.2718322806393263, + "grad_norm": 0.1702069640159607, + "learning_rate": 0.0008371677938558966, + "loss": 2.7471, + "step": 9167 + }, + { + "epoch": 0.27186193399163777, + "grad_norm": 0.18381290137767792, + "learning_rate": 0.0008371330487562692, + "loss": 2.7502, + "step": 9168 + }, + { + "epoch": 0.27189158734394925, + "grad_norm": 0.1660575270652771, + "learning_rate": 0.0008370983006712709, + "loss": 2.7419, + "step": 9169 + }, + { + "epoch": 0.2719212406962607, + "grad_norm": 0.1581909954547882, + "learning_rate": 0.0008370635496012092, + "loss": 2.7658, + "step": 9170 + }, + { + "epoch": 0.2719508940485722, + "grad_norm": 0.17097686231136322, + "learning_rate": 0.000837028795546392, + "loss": 2.7563, + "step": 9171 + }, + { + "epoch": 0.2719805474008837, + "grad_norm": 0.17863602936267853, + "learning_rate": 0.0008369940385071268, + "loss": 2.7523, + "step": 9172 + }, + { + "epoch": 0.27201020075319515, + "grad_norm": 0.1353728324174881, + "learning_rate": 0.0008369592784837216, + "loss": 2.7536, + "step": 9173 + }, + { + "epoch": 0.27203985410550663, + "grad_norm": 0.140245720744133, + "learning_rate": 0.0008369245154764842, + "loss": 2.7569, + "step": 9174 + }, + { + "epoch": 0.2720695074578181, + "grad_norm": 0.19355408847332, + "learning_rate": 0.0008368897494857223, + "loss": 2.7528, + "step": 9175 + }, + { + "epoch": 0.2720991608101296, + "grad_norm": 0.20828042924404144, + "learning_rate": 0.000836854980511744, + "loss": 2.7338, + "step": 9176 + }, + { + "epoch": 0.27212881416244106, + "grad_norm": 0.19960413873195648, + "learning_rate": 0.0008368202085548568, + "loss": 2.7419, + "step": 9177 + }, + { + "epoch": 0.27215846751475253, + "grad_norm": 0.1769503355026245, + "learning_rate": 0.000836785433615369, + "loss": 2.6925, + "step": 9178 + }, + { + "epoch": 0.272188120867064, + "grad_norm": 0.16306130588054657, + "learning_rate": 0.0008367506556935884, + "loss": 2.7867, + "step": 9179 + }, + { + "epoch": 0.2722177742193755, + "grad_norm": 0.15658651292324066, + "learning_rate": 0.000836715874789823, + "loss": 2.7114, + "step": 9180 + }, + { + "epoch": 0.27224742757168696, + "grad_norm": 0.14099451899528503, + "learning_rate": 0.0008366810909043805, + "loss": 2.7584, + "step": 9181 + }, + { + "epoch": 0.27227708092399844, + "grad_norm": 0.13447421789169312, + "learning_rate": 0.0008366463040375693, + "loss": 2.7617, + "step": 9182 + }, + { + "epoch": 0.2723067342763099, + "grad_norm": 0.1332789957523346, + "learning_rate": 0.0008366115141896972, + "loss": 2.7689, + "step": 9183 + }, + { + "epoch": 0.2723363876286214, + "grad_norm": 0.14410413801670074, + "learning_rate": 0.0008365767213610726, + "loss": 2.7689, + "step": 9184 + }, + { + "epoch": 0.2723660409809329, + "grad_norm": 0.12734897434711456, + "learning_rate": 0.0008365419255520031, + "loss": 2.7407, + "step": 9185 + }, + { + "epoch": 0.2723956943332444, + "grad_norm": 0.14394569396972656, + "learning_rate": 0.0008365071267627973, + "loss": 2.7314, + "step": 9186 + }, + { + "epoch": 0.2724253476855559, + "grad_norm": 0.1367257535457611, + "learning_rate": 0.0008364723249937629, + "loss": 2.7433, + "step": 9187 + }, + { + "epoch": 0.27245500103786735, + "grad_norm": 0.12644200026988983, + "learning_rate": 0.0008364375202452083, + "loss": 2.727, + "step": 9188 + }, + { + "epoch": 0.2724846543901788, + "grad_norm": 0.1404527723789215, + "learning_rate": 0.0008364027125174419, + "loss": 2.7503, + "step": 9189 + }, + { + "epoch": 0.2725143077424903, + "grad_norm": 0.140598326921463, + "learning_rate": 0.0008363679018107718, + "loss": 2.7888, + "step": 9190 + }, + { + "epoch": 0.2725439610948018, + "grad_norm": 0.12640230357646942, + "learning_rate": 0.0008363330881255059, + "loss": 2.732, + "step": 9191 + }, + { + "epoch": 0.27257361444711325, + "grad_norm": 0.127191960811615, + "learning_rate": 0.0008362982714619529, + "loss": 2.7608, + "step": 9192 + }, + { + "epoch": 0.27260326779942473, + "grad_norm": 0.1391022503376007, + "learning_rate": 0.0008362634518204211, + "loss": 2.7612, + "step": 9193 + }, + { + "epoch": 0.2726329211517362, + "grad_norm": 0.15841154754161835, + "learning_rate": 0.0008362286292012185, + "loss": 2.7361, + "step": 9194 + }, + { + "epoch": 0.2726625745040477, + "grad_norm": 0.15764564275741577, + "learning_rate": 0.0008361938036046539, + "loss": 2.7559, + "step": 9195 + }, + { + "epoch": 0.27269222785635916, + "grad_norm": 0.16942737996578217, + "learning_rate": 0.0008361589750310353, + "loss": 2.7299, + "step": 9196 + }, + { + "epoch": 0.27272188120867064, + "grad_norm": 0.1495034247636795, + "learning_rate": 0.0008361241434806714, + "loss": 2.7869, + "step": 9197 + }, + { + "epoch": 0.2727515345609821, + "grad_norm": 0.1369277983903885, + "learning_rate": 0.0008360893089538703, + "loss": 2.7555, + "step": 9198 + }, + { + "epoch": 0.2727811879132936, + "grad_norm": 0.1438184529542923, + "learning_rate": 0.0008360544714509409, + "loss": 2.7347, + "step": 9199 + }, + { + "epoch": 0.27281084126560506, + "grad_norm": 0.17734719812870026, + "learning_rate": 0.0008360196309721915, + "loss": 2.7923, + "step": 9200 + }, + { + "epoch": 0.27284049461791654, + "grad_norm": 0.161447212100029, + "learning_rate": 0.0008359847875179304, + "loss": 2.7483, + "step": 9201 + }, + { + "epoch": 0.272870147970228, + "grad_norm": 0.1656581461429596, + "learning_rate": 0.0008359499410884665, + "loss": 2.6999, + "step": 9202 + }, + { + "epoch": 0.2728998013225395, + "grad_norm": 0.14994803071022034, + "learning_rate": 0.000835915091684108, + "loss": 2.7665, + "step": 9203 + }, + { + "epoch": 0.27292945467485097, + "grad_norm": 0.1568341702222824, + "learning_rate": 0.0008358802393051639, + "loss": 2.7055, + "step": 9204 + }, + { + "epoch": 0.27295910802716244, + "grad_norm": 0.15220150351524353, + "learning_rate": 0.0008358453839519426, + "loss": 2.7369, + "step": 9205 + }, + { + "epoch": 0.272988761379474, + "grad_norm": 0.13827094435691833, + "learning_rate": 0.0008358105256247527, + "loss": 2.7298, + "step": 9206 + }, + { + "epoch": 0.27301841473178545, + "grad_norm": 0.1626301407814026, + "learning_rate": 0.0008357756643239029, + "loss": 2.7189, + "step": 9207 + }, + { + "epoch": 0.27304806808409693, + "grad_norm": 0.17962005734443665, + "learning_rate": 0.0008357408000497022, + "loss": 2.7476, + "step": 9208 + }, + { + "epoch": 0.2730777214364084, + "grad_norm": 0.15718288719654083, + "learning_rate": 0.000835705932802459, + "loss": 2.7908, + "step": 9209 + }, + { + "epoch": 0.2731073747887199, + "grad_norm": 0.13261155784130096, + "learning_rate": 0.0008356710625824819, + "loss": 2.7354, + "step": 9210 + }, + { + "epoch": 0.27313702814103136, + "grad_norm": 0.16286127269268036, + "learning_rate": 0.0008356361893900803, + "loss": 2.7432, + "step": 9211 + }, + { + "epoch": 0.27316668149334283, + "grad_norm": 0.13395284116268158, + "learning_rate": 0.0008356013132255624, + "loss": 2.7352, + "step": 9212 + }, + { + "epoch": 0.2731963348456543, + "grad_norm": 0.1371770054101944, + "learning_rate": 0.0008355664340892373, + "loss": 2.7529, + "step": 9213 + }, + { + "epoch": 0.2732259881979658, + "grad_norm": 0.1455785036087036, + "learning_rate": 0.0008355315519814137, + "loss": 2.7011, + "step": 9214 + }, + { + "epoch": 0.27325564155027726, + "grad_norm": 0.14937232434749603, + "learning_rate": 0.0008354966669024008, + "loss": 2.7772, + "step": 9215 + }, + { + "epoch": 0.27328529490258874, + "grad_norm": 0.1701754629611969, + "learning_rate": 0.0008354617788525074, + "loss": 2.7634, + "step": 9216 + }, + { + "epoch": 0.2733149482549002, + "grad_norm": 0.16626021265983582, + "learning_rate": 0.0008354268878320422, + "loss": 2.7883, + "step": 9217 + }, + { + "epoch": 0.2733446016072117, + "grad_norm": 0.13901670277118683, + "learning_rate": 0.0008353919938413144, + "loss": 2.7661, + "step": 9218 + }, + { + "epoch": 0.27337425495952317, + "grad_norm": 0.15145115554332733, + "learning_rate": 0.0008353570968806328, + "loss": 2.739, + "step": 9219 + }, + { + "epoch": 0.27340390831183464, + "grad_norm": 0.13931414484977722, + "learning_rate": 0.0008353221969503066, + "loss": 2.7311, + "step": 9220 + }, + { + "epoch": 0.2734335616641461, + "grad_norm": 0.15243907272815704, + "learning_rate": 0.0008352872940506448, + "loss": 2.7512, + "step": 9221 + }, + { + "epoch": 0.2734632150164576, + "grad_norm": 0.1969170719385147, + "learning_rate": 0.0008352523881819566, + "loss": 2.7471, + "step": 9222 + }, + { + "epoch": 0.27349286836876907, + "grad_norm": 0.20171144604682922, + "learning_rate": 0.0008352174793445508, + "loss": 2.7737, + "step": 9223 + }, + { + "epoch": 0.27352252172108055, + "grad_norm": 0.15490251779556274, + "learning_rate": 0.0008351825675387368, + "loss": 2.7324, + "step": 9224 + }, + { + "epoch": 0.273552175073392, + "grad_norm": 0.1471242904663086, + "learning_rate": 0.0008351476527648236, + "loss": 2.7256, + "step": 9225 + }, + { + "epoch": 0.2735818284257035, + "grad_norm": 0.15193215012550354, + "learning_rate": 0.0008351127350231202, + "loss": 2.7516, + "step": 9226 + }, + { + "epoch": 0.27361148177801503, + "grad_norm": 0.15664677321910858, + "learning_rate": 0.0008350778143139363, + "loss": 2.7266, + "step": 9227 + }, + { + "epoch": 0.2736411351303265, + "grad_norm": 0.13964784145355225, + "learning_rate": 0.0008350428906375806, + "loss": 2.7092, + "step": 9228 + }, + { + "epoch": 0.273670788482638, + "grad_norm": 0.14119067788124084, + "learning_rate": 0.0008350079639943629, + "loss": 2.7382, + "step": 9229 + }, + { + "epoch": 0.27370044183494946, + "grad_norm": 0.13221898674964905, + "learning_rate": 0.0008349730343845919, + "loss": 2.7268, + "step": 9230 + }, + { + "epoch": 0.27373009518726094, + "grad_norm": 0.13111308217048645, + "learning_rate": 0.0008349381018085773, + "loss": 2.7609, + "step": 9231 + }, + { + "epoch": 0.2737597485395724, + "grad_norm": 0.16738460958003998, + "learning_rate": 0.0008349031662666282, + "loss": 2.7666, + "step": 9232 + }, + { + "epoch": 0.2737894018918839, + "grad_norm": 0.17769305408000946, + "learning_rate": 0.0008348682277590542, + "loss": 2.7121, + "step": 9233 + }, + { + "epoch": 0.27381905524419536, + "grad_norm": 0.14255230128765106, + "learning_rate": 0.0008348332862861645, + "loss": 2.7318, + "step": 9234 + }, + { + "epoch": 0.27384870859650684, + "grad_norm": 0.12946641445159912, + "learning_rate": 0.0008347983418482686, + "loss": 2.7418, + "step": 9235 + }, + { + "epoch": 0.2738783619488183, + "grad_norm": 0.17379975318908691, + "learning_rate": 0.000834763394445676, + "loss": 2.7701, + "step": 9236 + }, + { + "epoch": 0.2739080153011298, + "grad_norm": 0.16418863832950592, + "learning_rate": 0.000834728444078696, + "loss": 2.7313, + "step": 9237 + }, + { + "epoch": 0.27393766865344127, + "grad_norm": 0.13892848789691925, + "learning_rate": 0.0008346934907476382, + "loss": 2.7206, + "step": 9238 + }, + { + "epoch": 0.27396732200575274, + "grad_norm": 0.14800474047660828, + "learning_rate": 0.0008346585344528119, + "loss": 2.7774, + "step": 9239 + }, + { + "epoch": 0.2739969753580642, + "grad_norm": 0.15143921971321106, + "learning_rate": 0.000834623575194527, + "loss": 2.7387, + "step": 9240 + }, + { + "epoch": 0.2740266287103757, + "grad_norm": 0.1330939531326294, + "learning_rate": 0.000834588612973093, + "loss": 2.755, + "step": 9241 + }, + { + "epoch": 0.2740562820626872, + "grad_norm": 0.16129480302333832, + "learning_rate": 0.0008345536477888193, + "loss": 2.7623, + "step": 9242 + }, + { + "epoch": 0.27408593541499865, + "grad_norm": 0.18283197283744812, + "learning_rate": 0.0008345186796420156, + "loss": 2.766, + "step": 9243 + }, + { + "epoch": 0.2741155887673101, + "grad_norm": 0.14271967113018036, + "learning_rate": 0.0008344837085329917, + "loss": 2.7414, + "step": 9244 + }, + { + "epoch": 0.2741452421196216, + "grad_norm": 0.14035291969776154, + "learning_rate": 0.0008344487344620569, + "loss": 2.7607, + "step": 9245 + }, + { + "epoch": 0.2741748954719331, + "grad_norm": 0.12711060047149658, + "learning_rate": 0.0008344137574295214, + "loss": 2.7322, + "step": 9246 + }, + { + "epoch": 0.27420454882424455, + "grad_norm": 0.12252748757600784, + "learning_rate": 0.0008343787774356946, + "loss": 2.763, + "step": 9247 + }, + { + "epoch": 0.2742342021765561, + "grad_norm": 0.15449056029319763, + "learning_rate": 0.0008343437944808862, + "loss": 2.7383, + "step": 9248 + }, + { + "epoch": 0.27426385552886756, + "grad_norm": 0.1619720458984375, + "learning_rate": 0.0008343088085654062, + "loss": 2.7549, + "step": 9249 + }, + { + "epoch": 0.27429350888117904, + "grad_norm": 0.16540278494358063, + "learning_rate": 0.0008342738196895644, + "loss": 2.7893, + "step": 9250 + }, + { + "epoch": 0.2743231622334905, + "grad_norm": 0.15526147186756134, + "learning_rate": 0.0008342388278536703, + "loss": 2.749, + "step": 9251 + }, + { + "epoch": 0.274352815585802, + "grad_norm": 0.14659994840621948, + "learning_rate": 0.0008342038330580343, + "loss": 2.7657, + "step": 9252 + }, + { + "epoch": 0.27438246893811347, + "grad_norm": 0.1776318997144699, + "learning_rate": 0.0008341688353029659, + "loss": 2.787, + "step": 9253 + }, + { + "epoch": 0.27441212229042494, + "grad_norm": 0.16866889595985413, + "learning_rate": 0.000834133834588775, + "loss": 2.7228, + "step": 9254 + }, + { + "epoch": 0.2744417756427364, + "grad_norm": 0.12796306610107422, + "learning_rate": 0.0008340988309157718, + "loss": 2.7381, + "step": 9255 + }, + { + "epoch": 0.2744714289950479, + "grad_norm": 0.13607989251613617, + "learning_rate": 0.000834063824284266, + "loss": 2.7731, + "step": 9256 + }, + { + "epoch": 0.27450108234735937, + "grad_norm": 0.18683359026908875, + "learning_rate": 0.0008340288146945678, + "loss": 2.7526, + "step": 9257 + }, + { + "epoch": 0.27453073569967085, + "grad_norm": 0.1917750984430313, + "learning_rate": 0.000833993802146987, + "loss": 2.7619, + "step": 9258 + }, + { + "epoch": 0.2745603890519823, + "grad_norm": 0.14337769150733948, + "learning_rate": 0.0008339587866418338, + "loss": 2.7494, + "step": 9259 + }, + { + "epoch": 0.2745900424042938, + "grad_norm": 0.16443274915218353, + "learning_rate": 0.0008339237681794182, + "loss": 2.7841, + "step": 9260 + }, + { + "epoch": 0.2746196957566053, + "grad_norm": 0.14524635672569275, + "learning_rate": 0.0008338887467600502, + "loss": 2.7389, + "step": 9261 + }, + { + "epoch": 0.27464934910891675, + "grad_norm": 0.13050499558448792, + "learning_rate": 0.0008338537223840403, + "loss": 2.7792, + "step": 9262 + }, + { + "epoch": 0.2746790024612282, + "grad_norm": 0.13585522770881653, + "learning_rate": 0.0008338186950516981, + "loss": 2.7532, + "step": 9263 + }, + { + "epoch": 0.2747086558135397, + "grad_norm": 0.13856610655784607, + "learning_rate": 0.0008337836647633344, + "loss": 2.7535, + "step": 9264 + }, + { + "epoch": 0.2747383091658512, + "grad_norm": 0.13514761626720428, + "learning_rate": 0.0008337486315192587, + "loss": 2.6927, + "step": 9265 + }, + { + "epoch": 0.27476796251816266, + "grad_norm": 0.13286453485488892, + "learning_rate": 0.0008337135953197819, + "loss": 2.7419, + "step": 9266 + }, + { + "epoch": 0.27479761587047413, + "grad_norm": 0.12746527791023254, + "learning_rate": 0.0008336785561652136, + "loss": 2.7294, + "step": 9267 + }, + { + "epoch": 0.2748272692227856, + "grad_norm": 0.11776845157146454, + "learning_rate": 0.0008336435140558647, + "loss": 2.809, + "step": 9268 + }, + { + "epoch": 0.27485692257509714, + "grad_norm": 0.11617592722177505, + "learning_rate": 0.0008336084689920451, + "loss": 2.7614, + "step": 9269 + }, + { + "epoch": 0.2748865759274086, + "grad_norm": 0.12055516242980957, + "learning_rate": 0.0008335734209740652, + "loss": 2.7221, + "step": 9270 + }, + { + "epoch": 0.2749162292797201, + "grad_norm": 0.1228426918387413, + "learning_rate": 0.0008335383700022354, + "loss": 2.7597, + "step": 9271 + }, + { + "epoch": 0.27494588263203157, + "grad_norm": 0.1368369162082672, + "learning_rate": 0.0008335033160768662, + "loss": 2.7118, + "step": 9272 + }, + { + "epoch": 0.27497553598434304, + "grad_norm": 0.15862268209457397, + "learning_rate": 0.0008334682591982677, + "loss": 2.6946, + "step": 9273 + }, + { + "epoch": 0.2750051893366545, + "grad_norm": 0.15049390494823456, + "learning_rate": 0.0008334331993667506, + "loss": 2.7369, + "step": 9274 + }, + { + "epoch": 0.275034842688966, + "grad_norm": 0.15327805280685425, + "learning_rate": 0.0008333981365826253, + "loss": 2.7536, + "step": 9275 + }, + { + "epoch": 0.2750644960412775, + "grad_norm": 0.1448782980442047, + "learning_rate": 0.0008333630708462024, + "loss": 2.7136, + "step": 9276 + }, + { + "epoch": 0.27509414939358895, + "grad_norm": 0.14145547151565552, + "learning_rate": 0.000833328002157792, + "loss": 2.7368, + "step": 9277 + }, + { + "epoch": 0.2751238027459004, + "grad_norm": 0.1319536417722702, + "learning_rate": 0.0008332929305177052, + "loss": 2.7456, + "step": 9278 + }, + { + "epoch": 0.2751534560982119, + "grad_norm": 0.1483922302722931, + "learning_rate": 0.0008332578559262523, + "loss": 2.7285, + "step": 9279 + }, + { + "epoch": 0.2751831094505234, + "grad_norm": 0.13950663805007935, + "learning_rate": 0.0008332227783837437, + "loss": 2.7275, + "step": 9280 + }, + { + "epoch": 0.27521276280283485, + "grad_norm": 0.1422661542892456, + "learning_rate": 0.0008331876978904903, + "loss": 2.7616, + "step": 9281 + }, + { + "epoch": 0.27524241615514633, + "grad_norm": 0.1419362723827362, + "learning_rate": 0.0008331526144468027, + "loss": 2.7445, + "step": 9282 + }, + { + "epoch": 0.2752720695074578, + "grad_norm": 0.11575764417648315, + "learning_rate": 0.0008331175280529915, + "loss": 2.7483, + "step": 9283 + }, + { + "epoch": 0.2753017228597693, + "grad_norm": 0.1073925569653511, + "learning_rate": 0.0008330824387093672, + "loss": 2.7642, + "step": 9284 + }, + { + "epoch": 0.27533137621208076, + "grad_norm": 0.1223365068435669, + "learning_rate": 0.0008330473464162409, + "loss": 2.7349, + "step": 9285 + }, + { + "epoch": 0.27536102956439223, + "grad_norm": 0.14543326199054718, + "learning_rate": 0.0008330122511739231, + "loss": 2.7516, + "step": 9286 + }, + { + "epoch": 0.2753906829167037, + "grad_norm": 0.13712668418884277, + "learning_rate": 0.0008329771529827248, + "loss": 2.76, + "step": 9287 + }, + { + "epoch": 0.2754203362690152, + "grad_norm": 0.1587250679731369, + "learning_rate": 0.0008329420518429566, + "loss": 2.7495, + "step": 9288 + }, + { + "epoch": 0.2754499896213267, + "grad_norm": 0.1469295620918274, + "learning_rate": 0.0008329069477549293, + "loss": 2.7727, + "step": 9289 + }, + { + "epoch": 0.2754796429736382, + "grad_norm": 0.13834749162197113, + "learning_rate": 0.000832871840718954, + "loss": 2.7654, + "step": 9290 + }, + { + "epoch": 0.27550929632594967, + "grad_norm": 0.14349110424518585, + "learning_rate": 0.0008328367307353412, + "loss": 2.7734, + "step": 9291 + }, + { + "epoch": 0.27553894967826115, + "grad_norm": 0.15274827182292938, + "learning_rate": 0.0008328016178044022, + "loss": 2.7364, + "step": 9292 + }, + { + "epoch": 0.2755686030305726, + "grad_norm": 0.1690753698348999, + "learning_rate": 0.0008327665019264476, + "loss": 2.7557, + "step": 9293 + }, + { + "epoch": 0.2755982563828841, + "grad_norm": 0.17238539457321167, + "learning_rate": 0.0008327313831017886, + "loss": 2.7324, + "step": 9294 + }, + { + "epoch": 0.2756279097351956, + "grad_norm": 0.18935446441173553, + "learning_rate": 0.0008326962613307361, + "loss": 2.7507, + "step": 9295 + }, + { + "epoch": 0.27565756308750705, + "grad_norm": 0.1642468273639679, + "learning_rate": 0.000832661136613601, + "loss": 2.7046, + "step": 9296 + }, + { + "epoch": 0.2756872164398185, + "grad_norm": 0.1455288678407669, + "learning_rate": 0.0008326260089506945, + "loss": 2.7225, + "step": 9297 + }, + { + "epoch": 0.27571686979213, + "grad_norm": 0.14801174402236938, + "learning_rate": 0.0008325908783423276, + "loss": 2.727, + "step": 9298 + }, + { + "epoch": 0.2757465231444415, + "grad_norm": 0.15355154871940613, + "learning_rate": 0.0008325557447888115, + "loss": 2.7207, + "step": 9299 + }, + { + "epoch": 0.27577617649675296, + "grad_norm": 0.13550390303134918, + "learning_rate": 0.0008325206082904571, + "loss": 2.7285, + "step": 9300 + }, + { + "epoch": 0.27580582984906443, + "grad_norm": 0.1390133649110794, + "learning_rate": 0.0008324854688475756, + "loss": 2.7378, + "step": 9301 + }, + { + "epoch": 0.2758354832013759, + "grad_norm": 0.15738409757614136, + "learning_rate": 0.0008324503264604781, + "loss": 2.7395, + "step": 9302 + }, + { + "epoch": 0.2758651365536874, + "grad_norm": 0.12707127630710602, + "learning_rate": 0.000832415181129476, + "loss": 2.7451, + "step": 9303 + }, + { + "epoch": 0.27589478990599886, + "grad_norm": 0.13763540983200073, + "learning_rate": 0.0008323800328548805, + "loss": 2.7224, + "step": 9304 + }, + { + "epoch": 0.27592444325831034, + "grad_norm": 0.14250388741493225, + "learning_rate": 0.0008323448816370027, + "loss": 2.7504, + "step": 9305 + }, + { + "epoch": 0.2759540966106218, + "grad_norm": 0.13293543457984924, + "learning_rate": 0.000832309727476154, + "loss": 2.719, + "step": 9306 + }, + { + "epoch": 0.2759837499629333, + "grad_norm": 0.1387450098991394, + "learning_rate": 0.0008322745703726454, + "loss": 2.7231, + "step": 9307 + }, + { + "epoch": 0.27601340331524477, + "grad_norm": 0.14562568068504333, + "learning_rate": 0.0008322394103267886, + "loss": 2.7912, + "step": 9308 + }, + { + "epoch": 0.27604305666755624, + "grad_norm": 0.16712261736392975, + "learning_rate": 0.0008322042473388949, + "loss": 2.7095, + "step": 9309 + }, + { + "epoch": 0.2760727100198678, + "grad_norm": 0.1706840544939041, + "learning_rate": 0.0008321690814092753, + "loss": 2.7423, + "step": 9310 + }, + { + "epoch": 0.27610236337217925, + "grad_norm": 0.159153550863266, + "learning_rate": 0.0008321339125382417, + "loss": 2.7137, + "step": 9311 + }, + { + "epoch": 0.2761320167244907, + "grad_norm": 0.16839824616909027, + "learning_rate": 0.0008320987407261051, + "loss": 2.7381, + "step": 9312 + }, + { + "epoch": 0.2761616700768022, + "grad_norm": 0.17551547288894653, + "learning_rate": 0.0008320635659731773, + "loss": 2.7212, + "step": 9313 + }, + { + "epoch": 0.2761913234291137, + "grad_norm": 0.13988511264324188, + "learning_rate": 0.0008320283882797695, + "loss": 2.7128, + "step": 9314 + }, + { + "epoch": 0.27622097678142515, + "grad_norm": 0.13923205435276031, + "learning_rate": 0.0008319932076461936, + "loss": 2.7331, + "step": 9315 + }, + { + "epoch": 0.27625063013373663, + "grad_norm": 0.14410415291786194, + "learning_rate": 0.0008319580240727604, + "loss": 2.7457, + "step": 9316 + }, + { + "epoch": 0.2762802834860481, + "grad_norm": 0.1505494862794876, + "learning_rate": 0.0008319228375597823, + "loss": 2.7458, + "step": 9317 + }, + { + "epoch": 0.2763099368383596, + "grad_norm": 0.17739269137382507, + "learning_rate": 0.0008318876481075703, + "loss": 2.7373, + "step": 9318 + }, + { + "epoch": 0.27633959019067106, + "grad_norm": 0.18261601030826569, + "learning_rate": 0.0008318524557164364, + "loss": 2.7548, + "step": 9319 + }, + { + "epoch": 0.27636924354298253, + "grad_norm": 0.16792497038841248, + "learning_rate": 0.0008318172603866919, + "loss": 2.7496, + "step": 9320 + }, + { + "epoch": 0.276398896895294, + "grad_norm": 0.13914890587329865, + "learning_rate": 0.0008317820621186488, + "loss": 2.7604, + "step": 9321 + }, + { + "epoch": 0.2764285502476055, + "grad_norm": 0.14145702123641968, + "learning_rate": 0.0008317468609126183, + "loss": 2.7891, + "step": 9322 + }, + { + "epoch": 0.27645820359991696, + "grad_norm": 0.14263062179088593, + "learning_rate": 0.0008317116567689124, + "loss": 2.7383, + "step": 9323 + }, + { + "epoch": 0.27648785695222844, + "grad_norm": 0.1399105042219162, + "learning_rate": 0.0008316764496878431, + "loss": 2.7582, + "step": 9324 + }, + { + "epoch": 0.2765175103045399, + "grad_norm": 0.1410304754972458, + "learning_rate": 0.0008316412396697217, + "loss": 2.7554, + "step": 9325 + }, + { + "epoch": 0.2765471636568514, + "grad_norm": 0.1480463743209839, + "learning_rate": 0.0008316060267148604, + "loss": 2.722, + "step": 9326 + }, + { + "epoch": 0.27657681700916287, + "grad_norm": 0.14970599114894867, + "learning_rate": 0.0008315708108235706, + "loss": 2.7401, + "step": 9327 + }, + { + "epoch": 0.27660647036147434, + "grad_norm": 0.15584424138069153, + "learning_rate": 0.0008315355919961644, + "loss": 2.7701, + "step": 9328 + }, + { + "epoch": 0.2766361237137858, + "grad_norm": 0.14018762111663818, + "learning_rate": 0.0008315003702329538, + "loss": 2.7754, + "step": 9329 + }, + { + "epoch": 0.2766657770660973, + "grad_norm": 0.1513686627149582, + "learning_rate": 0.0008314651455342503, + "loss": 2.7693, + "step": 9330 + }, + { + "epoch": 0.2766954304184088, + "grad_norm": 0.16427701711654663, + "learning_rate": 0.0008314299179003661, + "loss": 2.7446, + "step": 9331 + }, + { + "epoch": 0.2767250837707203, + "grad_norm": 0.15156199038028717, + "learning_rate": 0.0008313946873316131, + "loss": 2.7421, + "step": 9332 + }, + { + "epoch": 0.2767547371230318, + "grad_norm": 0.13490267097949982, + "learning_rate": 0.0008313594538283033, + "loss": 2.7603, + "step": 9333 + }, + { + "epoch": 0.27678439047534326, + "grad_norm": 0.12232314795255661, + "learning_rate": 0.0008313242173907487, + "loss": 2.7429, + "step": 9334 + }, + { + "epoch": 0.27681404382765473, + "grad_norm": 0.1365334689617157, + "learning_rate": 0.0008312889780192612, + "loss": 2.6792, + "step": 9335 + }, + { + "epoch": 0.2768436971799662, + "grad_norm": 0.13235531747341156, + "learning_rate": 0.000831253735714153, + "loss": 2.7237, + "step": 9336 + }, + { + "epoch": 0.2768733505322777, + "grad_norm": 0.11337804049253464, + "learning_rate": 0.0008312184904757361, + "loss": 2.7536, + "step": 9337 + }, + { + "epoch": 0.27690300388458916, + "grad_norm": 0.12224958091974258, + "learning_rate": 0.0008311832423043226, + "loss": 2.71, + "step": 9338 + }, + { + "epoch": 0.27693265723690064, + "grad_norm": 0.13053949177265167, + "learning_rate": 0.0008311479912002246, + "loss": 2.7535, + "step": 9339 + }, + { + "epoch": 0.2769623105892121, + "grad_norm": 0.14152535796165466, + "learning_rate": 0.0008311127371637544, + "loss": 2.7295, + "step": 9340 + }, + { + "epoch": 0.2769919639415236, + "grad_norm": 0.15158112347126007, + "learning_rate": 0.0008310774801952239, + "loss": 2.7633, + "step": 9341 + }, + { + "epoch": 0.27702161729383507, + "grad_norm": 0.1323096603155136, + "learning_rate": 0.0008310422202949456, + "loss": 2.7667, + "step": 9342 + }, + { + "epoch": 0.27705127064614654, + "grad_norm": 0.12382223457098007, + "learning_rate": 0.0008310069574632315, + "loss": 2.7126, + "step": 9343 + }, + { + "epoch": 0.277080923998458, + "grad_norm": 0.11636456102132797, + "learning_rate": 0.0008309716917003942, + "loss": 2.7405, + "step": 9344 + }, + { + "epoch": 0.2771105773507695, + "grad_norm": 0.12194480001926422, + "learning_rate": 0.0008309364230067456, + "loss": 2.7287, + "step": 9345 + }, + { + "epoch": 0.27714023070308097, + "grad_norm": 0.11622942984104156, + "learning_rate": 0.0008309011513825983, + "loss": 2.7272, + "step": 9346 + }, + { + "epoch": 0.27716988405539245, + "grad_norm": 0.11923094838857651, + "learning_rate": 0.0008308658768282644, + "loss": 2.7456, + "step": 9347 + }, + { + "epoch": 0.2771995374077039, + "grad_norm": 0.13301615417003632, + "learning_rate": 0.0008308305993440563, + "loss": 2.7414, + "step": 9348 + }, + { + "epoch": 0.2772291907600154, + "grad_norm": 0.14846660196781158, + "learning_rate": 0.0008307953189302866, + "loss": 2.7495, + "step": 9349 + }, + { + "epoch": 0.2772588441123269, + "grad_norm": 0.15876086056232452, + "learning_rate": 0.0008307600355872676, + "loss": 2.753, + "step": 9350 + }, + { + "epoch": 0.27728849746463835, + "grad_norm": 0.17288410663604736, + "learning_rate": 0.0008307247493153115, + "loss": 2.7513, + "step": 9351 + }, + { + "epoch": 0.2773181508169499, + "grad_norm": 0.18955951929092407, + "learning_rate": 0.0008306894601147312, + "loss": 2.7601, + "step": 9352 + }, + { + "epoch": 0.27734780416926136, + "grad_norm": 0.20617075264453888, + "learning_rate": 0.0008306541679858388, + "loss": 2.7523, + "step": 9353 + }, + { + "epoch": 0.27737745752157283, + "grad_norm": 0.20675677061080933, + "learning_rate": 0.000830618872928947, + "loss": 2.767, + "step": 9354 + }, + { + "epoch": 0.2774071108738843, + "grad_norm": 0.15760833024978638, + "learning_rate": 0.0008305835749443685, + "loss": 2.7614, + "step": 9355 + }, + { + "epoch": 0.2774367642261958, + "grad_norm": 0.12762980163097382, + "learning_rate": 0.0008305482740324155, + "loss": 2.7456, + "step": 9356 + }, + { + "epoch": 0.27746641757850726, + "grad_norm": 0.1641702950000763, + "learning_rate": 0.0008305129701934009, + "loss": 2.7667, + "step": 9357 + }, + { + "epoch": 0.27749607093081874, + "grad_norm": 0.16227202117443085, + "learning_rate": 0.0008304776634276372, + "loss": 2.7476, + "step": 9358 + }, + { + "epoch": 0.2775257242831302, + "grad_norm": 0.1710244119167328, + "learning_rate": 0.0008304423537354371, + "loss": 2.7679, + "step": 9359 + }, + { + "epoch": 0.2775553776354417, + "grad_norm": 0.15673035383224487, + "learning_rate": 0.0008304070411171132, + "loss": 2.7297, + "step": 9360 + }, + { + "epoch": 0.27758503098775317, + "grad_norm": 0.13394321501255035, + "learning_rate": 0.0008303717255729781, + "loss": 2.755, + "step": 9361 + }, + { + "epoch": 0.27761468434006464, + "grad_norm": 0.14354664087295532, + "learning_rate": 0.0008303364071033448, + "loss": 2.7718, + "step": 9362 + }, + { + "epoch": 0.2776443376923761, + "grad_norm": 0.17031463980674744, + "learning_rate": 0.000830301085708526, + "loss": 2.7227, + "step": 9363 + }, + { + "epoch": 0.2776739910446876, + "grad_norm": 0.16083860397338867, + "learning_rate": 0.0008302657613888342, + "loss": 2.7407, + "step": 9364 + }, + { + "epoch": 0.27770364439699907, + "grad_norm": 0.12044429033994675, + "learning_rate": 0.0008302304341445825, + "loss": 2.7219, + "step": 9365 + }, + { + "epoch": 0.27773329774931055, + "grad_norm": 0.13587647676467896, + "learning_rate": 0.0008301951039760838, + "loss": 2.7472, + "step": 9366 + }, + { + "epoch": 0.277762951101622, + "grad_norm": 0.15841291844844818, + "learning_rate": 0.0008301597708836504, + "loss": 2.7414, + "step": 9367 + }, + { + "epoch": 0.2777926044539335, + "grad_norm": 0.15909184515476227, + "learning_rate": 0.0008301244348675958, + "loss": 2.7382, + "step": 9368 + }, + { + "epoch": 0.277822257806245, + "grad_norm": 0.15614767372608185, + "learning_rate": 0.0008300890959282326, + "loss": 2.7411, + "step": 9369 + }, + { + "epoch": 0.27785191115855645, + "grad_norm": 0.12896442413330078, + "learning_rate": 0.0008300537540658738, + "loss": 2.7055, + "step": 9370 + }, + { + "epoch": 0.27788156451086793, + "grad_norm": 0.14694909751415253, + "learning_rate": 0.0008300184092808325, + "loss": 2.7402, + "step": 9371 + }, + { + "epoch": 0.2779112178631794, + "grad_norm": 0.17062926292419434, + "learning_rate": 0.0008299830615734214, + "loss": 2.7425, + "step": 9372 + }, + { + "epoch": 0.27794087121549094, + "grad_norm": 0.1584906429052353, + "learning_rate": 0.0008299477109439536, + "loss": 2.7404, + "step": 9373 + }, + { + "epoch": 0.2779705245678024, + "grad_norm": 0.13888181746006012, + "learning_rate": 0.0008299123573927422, + "loss": 2.7426, + "step": 9374 + }, + { + "epoch": 0.2780001779201139, + "grad_norm": 0.14144639670848846, + "learning_rate": 0.0008298770009201002, + "loss": 2.7414, + "step": 9375 + }, + { + "epoch": 0.27802983127242537, + "grad_norm": 0.13998766243457794, + "learning_rate": 0.000829841641526341, + "loss": 2.6942, + "step": 9376 + }, + { + "epoch": 0.27805948462473684, + "grad_norm": 0.14628352224826813, + "learning_rate": 0.0008298062792117772, + "loss": 2.7369, + "step": 9377 + }, + { + "epoch": 0.2780891379770483, + "grad_norm": 0.13103340566158295, + "learning_rate": 0.0008297709139767222, + "loss": 2.7493, + "step": 9378 + }, + { + "epoch": 0.2781187913293598, + "grad_norm": 0.13334918022155762, + "learning_rate": 0.0008297355458214892, + "loss": 2.7636, + "step": 9379 + }, + { + "epoch": 0.27814844468167127, + "grad_norm": 0.14023779332637787, + "learning_rate": 0.0008297001747463915, + "loss": 2.728, + "step": 9380 + }, + { + "epoch": 0.27817809803398275, + "grad_norm": 0.14944782853126526, + "learning_rate": 0.0008296648007517417, + "loss": 2.757, + "step": 9381 + }, + { + "epoch": 0.2782077513862942, + "grad_norm": 0.17157432436943054, + "learning_rate": 0.000829629423837854, + "loss": 2.7074, + "step": 9382 + }, + { + "epoch": 0.2782374047386057, + "grad_norm": 0.1637069433927536, + "learning_rate": 0.000829594044005041, + "loss": 2.7575, + "step": 9383 + }, + { + "epoch": 0.2782670580909172, + "grad_norm": 0.1384565830230713, + "learning_rate": 0.0008295586612536161, + "loss": 2.7352, + "step": 9384 + }, + { + "epoch": 0.27829671144322865, + "grad_norm": 0.13684143126010895, + "learning_rate": 0.0008295232755838928, + "loss": 2.7473, + "step": 9385 + }, + { + "epoch": 0.2783263647955401, + "grad_norm": 0.14466312527656555, + "learning_rate": 0.0008294878869961842, + "loss": 2.7404, + "step": 9386 + }, + { + "epoch": 0.2783560181478516, + "grad_norm": 0.16679605841636658, + "learning_rate": 0.0008294524954908039, + "loss": 2.7546, + "step": 9387 + }, + { + "epoch": 0.2783856715001631, + "grad_norm": 0.15056225657463074, + "learning_rate": 0.0008294171010680652, + "loss": 2.7246, + "step": 9388 + }, + { + "epoch": 0.27841532485247455, + "grad_norm": 0.1341378092765808, + "learning_rate": 0.0008293817037282815, + "loss": 2.7438, + "step": 9389 + }, + { + "epoch": 0.27844497820478603, + "grad_norm": 0.133432537317276, + "learning_rate": 0.0008293463034717663, + "loss": 2.7049, + "step": 9390 + }, + { + "epoch": 0.2784746315570975, + "grad_norm": 0.15945294499397278, + "learning_rate": 0.0008293109002988331, + "loss": 2.7105, + "step": 9391 + }, + { + "epoch": 0.278504284909409, + "grad_norm": 0.1273593306541443, + "learning_rate": 0.0008292754942097954, + "loss": 2.7404, + "step": 9392 + }, + { + "epoch": 0.2785339382617205, + "grad_norm": 0.14390021562576294, + "learning_rate": 0.0008292400852049664, + "loss": 2.7222, + "step": 9393 + }, + { + "epoch": 0.278563591614032, + "grad_norm": 0.14191874861717224, + "learning_rate": 0.0008292046732846601, + "loss": 2.7596, + "step": 9394 + }, + { + "epoch": 0.27859324496634347, + "grad_norm": 0.11763321608304977, + "learning_rate": 0.0008291692584491899, + "loss": 2.7519, + "step": 9395 + }, + { + "epoch": 0.27862289831865494, + "grad_norm": 0.13058845698833466, + "learning_rate": 0.0008291338406988695, + "loss": 2.7541, + "step": 9396 + }, + { + "epoch": 0.2786525516709664, + "grad_norm": 0.13310062885284424, + "learning_rate": 0.0008290984200340122, + "loss": 2.7557, + "step": 9397 + }, + { + "epoch": 0.2786822050232779, + "grad_norm": 0.13598711788654327, + "learning_rate": 0.0008290629964549321, + "loss": 2.7365, + "step": 9398 + }, + { + "epoch": 0.27871185837558937, + "grad_norm": 0.13667017221450806, + "learning_rate": 0.0008290275699619427, + "loss": 2.7549, + "step": 9399 + }, + { + "epoch": 0.27874151172790085, + "grad_norm": 0.12172625213861465, + "learning_rate": 0.0008289921405553576, + "loss": 2.7273, + "step": 9400 + }, + { + "epoch": 0.2787711650802123, + "grad_norm": 0.14092384278774261, + "learning_rate": 0.0008289567082354907, + "loss": 2.741, + "step": 9401 + }, + { + "epoch": 0.2788008184325238, + "grad_norm": 0.12649981677532196, + "learning_rate": 0.0008289212730026557, + "loss": 2.6948, + "step": 9402 + }, + { + "epoch": 0.2788304717848353, + "grad_norm": 0.1431751549243927, + "learning_rate": 0.0008288858348571663, + "loss": 2.7589, + "step": 9403 + }, + { + "epoch": 0.27886012513714675, + "grad_norm": 0.16231054067611694, + "learning_rate": 0.0008288503937993364, + "loss": 2.7371, + "step": 9404 + }, + { + "epoch": 0.27888977848945823, + "grad_norm": 0.19069334864616394, + "learning_rate": 0.0008288149498294799, + "loss": 2.7497, + "step": 9405 + }, + { + "epoch": 0.2789194318417697, + "grad_norm": 0.19821161031723022, + "learning_rate": 0.0008287795029479104, + "loss": 2.7591, + "step": 9406 + }, + { + "epoch": 0.2789490851940812, + "grad_norm": 0.18347685039043427, + "learning_rate": 0.0008287440531549421, + "loss": 2.7523, + "step": 9407 + }, + { + "epoch": 0.27897873854639266, + "grad_norm": 0.14524178206920624, + "learning_rate": 0.0008287086004508887, + "loss": 2.747, + "step": 9408 + }, + { + "epoch": 0.27900839189870413, + "grad_norm": 0.13997900485992432, + "learning_rate": 0.0008286731448360643, + "loss": 2.7294, + "step": 9409 + }, + { + "epoch": 0.2790380452510156, + "grad_norm": 0.15312474966049194, + "learning_rate": 0.0008286376863107827, + "loss": 2.7506, + "step": 9410 + }, + { + "epoch": 0.2790676986033271, + "grad_norm": 0.1564895212650299, + "learning_rate": 0.000828602224875358, + "loss": 2.7441, + "step": 9411 + }, + { + "epoch": 0.27909735195563856, + "grad_norm": 0.15574581921100616, + "learning_rate": 0.0008285667605301044, + "loss": 2.7294, + "step": 9412 + }, + { + "epoch": 0.27912700530795004, + "grad_norm": 0.15592217445373535, + "learning_rate": 0.0008285312932753355, + "loss": 2.7549, + "step": 9413 + }, + { + "epoch": 0.27915665866026157, + "grad_norm": 0.14231181144714355, + "learning_rate": 0.0008284958231113655, + "loss": 2.7321, + "step": 9414 + }, + { + "epoch": 0.27918631201257305, + "grad_norm": 0.13607031106948853, + "learning_rate": 0.0008284603500385089, + "loss": 2.7405, + "step": 9415 + }, + { + "epoch": 0.2792159653648845, + "grad_norm": 0.1355397254228592, + "learning_rate": 0.0008284248740570794, + "loss": 2.724, + "step": 9416 + }, + { + "epoch": 0.279245618717196, + "grad_norm": 0.13611288368701935, + "learning_rate": 0.0008283893951673913, + "loss": 2.7276, + "step": 9417 + }, + { + "epoch": 0.2792752720695075, + "grad_norm": 0.13011184334754944, + "learning_rate": 0.0008283539133697586, + "loss": 2.7679, + "step": 9418 + }, + { + "epoch": 0.27930492542181895, + "grad_norm": 0.1394832283258438, + "learning_rate": 0.0008283184286644958, + "loss": 2.7304, + "step": 9419 + }, + { + "epoch": 0.2793345787741304, + "grad_norm": 0.12360572814941406, + "learning_rate": 0.0008282829410519168, + "loss": 2.7387, + "step": 9420 + }, + { + "epoch": 0.2793642321264419, + "grad_norm": 0.1480012983083725, + "learning_rate": 0.0008282474505323361, + "loss": 2.7306, + "step": 9421 + }, + { + "epoch": 0.2793938854787534, + "grad_norm": 0.14597289264202118, + "learning_rate": 0.000828211957106068, + "loss": 2.7593, + "step": 9422 + }, + { + "epoch": 0.27942353883106485, + "grad_norm": 0.1375865787267685, + "learning_rate": 0.0008281764607734266, + "loss": 2.727, + "step": 9423 + }, + { + "epoch": 0.27945319218337633, + "grad_norm": 0.15973037481307983, + "learning_rate": 0.0008281409615347262, + "loss": 2.7918, + "step": 9424 + }, + { + "epoch": 0.2794828455356878, + "grad_norm": 0.15709471702575684, + "learning_rate": 0.0008281054593902812, + "loss": 2.7235, + "step": 9425 + }, + { + "epoch": 0.2795124988879993, + "grad_norm": 0.13527542352676392, + "learning_rate": 0.0008280699543404063, + "loss": 2.7434, + "step": 9426 + }, + { + "epoch": 0.27954215224031076, + "grad_norm": 0.12297854572534561, + "learning_rate": 0.0008280344463854155, + "loss": 2.7244, + "step": 9427 + }, + { + "epoch": 0.27957180559262224, + "grad_norm": 0.15656910836696625, + "learning_rate": 0.0008279989355256235, + "loss": 2.7242, + "step": 9428 + }, + { + "epoch": 0.2796014589449337, + "grad_norm": 0.13724663853645325, + "learning_rate": 0.0008279634217613444, + "loss": 2.7606, + "step": 9429 + }, + { + "epoch": 0.2796311122972452, + "grad_norm": 0.13113123178482056, + "learning_rate": 0.000827927905092893, + "loss": 2.7509, + "step": 9430 + }, + { + "epoch": 0.27966076564955666, + "grad_norm": 0.14265751838684082, + "learning_rate": 0.0008278923855205838, + "loss": 2.7523, + "step": 9431 + }, + { + "epoch": 0.27969041900186814, + "grad_norm": 0.1466422826051712, + "learning_rate": 0.000827856863044731, + "loss": 2.7098, + "step": 9432 + }, + { + "epoch": 0.2797200723541796, + "grad_norm": 0.1509770154953003, + "learning_rate": 0.0008278213376656496, + "loss": 2.7516, + "step": 9433 + }, + { + "epoch": 0.2797497257064911, + "grad_norm": 0.15803447365760803, + "learning_rate": 0.0008277858093836541, + "loss": 2.7729, + "step": 9434 + }, + { + "epoch": 0.2797793790588026, + "grad_norm": 0.144461527466774, + "learning_rate": 0.0008277502781990588, + "loss": 2.7435, + "step": 9435 + }, + { + "epoch": 0.2798090324111141, + "grad_norm": 0.13802367448806763, + "learning_rate": 0.0008277147441121786, + "loss": 2.7391, + "step": 9436 + }, + { + "epoch": 0.2798386857634256, + "grad_norm": 0.21590550243854523, + "learning_rate": 0.0008276792071233281, + "loss": 2.769, + "step": 9437 + }, + { + "epoch": 0.27986833911573705, + "grad_norm": 0.18072213232517242, + "learning_rate": 0.000827643667232822, + "loss": 2.7456, + "step": 9438 + }, + { + "epoch": 0.27989799246804853, + "grad_norm": 0.17189174890518188, + "learning_rate": 0.0008276081244409747, + "loss": 2.7465, + "step": 9439 + }, + { + "epoch": 0.27992764582036, + "grad_norm": 0.17179006338119507, + "learning_rate": 0.0008275725787481017, + "loss": 2.7702, + "step": 9440 + }, + { + "epoch": 0.2799572991726715, + "grad_norm": 0.18025226891040802, + "learning_rate": 0.0008275370301545168, + "loss": 2.7543, + "step": 9441 + }, + { + "epoch": 0.27998695252498296, + "grad_norm": 0.16865161061286926, + "learning_rate": 0.0008275014786605357, + "loss": 2.7233, + "step": 9442 + }, + { + "epoch": 0.28001660587729443, + "grad_norm": 0.1739378571510315, + "learning_rate": 0.0008274659242664726, + "loss": 2.7494, + "step": 9443 + }, + { + "epoch": 0.2800462592296059, + "grad_norm": 0.14344896376132965, + "learning_rate": 0.0008274303669726426, + "loss": 2.7023, + "step": 9444 + }, + { + "epoch": 0.2800759125819174, + "grad_norm": 0.15281911194324493, + "learning_rate": 0.0008273948067793604, + "loss": 2.7429, + "step": 9445 + }, + { + "epoch": 0.28010556593422886, + "grad_norm": 0.15558713674545288, + "learning_rate": 0.000827359243686941, + "loss": 2.7274, + "step": 9446 + }, + { + "epoch": 0.28013521928654034, + "grad_norm": 0.13715028762817383, + "learning_rate": 0.0008273236776956994, + "loss": 2.722, + "step": 9447 + }, + { + "epoch": 0.2801648726388518, + "grad_norm": 0.12876154482364655, + "learning_rate": 0.0008272881088059504, + "loss": 2.6819, + "step": 9448 + }, + { + "epoch": 0.2801945259911633, + "grad_norm": 0.12463211268186569, + "learning_rate": 0.0008272525370180091, + "loss": 2.7349, + "step": 9449 + }, + { + "epoch": 0.28022417934347477, + "grad_norm": 0.12335114926099777, + "learning_rate": 0.0008272169623321903, + "loss": 2.7492, + "step": 9450 + }, + { + "epoch": 0.28025383269578624, + "grad_norm": 0.15014314651489258, + "learning_rate": 0.0008271813847488091, + "loss": 2.7244, + "step": 9451 + }, + { + "epoch": 0.2802834860480977, + "grad_norm": 0.11209968477487564, + "learning_rate": 0.0008271458042681805, + "loss": 2.7263, + "step": 9452 + }, + { + "epoch": 0.2803131394004092, + "grad_norm": 0.117177315056324, + "learning_rate": 0.0008271102208906199, + "loss": 2.7551, + "step": 9453 + }, + { + "epoch": 0.28034279275272067, + "grad_norm": 0.10865611582994461, + "learning_rate": 0.000827074634616442, + "loss": 2.7434, + "step": 9454 + }, + { + "epoch": 0.28037244610503215, + "grad_norm": 0.1139739528298378, + "learning_rate": 0.0008270390454459621, + "loss": 2.7627, + "step": 9455 + }, + { + "epoch": 0.2804020994573437, + "grad_norm": 0.11134032905101776, + "learning_rate": 0.0008270034533794955, + "loss": 2.741, + "step": 9456 + }, + { + "epoch": 0.28043175280965515, + "grad_norm": 0.1168205514550209, + "learning_rate": 0.0008269678584173569, + "loss": 2.7392, + "step": 9457 + }, + { + "epoch": 0.28046140616196663, + "grad_norm": 0.1289111226797104, + "learning_rate": 0.0008269322605598618, + "loss": 2.7478, + "step": 9458 + }, + { + "epoch": 0.2804910595142781, + "grad_norm": 0.1249934509396553, + "learning_rate": 0.0008268966598073256, + "loss": 2.7201, + "step": 9459 + }, + { + "epoch": 0.2805207128665896, + "grad_norm": 0.12165415287017822, + "learning_rate": 0.0008268610561600633, + "loss": 2.7411, + "step": 9460 + }, + { + "epoch": 0.28055036621890106, + "grad_norm": 0.1313081830739975, + "learning_rate": 0.0008268254496183903, + "loss": 2.7282, + "step": 9461 + }, + { + "epoch": 0.28058001957121254, + "grad_norm": 0.16343867778778076, + "learning_rate": 0.0008267898401826217, + "loss": 2.7255, + "step": 9462 + }, + { + "epoch": 0.280609672923524, + "grad_norm": 0.16264823079109192, + "learning_rate": 0.000826754227853073, + "loss": 2.7396, + "step": 9463 + }, + { + "epoch": 0.2806393262758355, + "grad_norm": 0.1632014364004135, + "learning_rate": 0.0008267186126300597, + "loss": 2.7575, + "step": 9464 + }, + { + "epoch": 0.28066897962814696, + "grad_norm": 0.14479640126228333, + "learning_rate": 0.0008266829945138967, + "loss": 2.7474, + "step": 9465 + }, + { + "epoch": 0.28069863298045844, + "grad_norm": 0.1550682932138443, + "learning_rate": 0.0008266473735048999, + "loss": 2.7252, + "step": 9466 + }, + { + "epoch": 0.2807282863327699, + "grad_norm": 0.1701563596725464, + "learning_rate": 0.0008266117496033845, + "loss": 2.7552, + "step": 9467 + }, + { + "epoch": 0.2807579396850814, + "grad_norm": 0.1747724413871765, + "learning_rate": 0.000826576122809666, + "loss": 2.7651, + "step": 9468 + }, + { + "epoch": 0.28078759303739287, + "grad_norm": 0.17252759635448456, + "learning_rate": 0.0008265404931240599, + "loss": 2.7056, + "step": 9469 + }, + { + "epoch": 0.28081724638970434, + "grad_norm": 0.1419326364994049, + "learning_rate": 0.0008265048605468816, + "loss": 2.7049, + "step": 9470 + }, + { + "epoch": 0.2808468997420158, + "grad_norm": 0.1362723559141159, + "learning_rate": 0.0008264692250784468, + "loss": 2.7059, + "step": 9471 + }, + { + "epoch": 0.2808765530943273, + "grad_norm": 0.14292143285274506, + "learning_rate": 0.0008264335867190711, + "loss": 2.7469, + "step": 9472 + }, + { + "epoch": 0.2809062064466388, + "grad_norm": 0.13826489448547363, + "learning_rate": 0.0008263979454690698, + "loss": 2.709, + "step": 9473 + }, + { + "epoch": 0.28093585979895025, + "grad_norm": 0.14278769493103027, + "learning_rate": 0.0008263623013287587, + "loss": 2.7528, + "step": 9474 + }, + { + "epoch": 0.2809655131512617, + "grad_norm": 0.15400798618793488, + "learning_rate": 0.0008263266542984534, + "loss": 2.7532, + "step": 9475 + }, + { + "epoch": 0.2809951665035732, + "grad_norm": 0.14652682840824127, + "learning_rate": 0.0008262910043784695, + "loss": 2.7557, + "step": 9476 + }, + { + "epoch": 0.28102481985588473, + "grad_norm": 0.16101688146591187, + "learning_rate": 0.0008262553515691228, + "loss": 2.7282, + "step": 9477 + }, + { + "epoch": 0.2810544732081962, + "grad_norm": 0.15896786749362946, + "learning_rate": 0.0008262196958707289, + "loss": 2.7276, + "step": 9478 + }, + { + "epoch": 0.2810841265605077, + "grad_norm": 0.16653546690940857, + "learning_rate": 0.0008261840372836037, + "loss": 2.7482, + "step": 9479 + }, + { + "epoch": 0.28111377991281916, + "grad_norm": 0.144122913479805, + "learning_rate": 0.0008261483758080628, + "loss": 2.7426, + "step": 9480 + }, + { + "epoch": 0.28114343326513064, + "grad_norm": 0.15235209465026855, + "learning_rate": 0.0008261127114444221, + "loss": 2.7403, + "step": 9481 + }, + { + "epoch": 0.2811730866174421, + "grad_norm": 0.14636841416358948, + "learning_rate": 0.0008260770441929974, + "loss": 2.7148, + "step": 9482 + }, + { + "epoch": 0.2812027399697536, + "grad_norm": 0.14392073452472687, + "learning_rate": 0.0008260413740541044, + "loss": 2.7306, + "step": 9483 + }, + { + "epoch": 0.28123239332206507, + "grad_norm": 0.1295301467180252, + "learning_rate": 0.000826005701028059, + "loss": 2.7653, + "step": 9484 + }, + { + "epoch": 0.28126204667437654, + "grad_norm": 0.1522274911403656, + "learning_rate": 0.0008259700251151774, + "loss": 2.7323, + "step": 9485 + }, + { + "epoch": 0.281291700026688, + "grad_norm": 0.15186549723148346, + "learning_rate": 0.0008259343463157752, + "loss": 2.7289, + "step": 9486 + }, + { + "epoch": 0.2813213533789995, + "grad_norm": 0.142185240983963, + "learning_rate": 0.0008258986646301683, + "loss": 2.7361, + "step": 9487 + }, + { + "epoch": 0.28135100673131097, + "grad_norm": 0.18217620253562927, + "learning_rate": 0.0008258629800586728, + "loss": 2.7385, + "step": 9488 + }, + { + "epoch": 0.28138066008362245, + "grad_norm": 0.2014121413230896, + "learning_rate": 0.0008258272926016048, + "loss": 2.7388, + "step": 9489 + }, + { + "epoch": 0.2814103134359339, + "grad_norm": 0.1715058982372284, + "learning_rate": 0.0008257916022592801, + "loss": 2.712, + "step": 9490 + }, + { + "epoch": 0.2814399667882454, + "grad_norm": 0.1770213395357132, + "learning_rate": 0.0008257559090320148, + "loss": 2.6959, + "step": 9491 + }, + { + "epoch": 0.2814696201405569, + "grad_norm": 0.17011143267154694, + "learning_rate": 0.0008257202129201252, + "loss": 2.7556, + "step": 9492 + }, + { + "epoch": 0.28149927349286835, + "grad_norm": 0.16416428983211517, + "learning_rate": 0.000825684513923927, + "loss": 2.7447, + "step": 9493 + }, + { + "epoch": 0.2815289268451798, + "grad_norm": 0.14186464250087738, + "learning_rate": 0.0008256488120437366, + "loss": 2.7583, + "step": 9494 + }, + { + "epoch": 0.2815585801974913, + "grad_norm": 0.15045776963233948, + "learning_rate": 0.0008256131072798701, + "loss": 2.7378, + "step": 9495 + }, + { + "epoch": 0.2815882335498028, + "grad_norm": 0.1374100148677826, + "learning_rate": 0.0008255773996326436, + "loss": 2.7442, + "step": 9496 + }, + { + "epoch": 0.2816178869021143, + "grad_norm": 0.12282941490411758, + "learning_rate": 0.0008255416891023733, + "loss": 2.7261, + "step": 9497 + }, + { + "epoch": 0.2816475402544258, + "grad_norm": 0.1393265724182129, + "learning_rate": 0.0008255059756893755, + "loss": 2.7547, + "step": 9498 + }, + { + "epoch": 0.28167719360673726, + "grad_norm": 0.13039550185203552, + "learning_rate": 0.0008254702593939665, + "loss": 2.7288, + "step": 9499 + }, + { + "epoch": 0.28170684695904874, + "grad_norm": 0.12958720326423645, + "learning_rate": 0.0008254345402164625, + "loss": 2.7343, + "step": 9500 + }, + { + "epoch": 0.2817365003113602, + "grad_norm": 0.12743060290813446, + "learning_rate": 0.0008253988181571797, + "loss": 2.7244, + "step": 9501 + }, + { + "epoch": 0.2817661536636717, + "grad_norm": 0.1290547251701355, + "learning_rate": 0.0008253630932164344, + "loss": 2.7448, + "step": 9502 + }, + { + "epoch": 0.28179580701598317, + "grad_norm": 0.14424584805965424, + "learning_rate": 0.0008253273653945432, + "loss": 2.738, + "step": 9503 + }, + { + "epoch": 0.28182546036829464, + "grad_norm": 0.16963952779769897, + "learning_rate": 0.0008252916346918223, + "loss": 2.7212, + "step": 9504 + }, + { + "epoch": 0.2818551137206061, + "grad_norm": 0.1603410392999649, + "learning_rate": 0.0008252559011085882, + "loss": 2.7569, + "step": 9505 + }, + { + "epoch": 0.2818847670729176, + "grad_norm": 0.13273455202579498, + "learning_rate": 0.0008252201646451573, + "loss": 2.7383, + "step": 9506 + }, + { + "epoch": 0.2819144204252291, + "grad_norm": 0.13526804745197296, + "learning_rate": 0.0008251844253018459, + "loss": 2.7736, + "step": 9507 + }, + { + "epoch": 0.28194407377754055, + "grad_norm": 0.13271979987621307, + "learning_rate": 0.0008251486830789707, + "loss": 2.7269, + "step": 9508 + }, + { + "epoch": 0.281973727129852, + "grad_norm": 0.13389863073825836, + "learning_rate": 0.000825112937976848, + "loss": 2.7672, + "step": 9509 + }, + { + "epoch": 0.2820033804821635, + "grad_norm": 0.13982312381267548, + "learning_rate": 0.0008250771899957945, + "loss": 2.7667, + "step": 9510 + }, + { + "epoch": 0.282033033834475, + "grad_norm": 0.1151002049446106, + "learning_rate": 0.0008250414391361265, + "loss": 2.7413, + "step": 9511 + }, + { + "epoch": 0.28206268718678645, + "grad_norm": 0.1441519558429718, + "learning_rate": 0.000825005685398161, + "loss": 2.7889, + "step": 9512 + }, + { + "epoch": 0.28209234053909793, + "grad_norm": 0.13749440014362335, + "learning_rate": 0.0008249699287822144, + "loss": 2.7556, + "step": 9513 + }, + { + "epoch": 0.2821219938914094, + "grad_norm": 0.14358775317668915, + "learning_rate": 0.0008249341692886031, + "loss": 2.7306, + "step": 9514 + }, + { + "epoch": 0.2821516472437209, + "grad_norm": 0.13085579872131348, + "learning_rate": 0.000824898406917644, + "loss": 2.7457, + "step": 9515 + }, + { + "epoch": 0.28218130059603236, + "grad_norm": 0.13058911263942719, + "learning_rate": 0.0008248626416696538, + "loss": 2.7358, + "step": 9516 + }, + { + "epoch": 0.28221095394834383, + "grad_norm": 0.11696333438158035, + "learning_rate": 0.0008248268735449491, + "loss": 2.7236, + "step": 9517 + }, + { + "epoch": 0.28224060730065537, + "grad_norm": 0.13987410068511963, + "learning_rate": 0.0008247911025438469, + "loss": 2.7292, + "step": 9518 + }, + { + "epoch": 0.28227026065296684, + "grad_norm": 0.1430034637451172, + "learning_rate": 0.0008247553286666634, + "loss": 2.7433, + "step": 9519 + }, + { + "epoch": 0.2822999140052783, + "grad_norm": 0.14365914463996887, + "learning_rate": 0.0008247195519137158, + "loss": 2.7179, + "step": 9520 + }, + { + "epoch": 0.2823295673575898, + "grad_norm": 0.13983391225337982, + "learning_rate": 0.000824683772285321, + "loss": 2.7173, + "step": 9521 + }, + { + "epoch": 0.28235922070990127, + "grad_norm": 0.12269478291273117, + "learning_rate": 0.0008246479897817956, + "loss": 2.7424, + "step": 9522 + }, + { + "epoch": 0.28238887406221275, + "grad_norm": 0.11985426396131516, + "learning_rate": 0.0008246122044034564, + "loss": 2.7027, + "step": 9523 + }, + { + "epoch": 0.2824185274145242, + "grad_norm": 0.12872344255447388, + "learning_rate": 0.0008245764161506205, + "loss": 2.7455, + "step": 9524 + }, + { + "epoch": 0.2824481807668357, + "grad_norm": 0.13254167139530182, + "learning_rate": 0.0008245406250236047, + "loss": 2.7455, + "step": 9525 + }, + { + "epoch": 0.2824778341191472, + "grad_norm": 0.12929491698741913, + "learning_rate": 0.0008245048310227261, + "loss": 2.7187, + "step": 9526 + }, + { + "epoch": 0.28250748747145865, + "grad_norm": 0.12600132822990417, + "learning_rate": 0.0008244690341483015, + "loss": 2.7667, + "step": 9527 + }, + { + "epoch": 0.2825371408237701, + "grad_norm": 0.1261732429265976, + "learning_rate": 0.0008244332344006476, + "loss": 2.724, + "step": 9528 + }, + { + "epoch": 0.2825667941760816, + "grad_norm": 0.1322585493326187, + "learning_rate": 0.000824397431780082, + "loss": 2.7038, + "step": 9529 + }, + { + "epoch": 0.2825964475283931, + "grad_norm": 0.1462005078792572, + "learning_rate": 0.0008243616262869213, + "loss": 2.7624, + "step": 9530 + }, + { + "epoch": 0.28262610088070456, + "grad_norm": 0.17940916121006012, + "learning_rate": 0.0008243258179214828, + "loss": 2.7251, + "step": 9531 + }, + { + "epoch": 0.28265575423301603, + "grad_norm": 0.20049461722373962, + "learning_rate": 0.0008242900066840837, + "loss": 2.7681, + "step": 9532 + }, + { + "epoch": 0.2826854075853275, + "grad_norm": 0.19292065501213074, + "learning_rate": 0.0008242541925750406, + "loss": 2.7678, + "step": 9533 + }, + { + "epoch": 0.282715060937639, + "grad_norm": 0.15558955073356628, + "learning_rate": 0.0008242183755946713, + "loss": 2.7023, + "step": 9534 + }, + { + "epoch": 0.28274471428995046, + "grad_norm": 0.1719508022069931, + "learning_rate": 0.0008241825557432924, + "loss": 2.7414, + "step": 9535 + }, + { + "epoch": 0.28277436764226194, + "grad_norm": 0.18187688291072845, + "learning_rate": 0.0008241467330212213, + "loss": 2.7308, + "step": 9536 + }, + { + "epoch": 0.2828040209945734, + "grad_norm": 0.18244658410549164, + "learning_rate": 0.0008241109074287753, + "loss": 2.743, + "step": 9537 + }, + { + "epoch": 0.2828336743468849, + "grad_norm": 0.1669139713048935, + "learning_rate": 0.0008240750789662716, + "loss": 2.7259, + "step": 9538 + }, + { + "epoch": 0.2828633276991964, + "grad_norm": 0.1815773993730545, + "learning_rate": 0.0008240392476340275, + "loss": 2.7457, + "step": 9539 + }, + { + "epoch": 0.2828929810515079, + "grad_norm": 0.17506733536720276, + "learning_rate": 0.0008240034134323602, + "loss": 2.7334, + "step": 9540 + }, + { + "epoch": 0.2829226344038194, + "grad_norm": 0.14955799281597137, + "learning_rate": 0.000823967576361587, + "loss": 2.7233, + "step": 9541 + }, + { + "epoch": 0.28295228775613085, + "grad_norm": 0.12976676225662231, + "learning_rate": 0.0008239317364220253, + "loss": 2.7488, + "step": 9542 + }, + { + "epoch": 0.2829819411084423, + "grad_norm": 0.13593077659606934, + "learning_rate": 0.0008238958936139926, + "loss": 2.7053, + "step": 9543 + }, + { + "epoch": 0.2830115944607538, + "grad_norm": 0.12106840312480927, + "learning_rate": 0.0008238600479378061, + "loss": 2.7302, + "step": 9544 + }, + { + "epoch": 0.2830412478130653, + "grad_norm": 0.14184457063674927, + "learning_rate": 0.0008238241993937833, + "loss": 2.7935, + "step": 9545 + }, + { + "epoch": 0.28307090116537675, + "grad_norm": 0.1504947990179062, + "learning_rate": 0.0008237883479822416, + "loss": 2.7522, + "step": 9546 + }, + { + "epoch": 0.28310055451768823, + "grad_norm": 0.12297342717647552, + "learning_rate": 0.0008237524937034986, + "loss": 2.7496, + "step": 9547 + }, + { + "epoch": 0.2831302078699997, + "grad_norm": 0.12103726714849472, + "learning_rate": 0.0008237166365578716, + "loss": 2.7165, + "step": 9548 + }, + { + "epoch": 0.2831598612223112, + "grad_norm": 0.1284034699201584, + "learning_rate": 0.000823680776545678, + "loss": 2.6995, + "step": 9549 + }, + { + "epoch": 0.28318951457462266, + "grad_norm": 0.14985722303390503, + "learning_rate": 0.000823644913667236, + "loss": 2.7268, + "step": 9550 + }, + { + "epoch": 0.28321916792693413, + "grad_norm": 0.15148523449897766, + "learning_rate": 0.0008236090479228624, + "loss": 2.7347, + "step": 9551 + }, + { + "epoch": 0.2832488212792456, + "grad_norm": 0.13272970914840698, + "learning_rate": 0.0008235731793128754, + "loss": 2.7374, + "step": 9552 + }, + { + "epoch": 0.2832784746315571, + "grad_norm": 0.13256388902664185, + "learning_rate": 0.0008235373078375921, + "loss": 2.7732, + "step": 9553 + }, + { + "epoch": 0.28330812798386856, + "grad_norm": 0.13865253329277039, + "learning_rate": 0.0008235014334973305, + "loss": 2.7439, + "step": 9554 + }, + { + "epoch": 0.28333778133618004, + "grad_norm": 0.16138753294944763, + "learning_rate": 0.0008234655562924082, + "loss": 2.7006, + "step": 9555 + }, + { + "epoch": 0.2833674346884915, + "grad_norm": 0.17049208283424377, + "learning_rate": 0.0008234296762231429, + "loss": 2.7577, + "step": 9556 + }, + { + "epoch": 0.283397088040803, + "grad_norm": 0.17326617240905762, + "learning_rate": 0.0008233937932898523, + "loss": 2.7033, + "step": 9557 + }, + { + "epoch": 0.28342674139311447, + "grad_norm": 0.16386331617832184, + "learning_rate": 0.0008233579074928541, + "loss": 2.7368, + "step": 9558 + }, + { + "epoch": 0.28345639474542594, + "grad_norm": 0.176985964179039, + "learning_rate": 0.0008233220188324661, + "loss": 2.7291, + "step": 9559 + }, + { + "epoch": 0.2834860480977375, + "grad_norm": 0.17057515680789948, + "learning_rate": 0.0008232861273090062, + "loss": 2.73, + "step": 9560 + }, + { + "epoch": 0.28351570145004895, + "grad_norm": 0.14551375806331635, + "learning_rate": 0.0008232502329227923, + "loss": 2.7421, + "step": 9561 + }, + { + "epoch": 0.2835453548023604, + "grad_norm": 0.136226087808609, + "learning_rate": 0.0008232143356741418, + "loss": 2.7452, + "step": 9562 + }, + { + "epoch": 0.2835750081546719, + "grad_norm": 0.1386631578207016, + "learning_rate": 0.000823178435563373, + "loss": 2.7164, + "step": 9563 + }, + { + "epoch": 0.2836046615069834, + "grad_norm": 0.14278383553028107, + "learning_rate": 0.0008231425325908037, + "loss": 2.6966, + "step": 9564 + }, + { + "epoch": 0.28363431485929486, + "grad_norm": 0.13003253936767578, + "learning_rate": 0.0008231066267567517, + "loss": 2.7331, + "step": 9565 + }, + { + "epoch": 0.28366396821160633, + "grad_norm": 0.1346689909696579, + "learning_rate": 0.0008230707180615353, + "loss": 2.726, + "step": 9566 + }, + { + "epoch": 0.2836936215639178, + "grad_norm": 0.1325702667236328, + "learning_rate": 0.000823034806505472, + "loss": 2.7529, + "step": 9567 + }, + { + "epoch": 0.2837232749162293, + "grad_norm": 0.12384305894374847, + "learning_rate": 0.0008229988920888801, + "loss": 2.7266, + "step": 9568 + }, + { + "epoch": 0.28375292826854076, + "grad_norm": 0.13866519927978516, + "learning_rate": 0.0008229629748120777, + "loss": 2.7524, + "step": 9569 + }, + { + "epoch": 0.28378258162085224, + "grad_norm": 0.1561243087053299, + "learning_rate": 0.0008229270546753827, + "loss": 2.7201, + "step": 9570 + }, + { + "epoch": 0.2838122349731637, + "grad_norm": 0.16894279420375824, + "learning_rate": 0.000822891131679113, + "loss": 2.749, + "step": 9571 + }, + { + "epoch": 0.2838418883254752, + "grad_norm": 0.17642982304096222, + "learning_rate": 0.000822855205823587, + "loss": 2.7655, + "step": 9572 + }, + { + "epoch": 0.28387154167778667, + "grad_norm": 0.18058674037456512, + "learning_rate": 0.0008228192771091229, + "loss": 2.7109, + "step": 9573 + }, + { + "epoch": 0.28390119503009814, + "grad_norm": 0.15425626933574677, + "learning_rate": 0.0008227833455360385, + "loss": 2.727, + "step": 9574 + }, + { + "epoch": 0.2839308483824096, + "grad_norm": 0.134149432182312, + "learning_rate": 0.0008227474111046522, + "loss": 2.7663, + "step": 9575 + }, + { + "epoch": 0.2839605017347211, + "grad_norm": 0.1300327032804489, + "learning_rate": 0.0008227114738152822, + "loss": 2.7243, + "step": 9576 + }, + { + "epoch": 0.28399015508703257, + "grad_norm": 0.1394234448671341, + "learning_rate": 0.0008226755336682468, + "loss": 2.7511, + "step": 9577 + }, + { + "epoch": 0.28401980843934405, + "grad_norm": 0.12546463310718536, + "learning_rate": 0.000822639590663864, + "loss": 2.7126, + "step": 9578 + }, + { + "epoch": 0.2840494617916555, + "grad_norm": 0.13083580136299133, + "learning_rate": 0.0008226036448024523, + "loss": 2.7279, + "step": 9579 + }, + { + "epoch": 0.284079115143967, + "grad_norm": 0.1295660138130188, + "learning_rate": 0.00082256769608433, + "loss": 2.7661, + "step": 9580 + }, + { + "epoch": 0.28410876849627853, + "grad_norm": 0.1288653314113617, + "learning_rate": 0.0008225317445098153, + "loss": 2.7441, + "step": 9581 + }, + { + "epoch": 0.28413842184859, + "grad_norm": 0.13037405908107758, + "learning_rate": 0.0008224957900792267, + "loss": 2.7068, + "step": 9582 + }, + { + "epoch": 0.2841680752009015, + "grad_norm": 0.11124968528747559, + "learning_rate": 0.0008224598327928825, + "loss": 2.7464, + "step": 9583 + }, + { + "epoch": 0.28419772855321296, + "grad_norm": 0.12308112531900406, + "learning_rate": 0.0008224238726511012, + "loss": 2.7363, + "step": 9584 + }, + { + "epoch": 0.28422738190552443, + "grad_norm": 0.13241788744926453, + "learning_rate": 0.0008223879096542011, + "loss": 2.7345, + "step": 9585 + }, + { + "epoch": 0.2842570352578359, + "grad_norm": 0.11424005031585693, + "learning_rate": 0.0008223519438025007, + "loss": 2.722, + "step": 9586 + }, + { + "epoch": 0.2842866886101474, + "grad_norm": 0.12757356464862823, + "learning_rate": 0.0008223159750963186, + "loss": 2.7286, + "step": 9587 + }, + { + "epoch": 0.28431634196245886, + "grad_norm": 0.14946967363357544, + "learning_rate": 0.0008222800035359729, + "loss": 2.7752, + "step": 9588 + }, + { + "epoch": 0.28434599531477034, + "grad_norm": 0.13074891269207, + "learning_rate": 0.000822244029121783, + "loss": 2.7553, + "step": 9589 + }, + { + "epoch": 0.2843756486670818, + "grad_norm": 0.14592714607715607, + "learning_rate": 0.0008222080518540665, + "loss": 2.7452, + "step": 9590 + }, + { + "epoch": 0.2844053020193933, + "grad_norm": 0.1691220998764038, + "learning_rate": 0.0008221720717331425, + "loss": 2.7271, + "step": 9591 + }, + { + "epoch": 0.28443495537170477, + "grad_norm": 0.17669831216335297, + "learning_rate": 0.0008221360887593296, + "loss": 2.7522, + "step": 9592 + }, + { + "epoch": 0.28446460872401624, + "grad_norm": 0.1842852532863617, + "learning_rate": 0.0008221001029329462, + "loss": 2.7376, + "step": 9593 + }, + { + "epoch": 0.2844942620763277, + "grad_norm": 0.1677079200744629, + "learning_rate": 0.0008220641142543112, + "loss": 2.7416, + "step": 9594 + }, + { + "epoch": 0.2845239154286392, + "grad_norm": 0.16536501049995422, + "learning_rate": 0.0008220281227237431, + "loss": 2.7354, + "step": 9595 + }, + { + "epoch": 0.28455356878095067, + "grad_norm": 0.16576358675956726, + "learning_rate": 0.0008219921283415608, + "loss": 2.7077, + "step": 9596 + }, + { + "epoch": 0.28458322213326215, + "grad_norm": 0.1556086391210556, + "learning_rate": 0.0008219561311080827, + "loss": 2.7565, + "step": 9597 + }, + { + "epoch": 0.2846128754855736, + "grad_norm": 0.13482362031936646, + "learning_rate": 0.0008219201310236282, + "loss": 2.7278, + "step": 9598 + }, + { + "epoch": 0.2846425288378851, + "grad_norm": 0.15599355101585388, + "learning_rate": 0.0008218841280885153, + "loss": 2.7364, + "step": 9599 + }, + { + "epoch": 0.2846721821901966, + "grad_norm": 0.15094849467277527, + "learning_rate": 0.0008218481223030634, + "loss": 2.7697, + "step": 9600 + }, + { + "epoch": 0.2847018355425081, + "grad_norm": 0.12303861975669861, + "learning_rate": 0.000821812113667591, + "loss": 2.7352, + "step": 9601 + }, + { + "epoch": 0.2847314888948196, + "grad_norm": 0.1501454859972, + "learning_rate": 0.0008217761021824172, + "loss": 2.7247, + "step": 9602 + }, + { + "epoch": 0.28476114224713106, + "grad_norm": 0.13980892300605774, + "learning_rate": 0.0008217400878478608, + "loss": 2.7373, + "step": 9603 + }, + { + "epoch": 0.28479079559944254, + "grad_norm": 0.12257712334394455, + "learning_rate": 0.0008217040706642407, + "loss": 2.7199, + "step": 9604 + }, + { + "epoch": 0.284820448951754, + "grad_norm": 0.11841963231563568, + "learning_rate": 0.0008216680506318757, + "loss": 2.7242, + "step": 9605 + }, + { + "epoch": 0.2848501023040655, + "grad_norm": 0.13127164542675018, + "learning_rate": 0.000821632027751085, + "loss": 2.7463, + "step": 9606 + }, + { + "epoch": 0.28487975565637697, + "grad_norm": 0.13794872164726257, + "learning_rate": 0.0008215960020221874, + "loss": 2.7762, + "step": 9607 + }, + { + "epoch": 0.28490940900868844, + "grad_norm": 0.15107154846191406, + "learning_rate": 0.0008215599734455022, + "loss": 2.7485, + "step": 9608 + }, + { + "epoch": 0.2849390623609999, + "grad_norm": 0.17299215495586395, + "learning_rate": 0.0008215239420213481, + "loss": 2.75, + "step": 9609 + }, + { + "epoch": 0.2849687157133114, + "grad_norm": 0.17453722655773163, + "learning_rate": 0.0008214879077500444, + "loss": 2.7252, + "step": 9610 + }, + { + "epoch": 0.28499836906562287, + "grad_norm": 0.16074135899543762, + "learning_rate": 0.0008214518706319099, + "loss": 2.7696, + "step": 9611 + }, + { + "epoch": 0.28502802241793435, + "grad_norm": 0.14200493693351746, + "learning_rate": 0.0008214158306672641, + "loss": 2.7454, + "step": 9612 + }, + { + "epoch": 0.2850576757702458, + "grad_norm": 0.12619280815124512, + "learning_rate": 0.0008213797878564257, + "loss": 2.7323, + "step": 9613 + }, + { + "epoch": 0.2850873291225573, + "grad_norm": 0.12928129732608795, + "learning_rate": 0.0008213437421997143, + "loss": 2.7563, + "step": 9614 + }, + { + "epoch": 0.2851169824748688, + "grad_norm": 0.14024299383163452, + "learning_rate": 0.000821307693697449, + "loss": 2.7346, + "step": 9615 + }, + { + "epoch": 0.28514663582718025, + "grad_norm": 0.1608179360628128, + "learning_rate": 0.0008212716423499488, + "loss": 2.7701, + "step": 9616 + }, + { + "epoch": 0.2851762891794917, + "grad_norm": 0.1683962196111679, + "learning_rate": 0.0008212355881575331, + "loss": 2.7536, + "step": 9617 + }, + { + "epoch": 0.2852059425318032, + "grad_norm": 0.13460588455200195, + "learning_rate": 0.0008211995311205211, + "loss": 2.7204, + "step": 9618 + }, + { + "epoch": 0.2852355958841147, + "grad_norm": 0.13871799409389496, + "learning_rate": 0.0008211634712392321, + "loss": 2.7392, + "step": 9619 + }, + { + "epoch": 0.28526524923642615, + "grad_norm": 0.13530893623828888, + "learning_rate": 0.0008211274085139854, + "loss": 2.7083, + "step": 9620 + }, + { + "epoch": 0.28529490258873763, + "grad_norm": 0.13116466999053955, + "learning_rate": 0.0008210913429451003, + "loss": 2.7606, + "step": 9621 + }, + { + "epoch": 0.28532455594104916, + "grad_norm": 0.1161784753203392, + "learning_rate": 0.0008210552745328965, + "loss": 2.7297, + "step": 9622 + }, + { + "epoch": 0.28535420929336064, + "grad_norm": 0.13848569989204407, + "learning_rate": 0.000821019203277693, + "loss": 2.7206, + "step": 9623 + }, + { + "epoch": 0.2853838626456721, + "grad_norm": 0.1293622851371765, + "learning_rate": 0.0008209831291798094, + "loss": 2.7156, + "step": 9624 + }, + { + "epoch": 0.2854135159979836, + "grad_norm": 0.1353253424167633, + "learning_rate": 0.0008209470522395651, + "loss": 2.7706, + "step": 9625 + }, + { + "epoch": 0.28544316935029507, + "grad_norm": 0.14350369572639465, + "learning_rate": 0.0008209109724572794, + "loss": 2.7399, + "step": 9626 + }, + { + "epoch": 0.28547282270260654, + "grad_norm": 0.1571044772863388, + "learning_rate": 0.000820874889833272, + "loss": 2.7609, + "step": 9627 + }, + { + "epoch": 0.285502476054918, + "grad_norm": 0.15242081880569458, + "learning_rate": 0.0008208388043678625, + "loss": 2.7432, + "step": 9628 + }, + { + "epoch": 0.2855321294072295, + "grad_norm": 0.13051418960094452, + "learning_rate": 0.0008208027160613704, + "loss": 2.7446, + "step": 9629 + }, + { + "epoch": 0.28556178275954097, + "grad_norm": 0.13572478294372559, + "learning_rate": 0.000820766624914115, + "loss": 2.7589, + "step": 9630 + }, + { + "epoch": 0.28559143611185245, + "grad_norm": 0.12308996915817261, + "learning_rate": 0.0008207305309264161, + "loss": 2.7392, + "step": 9631 + }, + { + "epoch": 0.2856210894641639, + "grad_norm": 0.11871939152479172, + "learning_rate": 0.0008206944340985933, + "loss": 2.7236, + "step": 9632 + }, + { + "epoch": 0.2856507428164754, + "grad_norm": 0.14233750104904175, + "learning_rate": 0.0008206583344309664, + "loss": 2.7336, + "step": 9633 + }, + { + "epoch": 0.2856803961687869, + "grad_norm": 0.1330062747001648, + "learning_rate": 0.0008206222319238547, + "loss": 2.7456, + "step": 9634 + }, + { + "epoch": 0.28571004952109835, + "grad_norm": 0.153945654630661, + "learning_rate": 0.0008205861265775782, + "loss": 2.7318, + "step": 9635 + }, + { + "epoch": 0.28573970287340983, + "grad_norm": 0.15550477802753448, + "learning_rate": 0.0008205500183924565, + "loss": 2.7183, + "step": 9636 + }, + { + "epoch": 0.2857693562257213, + "grad_norm": 0.1614988148212433, + "learning_rate": 0.0008205139073688092, + "loss": 2.7332, + "step": 9637 + }, + { + "epoch": 0.2857990095780328, + "grad_norm": 0.18064892292022705, + "learning_rate": 0.0008204777935069566, + "loss": 2.7568, + "step": 9638 + }, + { + "epoch": 0.28582866293034426, + "grad_norm": 0.1766478270292282, + "learning_rate": 0.0008204416768072178, + "loss": 2.7234, + "step": 9639 + }, + { + "epoch": 0.28585831628265573, + "grad_norm": 0.16308891773223877, + "learning_rate": 0.000820405557269913, + "loss": 2.7288, + "step": 9640 + }, + { + "epoch": 0.2858879696349672, + "grad_norm": 0.16716517508029938, + "learning_rate": 0.0008203694348953622, + "loss": 2.7371, + "step": 9641 + }, + { + "epoch": 0.2859176229872787, + "grad_norm": 0.1572481095790863, + "learning_rate": 0.000820333309683885, + "loss": 2.7279, + "step": 9642 + }, + { + "epoch": 0.2859472763395902, + "grad_norm": 0.1286490261554718, + "learning_rate": 0.0008202971816358011, + "loss": 2.7583, + "step": 9643 + }, + { + "epoch": 0.2859769296919017, + "grad_norm": 0.1312839537858963, + "learning_rate": 0.000820261050751431, + "loss": 2.7382, + "step": 9644 + }, + { + "epoch": 0.28600658304421317, + "grad_norm": 0.12894761562347412, + "learning_rate": 0.0008202249170310942, + "loss": 2.7079, + "step": 9645 + }, + { + "epoch": 0.28603623639652465, + "grad_norm": 0.13520947098731995, + "learning_rate": 0.0008201887804751107, + "loss": 2.7086, + "step": 9646 + }, + { + "epoch": 0.2860658897488361, + "grad_norm": 0.12670789659023285, + "learning_rate": 0.0008201526410838007, + "loss": 2.7219, + "step": 9647 + }, + { + "epoch": 0.2860955431011476, + "grad_norm": 0.11696305871009827, + "learning_rate": 0.000820116498857484, + "loss": 2.7107, + "step": 9648 + }, + { + "epoch": 0.2861251964534591, + "grad_norm": 0.14708304405212402, + "learning_rate": 0.000820080353796481, + "loss": 2.7165, + "step": 9649 + }, + { + "epoch": 0.28615484980577055, + "grad_norm": 0.16606400907039642, + "learning_rate": 0.0008200442059011113, + "loss": 2.771, + "step": 9650 + }, + { + "epoch": 0.286184503158082, + "grad_norm": 0.22039680182933807, + "learning_rate": 0.0008200080551716953, + "loss": 2.7212, + "step": 9651 + }, + { + "epoch": 0.2862141565103935, + "grad_norm": 0.23158353567123413, + "learning_rate": 0.000819971901608553, + "loss": 2.6962, + "step": 9652 + }, + { + "epoch": 0.286243809862705, + "grad_norm": 0.17174507677555084, + "learning_rate": 0.0008199357452120047, + "loss": 2.7016, + "step": 9653 + }, + { + "epoch": 0.28627346321501645, + "grad_norm": 0.15591353178024292, + "learning_rate": 0.0008198995859823703, + "loss": 2.7541, + "step": 9654 + }, + { + "epoch": 0.28630311656732793, + "grad_norm": 0.17145131528377533, + "learning_rate": 0.0008198634239199703, + "loss": 2.7218, + "step": 9655 + }, + { + "epoch": 0.2863327699196394, + "grad_norm": 0.1467595100402832, + "learning_rate": 0.0008198272590251246, + "loss": 2.7367, + "step": 9656 + }, + { + "epoch": 0.2863624232719509, + "grad_norm": 0.1452733278274536, + "learning_rate": 0.0008197910912981538, + "loss": 2.7331, + "step": 9657 + }, + { + "epoch": 0.28639207662426236, + "grad_norm": 0.14761316776275635, + "learning_rate": 0.000819754920739378, + "loss": 2.7306, + "step": 9658 + }, + { + "epoch": 0.28642172997657384, + "grad_norm": 0.13633272051811218, + "learning_rate": 0.0008197187473491173, + "loss": 2.7646, + "step": 9659 + }, + { + "epoch": 0.2864513833288853, + "grad_norm": 0.12042942643165588, + "learning_rate": 0.0008196825711276923, + "loss": 2.7442, + "step": 9660 + }, + { + "epoch": 0.2864810366811968, + "grad_norm": 0.1252843737602234, + "learning_rate": 0.0008196463920754234, + "loss": 2.7424, + "step": 9661 + }, + { + "epoch": 0.28651069003350826, + "grad_norm": 0.12687596678733826, + "learning_rate": 0.0008196102101926306, + "loss": 2.7213, + "step": 9662 + }, + { + "epoch": 0.28654034338581974, + "grad_norm": 0.1277705281972885, + "learning_rate": 0.0008195740254796347, + "loss": 2.7373, + "step": 9663 + }, + { + "epoch": 0.28656999673813127, + "grad_norm": 0.13661354780197144, + "learning_rate": 0.000819537837936756, + "loss": 2.7375, + "step": 9664 + }, + { + "epoch": 0.28659965009044275, + "grad_norm": 0.15087130665779114, + "learning_rate": 0.0008195016475643147, + "loss": 2.7252, + "step": 9665 + }, + { + "epoch": 0.2866293034427542, + "grad_norm": 0.13821913301944733, + "learning_rate": 0.0008194654543626316, + "loss": 2.7339, + "step": 9666 + }, + { + "epoch": 0.2866589567950657, + "grad_norm": 0.1177930012345314, + "learning_rate": 0.000819429258332027, + "loss": 2.6971, + "step": 9667 + }, + { + "epoch": 0.2866886101473772, + "grad_norm": 0.12648169696331024, + "learning_rate": 0.0008193930594728216, + "loss": 2.7354, + "step": 9668 + }, + { + "epoch": 0.28671826349968865, + "grad_norm": 0.12800419330596924, + "learning_rate": 0.0008193568577853356, + "loss": 2.7675, + "step": 9669 + }, + { + "epoch": 0.28674791685200013, + "grad_norm": 0.11666657030582428, + "learning_rate": 0.00081932065326989, + "loss": 2.7312, + "step": 9670 + }, + { + "epoch": 0.2867775702043116, + "grad_norm": 0.13555550575256348, + "learning_rate": 0.0008192844459268052, + "loss": 2.7191, + "step": 9671 + }, + { + "epoch": 0.2868072235566231, + "grad_norm": 0.1391928642988205, + "learning_rate": 0.0008192482357564018, + "loss": 2.7667, + "step": 9672 + }, + { + "epoch": 0.28683687690893456, + "grad_norm": 0.13376562297344208, + "learning_rate": 0.0008192120227590006, + "loss": 2.7438, + "step": 9673 + }, + { + "epoch": 0.28686653026124603, + "grad_norm": 0.15871751308441162, + "learning_rate": 0.000819175806934922, + "loss": 2.7399, + "step": 9674 + }, + { + "epoch": 0.2868961836135575, + "grad_norm": 0.1529640555381775, + "learning_rate": 0.0008191395882844867, + "loss": 2.716, + "step": 9675 + }, + { + "epoch": 0.286925836965869, + "grad_norm": 0.11658649146556854, + "learning_rate": 0.0008191033668080159, + "loss": 2.7376, + "step": 9676 + }, + { + "epoch": 0.28695549031818046, + "grad_norm": 0.11265364289283752, + "learning_rate": 0.0008190671425058298, + "loss": 2.7358, + "step": 9677 + }, + { + "epoch": 0.28698514367049194, + "grad_norm": 0.13116469979286194, + "learning_rate": 0.0008190309153782493, + "loss": 2.7119, + "step": 9678 + }, + { + "epoch": 0.2870147970228034, + "grad_norm": 0.13761892914772034, + "learning_rate": 0.0008189946854255954, + "loss": 2.7335, + "step": 9679 + }, + { + "epoch": 0.2870444503751149, + "grad_norm": 0.12954525649547577, + "learning_rate": 0.000818958452648189, + "loss": 2.7381, + "step": 9680 + }, + { + "epoch": 0.28707410372742637, + "grad_norm": 0.11861580610275269, + "learning_rate": 0.0008189222170463504, + "loss": 2.7395, + "step": 9681 + }, + { + "epoch": 0.28710375707973784, + "grad_norm": 0.11787911504507065, + "learning_rate": 0.0008188859786204011, + "loss": 2.7649, + "step": 9682 + }, + { + "epoch": 0.2871334104320493, + "grad_norm": 0.1430557817220688, + "learning_rate": 0.0008188497373706616, + "loss": 2.7219, + "step": 9683 + }, + { + "epoch": 0.2871630637843608, + "grad_norm": 0.14362527430057526, + "learning_rate": 0.000818813493297453, + "loss": 2.7181, + "step": 9684 + }, + { + "epoch": 0.2871927171366723, + "grad_norm": 0.13333125412464142, + "learning_rate": 0.0008187772464010961, + "loss": 2.7418, + "step": 9685 + }, + { + "epoch": 0.2872223704889838, + "grad_norm": 0.1307772845029831, + "learning_rate": 0.000818740996681912, + "loss": 2.7276, + "step": 9686 + }, + { + "epoch": 0.2872520238412953, + "grad_norm": 0.13438761234283447, + "learning_rate": 0.0008187047441402217, + "loss": 2.7482, + "step": 9687 + }, + { + "epoch": 0.28728167719360675, + "grad_norm": 0.1402485966682434, + "learning_rate": 0.0008186684887763463, + "loss": 2.6938, + "step": 9688 + }, + { + "epoch": 0.28731133054591823, + "grad_norm": 0.1478273868560791, + "learning_rate": 0.0008186322305906065, + "loss": 2.7548, + "step": 9689 + }, + { + "epoch": 0.2873409838982297, + "grad_norm": 0.15230895578861237, + "learning_rate": 0.0008185959695833238, + "loss": 2.7844, + "step": 9690 + }, + { + "epoch": 0.2873706372505412, + "grad_norm": 0.15795297920703888, + "learning_rate": 0.0008185597057548189, + "loss": 2.7514, + "step": 9691 + }, + { + "epoch": 0.28740029060285266, + "grad_norm": 0.16016167402267456, + "learning_rate": 0.0008185234391054133, + "loss": 2.7481, + "step": 9692 + }, + { + "epoch": 0.28742994395516414, + "grad_norm": 0.1476651281118393, + "learning_rate": 0.0008184871696354279, + "loss": 2.7362, + "step": 9693 + }, + { + "epoch": 0.2874595973074756, + "grad_norm": 0.14148230850696564, + "learning_rate": 0.0008184508973451839, + "loss": 2.7138, + "step": 9694 + }, + { + "epoch": 0.2874892506597871, + "grad_norm": 0.15895895659923553, + "learning_rate": 0.0008184146222350026, + "loss": 2.7524, + "step": 9695 + }, + { + "epoch": 0.28751890401209856, + "grad_norm": 0.16222181916236877, + "learning_rate": 0.0008183783443052053, + "loss": 2.672, + "step": 9696 + }, + { + "epoch": 0.28754855736441004, + "grad_norm": 0.1719609498977661, + "learning_rate": 0.000818342063556113, + "loss": 2.7659, + "step": 9697 + }, + { + "epoch": 0.2875782107167215, + "grad_norm": 0.17547444999217987, + "learning_rate": 0.0008183057799880469, + "loss": 2.7081, + "step": 9698 + }, + { + "epoch": 0.287607864069033, + "grad_norm": 0.16087529063224792, + "learning_rate": 0.0008182694936013286, + "loss": 2.7358, + "step": 9699 + }, + { + "epoch": 0.28763751742134447, + "grad_norm": 0.1786809116601944, + "learning_rate": 0.0008182332043962794, + "loss": 2.7346, + "step": 9700 + }, + { + "epoch": 0.28766717077365594, + "grad_norm": 0.15471108257770538, + "learning_rate": 0.0008181969123732206, + "loss": 2.7353, + "step": 9701 + }, + { + "epoch": 0.2876968241259674, + "grad_norm": 0.1427384614944458, + "learning_rate": 0.0008181606175324734, + "loss": 2.7747, + "step": 9702 + }, + { + "epoch": 0.2877264774782789, + "grad_norm": 0.1584879755973816, + "learning_rate": 0.0008181243198743594, + "loss": 2.7383, + "step": 9703 + }, + { + "epoch": 0.2877561308305904, + "grad_norm": 0.14206142723560333, + "learning_rate": 0.0008180880193991997, + "loss": 2.7138, + "step": 9704 + }, + { + "epoch": 0.2877857841829019, + "grad_norm": 0.1514703929424286, + "learning_rate": 0.0008180517161073162, + "loss": 2.7221, + "step": 9705 + }, + { + "epoch": 0.2878154375352134, + "grad_norm": 0.13832367956638336, + "learning_rate": 0.0008180154099990302, + "loss": 2.7263, + "step": 9706 + }, + { + "epoch": 0.28784509088752486, + "grad_norm": 0.17643722891807556, + "learning_rate": 0.0008179791010746631, + "loss": 2.7597, + "step": 9707 + }, + { + "epoch": 0.28787474423983633, + "grad_norm": 0.19995753467082977, + "learning_rate": 0.0008179427893345364, + "loss": 2.7211, + "step": 9708 + }, + { + "epoch": 0.2879043975921478, + "grad_norm": 0.1844813972711563, + "learning_rate": 0.000817906474778972, + "loss": 2.7257, + "step": 9709 + }, + { + "epoch": 0.2879340509444593, + "grad_norm": 0.14451275765895844, + "learning_rate": 0.0008178701574082909, + "loss": 2.7471, + "step": 9710 + }, + { + "epoch": 0.28796370429677076, + "grad_norm": 0.15763525664806366, + "learning_rate": 0.000817833837222815, + "loss": 2.7396, + "step": 9711 + }, + { + "epoch": 0.28799335764908224, + "grad_norm": 0.16824887692928314, + "learning_rate": 0.0008177975142228661, + "loss": 2.7296, + "step": 9712 + }, + { + "epoch": 0.2880230110013937, + "grad_norm": 0.1345958262681961, + "learning_rate": 0.0008177611884087654, + "loss": 2.7313, + "step": 9713 + }, + { + "epoch": 0.2880526643537052, + "grad_norm": 0.1250055432319641, + "learning_rate": 0.0008177248597808351, + "loss": 2.7341, + "step": 9714 + }, + { + "epoch": 0.28808231770601667, + "grad_norm": 0.14327222108840942, + "learning_rate": 0.0008176885283393967, + "loss": 2.7077, + "step": 9715 + }, + { + "epoch": 0.28811197105832814, + "grad_norm": 0.11497865617275238, + "learning_rate": 0.0008176521940847717, + "loss": 2.7422, + "step": 9716 + }, + { + "epoch": 0.2881416244106396, + "grad_norm": 0.12734031677246094, + "learning_rate": 0.0008176158570172818, + "loss": 2.7305, + "step": 9717 + }, + { + "epoch": 0.2881712777629511, + "grad_norm": 0.12269318103790283, + "learning_rate": 0.0008175795171372491, + "loss": 2.7374, + "step": 9718 + }, + { + "epoch": 0.28820093111526257, + "grad_norm": 0.11615309119224548, + "learning_rate": 0.0008175431744449953, + "loss": 2.7333, + "step": 9719 + }, + { + "epoch": 0.28823058446757405, + "grad_norm": 0.12088433653116226, + "learning_rate": 0.0008175068289408423, + "loss": 2.7422, + "step": 9720 + }, + { + "epoch": 0.2882602378198855, + "grad_norm": 0.12871384620666504, + "learning_rate": 0.0008174704806251118, + "loss": 2.7215, + "step": 9721 + }, + { + "epoch": 0.288289891172197, + "grad_norm": 0.13363005220890045, + "learning_rate": 0.0008174341294981256, + "loss": 2.7213, + "step": 9722 + }, + { + "epoch": 0.2883195445245085, + "grad_norm": 0.1264152228832245, + "learning_rate": 0.0008173977755602057, + "loss": 2.7337, + "step": 9723 + }, + { + "epoch": 0.28834919787681995, + "grad_norm": 0.1388908177614212, + "learning_rate": 0.000817361418811674, + "loss": 2.7416, + "step": 9724 + }, + { + "epoch": 0.2883788512291314, + "grad_norm": 0.13507723808288574, + "learning_rate": 0.0008173250592528524, + "loss": 2.737, + "step": 9725 + }, + { + "epoch": 0.28840850458144296, + "grad_norm": 0.13670657575130463, + "learning_rate": 0.0008172886968840632, + "loss": 2.7581, + "step": 9726 + }, + { + "epoch": 0.28843815793375444, + "grad_norm": 0.14807993173599243, + "learning_rate": 0.000817252331705628, + "loss": 2.7818, + "step": 9727 + }, + { + "epoch": 0.2884678112860659, + "grad_norm": 0.15194252133369446, + "learning_rate": 0.0008172159637178689, + "loss": 2.7564, + "step": 9728 + }, + { + "epoch": 0.2884974646383774, + "grad_norm": 0.1270727515220642, + "learning_rate": 0.000817179592921108, + "loss": 2.7184, + "step": 9729 + }, + { + "epoch": 0.28852711799068886, + "grad_norm": 0.12097659707069397, + "learning_rate": 0.0008171432193156673, + "loss": 2.7195, + "step": 9730 + }, + { + "epoch": 0.28855677134300034, + "grad_norm": 0.12404943257570267, + "learning_rate": 0.000817106842901869, + "loss": 2.7504, + "step": 9731 + }, + { + "epoch": 0.2885864246953118, + "grad_norm": 0.12977978587150574, + "learning_rate": 0.0008170704636800353, + "loss": 2.7607, + "step": 9732 + }, + { + "epoch": 0.2886160780476233, + "grad_norm": 0.13634169101715088, + "learning_rate": 0.000817034081650488, + "loss": 2.7389, + "step": 9733 + }, + { + "epoch": 0.28864573139993477, + "grad_norm": 0.12139714509248734, + "learning_rate": 0.0008169976968135498, + "loss": 2.735, + "step": 9734 + }, + { + "epoch": 0.28867538475224624, + "grad_norm": 0.14457650482654572, + "learning_rate": 0.0008169613091695422, + "loss": 2.7419, + "step": 9735 + }, + { + "epoch": 0.2887050381045577, + "grad_norm": 0.15196563303470612, + "learning_rate": 0.0008169249187187879, + "loss": 2.7507, + "step": 9736 + }, + { + "epoch": 0.2887346914568692, + "grad_norm": 0.15997783839702606, + "learning_rate": 0.0008168885254616092, + "loss": 2.7411, + "step": 9737 + }, + { + "epoch": 0.2887643448091807, + "grad_norm": 0.15458978712558746, + "learning_rate": 0.0008168521293983282, + "loss": 2.7493, + "step": 9738 + }, + { + "epoch": 0.28879399816149215, + "grad_norm": 0.16012436151504517, + "learning_rate": 0.0008168157305292672, + "loss": 2.7372, + "step": 9739 + }, + { + "epoch": 0.2888236515138036, + "grad_norm": 0.16732177138328552, + "learning_rate": 0.0008167793288547485, + "loss": 2.7454, + "step": 9740 + }, + { + "epoch": 0.2888533048661151, + "grad_norm": 0.16092468798160553, + "learning_rate": 0.0008167429243750943, + "loss": 2.7527, + "step": 9741 + }, + { + "epoch": 0.2888829582184266, + "grad_norm": 0.13513581454753876, + "learning_rate": 0.0008167065170906274, + "loss": 2.7074, + "step": 9742 + }, + { + "epoch": 0.28891261157073805, + "grad_norm": 0.12197552621364594, + "learning_rate": 0.0008166701070016698, + "loss": 2.7137, + "step": 9743 + }, + { + "epoch": 0.28894226492304953, + "grad_norm": 0.16636060178279877, + "learning_rate": 0.0008166336941085441, + "loss": 2.7391, + "step": 9744 + }, + { + "epoch": 0.288971918275361, + "grad_norm": 0.19468000531196594, + "learning_rate": 0.0008165972784115726, + "loss": 2.7388, + "step": 9745 + }, + { + "epoch": 0.2890015716276725, + "grad_norm": 0.19573195278644562, + "learning_rate": 0.0008165608599110779, + "loss": 2.7753, + "step": 9746 + }, + { + "epoch": 0.289031224979984, + "grad_norm": 0.17634941637516022, + "learning_rate": 0.0008165244386073824, + "loss": 2.7366, + "step": 9747 + }, + { + "epoch": 0.2890608783322955, + "grad_norm": 0.19643673300743103, + "learning_rate": 0.0008164880145008087, + "loss": 2.7402, + "step": 9748 + }, + { + "epoch": 0.28909053168460697, + "grad_norm": 0.14787161350250244, + "learning_rate": 0.0008164515875916794, + "loss": 2.7187, + "step": 9749 + }, + { + "epoch": 0.28912018503691844, + "grad_norm": 0.14129650592803955, + "learning_rate": 0.0008164151578803169, + "loss": 2.7169, + "step": 9750 + }, + { + "epoch": 0.2891498383892299, + "grad_norm": 0.15717050433158875, + "learning_rate": 0.0008163787253670439, + "loss": 2.7406, + "step": 9751 + }, + { + "epoch": 0.2891794917415414, + "grad_norm": 0.14109572768211365, + "learning_rate": 0.0008163422900521829, + "loss": 2.7576, + "step": 9752 + }, + { + "epoch": 0.28920914509385287, + "grad_norm": 0.1497490555047989, + "learning_rate": 0.0008163058519360567, + "loss": 2.7428, + "step": 9753 + }, + { + "epoch": 0.28923879844616435, + "grad_norm": 0.1432233601808548, + "learning_rate": 0.0008162694110189878, + "loss": 2.7643, + "step": 9754 + }, + { + "epoch": 0.2892684517984758, + "grad_norm": 0.15032154321670532, + "learning_rate": 0.0008162329673012991, + "loss": 2.7647, + "step": 9755 + }, + { + "epoch": 0.2892981051507873, + "grad_norm": 0.13892225921154022, + "learning_rate": 0.0008161965207833131, + "loss": 2.7556, + "step": 9756 + }, + { + "epoch": 0.2893277585030988, + "grad_norm": 0.13632746040821075, + "learning_rate": 0.0008161600714653526, + "loss": 2.7267, + "step": 9757 + }, + { + "epoch": 0.28935741185541025, + "grad_norm": 0.13295531272888184, + "learning_rate": 0.0008161236193477406, + "loss": 2.7435, + "step": 9758 + }, + { + "epoch": 0.2893870652077217, + "grad_norm": 0.13503941893577576, + "learning_rate": 0.0008160871644307994, + "loss": 2.7527, + "step": 9759 + }, + { + "epoch": 0.2894167185600332, + "grad_norm": 0.1408849060535431, + "learning_rate": 0.0008160507067148524, + "loss": 2.7504, + "step": 9760 + }, + { + "epoch": 0.2894463719123447, + "grad_norm": 0.14159248769283295, + "learning_rate": 0.000816014246200222, + "loss": 2.7598, + "step": 9761 + }, + { + "epoch": 0.28947602526465616, + "grad_norm": 0.14034517109394073, + "learning_rate": 0.0008159777828872311, + "loss": 2.7716, + "step": 9762 + }, + { + "epoch": 0.28950567861696763, + "grad_norm": 0.13745512068271637, + "learning_rate": 0.0008159413167762029, + "loss": 2.7219, + "step": 9763 + }, + { + "epoch": 0.2895353319692791, + "grad_norm": 0.1192997395992279, + "learning_rate": 0.00081590484786746, + "loss": 2.7727, + "step": 9764 + }, + { + "epoch": 0.2895649853215906, + "grad_norm": 0.14562511444091797, + "learning_rate": 0.0008158683761613255, + "loss": 2.721, + "step": 9765 + }, + { + "epoch": 0.28959463867390206, + "grad_norm": 0.16132616996765137, + "learning_rate": 0.0008158319016581221, + "loss": 2.714, + "step": 9766 + }, + { + "epoch": 0.28962429202621354, + "grad_norm": 0.15635348856449127, + "learning_rate": 0.0008157954243581733, + "loss": 2.7379, + "step": 9767 + }, + { + "epoch": 0.28965394537852507, + "grad_norm": 0.15440799295902252, + "learning_rate": 0.0008157589442618016, + "loss": 2.7577, + "step": 9768 + }, + { + "epoch": 0.28968359873083654, + "grad_norm": 0.1569206863641739, + "learning_rate": 0.0008157224613693304, + "loss": 2.7483, + "step": 9769 + }, + { + "epoch": 0.289713252083148, + "grad_norm": 0.15850690007209778, + "learning_rate": 0.0008156859756810825, + "loss": 2.7195, + "step": 9770 + }, + { + "epoch": 0.2897429054354595, + "grad_norm": 0.1683136224746704, + "learning_rate": 0.0008156494871973811, + "loss": 2.7744, + "step": 9771 + }, + { + "epoch": 0.289772558787771, + "grad_norm": 0.20398065447807312, + "learning_rate": 0.0008156129959185494, + "loss": 2.7282, + "step": 9772 + }, + { + "epoch": 0.28980221214008245, + "grad_norm": 0.16863924264907837, + "learning_rate": 0.0008155765018449104, + "loss": 2.7315, + "step": 9773 + }, + { + "epoch": 0.2898318654923939, + "grad_norm": 0.14652006328105927, + "learning_rate": 0.0008155400049767872, + "loss": 2.7469, + "step": 9774 + }, + { + "epoch": 0.2898615188447054, + "grad_norm": 0.1465330719947815, + "learning_rate": 0.0008155035053145032, + "loss": 2.722, + "step": 9775 + }, + { + "epoch": 0.2898911721970169, + "grad_norm": 0.15783590078353882, + "learning_rate": 0.0008154670028583814, + "loss": 2.7139, + "step": 9776 + }, + { + "epoch": 0.28992082554932835, + "grad_norm": 0.14049838483333588, + "learning_rate": 0.0008154304976087455, + "loss": 2.7681, + "step": 9777 + }, + { + "epoch": 0.28995047890163983, + "grad_norm": 0.14108319580554962, + "learning_rate": 0.0008153939895659181, + "loss": 2.7488, + "step": 9778 + }, + { + "epoch": 0.2899801322539513, + "grad_norm": 0.1416737288236618, + "learning_rate": 0.0008153574787302228, + "loss": 2.7251, + "step": 9779 + }, + { + "epoch": 0.2900097856062628, + "grad_norm": 0.14702600240707397, + "learning_rate": 0.0008153209651019828, + "loss": 2.7182, + "step": 9780 + }, + { + "epoch": 0.29003943895857426, + "grad_norm": 0.14880937337875366, + "learning_rate": 0.0008152844486815218, + "loss": 2.7532, + "step": 9781 + }, + { + "epoch": 0.29006909231088573, + "grad_norm": 0.1436205506324768, + "learning_rate": 0.0008152479294691627, + "loss": 2.7308, + "step": 9782 + }, + { + "epoch": 0.2900987456631972, + "grad_norm": 0.13431549072265625, + "learning_rate": 0.0008152114074652291, + "loss": 2.7362, + "step": 9783 + }, + { + "epoch": 0.2901283990155087, + "grad_norm": 0.11171537637710571, + "learning_rate": 0.0008151748826700445, + "loss": 2.7338, + "step": 9784 + }, + { + "epoch": 0.29015805236782016, + "grad_norm": 0.12147003412246704, + "learning_rate": 0.000815138355083932, + "loss": 2.7454, + "step": 9785 + }, + { + "epoch": 0.29018770572013164, + "grad_norm": 0.12688229978084564, + "learning_rate": 0.0008151018247072155, + "loss": 2.7583, + "step": 9786 + }, + { + "epoch": 0.2902173590724431, + "grad_norm": 0.14116449654102325, + "learning_rate": 0.0008150652915402181, + "loss": 2.7394, + "step": 9787 + }, + { + "epoch": 0.2902470124247546, + "grad_norm": 0.13272300362586975, + "learning_rate": 0.0008150287555832634, + "loss": 2.7684, + "step": 9788 + }, + { + "epoch": 0.2902766657770661, + "grad_norm": 0.10959358513355255, + "learning_rate": 0.0008149922168366752, + "loss": 2.7565, + "step": 9789 + }, + { + "epoch": 0.2903063191293776, + "grad_norm": 0.1195904016494751, + "learning_rate": 0.0008149556753007768, + "loss": 2.7492, + "step": 9790 + }, + { + "epoch": 0.2903359724816891, + "grad_norm": 0.14551854133605957, + "learning_rate": 0.0008149191309758917, + "loss": 2.7306, + "step": 9791 + }, + { + "epoch": 0.29036562583400055, + "grad_norm": 0.15788161754608154, + "learning_rate": 0.0008148825838623437, + "loss": 2.7178, + "step": 9792 + }, + { + "epoch": 0.290395279186312, + "grad_norm": 0.1493217796087265, + "learning_rate": 0.0008148460339604564, + "loss": 2.7317, + "step": 9793 + }, + { + "epoch": 0.2904249325386235, + "grad_norm": 0.12966416776180267, + "learning_rate": 0.0008148094812705535, + "loss": 2.738, + "step": 9794 + }, + { + "epoch": 0.290454585890935, + "grad_norm": 0.1470741480588913, + "learning_rate": 0.0008147729257929585, + "loss": 2.7289, + "step": 9795 + }, + { + "epoch": 0.29048423924324646, + "grad_norm": 0.15543197095394135, + "learning_rate": 0.0008147363675279953, + "loss": 2.7503, + "step": 9796 + }, + { + "epoch": 0.29051389259555793, + "grad_norm": 0.15622784197330475, + "learning_rate": 0.0008146998064759874, + "loss": 2.7371, + "step": 9797 + }, + { + "epoch": 0.2905435459478694, + "grad_norm": 0.13748033344745636, + "learning_rate": 0.0008146632426372589, + "loss": 2.7484, + "step": 9798 + }, + { + "epoch": 0.2905731993001809, + "grad_norm": 0.12517711520195007, + "learning_rate": 0.0008146266760121331, + "loss": 2.7038, + "step": 9799 + }, + { + "epoch": 0.29060285265249236, + "grad_norm": 0.12670943140983582, + "learning_rate": 0.0008145901066009344, + "loss": 2.7322, + "step": 9800 + }, + { + "epoch": 0.29063250600480384, + "grad_norm": 0.13581934571266174, + "learning_rate": 0.0008145535344039861, + "loss": 2.751, + "step": 9801 + }, + { + "epoch": 0.2906621593571153, + "grad_norm": 0.14714239537715912, + "learning_rate": 0.0008145169594216122, + "loss": 2.7542, + "step": 9802 + }, + { + "epoch": 0.2906918127094268, + "grad_norm": 0.14710773527622223, + "learning_rate": 0.0008144803816541368, + "loss": 2.7015, + "step": 9803 + }, + { + "epoch": 0.29072146606173827, + "grad_norm": 0.12942765653133392, + "learning_rate": 0.0008144438011018836, + "loss": 2.7595, + "step": 9804 + }, + { + "epoch": 0.29075111941404974, + "grad_norm": 0.13264407217502594, + "learning_rate": 0.0008144072177651766, + "loss": 2.7562, + "step": 9805 + }, + { + "epoch": 0.2907807727663612, + "grad_norm": 0.14423596858978271, + "learning_rate": 0.0008143706316443395, + "loss": 2.7673, + "step": 9806 + }, + { + "epoch": 0.2908104261186727, + "grad_norm": 0.13873417675495148, + "learning_rate": 0.0008143340427396968, + "loss": 2.7196, + "step": 9807 + }, + { + "epoch": 0.29084007947098417, + "grad_norm": 0.15025945007801056, + "learning_rate": 0.0008142974510515719, + "loss": 2.742, + "step": 9808 + }, + { + "epoch": 0.29086973282329565, + "grad_norm": 0.19043827056884766, + "learning_rate": 0.0008142608565802894, + "loss": 2.7283, + "step": 9809 + }, + { + "epoch": 0.2908993861756072, + "grad_norm": 0.20420539379119873, + "learning_rate": 0.000814224259326173, + "loss": 2.7584, + "step": 9810 + }, + { + "epoch": 0.29092903952791865, + "grad_norm": 0.1830623596906662, + "learning_rate": 0.0008141876592895467, + "loss": 2.7276, + "step": 9811 + }, + { + "epoch": 0.29095869288023013, + "grad_norm": 0.1457090824842453, + "learning_rate": 0.0008141510564707348, + "loss": 2.7332, + "step": 9812 + }, + { + "epoch": 0.2909883462325416, + "grad_norm": 0.13832832872867584, + "learning_rate": 0.0008141144508700616, + "loss": 2.7472, + "step": 9813 + }, + { + "epoch": 0.2910179995848531, + "grad_norm": 0.15546290576457977, + "learning_rate": 0.0008140778424878508, + "loss": 2.7321, + "step": 9814 + }, + { + "epoch": 0.29104765293716456, + "grad_norm": 0.12507446110248566, + "learning_rate": 0.0008140412313244268, + "loss": 2.7611, + "step": 9815 + }, + { + "epoch": 0.29107730628947603, + "grad_norm": 0.12176544219255447, + "learning_rate": 0.0008140046173801138, + "loss": 2.7368, + "step": 9816 + }, + { + "epoch": 0.2911069596417875, + "grad_norm": 0.12745489180088043, + "learning_rate": 0.0008139680006552362, + "loss": 2.7519, + "step": 9817 + }, + { + "epoch": 0.291136612994099, + "grad_norm": 0.1539175808429718, + "learning_rate": 0.0008139313811501178, + "loss": 2.7477, + "step": 9818 + }, + { + "epoch": 0.29116626634641046, + "grad_norm": 0.1672886610031128, + "learning_rate": 0.0008138947588650833, + "loss": 2.7381, + "step": 9819 + }, + { + "epoch": 0.29119591969872194, + "grad_norm": 0.1625543236732483, + "learning_rate": 0.0008138581338004567, + "loss": 2.737, + "step": 9820 + }, + { + "epoch": 0.2912255730510334, + "grad_norm": 0.1493612825870514, + "learning_rate": 0.0008138215059565626, + "loss": 2.768, + "step": 9821 + }, + { + "epoch": 0.2912552264033449, + "grad_norm": 0.154531329870224, + "learning_rate": 0.000813784875333725, + "loss": 2.7517, + "step": 9822 + }, + { + "epoch": 0.29128487975565637, + "grad_norm": 0.15231642127037048, + "learning_rate": 0.0008137482419322686, + "loss": 2.711, + "step": 9823 + }, + { + "epoch": 0.29131453310796784, + "grad_norm": 0.15206490457057953, + "learning_rate": 0.0008137116057525178, + "loss": 2.7319, + "step": 9824 + }, + { + "epoch": 0.2913441864602793, + "grad_norm": 0.14840121567249298, + "learning_rate": 0.0008136749667947967, + "loss": 2.7399, + "step": 9825 + }, + { + "epoch": 0.2913738398125908, + "grad_norm": 0.13302235305309296, + "learning_rate": 0.0008136383250594299, + "loss": 2.7511, + "step": 9826 + }, + { + "epoch": 0.29140349316490227, + "grad_norm": 0.13543294370174408, + "learning_rate": 0.0008136016805467418, + "loss": 2.7423, + "step": 9827 + }, + { + "epoch": 0.29143314651721375, + "grad_norm": 0.12430478632450104, + "learning_rate": 0.0008135650332570572, + "loss": 2.773, + "step": 9828 + }, + { + "epoch": 0.2914627998695252, + "grad_norm": 0.13102902472019196, + "learning_rate": 0.0008135283831907005, + "loss": 2.7433, + "step": 9829 + }, + { + "epoch": 0.29149245322183676, + "grad_norm": 0.14120027422904968, + "learning_rate": 0.000813491730347996, + "loss": 2.7409, + "step": 9830 + }, + { + "epoch": 0.29152210657414823, + "grad_norm": 0.14454379677772522, + "learning_rate": 0.0008134550747292684, + "loss": 2.7149, + "step": 9831 + }, + { + "epoch": 0.2915517599264597, + "grad_norm": 0.1710374653339386, + "learning_rate": 0.0008134184163348424, + "loss": 2.7433, + "step": 9832 + }, + { + "epoch": 0.2915814132787712, + "grad_norm": 0.16784818470478058, + "learning_rate": 0.0008133817551650424, + "loss": 2.7543, + "step": 9833 + }, + { + "epoch": 0.29161106663108266, + "grad_norm": 0.14414995908737183, + "learning_rate": 0.0008133450912201932, + "loss": 2.7221, + "step": 9834 + }, + { + "epoch": 0.29164071998339414, + "grad_norm": 0.13643430173397064, + "learning_rate": 0.0008133084245006194, + "loss": 2.7524, + "step": 9835 + }, + { + "epoch": 0.2916703733357056, + "grad_norm": 0.15642434358596802, + "learning_rate": 0.0008132717550066459, + "loss": 2.7613, + "step": 9836 + }, + { + "epoch": 0.2917000266880171, + "grad_norm": 0.14414209127426147, + "learning_rate": 0.000813235082738597, + "loss": 2.7528, + "step": 9837 + }, + { + "epoch": 0.29172968004032857, + "grad_norm": 0.1475328803062439, + "learning_rate": 0.000813198407696798, + "loss": 2.6928, + "step": 9838 + }, + { + "epoch": 0.29175933339264004, + "grad_norm": 0.1416589468717575, + "learning_rate": 0.000813161729881573, + "loss": 2.7245, + "step": 9839 + }, + { + "epoch": 0.2917889867449515, + "grad_norm": 0.15088757872581482, + "learning_rate": 0.0008131250492932474, + "loss": 2.7567, + "step": 9840 + }, + { + "epoch": 0.291818640097263, + "grad_norm": 0.13992439210414886, + "learning_rate": 0.0008130883659321455, + "loss": 2.7385, + "step": 9841 + }, + { + "epoch": 0.29184829344957447, + "grad_norm": 0.14558088779449463, + "learning_rate": 0.0008130516797985925, + "loss": 2.7446, + "step": 9842 + }, + { + "epoch": 0.29187794680188595, + "grad_norm": 0.14279095828533173, + "learning_rate": 0.0008130149908929132, + "loss": 2.747, + "step": 9843 + }, + { + "epoch": 0.2919076001541974, + "grad_norm": 0.14162851870059967, + "learning_rate": 0.0008129782992154323, + "loss": 2.7272, + "step": 9844 + }, + { + "epoch": 0.2919372535065089, + "grad_norm": 0.12120357900857925, + "learning_rate": 0.0008129416047664748, + "loss": 2.723, + "step": 9845 + }, + { + "epoch": 0.2919669068588204, + "grad_norm": 0.11649978905916214, + "learning_rate": 0.0008129049075463658, + "loss": 2.72, + "step": 9846 + }, + { + "epoch": 0.29199656021113185, + "grad_norm": 0.129825621843338, + "learning_rate": 0.0008128682075554301, + "loss": 2.7047, + "step": 9847 + }, + { + "epoch": 0.2920262135634433, + "grad_norm": 0.1325642615556717, + "learning_rate": 0.0008128315047939927, + "loss": 2.7188, + "step": 9848 + }, + { + "epoch": 0.2920558669157548, + "grad_norm": 0.12517130374908447, + "learning_rate": 0.0008127947992623788, + "loss": 2.745, + "step": 9849 + }, + { + "epoch": 0.2920855202680663, + "grad_norm": 0.1316203624010086, + "learning_rate": 0.000812758090960913, + "loss": 2.7115, + "step": 9850 + }, + { + "epoch": 0.2921151736203778, + "grad_norm": 0.1356639415025711, + "learning_rate": 0.0008127213798899208, + "loss": 2.7577, + "step": 9851 + }, + { + "epoch": 0.2921448269726893, + "grad_norm": 0.14343567192554474, + "learning_rate": 0.000812684666049727, + "loss": 2.7546, + "step": 9852 + }, + { + "epoch": 0.29217448032500076, + "grad_norm": 0.1553821712732315, + "learning_rate": 0.0008126479494406568, + "loss": 2.7139, + "step": 9853 + }, + { + "epoch": 0.29220413367731224, + "grad_norm": 0.15679417550563812, + "learning_rate": 0.0008126112300630354, + "loss": 2.7265, + "step": 9854 + }, + { + "epoch": 0.2922337870296237, + "grad_norm": 0.17631657421588898, + "learning_rate": 0.000812574507917188, + "loss": 2.7294, + "step": 9855 + }, + { + "epoch": 0.2922634403819352, + "grad_norm": 0.19165697693824768, + "learning_rate": 0.0008125377830034395, + "loss": 2.7353, + "step": 9856 + }, + { + "epoch": 0.29229309373424667, + "grad_norm": 0.22477568686008453, + "learning_rate": 0.0008125010553221152, + "loss": 2.7273, + "step": 9857 + }, + { + "epoch": 0.29232274708655814, + "grad_norm": 0.20229248702526093, + "learning_rate": 0.0008124643248735408, + "loss": 2.7458, + "step": 9858 + }, + { + "epoch": 0.2923524004388696, + "grad_norm": 0.15797895193099976, + "learning_rate": 0.0008124275916580408, + "loss": 2.7396, + "step": 9859 + }, + { + "epoch": 0.2923820537911811, + "grad_norm": 0.23019832372665405, + "learning_rate": 0.000812390855675941, + "loss": 2.698, + "step": 9860 + }, + { + "epoch": 0.29241170714349257, + "grad_norm": 0.20510311424732208, + "learning_rate": 0.0008123541169275665, + "loss": 2.7307, + "step": 9861 + }, + { + "epoch": 0.29244136049580405, + "grad_norm": 0.18797366321086884, + "learning_rate": 0.0008123173754132427, + "loss": 2.72, + "step": 9862 + }, + { + "epoch": 0.2924710138481155, + "grad_norm": 0.1786789894104004, + "learning_rate": 0.000812280631133295, + "loss": 2.7104, + "step": 9863 + }, + { + "epoch": 0.292500667200427, + "grad_norm": 0.1589571237564087, + "learning_rate": 0.0008122438840880486, + "loss": 2.7664, + "step": 9864 + }, + { + "epoch": 0.2925303205527385, + "grad_norm": 0.16862300038337708, + "learning_rate": 0.0008122071342778292, + "loss": 2.7476, + "step": 9865 + }, + { + "epoch": 0.29255997390504995, + "grad_norm": 0.15920867025852203, + "learning_rate": 0.0008121703817029617, + "loss": 2.777, + "step": 9866 + }, + { + "epoch": 0.29258962725736143, + "grad_norm": 0.14817216992378235, + "learning_rate": 0.0008121336263637722, + "loss": 2.7155, + "step": 9867 + }, + { + "epoch": 0.2926192806096729, + "grad_norm": 0.15165475010871887, + "learning_rate": 0.0008120968682605858, + "loss": 2.711, + "step": 9868 + }, + { + "epoch": 0.2926489339619844, + "grad_norm": 0.14853869378566742, + "learning_rate": 0.0008120601073937279, + "loss": 2.7096, + "step": 9869 + }, + { + "epoch": 0.29267858731429586, + "grad_norm": 0.1397465616464615, + "learning_rate": 0.0008120233437635244, + "loss": 2.7284, + "step": 9870 + }, + { + "epoch": 0.29270824066660733, + "grad_norm": 0.16068528592586517, + "learning_rate": 0.0008119865773703006, + "loss": 2.7242, + "step": 9871 + }, + { + "epoch": 0.29273789401891886, + "grad_norm": 0.15626348555088043, + "learning_rate": 0.0008119498082143819, + "loss": 2.6632, + "step": 9872 + }, + { + "epoch": 0.29276754737123034, + "grad_norm": 0.1395595222711563, + "learning_rate": 0.0008119130362960942, + "loss": 2.7248, + "step": 9873 + }, + { + "epoch": 0.2927972007235418, + "grad_norm": 0.12674187123775482, + "learning_rate": 0.0008118762616157631, + "loss": 2.6967, + "step": 9874 + }, + { + "epoch": 0.2928268540758533, + "grad_norm": 0.15052516758441925, + "learning_rate": 0.0008118394841737141, + "loss": 2.714, + "step": 9875 + }, + { + "epoch": 0.29285650742816477, + "grad_norm": 0.1522333025932312, + "learning_rate": 0.0008118027039702732, + "loss": 2.7023, + "step": 9876 + }, + { + "epoch": 0.29288616078047625, + "grad_norm": 0.12075161933898926, + "learning_rate": 0.0008117659210057656, + "loss": 2.6979, + "step": 9877 + }, + { + "epoch": 0.2929158141327877, + "grad_norm": 0.10760878771543503, + "learning_rate": 0.0008117291352805172, + "loss": 2.7014, + "step": 9878 + }, + { + "epoch": 0.2929454674850992, + "grad_norm": 0.13703705370426178, + "learning_rate": 0.0008116923467948537, + "loss": 2.7226, + "step": 9879 + }, + { + "epoch": 0.2929751208374107, + "grad_norm": 0.12249117344617844, + "learning_rate": 0.0008116555555491012, + "loss": 2.7274, + "step": 9880 + }, + { + "epoch": 0.29300477418972215, + "grad_norm": 0.1218598484992981, + "learning_rate": 0.0008116187615435852, + "loss": 2.7412, + "step": 9881 + }, + { + "epoch": 0.2930344275420336, + "grad_norm": 0.12131982296705246, + "learning_rate": 0.0008115819647786316, + "loss": 2.7378, + "step": 9882 + }, + { + "epoch": 0.2930640808943451, + "grad_norm": 0.12558472156524658, + "learning_rate": 0.0008115451652545661, + "loss": 2.7041, + "step": 9883 + }, + { + "epoch": 0.2930937342466566, + "grad_norm": 0.11222633719444275, + "learning_rate": 0.0008115083629717148, + "loss": 2.7193, + "step": 9884 + }, + { + "epoch": 0.29312338759896805, + "grad_norm": 0.1177629679441452, + "learning_rate": 0.0008114715579304034, + "loss": 2.7173, + "step": 9885 + }, + { + "epoch": 0.29315304095127953, + "grad_norm": 0.1255660206079483, + "learning_rate": 0.000811434750130958, + "loss": 2.7353, + "step": 9886 + }, + { + "epoch": 0.293182694303591, + "grad_norm": 0.13031551241874695, + "learning_rate": 0.0008113979395737044, + "loss": 2.7439, + "step": 9887 + }, + { + "epoch": 0.2932123476559025, + "grad_norm": 0.12762705981731415, + "learning_rate": 0.0008113611262589685, + "loss": 2.7258, + "step": 9888 + }, + { + "epoch": 0.29324200100821396, + "grad_norm": 0.11558263748884201, + "learning_rate": 0.0008113243101870765, + "loss": 2.7307, + "step": 9889 + }, + { + "epoch": 0.29327165436052544, + "grad_norm": 0.13099347054958344, + "learning_rate": 0.0008112874913583543, + "loss": 2.724, + "step": 9890 + }, + { + "epoch": 0.2933013077128369, + "grad_norm": 0.13760247826576233, + "learning_rate": 0.0008112506697731278, + "loss": 2.7565, + "step": 9891 + }, + { + "epoch": 0.2933309610651484, + "grad_norm": 0.153110072016716, + "learning_rate": 0.0008112138454317233, + "loss": 2.7452, + "step": 9892 + }, + { + "epoch": 0.2933606144174599, + "grad_norm": 0.13597118854522705, + "learning_rate": 0.0008111770183344667, + "loss": 2.7706, + "step": 9893 + }, + { + "epoch": 0.2933902677697714, + "grad_norm": 0.11776511371135712, + "learning_rate": 0.0008111401884816843, + "loss": 2.7494, + "step": 9894 + }, + { + "epoch": 0.29341992112208287, + "grad_norm": 0.13220104575157166, + "learning_rate": 0.000811103355873702, + "loss": 2.7484, + "step": 9895 + }, + { + "epoch": 0.29344957447439435, + "grad_norm": 0.13506466150283813, + "learning_rate": 0.0008110665205108463, + "loss": 2.7247, + "step": 9896 + }, + { + "epoch": 0.2934792278267058, + "grad_norm": 0.12590378522872925, + "learning_rate": 0.0008110296823934429, + "loss": 2.7559, + "step": 9897 + }, + { + "epoch": 0.2935088811790173, + "grad_norm": 0.13963106274604797, + "learning_rate": 0.0008109928415218184, + "loss": 2.7421, + "step": 9898 + }, + { + "epoch": 0.2935385345313288, + "grad_norm": 0.14060047268867493, + "learning_rate": 0.0008109559978962988, + "loss": 2.7553, + "step": 9899 + }, + { + "epoch": 0.29356818788364025, + "grad_norm": 0.16341523826122284, + "learning_rate": 0.0008109191515172108, + "loss": 2.7295, + "step": 9900 + }, + { + "epoch": 0.29359784123595173, + "grad_norm": 0.19017845392227173, + "learning_rate": 0.0008108823023848799, + "loss": 2.7525, + "step": 9901 + }, + { + "epoch": 0.2936274945882632, + "grad_norm": 0.18248111009597778, + "learning_rate": 0.0008108454504996331, + "loss": 2.7332, + "step": 9902 + }, + { + "epoch": 0.2936571479405747, + "grad_norm": 0.17261284589767456, + "learning_rate": 0.0008108085958617965, + "loss": 2.7098, + "step": 9903 + }, + { + "epoch": 0.29368680129288616, + "grad_norm": 0.15345196425914764, + "learning_rate": 0.0008107717384716963, + "loss": 2.7482, + "step": 9904 + }, + { + "epoch": 0.29371645464519763, + "grad_norm": 0.13435806334018707, + "learning_rate": 0.0008107348783296591, + "loss": 2.7651, + "step": 9905 + }, + { + "epoch": 0.2937461079975091, + "grad_norm": 0.15924976766109467, + "learning_rate": 0.0008106980154360112, + "loss": 2.7508, + "step": 9906 + }, + { + "epoch": 0.2937757613498206, + "grad_norm": 0.14096347987651825, + "learning_rate": 0.000810661149791079, + "loss": 2.687, + "step": 9907 + }, + { + "epoch": 0.29380541470213206, + "grad_norm": 0.13422216475009918, + "learning_rate": 0.0008106242813951892, + "loss": 2.7219, + "step": 9908 + }, + { + "epoch": 0.29383506805444354, + "grad_norm": 0.16648922860622406, + "learning_rate": 0.0008105874102486679, + "loss": 2.733, + "step": 9909 + }, + { + "epoch": 0.293864721406755, + "grad_norm": 0.1574712097644806, + "learning_rate": 0.0008105505363518417, + "loss": 2.7367, + "step": 9910 + }, + { + "epoch": 0.2938943747590665, + "grad_norm": 0.13450860977172852, + "learning_rate": 0.0008105136597050372, + "loss": 2.7208, + "step": 9911 + }, + { + "epoch": 0.29392402811137797, + "grad_norm": 0.14938603341579437, + "learning_rate": 0.0008104767803085811, + "loss": 2.7113, + "step": 9912 + }, + { + "epoch": 0.29395368146368944, + "grad_norm": 0.14093244075775146, + "learning_rate": 0.0008104398981627996, + "loss": 2.7288, + "step": 9913 + }, + { + "epoch": 0.293983334816001, + "grad_norm": 0.17446187138557434, + "learning_rate": 0.0008104030132680198, + "loss": 2.7357, + "step": 9914 + }, + { + "epoch": 0.29401298816831245, + "grad_norm": 0.21173565089702606, + "learning_rate": 0.0008103661256245678, + "loss": 2.7757, + "step": 9915 + }, + { + "epoch": 0.2940426415206239, + "grad_norm": 0.2081395536661148, + "learning_rate": 0.0008103292352327706, + "loss": 2.7613, + "step": 9916 + }, + { + "epoch": 0.2940722948729354, + "grad_norm": 0.15559867024421692, + "learning_rate": 0.0008102923420929547, + "loss": 2.7619, + "step": 9917 + }, + { + "epoch": 0.2941019482252469, + "grad_norm": 0.15151186287403107, + "learning_rate": 0.0008102554462054468, + "loss": 2.7477, + "step": 9918 + }, + { + "epoch": 0.29413160157755835, + "grad_norm": 0.18204039335250854, + "learning_rate": 0.0008102185475705739, + "loss": 2.7386, + "step": 9919 + }, + { + "epoch": 0.29416125492986983, + "grad_norm": 0.1420099139213562, + "learning_rate": 0.0008101816461886624, + "loss": 2.711, + "step": 9920 + }, + { + "epoch": 0.2941909082821813, + "grad_norm": 0.12975087761878967, + "learning_rate": 0.000810144742060039, + "loss": 2.721, + "step": 9921 + }, + { + "epoch": 0.2942205616344928, + "grad_norm": 0.1405208855867386, + "learning_rate": 0.0008101078351850308, + "loss": 2.7548, + "step": 9922 + }, + { + "epoch": 0.29425021498680426, + "grad_norm": 0.12463702261447906, + "learning_rate": 0.0008100709255639645, + "loss": 2.6902, + "step": 9923 + }, + { + "epoch": 0.29427986833911574, + "grad_norm": 0.14252090454101562, + "learning_rate": 0.0008100340131971669, + "loss": 2.7382, + "step": 9924 + }, + { + "epoch": 0.2943095216914272, + "grad_norm": 0.11929839104413986, + "learning_rate": 0.0008099970980849649, + "loss": 2.7339, + "step": 9925 + }, + { + "epoch": 0.2943391750437387, + "grad_norm": 0.1161637231707573, + "learning_rate": 0.0008099601802276855, + "loss": 2.7574, + "step": 9926 + }, + { + "epoch": 0.29436882839605016, + "grad_norm": 0.13042667508125305, + "learning_rate": 0.0008099232596256554, + "loss": 2.7331, + "step": 9927 + }, + { + "epoch": 0.29439848174836164, + "grad_norm": 0.11943121254444122, + "learning_rate": 0.0008098863362792018, + "loss": 2.7232, + "step": 9928 + }, + { + "epoch": 0.2944281351006731, + "grad_norm": 0.14149747788906097, + "learning_rate": 0.0008098494101886513, + "loss": 2.7731, + "step": 9929 + }, + { + "epoch": 0.2944577884529846, + "grad_norm": 0.1376694291830063, + "learning_rate": 0.0008098124813543311, + "loss": 2.7286, + "step": 9930 + }, + { + "epoch": 0.29448744180529607, + "grad_norm": 0.1312655210494995, + "learning_rate": 0.0008097755497765682, + "loss": 2.76, + "step": 9931 + }, + { + "epoch": 0.29451709515760754, + "grad_norm": 0.11828754842281342, + "learning_rate": 0.0008097386154556896, + "loss": 2.7096, + "step": 9932 + }, + { + "epoch": 0.294546748509919, + "grad_norm": 0.12042224407196045, + "learning_rate": 0.0008097016783920226, + "loss": 2.7532, + "step": 9933 + }, + { + "epoch": 0.29457640186223055, + "grad_norm": 0.12488420307636261, + "learning_rate": 0.0008096647385858939, + "loss": 2.7482, + "step": 9934 + }, + { + "epoch": 0.29460605521454203, + "grad_norm": 0.13978219032287598, + "learning_rate": 0.0008096277960376308, + "loss": 2.7162, + "step": 9935 + }, + { + "epoch": 0.2946357085668535, + "grad_norm": 0.13875137269496918, + "learning_rate": 0.0008095908507475605, + "loss": 2.7278, + "step": 9936 + }, + { + "epoch": 0.294665361919165, + "grad_norm": 0.1388368457555771, + "learning_rate": 0.0008095539027160099, + "loss": 2.7386, + "step": 9937 + }, + { + "epoch": 0.29469501527147646, + "grad_norm": 0.14099663496017456, + "learning_rate": 0.0008095169519433066, + "loss": 2.7402, + "step": 9938 + }, + { + "epoch": 0.29472466862378793, + "grad_norm": 0.14691683650016785, + "learning_rate": 0.0008094799984297773, + "loss": 2.7367, + "step": 9939 + }, + { + "epoch": 0.2947543219760994, + "grad_norm": 0.1452813446521759, + "learning_rate": 0.0008094430421757497, + "loss": 2.7073, + "step": 9940 + }, + { + "epoch": 0.2947839753284109, + "grad_norm": 0.1362791806459427, + "learning_rate": 0.0008094060831815509, + "loss": 2.7508, + "step": 9941 + }, + { + "epoch": 0.29481362868072236, + "grad_norm": 0.15424399077892303, + "learning_rate": 0.0008093691214475081, + "loss": 2.765, + "step": 9942 + }, + { + "epoch": 0.29484328203303384, + "grad_norm": 0.14341898262500763, + "learning_rate": 0.0008093321569739484, + "loss": 2.7447, + "step": 9943 + }, + { + "epoch": 0.2948729353853453, + "grad_norm": 0.17326320707798004, + "learning_rate": 0.0008092951897611995, + "loss": 2.7682, + "step": 9944 + }, + { + "epoch": 0.2949025887376568, + "grad_norm": 0.1579640507698059, + "learning_rate": 0.0008092582198095886, + "loss": 2.7535, + "step": 9945 + }, + { + "epoch": 0.29493224208996827, + "grad_norm": 0.12575113773345947, + "learning_rate": 0.0008092212471194431, + "loss": 2.7151, + "step": 9946 + }, + { + "epoch": 0.29496189544227974, + "grad_norm": 0.14371521770954132, + "learning_rate": 0.0008091842716910904, + "loss": 2.7257, + "step": 9947 + }, + { + "epoch": 0.2949915487945912, + "grad_norm": 0.15259471535682678, + "learning_rate": 0.0008091472935248578, + "loss": 2.7368, + "step": 9948 + }, + { + "epoch": 0.2950212021469027, + "grad_norm": 0.14035145938396454, + "learning_rate": 0.000809110312621073, + "loss": 2.7711, + "step": 9949 + }, + { + "epoch": 0.29505085549921417, + "grad_norm": 0.1286451667547226, + "learning_rate": 0.0008090733289800631, + "loss": 2.7323, + "step": 9950 + }, + { + "epoch": 0.29508050885152565, + "grad_norm": 0.12385362386703491, + "learning_rate": 0.0008090363426021561, + "loss": 2.7758, + "step": 9951 + }, + { + "epoch": 0.2951101622038371, + "grad_norm": 0.13384653627872467, + "learning_rate": 0.000808999353487679, + "loss": 2.7625, + "step": 9952 + }, + { + "epoch": 0.2951398155561486, + "grad_norm": 0.13430987298488617, + "learning_rate": 0.0008089623616369597, + "loss": 2.7341, + "step": 9953 + }, + { + "epoch": 0.2951694689084601, + "grad_norm": 0.136207714676857, + "learning_rate": 0.0008089253670503256, + "loss": 2.7003, + "step": 9954 + }, + { + "epoch": 0.2951991222607716, + "grad_norm": 0.14242607355117798, + "learning_rate": 0.0008088883697281044, + "loss": 2.7382, + "step": 9955 + }, + { + "epoch": 0.2952287756130831, + "grad_norm": 0.12668123841285706, + "learning_rate": 0.0008088513696706236, + "loss": 2.7522, + "step": 9956 + }, + { + "epoch": 0.29525842896539456, + "grad_norm": 0.14011479914188385, + "learning_rate": 0.0008088143668782111, + "loss": 2.727, + "step": 9957 + }, + { + "epoch": 0.29528808231770604, + "grad_norm": 0.14901727437973022, + "learning_rate": 0.0008087773613511942, + "loss": 2.7419, + "step": 9958 + }, + { + "epoch": 0.2953177356700175, + "grad_norm": 0.1383744776248932, + "learning_rate": 0.0008087403530899008, + "loss": 2.7534, + "step": 9959 + }, + { + "epoch": 0.295347389022329, + "grad_norm": 0.11490277945995331, + "learning_rate": 0.0008087033420946586, + "loss": 2.7493, + "step": 9960 + }, + { + "epoch": 0.29537704237464046, + "grad_norm": 0.13459570705890656, + "learning_rate": 0.0008086663283657954, + "loss": 2.7358, + "step": 9961 + }, + { + "epoch": 0.29540669572695194, + "grad_norm": 0.1361691653728485, + "learning_rate": 0.0008086293119036386, + "loss": 2.6983, + "step": 9962 + }, + { + "epoch": 0.2954363490792634, + "grad_norm": 0.1455371230840683, + "learning_rate": 0.0008085922927085165, + "loss": 2.7372, + "step": 9963 + }, + { + "epoch": 0.2954660024315749, + "grad_norm": 0.1733814775943756, + "learning_rate": 0.0008085552707807566, + "loss": 2.7389, + "step": 9964 + }, + { + "epoch": 0.29549565578388637, + "grad_norm": 0.1961730420589447, + "learning_rate": 0.0008085182461206868, + "loss": 2.7427, + "step": 9965 + }, + { + "epoch": 0.29552530913619784, + "grad_norm": 0.20194974541664124, + "learning_rate": 0.000808481218728635, + "loss": 2.7065, + "step": 9966 + }, + { + "epoch": 0.2955549624885093, + "grad_norm": 0.1772996485233307, + "learning_rate": 0.0008084441886049292, + "loss": 2.7426, + "step": 9967 + }, + { + "epoch": 0.2955846158408208, + "grad_norm": 0.18086469173431396, + "learning_rate": 0.000808407155749897, + "loss": 2.7201, + "step": 9968 + }, + { + "epoch": 0.2956142691931323, + "grad_norm": 0.17563867568969727, + "learning_rate": 0.0008083701201638665, + "loss": 2.7339, + "step": 9969 + }, + { + "epoch": 0.29564392254544375, + "grad_norm": 0.13698290288448334, + "learning_rate": 0.0008083330818471657, + "loss": 2.7294, + "step": 9970 + }, + { + "epoch": 0.2956735758977552, + "grad_norm": 0.1345651000738144, + "learning_rate": 0.0008082960408001225, + "loss": 2.7383, + "step": 9971 + }, + { + "epoch": 0.2957032292500667, + "grad_norm": 0.14181140065193176, + "learning_rate": 0.000808258997023065, + "loss": 2.7113, + "step": 9972 + }, + { + "epoch": 0.2957328826023782, + "grad_norm": 0.12403280287981033, + "learning_rate": 0.0008082219505163211, + "loss": 2.7461, + "step": 9973 + }, + { + "epoch": 0.29576253595468965, + "grad_norm": 0.13237589597702026, + "learning_rate": 0.000808184901280219, + "loss": 2.7175, + "step": 9974 + }, + { + "epoch": 0.29579218930700113, + "grad_norm": 0.13620556890964508, + "learning_rate": 0.0008081478493150866, + "loss": 2.7208, + "step": 9975 + }, + { + "epoch": 0.29582184265931266, + "grad_norm": 0.14046861231327057, + "learning_rate": 0.0008081107946212522, + "loss": 2.7071, + "step": 9976 + }, + { + "epoch": 0.29585149601162414, + "grad_norm": 0.153669074177742, + "learning_rate": 0.0008080737371990438, + "loss": 2.7431, + "step": 9977 + }, + { + "epoch": 0.2958811493639356, + "grad_norm": 0.12194392085075378, + "learning_rate": 0.0008080366770487895, + "loss": 2.7273, + "step": 9978 + }, + { + "epoch": 0.2959108027162471, + "grad_norm": 0.12435209006071091, + "learning_rate": 0.0008079996141708177, + "loss": 2.7071, + "step": 9979 + }, + { + "epoch": 0.29594045606855857, + "grad_norm": 0.12826956808567047, + "learning_rate": 0.0008079625485654563, + "loss": 2.7262, + "step": 9980 + }, + { + "epoch": 0.29597010942087004, + "grad_norm": 0.11737596988677979, + "learning_rate": 0.0008079254802330338, + "loss": 2.7167, + "step": 9981 + }, + { + "epoch": 0.2959997627731815, + "grad_norm": 0.11992348730564117, + "learning_rate": 0.0008078884091738781, + "loss": 2.7605, + "step": 9982 + }, + { + "epoch": 0.296029416125493, + "grad_norm": 0.12144176661968231, + "learning_rate": 0.0008078513353883179, + "loss": 2.7538, + "step": 9983 + }, + { + "epoch": 0.29605906947780447, + "grad_norm": 0.13622623682022095, + "learning_rate": 0.0008078142588766813, + "loss": 2.7258, + "step": 9984 + }, + { + "epoch": 0.29608872283011595, + "grad_norm": 0.12588627636432648, + "learning_rate": 0.0008077771796392966, + "loss": 2.7638, + "step": 9985 + }, + { + "epoch": 0.2961183761824274, + "grad_norm": 0.131668359041214, + "learning_rate": 0.0008077400976764919, + "loss": 2.7556, + "step": 9986 + }, + { + "epoch": 0.2961480295347389, + "grad_norm": 0.12274011969566345, + "learning_rate": 0.0008077030129885961, + "loss": 2.7321, + "step": 9987 + }, + { + "epoch": 0.2961776828870504, + "grad_norm": 0.1356770545244217, + "learning_rate": 0.0008076659255759371, + "loss": 2.719, + "step": 9988 + }, + { + "epoch": 0.29620733623936185, + "grad_norm": 0.1468125730752945, + "learning_rate": 0.0008076288354388436, + "loss": 2.694, + "step": 9989 + }, + { + "epoch": 0.2962369895916733, + "grad_norm": 0.1594550609588623, + "learning_rate": 0.000807591742577644, + "loss": 2.7128, + "step": 9990 + }, + { + "epoch": 0.2962666429439848, + "grad_norm": 0.1650051325559616, + "learning_rate": 0.0008075546469926666, + "loss": 2.7495, + "step": 9991 + }, + { + "epoch": 0.2962962962962963, + "grad_norm": 0.15988074243068695, + "learning_rate": 0.0008075175486842401, + "loss": 2.7172, + "step": 9992 + }, + { + "epoch": 0.29632594964860776, + "grad_norm": 0.16996964812278748, + "learning_rate": 0.000807480447652693, + "loss": 2.7141, + "step": 9993 + }, + { + "epoch": 0.29635560300091923, + "grad_norm": 0.1750069260597229, + "learning_rate": 0.0008074433438983537, + "loss": 2.7179, + "step": 9994 + }, + { + "epoch": 0.2963852563532307, + "grad_norm": 0.16579222679138184, + "learning_rate": 0.0008074062374215507, + "loss": 2.7507, + "step": 9995 + }, + { + "epoch": 0.2964149097055422, + "grad_norm": 0.16441597044467926, + "learning_rate": 0.0008073691282226128, + "loss": 2.7619, + "step": 9996 + }, + { + "epoch": 0.2964445630578537, + "grad_norm": 0.13419710099697113, + "learning_rate": 0.0008073320163018685, + "loss": 2.7537, + "step": 9997 + }, + { + "epoch": 0.2964742164101652, + "grad_norm": 0.16024303436279297, + "learning_rate": 0.0008072949016596464, + "loss": 2.7429, + "step": 9998 + }, + { + "epoch": 0.29650386976247667, + "grad_norm": 0.16233378648757935, + "learning_rate": 0.0008072577842962754, + "loss": 2.7188, + "step": 9999 + }, + { + "epoch": 0.29653352311478814, + "grad_norm": 0.16599565744400024, + "learning_rate": 0.0008072206642120839, + "loss": 2.7542, + "step": 10000 + }, + { + "epoch": 0.2965631764670996, + "grad_norm": 0.14308665692806244, + "learning_rate": 0.0008071835414074005, + "loss": 2.7269, + "step": 10001 + }, + { + "epoch": 0.2965928298194111, + "grad_norm": 0.14327506721019745, + "learning_rate": 0.0008071464158825541, + "loss": 2.7622, + "step": 10002 + }, + { + "epoch": 0.2966224831717226, + "grad_norm": 0.13846783339977264, + "learning_rate": 0.0008071092876378736, + "loss": 2.7271, + "step": 10003 + }, + { + "epoch": 0.29665213652403405, + "grad_norm": 0.12055961042642593, + "learning_rate": 0.0008070721566736877, + "loss": 2.7237, + "step": 10004 + }, + { + "epoch": 0.2966817898763455, + "grad_norm": 0.13144664466381073, + "learning_rate": 0.0008070350229903251, + "loss": 2.7184, + "step": 10005 + }, + { + "epoch": 0.296711443228657, + "grad_norm": 0.12314818799495697, + "learning_rate": 0.0008069978865881147, + "loss": 2.7203, + "step": 10006 + }, + { + "epoch": 0.2967410965809685, + "grad_norm": 0.12160150706768036, + "learning_rate": 0.0008069607474673852, + "loss": 2.7181, + "step": 10007 + }, + { + "epoch": 0.29677074993327995, + "grad_norm": 0.13317565619945526, + "learning_rate": 0.0008069236056284656, + "loss": 2.7547, + "step": 10008 + }, + { + "epoch": 0.29680040328559143, + "grad_norm": 0.1258280724287033, + "learning_rate": 0.0008068864610716849, + "loss": 2.7299, + "step": 10009 + }, + { + "epoch": 0.2968300566379029, + "grad_norm": 0.12525619566440582, + "learning_rate": 0.0008068493137973718, + "loss": 2.7155, + "step": 10010 + }, + { + "epoch": 0.2968597099902144, + "grad_norm": 0.1365371197462082, + "learning_rate": 0.0008068121638058554, + "loss": 2.7449, + "step": 10011 + }, + { + "epoch": 0.29688936334252586, + "grad_norm": 0.1433158814907074, + "learning_rate": 0.0008067750110974648, + "loss": 2.7593, + "step": 10012 + }, + { + "epoch": 0.29691901669483733, + "grad_norm": 0.15196466445922852, + "learning_rate": 0.0008067378556725287, + "loss": 2.7293, + "step": 10013 + }, + { + "epoch": 0.2969486700471488, + "grad_norm": 0.1491209715604782, + "learning_rate": 0.000806700697531376, + "loss": 2.7288, + "step": 10014 + }, + { + "epoch": 0.2969783233994603, + "grad_norm": 0.14008213579654694, + "learning_rate": 0.0008066635366743363, + "loss": 2.7047, + "step": 10015 + }, + { + "epoch": 0.29700797675177176, + "grad_norm": 0.15077044069766998, + "learning_rate": 0.0008066263731017382, + "loss": 2.7339, + "step": 10016 + }, + { + "epoch": 0.29703763010408324, + "grad_norm": 0.14728759229183197, + "learning_rate": 0.0008065892068139109, + "loss": 2.745, + "step": 10017 + }, + { + "epoch": 0.29706728345639477, + "grad_norm": 0.1490802764892578, + "learning_rate": 0.0008065520378111836, + "loss": 2.7229, + "step": 10018 + }, + { + "epoch": 0.29709693680870625, + "grad_norm": 0.12244491279125214, + "learning_rate": 0.0008065148660938854, + "loss": 2.714, + "step": 10019 + }, + { + "epoch": 0.2971265901610177, + "grad_norm": 0.11971496790647507, + "learning_rate": 0.0008064776916623456, + "loss": 2.7381, + "step": 10020 + }, + { + "epoch": 0.2971562435133292, + "grad_norm": 0.13227133452892303, + "learning_rate": 0.0008064405145168929, + "loss": 2.7277, + "step": 10021 + }, + { + "epoch": 0.2971858968656407, + "grad_norm": 0.14995916187763214, + "learning_rate": 0.000806403334657857, + "loss": 2.7526, + "step": 10022 + }, + { + "epoch": 0.29721555021795215, + "grad_norm": 0.15163449943065643, + "learning_rate": 0.0008063661520855671, + "loss": 2.7444, + "step": 10023 + }, + { + "epoch": 0.2972452035702636, + "grad_norm": 0.15602175891399384, + "learning_rate": 0.0008063289668003522, + "loss": 2.7533, + "step": 10024 + }, + { + "epoch": 0.2972748569225751, + "grad_norm": 0.15190169215202332, + "learning_rate": 0.0008062917788025417, + "loss": 2.7587, + "step": 10025 + }, + { + "epoch": 0.2973045102748866, + "grad_norm": 0.14052943885326385, + "learning_rate": 0.000806254588092465, + "loss": 2.7601, + "step": 10026 + }, + { + "epoch": 0.29733416362719806, + "grad_norm": 0.11809712648391724, + "learning_rate": 0.0008062173946704513, + "loss": 2.7508, + "step": 10027 + }, + { + "epoch": 0.29736381697950953, + "grad_norm": 0.13628755509853363, + "learning_rate": 0.00080618019853683, + "loss": 2.7384, + "step": 10028 + }, + { + "epoch": 0.297393470331821, + "grad_norm": 0.15324711799621582, + "learning_rate": 0.0008061429996919305, + "loss": 2.7301, + "step": 10029 + }, + { + "epoch": 0.2974231236841325, + "grad_norm": 0.15897053480148315, + "learning_rate": 0.0008061057981360822, + "loss": 2.7231, + "step": 10030 + }, + { + "epoch": 0.29745277703644396, + "grad_norm": 0.14409087598323822, + "learning_rate": 0.0008060685938696146, + "loss": 2.7339, + "step": 10031 + }, + { + "epoch": 0.29748243038875544, + "grad_norm": 0.14074794948101044, + "learning_rate": 0.0008060313868928571, + "loss": 2.7679, + "step": 10032 + }, + { + "epoch": 0.2975120837410669, + "grad_norm": 0.13693338632583618, + "learning_rate": 0.000805994177206139, + "loss": 2.7261, + "step": 10033 + }, + { + "epoch": 0.2975417370933784, + "grad_norm": 0.13849948346614838, + "learning_rate": 0.0008059569648097899, + "loss": 2.7207, + "step": 10034 + }, + { + "epoch": 0.29757139044568987, + "grad_norm": 0.1452471762895584, + "learning_rate": 0.0008059197497041395, + "loss": 2.7542, + "step": 10035 + }, + { + "epoch": 0.29760104379800134, + "grad_norm": 0.15573516488075256, + "learning_rate": 0.0008058825318895171, + "loss": 2.7118, + "step": 10036 + }, + { + "epoch": 0.2976306971503128, + "grad_norm": 0.1705796718597412, + "learning_rate": 0.0008058453113662524, + "loss": 2.7519, + "step": 10037 + }, + { + "epoch": 0.29766035050262435, + "grad_norm": 0.18055041134357452, + "learning_rate": 0.000805808088134675, + "loss": 2.7588, + "step": 10038 + }, + { + "epoch": 0.2976900038549358, + "grad_norm": 0.20115207135677338, + "learning_rate": 0.0008057708621951145, + "loss": 2.7282, + "step": 10039 + }, + { + "epoch": 0.2977196572072473, + "grad_norm": 0.1657675951719284, + "learning_rate": 0.0008057336335479004, + "loss": 2.7326, + "step": 10040 + }, + { + "epoch": 0.2977493105595588, + "grad_norm": 0.15190665423870087, + "learning_rate": 0.0008056964021933625, + "loss": 2.704, + "step": 10041 + }, + { + "epoch": 0.29777896391187025, + "grad_norm": 0.13600599765777588, + "learning_rate": 0.0008056591681318307, + "loss": 2.7351, + "step": 10042 + }, + { + "epoch": 0.29780861726418173, + "grad_norm": 0.12982988357543945, + "learning_rate": 0.0008056219313636344, + "loss": 2.7316, + "step": 10043 + }, + { + "epoch": 0.2978382706164932, + "grad_norm": 0.15762704610824585, + "learning_rate": 0.0008055846918891034, + "loss": 2.7257, + "step": 10044 + }, + { + "epoch": 0.2978679239688047, + "grad_norm": 0.14910054206848145, + "learning_rate": 0.0008055474497085676, + "loss": 2.724, + "step": 10045 + }, + { + "epoch": 0.29789757732111616, + "grad_norm": 0.12650227546691895, + "learning_rate": 0.0008055102048223566, + "loss": 2.7666, + "step": 10046 + }, + { + "epoch": 0.29792723067342763, + "grad_norm": 0.13473659753799438, + "learning_rate": 0.0008054729572308003, + "loss": 2.7365, + "step": 10047 + }, + { + "epoch": 0.2979568840257391, + "grad_norm": 0.1466020792722702, + "learning_rate": 0.0008054357069342286, + "loss": 2.7409, + "step": 10048 + }, + { + "epoch": 0.2979865373780506, + "grad_norm": 0.15498848259449005, + "learning_rate": 0.0008053984539329711, + "loss": 2.7364, + "step": 10049 + }, + { + "epoch": 0.29801619073036206, + "grad_norm": 0.1564044952392578, + "learning_rate": 0.0008053611982273581, + "loss": 2.7269, + "step": 10050 + }, + { + "epoch": 0.29804584408267354, + "grad_norm": 0.1252392679452896, + "learning_rate": 0.0008053239398177191, + "loss": 2.7198, + "step": 10051 + }, + { + "epoch": 0.298075497434985, + "grad_norm": 0.12749408185482025, + "learning_rate": 0.0008052866787043843, + "loss": 2.7375, + "step": 10052 + }, + { + "epoch": 0.2981051507872965, + "grad_norm": 0.11345665156841278, + "learning_rate": 0.0008052494148876834, + "loss": 2.6954, + "step": 10053 + }, + { + "epoch": 0.29813480413960797, + "grad_norm": 0.12489758431911469, + "learning_rate": 0.0008052121483679468, + "loss": 2.7118, + "step": 10054 + }, + { + "epoch": 0.29816445749191944, + "grad_norm": 0.13476617634296417, + "learning_rate": 0.000805174879145504, + "loss": 2.7594, + "step": 10055 + }, + { + "epoch": 0.2981941108442309, + "grad_norm": 0.13748575747013092, + "learning_rate": 0.0008051376072206856, + "loss": 2.729, + "step": 10056 + }, + { + "epoch": 0.2982237641965424, + "grad_norm": 0.12514138221740723, + "learning_rate": 0.0008051003325938209, + "loss": 2.7244, + "step": 10057 + }, + { + "epoch": 0.29825341754885387, + "grad_norm": 0.16660176217556, + "learning_rate": 0.0008050630552652406, + "loss": 2.7443, + "step": 10058 + }, + { + "epoch": 0.2982830709011654, + "grad_norm": 0.18334627151489258, + "learning_rate": 0.0008050257752352745, + "loss": 2.7529, + "step": 10059 + }, + { + "epoch": 0.2983127242534769, + "grad_norm": 0.16755716502666473, + "learning_rate": 0.0008049884925042528, + "loss": 2.7702, + "step": 10060 + }, + { + "epoch": 0.29834237760578836, + "grad_norm": 0.1639333963394165, + "learning_rate": 0.0008049512070725058, + "loss": 2.7272, + "step": 10061 + }, + { + "epoch": 0.29837203095809983, + "grad_norm": 0.15043160319328308, + "learning_rate": 0.0008049139189403633, + "loss": 2.7487, + "step": 10062 + }, + { + "epoch": 0.2984016843104113, + "grad_norm": 0.14379991590976715, + "learning_rate": 0.0008048766281081559, + "loss": 2.7205, + "step": 10063 + }, + { + "epoch": 0.2984313376627228, + "grad_norm": 0.14117431640625, + "learning_rate": 0.0008048393345762136, + "loss": 2.7052, + "step": 10064 + }, + { + "epoch": 0.29846099101503426, + "grad_norm": 0.12702420353889465, + "learning_rate": 0.0008048020383448666, + "loss": 2.7284, + "step": 10065 + }, + { + "epoch": 0.29849064436734574, + "grad_norm": 0.13360220193862915, + "learning_rate": 0.0008047647394144453, + "loss": 2.711, + "step": 10066 + }, + { + "epoch": 0.2985202977196572, + "grad_norm": 0.14310972392559052, + "learning_rate": 0.0008047274377852798, + "loss": 2.7509, + "step": 10067 + }, + { + "epoch": 0.2985499510719687, + "grad_norm": 0.13710907101631165, + "learning_rate": 0.0008046901334577006, + "loss": 2.7721, + "step": 10068 + }, + { + "epoch": 0.29857960442428017, + "grad_norm": 0.13241851329803467, + "learning_rate": 0.0008046528264320379, + "loss": 2.7723, + "step": 10069 + }, + { + "epoch": 0.29860925777659164, + "grad_norm": 0.11484309285879135, + "learning_rate": 0.0008046155167086222, + "loss": 2.7186, + "step": 10070 + }, + { + "epoch": 0.2986389111289031, + "grad_norm": 0.12153211236000061, + "learning_rate": 0.0008045782042877839, + "loss": 2.7353, + "step": 10071 + }, + { + "epoch": 0.2986685644812146, + "grad_norm": 0.11374089866876602, + "learning_rate": 0.0008045408891698532, + "loss": 2.6996, + "step": 10072 + }, + { + "epoch": 0.29869821783352607, + "grad_norm": 0.12154358625411987, + "learning_rate": 0.0008045035713551607, + "loss": 2.741, + "step": 10073 + }, + { + "epoch": 0.29872787118583755, + "grad_norm": 0.12128783017396927, + "learning_rate": 0.0008044662508440368, + "loss": 2.7307, + "step": 10074 + }, + { + "epoch": 0.298757524538149, + "grad_norm": 0.11795956641435623, + "learning_rate": 0.0008044289276368119, + "loss": 2.7508, + "step": 10075 + }, + { + "epoch": 0.2987871778904605, + "grad_norm": 0.11456070840358734, + "learning_rate": 0.0008043916017338167, + "loss": 2.7629, + "step": 10076 + }, + { + "epoch": 0.298816831242772, + "grad_norm": 0.15340155363082886, + "learning_rate": 0.0008043542731353817, + "loss": 2.744, + "step": 10077 + }, + { + "epoch": 0.29884648459508345, + "grad_norm": 0.15410810708999634, + "learning_rate": 0.0008043169418418373, + "loss": 2.7201, + "step": 10078 + }, + { + "epoch": 0.2988761379473949, + "grad_norm": 0.16438810527324677, + "learning_rate": 0.0008042796078535139, + "loss": 2.7356, + "step": 10079 + }, + { + "epoch": 0.29890579129970646, + "grad_norm": 0.14949050545692444, + "learning_rate": 0.0008042422711707427, + "loss": 2.724, + "step": 10080 + }, + { + "epoch": 0.29893544465201793, + "grad_norm": 0.15155521035194397, + "learning_rate": 0.0008042049317938538, + "loss": 2.7103, + "step": 10081 + }, + { + "epoch": 0.2989650980043294, + "grad_norm": 0.17464579641819, + "learning_rate": 0.0008041675897231779, + "loss": 2.7536, + "step": 10082 + }, + { + "epoch": 0.2989947513566409, + "grad_norm": 0.1909511834383011, + "learning_rate": 0.0008041302449590461, + "loss": 2.7433, + "step": 10083 + }, + { + "epoch": 0.29902440470895236, + "grad_norm": 0.16548825800418854, + "learning_rate": 0.0008040928975017884, + "loss": 2.7341, + "step": 10084 + }, + { + "epoch": 0.29905405806126384, + "grad_norm": 0.13318046927452087, + "learning_rate": 0.0008040555473517361, + "loss": 2.7444, + "step": 10085 + }, + { + "epoch": 0.2990837114135753, + "grad_norm": 0.14851447939872742, + "learning_rate": 0.0008040181945092198, + "loss": 2.7127, + "step": 10086 + }, + { + "epoch": 0.2991133647658868, + "grad_norm": 0.1373746693134308, + "learning_rate": 0.0008039808389745702, + "loss": 2.705, + "step": 10087 + }, + { + "epoch": 0.29914301811819827, + "grad_norm": 0.11229487508535385, + "learning_rate": 0.0008039434807481181, + "loss": 2.7745, + "step": 10088 + }, + { + "epoch": 0.29917267147050974, + "grad_norm": 0.14165525138378143, + "learning_rate": 0.0008039061198301941, + "loss": 2.6903, + "step": 10089 + }, + { + "epoch": 0.2992023248228212, + "grad_norm": 0.1313355267047882, + "learning_rate": 0.0008038687562211295, + "loss": 2.7015, + "step": 10090 + }, + { + "epoch": 0.2992319781751327, + "grad_norm": 0.13999883830547333, + "learning_rate": 0.0008038313899212548, + "loss": 2.7188, + "step": 10091 + }, + { + "epoch": 0.29926163152744417, + "grad_norm": 0.1477421671152115, + "learning_rate": 0.0008037940209309008, + "loss": 2.7055, + "step": 10092 + }, + { + "epoch": 0.29929128487975565, + "grad_norm": 0.1633843630552292, + "learning_rate": 0.0008037566492503989, + "loss": 2.7288, + "step": 10093 + }, + { + "epoch": 0.2993209382320671, + "grad_norm": 0.1664121448993683, + "learning_rate": 0.0008037192748800795, + "loss": 2.712, + "step": 10094 + }, + { + "epoch": 0.2993505915843786, + "grad_norm": 0.1474197804927826, + "learning_rate": 0.0008036818978202738, + "loss": 2.6839, + "step": 10095 + }, + { + "epoch": 0.2993802449366901, + "grad_norm": 0.133774533867836, + "learning_rate": 0.000803644518071313, + "loss": 2.7523, + "step": 10096 + }, + { + "epoch": 0.29940989828900155, + "grad_norm": 0.16216041147708893, + "learning_rate": 0.0008036071356335278, + "loss": 2.7728, + "step": 10097 + }, + { + "epoch": 0.29943955164131303, + "grad_norm": 0.18630953133106232, + "learning_rate": 0.000803569750507249, + "loss": 2.7444, + "step": 10098 + }, + { + "epoch": 0.2994692049936245, + "grad_norm": 0.16257940232753754, + "learning_rate": 0.0008035323626928082, + "loss": 2.7529, + "step": 10099 + }, + { + "epoch": 0.299498858345936, + "grad_norm": 0.13135571777820587, + "learning_rate": 0.0008034949721905363, + "loss": 2.7283, + "step": 10100 + }, + { + "epoch": 0.2995285116982475, + "grad_norm": 0.1579824537038803, + "learning_rate": 0.0008034575790007643, + "loss": 2.7432, + "step": 10101 + }, + { + "epoch": 0.299558165050559, + "grad_norm": 0.15809224545955658, + "learning_rate": 0.0008034201831238233, + "loss": 2.7289, + "step": 10102 + }, + { + "epoch": 0.29958781840287046, + "grad_norm": 0.13272221386432648, + "learning_rate": 0.0008033827845600445, + "loss": 2.7115, + "step": 10103 + }, + { + "epoch": 0.29961747175518194, + "grad_norm": 0.15564590692520142, + "learning_rate": 0.0008033453833097591, + "loss": 2.7349, + "step": 10104 + }, + { + "epoch": 0.2996471251074934, + "grad_norm": 0.16602179408073425, + "learning_rate": 0.000803307979373298, + "loss": 2.7323, + "step": 10105 + }, + { + "epoch": 0.2996767784598049, + "grad_norm": 0.15267302095890045, + "learning_rate": 0.0008032705727509929, + "loss": 2.7507, + "step": 10106 + }, + { + "epoch": 0.29970643181211637, + "grad_norm": 0.1395368129014969, + "learning_rate": 0.0008032331634431749, + "loss": 2.7469, + "step": 10107 + }, + { + "epoch": 0.29973608516442785, + "grad_norm": 0.12432732433080673, + "learning_rate": 0.0008031957514501751, + "loss": 2.7594, + "step": 10108 + }, + { + "epoch": 0.2997657385167393, + "grad_norm": 0.1477808952331543, + "learning_rate": 0.0008031583367723249, + "loss": 2.7269, + "step": 10109 + }, + { + "epoch": 0.2997953918690508, + "grad_norm": 0.14953267574310303, + "learning_rate": 0.0008031209194099556, + "loss": 2.7618, + "step": 10110 + }, + { + "epoch": 0.2998250452213623, + "grad_norm": 0.1170521005988121, + "learning_rate": 0.0008030834993633984, + "loss": 2.7368, + "step": 10111 + }, + { + "epoch": 0.29985469857367375, + "grad_norm": 0.12250961363315582, + "learning_rate": 0.0008030460766329849, + "loss": 2.7245, + "step": 10112 + }, + { + "epoch": 0.2998843519259852, + "grad_norm": 0.1506635695695877, + "learning_rate": 0.0008030086512190464, + "loss": 2.7418, + "step": 10113 + }, + { + "epoch": 0.2999140052782967, + "grad_norm": 0.14695385098457336, + "learning_rate": 0.0008029712231219142, + "loss": 2.7314, + "step": 10114 + }, + { + "epoch": 0.2999436586306082, + "grad_norm": 0.15707512199878693, + "learning_rate": 0.0008029337923419199, + "loss": 2.746, + "step": 10115 + }, + { + "epoch": 0.29997331198291965, + "grad_norm": 0.14929810166358948, + "learning_rate": 0.0008028963588793949, + "loss": 2.6928, + "step": 10116 + }, + { + "epoch": 0.30000296533523113, + "grad_norm": 0.1402774155139923, + "learning_rate": 0.0008028589227346705, + "loss": 2.7453, + "step": 10117 + }, + { + "epoch": 0.3000326186875426, + "grad_norm": 0.14481350779533386, + "learning_rate": 0.0008028214839080784, + "loss": 2.7342, + "step": 10118 + }, + { + "epoch": 0.3000622720398541, + "grad_norm": 0.13775181770324707, + "learning_rate": 0.0008027840423999502, + "loss": 2.7179, + "step": 10119 + }, + { + "epoch": 0.30009192539216556, + "grad_norm": 0.13746541738510132, + "learning_rate": 0.0008027465982106172, + "loss": 2.7391, + "step": 10120 + }, + { + "epoch": 0.30012157874447704, + "grad_norm": 0.13710252940654755, + "learning_rate": 0.0008027091513404112, + "loss": 2.7227, + "step": 10121 + }, + { + "epoch": 0.30015123209678857, + "grad_norm": 0.14271123707294464, + "learning_rate": 0.0008026717017896636, + "loss": 2.7075, + "step": 10122 + }, + { + "epoch": 0.30018088544910004, + "grad_norm": 0.16980783641338348, + "learning_rate": 0.0008026342495587063, + "loss": 2.7463, + "step": 10123 + }, + { + "epoch": 0.3002105388014115, + "grad_norm": 0.1549931764602661, + "learning_rate": 0.0008025967946478705, + "loss": 2.7201, + "step": 10124 + }, + { + "epoch": 0.300240192153723, + "grad_norm": 0.1314072161912918, + "learning_rate": 0.0008025593370574884, + "loss": 2.688, + "step": 10125 + }, + { + "epoch": 0.30026984550603447, + "grad_norm": 0.15769432485103607, + "learning_rate": 0.0008025218767878914, + "loss": 2.7293, + "step": 10126 + }, + { + "epoch": 0.30029949885834595, + "grad_norm": 0.17565113306045532, + "learning_rate": 0.0008024844138394112, + "loss": 2.7339, + "step": 10127 + }, + { + "epoch": 0.3003291522106574, + "grad_norm": 0.15837177634239197, + "learning_rate": 0.0008024469482123796, + "loss": 2.7494, + "step": 10128 + }, + { + "epoch": 0.3003588055629689, + "grad_norm": 0.13997186720371246, + "learning_rate": 0.0008024094799071284, + "loss": 2.6968, + "step": 10129 + }, + { + "epoch": 0.3003884589152804, + "grad_norm": 0.13435465097427368, + "learning_rate": 0.0008023720089239892, + "loss": 2.7623, + "step": 10130 + }, + { + "epoch": 0.30041811226759185, + "grad_norm": 0.14411121606826782, + "learning_rate": 0.000802334535263294, + "loss": 2.748, + "step": 10131 + }, + { + "epoch": 0.30044776561990333, + "grad_norm": 0.14467675983905792, + "learning_rate": 0.0008022970589253748, + "loss": 2.7208, + "step": 10132 + }, + { + "epoch": 0.3004774189722148, + "grad_norm": 0.15410006046295166, + "learning_rate": 0.000802259579910563, + "loss": 2.7135, + "step": 10133 + }, + { + "epoch": 0.3005070723245263, + "grad_norm": 0.16286417841911316, + "learning_rate": 0.0008022220982191909, + "loss": 2.7059, + "step": 10134 + }, + { + "epoch": 0.30053672567683776, + "grad_norm": 0.15022806823253632, + "learning_rate": 0.0008021846138515903, + "loss": 2.7215, + "step": 10135 + }, + { + "epoch": 0.30056637902914923, + "grad_norm": 0.1288459748029709, + "learning_rate": 0.0008021471268080929, + "loss": 2.765, + "step": 10136 + }, + { + "epoch": 0.3005960323814607, + "grad_norm": 0.15796813368797302, + "learning_rate": 0.0008021096370890308, + "loss": 2.7359, + "step": 10137 + }, + { + "epoch": 0.3006256857337722, + "grad_norm": 0.14954935014247894, + "learning_rate": 0.0008020721446947361, + "loss": 2.7253, + "step": 10138 + }, + { + "epoch": 0.30065533908608366, + "grad_norm": 0.13276028633117676, + "learning_rate": 0.0008020346496255407, + "loss": 2.7322, + "step": 10139 + }, + { + "epoch": 0.30068499243839514, + "grad_norm": 0.1354837268590927, + "learning_rate": 0.0008019971518817768, + "loss": 2.7246, + "step": 10140 + }, + { + "epoch": 0.3007146457907066, + "grad_norm": 0.13983510434627533, + "learning_rate": 0.0008019596514637761, + "loss": 2.6951, + "step": 10141 + }, + { + "epoch": 0.30074429914301815, + "grad_norm": 0.13462543487548828, + "learning_rate": 0.0008019221483718708, + "loss": 2.6771, + "step": 10142 + }, + { + "epoch": 0.3007739524953296, + "grad_norm": 0.1524658203125, + "learning_rate": 0.0008018846426063932, + "loss": 2.7586, + "step": 10143 + }, + { + "epoch": 0.3008036058476411, + "grad_norm": 0.13985946774482727, + "learning_rate": 0.0008018471341676752, + "loss": 2.7248, + "step": 10144 + }, + { + "epoch": 0.3008332591999526, + "grad_norm": 0.13117283582687378, + "learning_rate": 0.000801809623056049, + "loss": 2.7432, + "step": 10145 + }, + { + "epoch": 0.30086291255226405, + "grad_norm": 0.149380162358284, + "learning_rate": 0.0008017721092718469, + "loss": 2.7374, + "step": 10146 + }, + { + "epoch": 0.3008925659045755, + "grad_norm": 0.17017030715942383, + "learning_rate": 0.0008017345928154007, + "loss": 2.7221, + "step": 10147 + }, + { + "epoch": 0.300922219256887, + "grad_norm": 0.16860534250736237, + "learning_rate": 0.0008016970736870432, + "loss": 2.7328, + "step": 10148 + }, + { + "epoch": 0.3009518726091985, + "grad_norm": 0.13747593760490417, + "learning_rate": 0.0008016595518871061, + "loss": 2.7556, + "step": 10149 + }, + { + "epoch": 0.30098152596150995, + "grad_norm": 0.14994175732135773, + "learning_rate": 0.0008016220274159221, + "loss": 2.7248, + "step": 10150 + }, + { + "epoch": 0.30101117931382143, + "grad_norm": 0.14677849411964417, + "learning_rate": 0.0008015845002738232, + "loss": 2.7329, + "step": 10151 + }, + { + "epoch": 0.3010408326661329, + "grad_norm": 0.1446085274219513, + "learning_rate": 0.0008015469704611417, + "loss": 2.7455, + "step": 10152 + }, + { + "epoch": 0.3010704860184444, + "grad_norm": 0.13822896778583527, + "learning_rate": 0.00080150943797821, + "loss": 2.7166, + "step": 10153 + }, + { + "epoch": 0.30110013937075586, + "grad_norm": 0.1182418093085289, + "learning_rate": 0.0008014719028253606, + "loss": 2.7413, + "step": 10154 + }, + { + "epoch": 0.30112979272306734, + "grad_norm": 0.1291673183441162, + "learning_rate": 0.0008014343650029256, + "loss": 2.7401, + "step": 10155 + }, + { + "epoch": 0.3011594460753788, + "grad_norm": 0.11763856559991837, + "learning_rate": 0.0008013968245112377, + "loss": 2.7264, + "step": 10156 + }, + { + "epoch": 0.3011890994276903, + "grad_norm": 0.1415649950504303, + "learning_rate": 0.0008013592813506291, + "loss": 2.7189, + "step": 10157 + }, + { + "epoch": 0.30121875278000176, + "grad_norm": 0.14495478570461273, + "learning_rate": 0.0008013217355214324, + "loss": 2.7183, + "step": 10158 + }, + { + "epoch": 0.30124840613231324, + "grad_norm": 0.13168562948703766, + "learning_rate": 0.0008012841870239799, + "loss": 2.7255, + "step": 10159 + }, + { + "epoch": 0.3012780594846247, + "grad_norm": 0.1456674486398697, + "learning_rate": 0.0008012466358586044, + "loss": 2.7507, + "step": 10160 + }, + { + "epoch": 0.3013077128369362, + "grad_norm": 0.13881833851337433, + "learning_rate": 0.0008012090820256381, + "loss": 2.6857, + "step": 10161 + }, + { + "epoch": 0.30133736618924767, + "grad_norm": 0.14191418886184692, + "learning_rate": 0.0008011715255254137, + "loss": 2.7131, + "step": 10162 + }, + { + "epoch": 0.3013670195415592, + "grad_norm": 0.15155456960201263, + "learning_rate": 0.0008011339663582638, + "loss": 2.7501, + "step": 10163 + }, + { + "epoch": 0.3013966728938707, + "grad_norm": 0.1476193070411682, + "learning_rate": 0.0008010964045245208, + "loss": 2.7207, + "step": 10164 + }, + { + "epoch": 0.30142632624618215, + "grad_norm": 0.14607210457324982, + "learning_rate": 0.0008010588400245176, + "loss": 2.7088, + "step": 10165 + }, + { + "epoch": 0.30145597959849363, + "grad_norm": 0.17010213434696198, + "learning_rate": 0.0008010212728585866, + "loss": 2.7423, + "step": 10166 + }, + { + "epoch": 0.3014856329508051, + "grad_norm": 0.15943151712417603, + "learning_rate": 0.0008009837030270606, + "loss": 2.772, + "step": 10167 + }, + { + "epoch": 0.3015152863031166, + "grad_norm": 0.12443207949399948, + "learning_rate": 0.0008009461305302722, + "loss": 2.7416, + "step": 10168 + }, + { + "epoch": 0.30154493965542806, + "grad_norm": 0.11030749976634979, + "learning_rate": 0.0008009085553685542, + "loss": 2.7079, + "step": 10169 + }, + { + "epoch": 0.30157459300773953, + "grad_norm": 0.13177986443042755, + "learning_rate": 0.0008008709775422393, + "loss": 2.757, + "step": 10170 + }, + { + "epoch": 0.301604246360051, + "grad_norm": 0.13240289688110352, + "learning_rate": 0.0008008333970516601, + "loss": 2.7559, + "step": 10171 + }, + { + "epoch": 0.3016338997123625, + "grad_norm": 0.13194197416305542, + "learning_rate": 0.0008007958138971497, + "loss": 2.7382, + "step": 10172 + }, + { + "epoch": 0.30166355306467396, + "grad_norm": 0.13785767555236816, + "learning_rate": 0.0008007582280790408, + "loss": 2.6984, + "step": 10173 + }, + { + "epoch": 0.30169320641698544, + "grad_norm": 0.1125045120716095, + "learning_rate": 0.000800720639597666, + "loss": 2.7238, + "step": 10174 + }, + { + "epoch": 0.3017228597692969, + "grad_norm": 0.11560475826263428, + "learning_rate": 0.0008006830484533585, + "loss": 2.7505, + "step": 10175 + }, + { + "epoch": 0.3017525131216084, + "grad_norm": 0.13846039772033691, + "learning_rate": 0.0008006454546464508, + "loss": 2.7452, + "step": 10176 + }, + { + "epoch": 0.30178216647391987, + "grad_norm": 0.14047156274318695, + "learning_rate": 0.000800607858177276, + "loss": 2.7164, + "step": 10177 + }, + { + "epoch": 0.30181181982623134, + "grad_norm": 0.12336088716983795, + "learning_rate": 0.0008005702590461672, + "loss": 2.6897, + "step": 10178 + }, + { + "epoch": 0.3018414731785428, + "grad_norm": 0.1255389153957367, + "learning_rate": 0.000800532657253457, + "loss": 2.7258, + "step": 10179 + }, + { + "epoch": 0.3018711265308543, + "grad_norm": 0.11341304332017899, + "learning_rate": 0.0008004950527994787, + "loss": 2.6807, + "step": 10180 + }, + { + "epoch": 0.30190077988316577, + "grad_norm": 0.14689616858959198, + "learning_rate": 0.0008004574456845651, + "loss": 2.7458, + "step": 10181 + }, + { + "epoch": 0.30193043323547725, + "grad_norm": 0.1726013422012329, + "learning_rate": 0.000800419835909049, + "loss": 2.7379, + "step": 10182 + }, + { + "epoch": 0.3019600865877887, + "grad_norm": 0.1832725554704666, + "learning_rate": 0.0008003822234732639, + "loss": 2.7483, + "step": 10183 + }, + { + "epoch": 0.30198973994010025, + "grad_norm": 0.16635257005691528, + "learning_rate": 0.0008003446083775425, + "loss": 2.729, + "step": 10184 + }, + { + "epoch": 0.30201939329241173, + "grad_norm": 0.15677772462368011, + "learning_rate": 0.0008003069906222182, + "loss": 2.7263, + "step": 10185 + }, + { + "epoch": 0.3020490466447232, + "grad_norm": 0.15924735367298126, + "learning_rate": 0.0008002693702076239, + "loss": 2.7515, + "step": 10186 + }, + { + "epoch": 0.3020786999970347, + "grad_norm": 0.16340519487857819, + "learning_rate": 0.0008002317471340928, + "loss": 2.7604, + "step": 10187 + }, + { + "epoch": 0.30210835334934616, + "grad_norm": 0.18628744781017303, + "learning_rate": 0.000800194121401958, + "loss": 2.7584, + "step": 10188 + }, + { + "epoch": 0.30213800670165764, + "grad_norm": 0.22456811368465424, + "learning_rate": 0.0008001564930115528, + "loss": 2.7206, + "step": 10189 + }, + { + "epoch": 0.3021676600539691, + "grad_norm": 0.21515102684497833, + "learning_rate": 0.0008001188619632103, + "loss": 2.7293, + "step": 10190 + }, + { + "epoch": 0.3021973134062806, + "grad_norm": 0.1421983391046524, + "learning_rate": 0.0008000812282572636, + "loss": 2.7446, + "step": 10191 + }, + { + "epoch": 0.30222696675859206, + "grad_norm": 0.18636590242385864, + "learning_rate": 0.0008000435918940464, + "loss": 2.7198, + "step": 10192 + }, + { + "epoch": 0.30225662011090354, + "grad_norm": 0.18246185779571533, + "learning_rate": 0.0008000059528738916, + "loss": 2.7587, + "step": 10193 + }, + { + "epoch": 0.302286273463215, + "grad_norm": 0.14742396771907806, + "learning_rate": 0.0007999683111971325, + "loss": 2.7119, + "step": 10194 + }, + { + "epoch": 0.3023159268155265, + "grad_norm": 0.13664215803146362, + "learning_rate": 0.0007999306668641025, + "loss": 2.7083, + "step": 10195 + }, + { + "epoch": 0.30234558016783797, + "grad_norm": 0.14564666152000427, + "learning_rate": 0.000799893019875135, + "loss": 2.721, + "step": 10196 + }, + { + "epoch": 0.30237523352014944, + "grad_norm": 0.15554822981357574, + "learning_rate": 0.0007998553702305635, + "loss": 2.7619, + "step": 10197 + }, + { + "epoch": 0.3024048868724609, + "grad_norm": 0.1324368268251419, + "learning_rate": 0.000799817717930721, + "loss": 2.7466, + "step": 10198 + }, + { + "epoch": 0.3024345402247724, + "grad_norm": 0.12033307552337646, + "learning_rate": 0.0007997800629759413, + "loss": 2.7362, + "step": 10199 + }, + { + "epoch": 0.3024641935770839, + "grad_norm": 0.1459653079509735, + "learning_rate": 0.0007997424053665576, + "loss": 2.7333, + "step": 10200 + }, + { + "epoch": 0.30249384692939535, + "grad_norm": 0.12887398898601532, + "learning_rate": 0.0007997047451029035, + "loss": 2.7279, + "step": 10201 + }, + { + "epoch": 0.3025235002817068, + "grad_norm": 0.14055630564689636, + "learning_rate": 0.0007996670821853123, + "loss": 2.7083, + "step": 10202 + }, + { + "epoch": 0.3025531536340183, + "grad_norm": 0.13205137848854065, + "learning_rate": 0.0007996294166141178, + "loss": 2.7317, + "step": 10203 + }, + { + "epoch": 0.3025828069863298, + "grad_norm": 0.12164082378149033, + "learning_rate": 0.0007995917483896533, + "loss": 2.745, + "step": 10204 + }, + { + "epoch": 0.3026124603386413, + "grad_norm": 0.1180487647652626, + "learning_rate": 0.0007995540775122525, + "loss": 2.725, + "step": 10205 + }, + { + "epoch": 0.3026421136909528, + "grad_norm": 0.14569289982318878, + "learning_rate": 0.0007995164039822489, + "loss": 2.7361, + "step": 10206 + }, + { + "epoch": 0.30267176704326426, + "grad_norm": 0.12846536934375763, + "learning_rate": 0.0007994787277999762, + "loss": 2.7082, + "step": 10207 + }, + { + "epoch": 0.30270142039557574, + "grad_norm": 0.12465757876634598, + "learning_rate": 0.0007994410489657679, + "loss": 2.7224, + "step": 10208 + }, + { + "epoch": 0.3027310737478872, + "grad_norm": 0.14598293602466583, + "learning_rate": 0.0007994033674799577, + "loss": 2.7089, + "step": 10209 + }, + { + "epoch": 0.3027607271001987, + "grad_norm": 0.13560904562473297, + "learning_rate": 0.0007993656833428793, + "loss": 2.6987, + "step": 10210 + }, + { + "epoch": 0.30279038045251017, + "grad_norm": 0.14473159611225128, + "learning_rate": 0.0007993279965548664, + "loss": 2.7631, + "step": 10211 + }, + { + "epoch": 0.30282003380482164, + "grad_norm": 0.13689564168453217, + "learning_rate": 0.0007992903071162527, + "loss": 2.725, + "step": 10212 + }, + { + "epoch": 0.3028496871571331, + "grad_norm": 0.1390889436006546, + "learning_rate": 0.000799252615027372, + "loss": 2.7435, + "step": 10213 + }, + { + "epoch": 0.3028793405094446, + "grad_norm": 0.15506604313850403, + "learning_rate": 0.000799214920288558, + "loss": 2.7277, + "step": 10214 + }, + { + "epoch": 0.30290899386175607, + "grad_norm": 0.13644598424434662, + "learning_rate": 0.0007991772229001444, + "loss": 2.728, + "step": 10215 + }, + { + "epoch": 0.30293864721406755, + "grad_norm": 0.15519027411937714, + "learning_rate": 0.0007991395228624653, + "loss": 2.7533, + "step": 10216 + }, + { + "epoch": 0.302968300566379, + "grad_norm": 0.15541620552539825, + "learning_rate": 0.0007991018201758543, + "loss": 2.7403, + "step": 10217 + }, + { + "epoch": 0.3029979539186905, + "grad_norm": 0.13027697801589966, + "learning_rate": 0.0007990641148406455, + "loss": 2.7574, + "step": 10218 + }, + { + "epoch": 0.303027607271002, + "grad_norm": 0.13281548023223877, + "learning_rate": 0.0007990264068571724, + "loss": 2.6964, + "step": 10219 + }, + { + "epoch": 0.30305726062331345, + "grad_norm": 0.13795198500156403, + "learning_rate": 0.0007989886962257694, + "loss": 2.7162, + "step": 10220 + }, + { + "epoch": 0.3030869139756249, + "grad_norm": 0.13538271188735962, + "learning_rate": 0.0007989509829467699, + "loss": 2.7132, + "step": 10221 + }, + { + "epoch": 0.3031165673279364, + "grad_norm": 0.15588004887104034, + "learning_rate": 0.0007989132670205082, + "loss": 2.7541, + "step": 10222 + }, + { + "epoch": 0.3031462206802479, + "grad_norm": 0.15339991450309753, + "learning_rate": 0.0007988755484473183, + "loss": 2.7641, + "step": 10223 + }, + { + "epoch": 0.30317587403255936, + "grad_norm": 0.1598479300737381, + "learning_rate": 0.000798837827227534, + "loss": 2.7652, + "step": 10224 + }, + { + "epoch": 0.30320552738487083, + "grad_norm": 0.1599999964237213, + "learning_rate": 0.0007988001033614895, + "loss": 2.7204, + "step": 10225 + }, + { + "epoch": 0.30323518073718236, + "grad_norm": 0.1487663835287094, + "learning_rate": 0.0007987623768495189, + "loss": 2.7133, + "step": 10226 + }, + { + "epoch": 0.30326483408949384, + "grad_norm": 0.10764505714178085, + "learning_rate": 0.0007987246476919561, + "loss": 2.7003, + "step": 10227 + }, + { + "epoch": 0.3032944874418053, + "grad_norm": 0.13439014554023743, + "learning_rate": 0.0007986869158891352, + "loss": 2.6985, + "step": 10228 + }, + { + "epoch": 0.3033241407941168, + "grad_norm": 0.1388644576072693, + "learning_rate": 0.0007986491814413905, + "loss": 2.6872, + "step": 10229 + }, + { + "epoch": 0.30335379414642827, + "grad_norm": 0.16091161966323853, + "learning_rate": 0.000798611444349056, + "loss": 2.738, + "step": 10230 + }, + { + "epoch": 0.30338344749873974, + "grad_norm": 0.14583329856395721, + "learning_rate": 0.0007985737046124658, + "loss": 2.7339, + "step": 10231 + }, + { + "epoch": 0.3034131008510512, + "grad_norm": 0.12773747742176056, + "learning_rate": 0.0007985359622319543, + "loss": 2.7304, + "step": 10232 + }, + { + "epoch": 0.3034427542033627, + "grad_norm": 0.14267918467521667, + "learning_rate": 0.0007984982172078557, + "loss": 2.7324, + "step": 10233 + }, + { + "epoch": 0.3034724075556742, + "grad_norm": 0.14966648817062378, + "learning_rate": 0.0007984604695405039, + "loss": 2.7404, + "step": 10234 + }, + { + "epoch": 0.30350206090798565, + "grad_norm": 0.14123310148715973, + "learning_rate": 0.0007984227192302336, + "loss": 2.7112, + "step": 10235 + }, + { + "epoch": 0.3035317142602971, + "grad_norm": 0.11224684119224548, + "learning_rate": 0.0007983849662773788, + "loss": 2.7285, + "step": 10236 + }, + { + "epoch": 0.3035613676126086, + "grad_norm": 0.13407258689403534, + "learning_rate": 0.000798347210682274, + "loss": 2.7236, + "step": 10237 + }, + { + "epoch": 0.3035910209649201, + "grad_norm": 0.15802694857120514, + "learning_rate": 0.0007983094524452534, + "loss": 2.7239, + "step": 10238 + }, + { + "epoch": 0.30362067431723155, + "grad_norm": 0.1457270383834839, + "learning_rate": 0.0007982716915666515, + "loss": 2.7322, + "step": 10239 + }, + { + "epoch": 0.30365032766954303, + "grad_norm": 0.1457565426826477, + "learning_rate": 0.0007982339280468024, + "loss": 2.7256, + "step": 10240 + }, + { + "epoch": 0.3036799810218545, + "grad_norm": 0.1547347605228424, + "learning_rate": 0.0007981961618860407, + "loss": 2.7093, + "step": 10241 + }, + { + "epoch": 0.303709634374166, + "grad_norm": 0.16550828516483307, + "learning_rate": 0.0007981583930847008, + "loss": 2.6995, + "step": 10242 + }, + { + "epoch": 0.30373928772647746, + "grad_norm": 0.15368783473968506, + "learning_rate": 0.0007981206216431172, + "loss": 2.7334, + "step": 10243 + }, + { + "epoch": 0.30376894107878893, + "grad_norm": 0.13565261662006378, + "learning_rate": 0.0007980828475616244, + "loss": 2.7405, + "step": 10244 + }, + { + "epoch": 0.3037985944311004, + "grad_norm": 0.1276153028011322, + "learning_rate": 0.0007980450708405567, + "loss": 2.7091, + "step": 10245 + }, + { + "epoch": 0.30382824778341194, + "grad_norm": 0.13715937733650208, + "learning_rate": 0.0007980072914802488, + "loss": 2.7193, + "step": 10246 + }, + { + "epoch": 0.3038579011357234, + "grad_norm": 0.1402343213558197, + "learning_rate": 0.0007979695094810351, + "loss": 2.7288, + "step": 10247 + }, + { + "epoch": 0.3038875544880349, + "grad_norm": 0.15182183682918549, + "learning_rate": 0.0007979317248432503, + "loss": 2.7371, + "step": 10248 + }, + { + "epoch": 0.30391720784034637, + "grad_norm": 0.13899478316307068, + "learning_rate": 0.0007978939375672291, + "loss": 2.7015, + "step": 10249 + }, + { + "epoch": 0.30394686119265785, + "grad_norm": 0.1500466912984848, + "learning_rate": 0.0007978561476533057, + "loss": 2.7564, + "step": 10250 + }, + { + "epoch": 0.3039765145449693, + "grad_norm": 0.13049452006816864, + "learning_rate": 0.0007978183551018151, + "loss": 2.7128, + "step": 10251 + }, + { + "epoch": 0.3040061678972808, + "grad_norm": 0.12205958366394043, + "learning_rate": 0.0007977805599130918, + "loss": 2.6969, + "step": 10252 + }, + { + "epoch": 0.3040358212495923, + "grad_norm": 0.12040438503026962, + "learning_rate": 0.0007977427620874707, + "loss": 2.7247, + "step": 10253 + }, + { + "epoch": 0.30406547460190375, + "grad_norm": 0.12925860285758972, + "learning_rate": 0.000797704961625286, + "loss": 2.7052, + "step": 10254 + }, + { + "epoch": 0.3040951279542152, + "grad_norm": 0.12815696001052856, + "learning_rate": 0.0007976671585268731, + "loss": 2.7496, + "step": 10255 + }, + { + "epoch": 0.3041247813065267, + "grad_norm": 0.15375417470932007, + "learning_rate": 0.0007976293527925662, + "loss": 2.7029, + "step": 10256 + }, + { + "epoch": 0.3041544346588382, + "grad_norm": 0.1594112366437912, + "learning_rate": 0.0007975915444227004, + "loss": 2.7152, + "step": 10257 + }, + { + "epoch": 0.30418408801114966, + "grad_norm": 0.1688774973154068, + "learning_rate": 0.0007975537334176104, + "loss": 2.6945, + "step": 10258 + }, + { + "epoch": 0.30421374136346113, + "grad_norm": 0.17064815759658813, + "learning_rate": 0.000797515919777631, + "loss": 2.6977, + "step": 10259 + }, + { + "epoch": 0.3042433947157726, + "grad_norm": 0.16078828275203705, + "learning_rate": 0.000797478103503097, + "loss": 2.7391, + "step": 10260 + }, + { + "epoch": 0.3042730480680841, + "grad_norm": 0.13775114715099335, + "learning_rate": 0.0007974402845943434, + "loss": 2.7293, + "step": 10261 + }, + { + "epoch": 0.30430270142039556, + "grad_norm": 0.12971393764019012, + "learning_rate": 0.000797402463051705, + "loss": 2.7308, + "step": 10262 + }, + { + "epoch": 0.30433235477270704, + "grad_norm": 0.13771900534629822, + "learning_rate": 0.0007973646388755167, + "loss": 2.7312, + "step": 10263 + }, + { + "epoch": 0.3043620081250185, + "grad_norm": 0.12996038794517517, + "learning_rate": 0.0007973268120661135, + "loss": 2.726, + "step": 10264 + }, + { + "epoch": 0.30439166147733, + "grad_norm": 0.14088808000087738, + "learning_rate": 0.0007972889826238303, + "loss": 2.7777, + "step": 10265 + }, + { + "epoch": 0.30442131482964147, + "grad_norm": 0.1291407346725464, + "learning_rate": 0.0007972511505490022, + "loss": 2.7396, + "step": 10266 + }, + { + "epoch": 0.304450968181953, + "grad_norm": 0.11406511813402176, + "learning_rate": 0.0007972133158419641, + "loss": 2.6982, + "step": 10267 + }, + { + "epoch": 0.3044806215342645, + "grad_norm": 0.11836441606283188, + "learning_rate": 0.0007971754785030512, + "loss": 2.7036, + "step": 10268 + }, + { + "epoch": 0.30451027488657595, + "grad_norm": 0.12507367134094238, + "learning_rate": 0.0007971376385325984, + "loss": 2.7903, + "step": 10269 + }, + { + "epoch": 0.3045399282388874, + "grad_norm": 0.1403038501739502, + "learning_rate": 0.0007970997959309406, + "loss": 2.6828, + "step": 10270 + }, + { + "epoch": 0.3045695815911989, + "grad_norm": 0.14769455790519714, + "learning_rate": 0.0007970619506984134, + "loss": 2.6977, + "step": 10271 + }, + { + "epoch": 0.3045992349435104, + "grad_norm": 0.15293705463409424, + "learning_rate": 0.0007970241028353514, + "loss": 2.7246, + "step": 10272 + }, + { + "epoch": 0.30462888829582185, + "grad_norm": 0.13135181367397308, + "learning_rate": 0.0007969862523420901, + "loss": 2.7324, + "step": 10273 + }, + { + "epoch": 0.30465854164813333, + "grad_norm": 0.1321904957294464, + "learning_rate": 0.0007969483992189644, + "loss": 2.701, + "step": 10274 + }, + { + "epoch": 0.3046881950004448, + "grad_norm": 0.14165745675563812, + "learning_rate": 0.0007969105434663098, + "loss": 2.7106, + "step": 10275 + }, + { + "epoch": 0.3047178483527563, + "grad_norm": 0.14817652106285095, + "learning_rate": 0.0007968726850844614, + "loss": 2.7381, + "step": 10276 + }, + { + "epoch": 0.30474750170506776, + "grad_norm": 0.15599359571933746, + "learning_rate": 0.0007968348240737544, + "loss": 2.7275, + "step": 10277 + }, + { + "epoch": 0.30477715505737923, + "grad_norm": 0.167430117726326, + "learning_rate": 0.000796796960434524, + "loss": 2.7267, + "step": 10278 + }, + { + "epoch": 0.3048068084096907, + "grad_norm": 0.16719570755958557, + "learning_rate": 0.0007967590941671057, + "loss": 2.7311, + "step": 10279 + }, + { + "epoch": 0.3048364617620022, + "grad_norm": 0.16524142026901245, + "learning_rate": 0.0007967212252718345, + "loss": 2.7318, + "step": 10280 + }, + { + "epoch": 0.30486611511431366, + "grad_norm": 0.16384343802928925, + "learning_rate": 0.0007966833537490461, + "loss": 2.7291, + "step": 10281 + }, + { + "epoch": 0.30489576846662514, + "grad_norm": 0.14609014987945557, + "learning_rate": 0.0007966454795990756, + "loss": 2.7435, + "step": 10282 + }, + { + "epoch": 0.3049254218189366, + "grad_norm": 0.13513414561748505, + "learning_rate": 0.0007966076028222584, + "loss": 2.7286, + "step": 10283 + }, + { + "epoch": 0.3049550751712481, + "grad_norm": 0.13784460723400116, + "learning_rate": 0.0007965697234189302, + "loss": 2.7514, + "step": 10284 + }, + { + "epoch": 0.30498472852355957, + "grad_norm": 0.13355837762355804, + "learning_rate": 0.0007965318413894261, + "loss": 2.7046, + "step": 10285 + }, + { + "epoch": 0.30501438187587104, + "grad_norm": 0.13537685573101044, + "learning_rate": 0.0007964939567340814, + "loss": 2.7388, + "step": 10286 + }, + { + "epoch": 0.3050440352281825, + "grad_norm": 0.13032293319702148, + "learning_rate": 0.000796456069453232, + "loss": 2.7309, + "step": 10287 + }, + { + "epoch": 0.30507368858049405, + "grad_norm": 0.12243563681840897, + "learning_rate": 0.0007964181795472132, + "loss": 2.7281, + "step": 10288 + }, + { + "epoch": 0.3051033419328055, + "grad_norm": 0.14118370413780212, + "learning_rate": 0.0007963802870163605, + "loss": 2.7229, + "step": 10289 + }, + { + "epoch": 0.305132995285117, + "grad_norm": 0.15350103378295898, + "learning_rate": 0.0007963423918610096, + "loss": 2.7278, + "step": 10290 + }, + { + "epoch": 0.3051626486374285, + "grad_norm": 0.1621541976928711, + "learning_rate": 0.0007963044940814958, + "loss": 2.6663, + "step": 10291 + }, + { + "epoch": 0.30519230198973996, + "grad_norm": 0.16497580707073212, + "learning_rate": 0.000796266593678155, + "loss": 2.7425, + "step": 10292 + }, + { + "epoch": 0.30522195534205143, + "grad_norm": 0.1298893243074417, + "learning_rate": 0.0007962286906513225, + "loss": 2.7019, + "step": 10293 + }, + { + "epoch": 0.3052516086943629, + "grad_norm": 0.13175459206104279, + "learning_rate": 0.0007961907850013343, + "loss": 2.7094, + "step": 10294 + }, + { + "epoch": 0.3052812620466744, + "grad_norm": 0.1634853631258011, + "learning_rate": 0.0007961528767285258, + "loss": 2.7158, + "step": 10295 + }, + { + "epoch": 0.30531091539898586, + "grad_norm": 0.1682758629322052, + "learning_rate": 0.0007961149658332327, + "loss": 2.7232, + "step": 10296 + }, + { + "epoch": 0.30534056875129734, + "grad_norm": 0.14955000579357147, + "learning_rate": 0.0007960770523157908, + "loss": 2.7382, + "step": 10297 + }, + { + "epoch": 0.3053702221036088, + "grad_norm": 0.11116749793291092, + "learning_rate": 0.0007960391361765356, + "loss": 2.7318, + "step": 10298 + }, + { + "epoch": 0.3053998754559203, + "grad_norm": 0.13724762201309204, + "learning_rate": 0.0007960012174158031, + "loss": 2.7246, + "step": 10299 + }, + { + "epoch": 0.30542952880823176, + "grad_norm": 0.1540466547012329, + "learning_rate": 0.0007959632960339292, + "loss": 2.7268, + "step": 10300 + }, + { + "epoch": 0.30545918216054324, + "grad_norm": 0.16470399498939514, + "learning_rate": 0.0007959253720312494, + "loss": 2.7447, + "step": 10301 + }, + { + "epoch": 0.3054888355128547, + "grad_norm": 0.16053028404712677, + "learning_rate": 0.0007958874454080995, + "loss": 2.7173, + "step": 10302 + }, + { + "epoch": 0.3055184888651662, + "grad_norm": 0.15539422631263733, + "learning_rate": 0.0007958495161648156, + "loss": 2.7399, + "step": 10303 + }, + { + "epoch": 0.30554814221747767, + "grad_norm": 0.1600581407546997, + "learning_rate": 0.0007958115843017335, + "loss": 2.7294, + "step": 10304 + }, + { + "epoch": 0.30557779556978915, + "grad_norm": 0.15250155329704285, + "learning_rate": 0.000795773649819189, + "loss": 2.7491, + "step": 10305 + }, + { + "epoch": 0.3056074489221006, + "grad_norm": 0.1476811319589615, + "learning_rate": 0.000795735712717518, + "loss": 2.7464, + "step": 10306 + }, + { + "epoch": 0.3056371022744121, + "grad_norm": 0.15802600979804993, + "learning_rate": 0.0007956977729970566, + "loss": 2.7386, + "step": 10307 + }, + { + "epoch": 0.3056667556267236, + "grad_norm": 0.11672025918960571, + "learning_rate": 0.0007956598306581407, + "loss": 2.7087, + "step": 10308 + }, + { + "epoch": 0.3056964089790351, + "grad_norm": 0.11255179345607758, + "learning_rate": 0.0007956218857011061, + "loss": 2.7135, + "step": 10309 + }, + { + "epoch": 0.3057260623313466, + "grad_norm": 0.13087190687656403, + "learning_rate": 0.000795583938126289, + "loss": 2.7952, + "step": 10310 + }, + { + "epoch": 0.30575571568365806, + "grad_norm": 0.14573152363300323, + "learning_rate": 0.0007955459879340254, + "loss": 2.7248, + "step": 10311 + }, + { + "epoch": 0.30578536903596953, + "grad_norm": 0.1206463947892189, + "learning_rate": 0.0007955080351246515, + "loss": 2.7272, + "step": 10312 + }, + { + "epoch": 0.305815022388281, + "grad_norm": 0.11804823577404022, + "learning_rate": 0.0007954700796985031, + "loss": 2.7118, + "step": 10313 + }, + { + "epoch": 0.3058446757405925, + "grad_norm": 0.1379375457763672, + "learning_rate": 0.0007954321216559163, + "loss": 2.6846, + "step": 10314 + }, + { + "epoch": 0.30587432909290396, + "grad_norm": 0.14041835069656372, + "learning_rate": 0.0007953941609972275, + "loss": 2.7428, + "step": 10315 + }, + { + "epoch": 0.30590398244521544, + "grad_norm": 0.1302817463874817, + "learning_rate": 0.0007953561977227728, + "loss": 2.7652, + "step": 10316 + }, + { + "epoch": 0.3059336357975269, + "grad_norm": 0.13443408906459808, + "learning_rate": 0.0007953182318328881, + "loss": 2.732, + "step": 10317 + }, + { + "epoch": 0.3059632891498384, + "grad_norm": 0.14580772817134857, + "learning_rate": 0.0007952802633279097, + "loss": 2.7075, + "step": 10318 + }, + { + "epoch": 0.30599294250214987, + "grad_norm": 0.15109911561012268, + "learning_rate": 0.0007952422922081741, + "loss": 2.7008, + "step": 10319 + }, + { + "epoch": 0.30602259585446134, + "grad_norm": 0.1482434719800949, + "learning_rate": 0.0007952043184740172, + "loss": 2.7372, + "step": 10320 + }, + { + "epoch": 0.3060522492067728, + "grad_norm": 0.13640782237052917, + "learning_rate": 0.0007951663421257754, + "loss": 2.7015, + "step": 10321 + }, + { + "epoch": 0.3060819025590843, + "grad_norm": 0.12059824168682098, + "learning_rate": 0.000795128363163785, + "loss": 2.7206, + "step": 10322 + }, + { + "epoch": 0.30611155591139577, + "grad_norm": 0.13956250250339508, + "learning_rate": 0.0007950903815883823, + "loss": 2.7773, + "step": 10323 + }, + { + "epoch": 0.30614120926370725, + "grad_norm": 0.158513605594635, + "learning_rate": 0.0007950523973999037, + "loss": 2.7219, + "step": 10324 + }, + { + "epoch": 0.3061708626160187, + "grad_norm": 0.13514776527881622, + "learning_rate": 0.0007950144105986852, + "loss": 2.7181, + "step": 10325 + }, + { + "epoch": 0.3062005159683302, + "grad_norm": 0.12069971859455109, + "learning_rate": 0.0007949764211850637, + "loss": 2.7646, + "step": 10326 + }, + { + "epoch": 0.3062301693206417, + "grad_norm": 0.12380833923816681, + "learning_rate": 0.0007949384291593753, + "loss": 2.7652, + "step": 10327 + }, + { + "epoch": 0.30625982267295315, + "grad_norm": 0.12612566351890564, + "learning_rate": 0.0007949004345219565, + "loss": 2.7455, + "step": 10328 + }, + { + "epoch": 0.30628947602526463, + "grad_norm": 0.1288183331489563, + "learning_rate": 0.0007948624372731437, + "loss": 2.7214, + "step": 10329 + }, + { + "epoch": 0.30631912937757616, + "grad_norm": 0.13037751615047455, + "learning_rate": 0.0007948244374132733, + "loss": 2.7306, + "step": 10330 + }, + { + "epoch": 0.30634878272988764, + "grad_norm": 0.11758371442556381, + "learning_rate": 0.0007947864349426821, + "loss": 2.7252, + "step": 10331 + }, + { + "epoch": 0.3063784360821991, + "grad_norm": 0.13564305007457733, + "learning_rate": 0.0007947484298617063, + "loss": 2.7192, + "step": 10332 + }, + { + "epoch": 0.3064080894345106, + "grad_norm": 0.15534032881259918, + "learning_rate": 0.0007947104221706826, + "loss": 2.7165, + "step": 10333 + }, + { + "epoch": 0.30643774278682206, + "grad_norm": 0.16368256509304047, + "learning_rate": 0.0007946724118699475, + "loss": 2.7112, + "step": 10334 + }, + { + "epoch": 0.30646739613913354, + "grad_norm": 0.19559769332408905, + "learning_rate": 0.0007946343989598377, + "loss": 2.7371, + "step": 10335 + }, + { + "epoch": 0.306497049491445, + "grad_norm": 0.20206676423549652, + "learning_rate": 0.0007945963834406895, + "loss": 2.7361, + "step": 10336 + }, + { + "epoch": 0.3065267028437565, + "grad_norm": 0.1742257922887802, + "learning_rate": 0.0007945583653128401, + "loss": 2.7344, + "step": 10337 + }, + { + "epoch": 0.30655635619606797, + "grad_norm": 0.1429455429315567, + "learning_rate": 0.0007945203445766254, + "loss": 2.7123, + "step": 10338 + }, + { + "epoch": 0.30658600954837945, + "grad_norm": 0.14628392457962036, + "learning_rate": 0.0007944823212323828, + "loss": 2.7516, + "step": 10339 + }, + { + "epoch": 0.3066156629006909, + "grad_norm": 0.13850678503513336, + "learning_rate": 0.0007944442952804487, + "loss": 2.7184, + "step": 10340 + }, + { + "epoch": 0.3066453162530024, + "grad_norm": 0.1273927241563797, + "learning_rate": 0.0007944062667211598, + "loss": 2.7512, + "step": 10341 + }, + { + "epoch": 0.3066749696053139, + "grad_norm": 0.14889219403266907, + "learning_rate": 0.0007943682355548527, + "loss": 2.7324, + "step": 10342 + }, + { + "epoch": 0.30670462295762535, + "grad_norm": 0.14305448532104492, + "learning_rate": 0.0007943302017818645, + "loss": 2.7325, + "step": 10343 + }, + { + "epoch": 0.3067342763099368, + "grad_norm": 0.1530001163482666, + "learning_rate": 0.0007942921654025318, + "loss": 2.7068, + "step": 10344 + }, + { + "epoch": 0.3067639296622483, + "grad_norm": 0.14654086530208588, + "learning_rate": 0.0007942541264171914, + "loss": 2.7176, + "step": 10345 + }, + { + "epoch": 0.3067935830145598, + "grad_norm": 0.12790361046791077, + "learning_rate": 0.0007942160848261803, + "loss": 2.7187, + "step": 10346 + }, + { + "epoch": 0.30682323636687125, + "grad_norm": 0.16611914336681366, + "learning_rate": 0.0007941780406298353, + "loss": 2.737, + "step": 10347 + }, + { + "epoch": 0.30685288971918273, + "grad_norm": 0.15077176690101624, + "learning_rate": 0.0007941399938284933, + "loss": 2.737, + "step": 10348 + }, + { + "epoch": 0.3068825430714942, + "grad_norm": 0.1334855854511261, + "learning_rate": 0.0007941019444224909, + "loss": 2.7368, + "step": 10349 + }, + { + "epoch": 0.30691219642380574, + "grad_norm": 0.13456663489341736, + "learning_rate": 0.0007940638924121654, + "loss": 2.6923, + "step": 10350 + }, + { + "epoch": 0.3069418497761172, + "grad_norm": 0.12613648176193237, + "learning_rate": 0.0007940258377978537, + "loss": 2.7676, + "step": 10351 + }, + { + "epoch": 0.3069715031284287, + "grad_norm": 0.15785622596740723, + "learning_rate": 0.0007939877805798928, + "loss": 2.7154, + "step": 10352 + }, + { + "epoch": 0.30700115648074017, + "grad_norm": 0.16461870074272156, + "learning_rate": 0.0007939497207586197, + "loss": 2.7092, + "step": 10353 + }, + { + "epoch": 0.30703080983305164, + "grad_norm": 0.17133450508117676, + "learning_rate": 0.0007939116583343712, + "loss": 2.7121, + "step": 10354 + }, + { + "epoch": 0.3070604631853631, + "grad_norm": 0.15218235552310944, + "learning_rate": 0.0007938735933074846, + "loss": 2.7438, + "step": 10355 + }, + { + "epoch": 0.3070901165376746, + "grad_norm": 0.12595361471176147, + "learning_rate": 0.0007938355256782969, + "loss": 2.718, + "step": 10356 + }, + { + "epoch": 0.30711976988998607, + "grad_norm": 0.1445152908563614, + "learning_rate": 0.000793797455447145, + "loss": 2.7036, + "step": 10357 + }, + { + "epoch": 0.30714942324229755, + "grad_norm": 0.13663740456104279, + "learning_rate": 0.0007937593826143664, + "loss": 2.7456, + "step": 10358 + }, + { + "epoch": 0.307179076594609, + "grad_norm": 0.12607930600643158, + "learning_rate": 0.000793721307180298, + "loss": 2.6997, + "step": 10359 + }, + { + "epoch": 0.3072087299469205, + "grad_norm": 0.13162097334861755, + "learning_rate": 0.000793683229145277, + "loss": 2.7287, + "step": 10360 + }, + { + "epoch": 0.307238383299232, + "grad_norm": 0.13851819932460785, + "learning_rate": 0.0007936451485096406, + "loss": 2.7279, + "step": 10361 + }, + { + "epoch": 0.30726803665154345, + "grad_norm": 0.12976668775081635, + "learning_rate": 0.0007936070652737261, + "loss": 2.7681, + "step": 10362 + }, + { + "epoch": 0.30729769000385493, + "grad_norm": 0.125049889087677, + "learning_rate": 0.0007935689794378705, + "loss": 2.769, + "step": 10363 + }, + { + "epoch": 0.3073273433561664, + "grad_norm": 0.14569169282913208, + "learning_rate": 0.0007935308910024113, + "loss": 2.718, + "step": 10364 + }, + { + "epoch": 0.3073569967084779, + "grad_norm": 0.1367487609386444, + "learning_rate": 0.0007934927999676855, + "loss": 2.7189, + "step": 10365 + }, + { + "epoch": 0.30738665006078936, + "grad_norm": 0.13803018629550934, + "learning_rate": 0.0007934547063340307, + "loss": 2.7103, + "step": 10366 + }, + { + "epoch": 0.30741630341310083, + "grad_norm": 0.1302960067987442, + "learning_rate": 0.0007934166101017841, + "loss": 2.7487, + "step": 10367 + }, + { + "epoch": 0.3074459567654123, + "grad_norm": 0.12489722669124603, + "learning_rate": 0.0007933785112712831, + "loss": 2.7184, + "step": 10368 + }, + { + "epoch": 0.3074756101177238, + "grad_norm": 0.1438831090927124, + "learning_rate": 0.0007933404098428651, + "loss": 2.7504, + "step": 10369 + }, + { + "epoch": 0.30750526347003526, + "grad_norm": 0.1740063577890396, + "learning_rate": 0.0007933023058168671, + "loss": 2.7632, + "step": 10370 + }, + { + "epoch": 0.3075349168223468, + "grad_norm": 0.15872904658317566, + "learning_rate": 0.0007932641991936271, + "loss": 2.6816, + "step": 10371 + }, + { + "epoch": 0.30756457017465827, + "grad_norm": 0.17002412676811218, + "learning_rate": 0.0007932260899734822, + "loss": 2.7572, + "step": 10372 + }, + { + "epoch": 0.30759422352696975, + "grad_norm": 0.17759175598621368, + "learning_rate": 0.0007931879781567699, + "loss": 2.7557, + "step": 10373 + }, + { + "epoch": 0.3076238768792812, + "grad_norm": 0.18468649685382843, + "learning_rate": 0.0007931498637438279, + "loss": 2.7362, + "step": 10374 + }, + { + "epoch": 0.3076535302315927, + "grad_norm": 0.17023912072181702, + "learning_rate": 0.0007931117467349934, + "loss": 2.744, + "step": 10375 + }, + { + "epoch": 0.3076831835839042, + "grad_norm": 0.1474561244249344, + "learning_rate": 0.000793073627130604, + "loss": 2.7343, + "step": 10376 + }, + { + "epoch": 0.30771283693621565, + "grad_norm": 0.13169518113136292, + "learning_rate": 0.0007930355049309975, + "loss": 2.7351, + "step": 10377 + }, + { + "epoch": 0.3077424902885271, + "grad_norm": 0.1512606143951416, + "learning_rate": 0.0007929973801365113, + "loss": 2.7247, + "step": 10378 + }, + { + "epoch": 0.3077721436408386, + "grad_norm": 0.15702736377716064, + "learning_rate": 0.000792959252747483, + "loss": 2.7296, + "step": 10379 + }, + { + "epoch": 0.3078017969931501, + "grad_norm": 0.1460929661989212, + "learning_rate": 0.0007929211227642501, + "loss": 2.707, + "step": 10380 + }, + { + "epoch": 0.30783145034546155, + "grad_norm": 0.1493493914604187, + "learning_rate": 0.0007928829901871503, + "loss": 2.7552, + "step": 10381 + }, + { + "epoch": 0.30786110369777303, + "grad_norm": 0.1451665163040161, + "learning_rate": 0.0007928448550165216, + "loss": 2.7352, + "step": 10382 + }, + { + "epoch": 0.3078907570500845, + "grad_norm": 0.136112242937088, + "learning_rate": 0.0007928067172527013, + "loss": 2.6824, + "step": 10383 + }, + { + "epoch": 0.307920410402396, + "grad_norm": 0.13896150887012482, + "learning_rate": 0.0007927685768960274, + "loss": 2.723, + "step": 10384 + }, + { + "epoch": 0.30795006375470746, + "grad_norm": 0.1355813592672348, + "learning_rate": 0.0007927304339468373, + "loss": 2.7315, + "step": 10385 + }, + { + "epoch": 0.30797971710701894, + "grad_norm": 0.14220944046974182, + "learning_rate": 0.000792692288405469, + "loss": 2.7659, + "step": 10386 + }, + { + "epoch": 0.3080093704593304, + "grad_norm": 0.1477435827255249, + "learning_rate": 0.0007926541402722603, + "loss": 2.7347, + "step": 10387 + }, + { + "epoch": 0.3080390238116419, + "grad_norm": 0.12382423132658005, + "learning_rate": 0.0007926159895475491, + "loss": 2.7228, + "step": 10388 + }, + { + "epoch": 0.30806867716395336, + "grad_norm": 0.13957001268863678, + "learning_rate": 0.0007925778362316728, + "loss": 2.7282, + "step": 10389 + }, + { + "epoch": 0.30809833051626484, + "grad_norm": 0.1386699378490448, + "learning_rate": 0.0007925396803249697, + "loss": 2.7372, + "step": 10390 + }, + { + "epoch": 0.3081279838685763, + "grad_norm": 0.12526819109916687, + "learning_rate": 0.0007925015218277774, + "loss": 2.7395, + "step": 10391 + }, + { + "epoch": 0.30815763722088785, + "grad_norm": 0.11728226393461227, + "learning_rate": 0.0007924633607404341, + "loss": 2.7158, + "step": 10392 + }, + { + "epoch": 0.3081872905731993, + "grad_norm": 0.11397843807935715, + "learning_rate": 0.0007924251970632774, + "loss": 2.7031, + "step": 10393 + }, + { + "epoch": 0.3082169439255108, + "grad_norm": 0.13399291038513184, + "learning_rate": 0.0007923870307966456, + "loss": 2.6873, + "step": 10394 + }, + { + "epoch": 0.3082465972778223, + "grad_norm": 0.13761217892169952, + "learning_rate": 0.0007923488619408762, + "loss": 2.7344, + "step": 10395 + }, + { + "epoch": 0.30827625063013375, + "grad_norm": 0.13156603276729584, + "learning_rate": 0.0007923106904963075, + "loss": 2.7095, + "step": 10396 + }, + { + "epoch": 0.30830590398244523, + "grad_norm": 0.1584807187318802, + "learning_rate": 0.0007922725164632775, + "loss": 2.7153, + "step": 10397 + }, + { + "epoch": 0.3083355573347567, + "grad_norm": 0.17465929687023163, + "learning_rate": 0.0007922343398421241, + "loss": 2.751, + "step": 10398 + }, + { + "epoch": 0.3083652106870682, + "grad_norm": 0.15969620645046234, + "learning_rate": 0.0007921961606331858, + "loss": 2.7281, + "step": 10399 + }, + { + "epoch": 0.30839486403937966, + "grad_norm": 0.1367846131324768, + "learning_rate": 0.0007921579788368001, + "loss": 2.732, + "step": 10400 + }, + { + "epoch": 0.30842451739169113, + "grad_norm": 0.19716210663318634, + "learning_rate": 0.0007921197944533052, + "loss": 2.7237, + "step": 10401 + }, + { + "epoch": 0.3084541707440026, + "grad_norm": 0.13739006221294403, + "learning_rate": 0.0007920816074830395, + "loss": 2.7033, + "step": 10402 + }, + { + "epoch": 0.3084838240963141, + "grad_norm": 0.1401563584804535, + "learning_rate": 0.0007920434179263412, + "loss": 2.7312, + "step": 10403 + }, + { + "epoch": 0.30851347744862556, + "grad_norm": 0.13451911509037018, + "learning_rate": 0.0007920052257835481, + "loss": 2.7131, + "step": 10404 + }, + { + "epoch": 0.30854313080093704, + "grad_norm": 0.12117689102888107, + "learning_rate": 0.0007919670310549987, + "loss": 2.747, + "step": 10405 + }, + { + "epoch": 0.3085727841532485, + "grad_norm": 0.13720743358135223, + "learning_rate": 0.0007919288337410311, + "loss": 2.7106, + "step": 10406 + }, + { + "epoch": 0.30860243750556, + "grad_norm": 0.14513759315013885, + "learning_rate": 0.0007918906338419835, + "loss": 2.7454, + "step": 10407 + }, + { + "epoch": 0.30863209085787147, + "grad_norm": 0.14685706794261932, + "learning_rate": 0.0007918524313581943, + "loss": 2.7219, + "step": 10408 + }, + { + "epoch": 0.30866174421018294, + "grad_norm": 0.12818549573421478, + "learning_rate": 0.0007918142262900017, + "loss": 2.7356, + "step": 10409 + }, + { + "epoch": 0.3086913975624944, + "grad_norm": 0.12757496535778046, + "learning_rate": 0.000791776018637744, + "loss": 2.7108, + "step": 10410 + }, + { + "epoch": 0.3087210509148059, + "grad_norm": 0.13891173899173737, + "learning_rate": 0.0007917378084017596, + "loss": 2.7192, + "step": 10411 + }, + { + "epoch": 0.30875070426711737, + "grad_norm": 0.12683625519275665, + "learning_rate": 0.0007916995955823869, + "loss": 2.7504, + "step": 10412 + }, + { + "epoch": 0.3087803576194289, + "grad_norm": 0.1077028438448906, + "learning_rate": 0.0007916613801799643, + "loss": 2.7435, + "step": 10413 + }, + { + "epoch": 0.3088100109717404, + "grad_norm": 0.11477367579936981, + "learning_rate": 0.00079162316219483, + "loss": 2.7347, + "step": 10414 + }, + { + "epoch": 0.30883966432405185, + "grad_norm": 0.12586960196495056, + "learning_rate": 0.0007915849416273225, + "loss": 2.6931, + "step": 10415 + }, + { + "epoch": 0.30886931767636333, + "grad_norm": 0.11595217138528824, + "learning_rate": 0.0007915467184777803, + "loss": 2.7476, + "step": 10416 + }, + { + "epoch": 0.3088989710286748, + "grad_norm": 0.11475757509469986, + "learning_rate": 0.0007915084927465419, + "loss": 2.7174, + "step": 10417 + }, + { + "epoch": 0.3089286243809863, + "grad_norm": 0.11892755329608917, + "learning_rate": 0.0007914702644339457, + "loss": 2.7681, + "step": 10418 + }, + { + "epoch": 0.30895827773329776, + "grad_norm": 0.1500832587480545, + "learning_rate": 0.0007914320335403304, + "loss": 2.744, + "step": 10419 + }, + { + "epoch": 0.30898793108560924, + "grad_norm": 0.16387736797332764, + "learning_rate": 0.0007913938000660343, + "loss": 2.705, + "step": 10420 + }, + { + "epoch": 0.3090175844379207, + "grad_norm": 0.16290166974067688, + "learning_rate": 0.0007913555640113961, + "loss": 2.7172, + "step": 10421 + }, + { + "epoch": 0.3090472377902322, + "grad_norm": 0.1757676601409912, + "learning_rate": 0.0007913173253767543, + "loss": 2.7555, + "step": 10422 + }, + { + "epoch": 0.30907689114254366, + "grad_norm": 0.1705377846956253, + "learning_rate": 0.0007912790841624477, + "loss": 2.7323, + "step": 10423 + }, + { + "epoch": 0.30910654449485514, + "grad_norm": 0.148240864276886, + "learning_rate": 0.0007912408403688149, + "loss": 2.7329, + "step": 10424 + }, + { + "epoch": 0.3091361978471666, + "grad_norm": 0.13802815973758698, + "learning_rate": 0.0007912025939961943, + "loss": 2.7274, + "step": 10425 + }, + { + "epoch": 0.3091658511994781, + "grad_norm": 0.1395544558763504, + "learning_rate": 0.0007911643450449248, + "loss": 2.7281, + "step": 10426 + }, + { + "epoch": 0.30919550455178957, + "grad_norm": 0.12134741991758347, + "learning_rate": 0.0007911260935153451, + "loss": 2.7227, + "step": 10427 + }, + { + "epoch": 0.30922515790410104, + "grad_norm": 0.1266937106847763, + "learning_rate": 0.0007910878394077938, + "loss": 2.7422, + "step": 10428 + }, + { + "epoch": 0.3092548112564125, + "grad_norm": 0.15327660739421844, + "learning_rate": 0.0007910495827226097, + "loss": 2.7193, + "step": 10429 + }, + { + "epoch": 0.309284464608724, + "grad_norm": 0.1554514616727829, + "learning_rate": 0.0007910113234601317, + "loss": 2.7318, + "step": 10430 + }, + { + "epoch": 0.3093141179610355, + "grad_norm": 0.17962156236171722, + "learning_rate": 0.0007909730616206983, + "loss": 2.7073, + "step": 10431 + }, + { + "epoch": 0.30934377131334695, + "grad_norm": 0.16900387406349182, + "learning_rate": 0.0007909347972046486, + "loss": 2.7314, + "step": 10432 + }, + { + "epoch": 0.3093734246656584, + "grad_norm": 0.15297158062458038, + "learning_rate": 0.0007908965302123214, + "loss": 2.7534, + "step": 10433 + }, + { + "epoch": 0.30940307801796996, + "grad_norm": 0.12935461103916168, + "learning_rate": 0.0007908582606440555, + "loss": 2.7475, + "step": 10434 + }, + { + "epoch": 0.30943273137028143, + "grad_norm": 0.12399613857269287, + "learning_rate": 0.0007908199885001897, + "loss": 2.7511, + "step": 10435 + }, + { + "epoch": 0.3094623847225929, + "grad_norm": 0.14224210381507874, + "learning_rate": 0.0007907817137810629, + "loss": 2.7199, + "step": 10436 + }, + { + "epoch": 0.3094920380749044, + "grad_norm": 0.1444532424211502, + "learning_rate": 0.0007907434364870142, + "loss": 2.6959, + "step": 10437 + }, + { + "epoch": 0.30952169142721586, + "grad_norm": 0.1473320871591568, + "learning_rate": 0.0007907051566183825, + "loss": 2.7207, + "step": 10438 + }, + { + "epoch": 0.30955134477952734, + "grad_norm": 0.14996054768562317, + "learning_rate": 0.0007906668741755066, + "loss": 2.7377, + "step": 10439 + }, + { + "epoch": 0.3095809981318388, + "grad_norm": 0.13532234728336334, + "learning_rate": 0.0007906285891587259, + "loss": 2.7259, + "step": 10440 + }, + { + "epoch": 0.3096106514841503, + "grad_norm": 0.1320274919271469, + "learning_rate": 0.0007905903015683789, + "loss": 2.7263, + "step": 10441 + }, + { + "epoch": 0.30964030483646177, + "grad_norm": 0.12044332176446915, + "learning_rate": 0.0007905520114048051, + "loss": 2.7072, + "step": 10442 + }, + { + "epoch": 0.30966995818877324, + "grad_norm": 0.12380673736333847, + "learning_rate": 0.0007905137186683431, + "loss": 2.7409, + "step": 10443 + }, + { + "epoch": 0.3096996115410847, + "grad_norm": 0.12496955692768097, + "learning_rate": 0.0007904754233593325, + "loss": 2.6891, + "step": 10444 + }, + { + "epoch": 0.3097292648933962, + "grad_norm": 0.11808191239833832, + "learning_rate": 0.0007904371254781121, + "loss": 2.7197, + "step": 10445 + }, + { + "epoch": 0.30975891824570767, + "grad_norm": 0.12140478193759918, + "learning_rate": 0.0007903988250250212, + "loss": 2.7199, + "step": 10446 + }, + { + "epoch": 0.30978857159801915, + "grad_norm": 0.13036467134952545, + "learning_rate": 0.0007903605220003986, + "loss": 2.7015, + "step": 10447 + }, + { + "epoch": 0.3098182249503306, + "grad_norm": 0.13930508494377136, + "learning_rate": 0.0007903222164045838, + "loss": 2.6933, + "step": 10448 + }, + { + "epoch": 0.3098478783026421, + "grad_norm": 0.12400554120540619, + "learning_rate": 0.0007902839082379161, + "loss": 2.7218, + "step": 10449 + }, + { + "epoch": 0.3098775316549536, + "grad_norm": 0.11539696156978607, + "learning_rate": 0.0007902455975007344, + "loss": 2.7364, + "step": 10450 + }, + { + "epoch": 0.30990718500726505, + "grad_norm": 0.15287423133850098, + "learning_rate": 0.0007902072841933783, + "loss": 2.7342, + "step": 10451 + }, + { + "epoch": 0.3099368383595765, + "grad_norm": 0.14097897708415985, + "learning_rate": 0.0007901689683161868, + "loss": 2.6972, + "step": 10452 + }, + { + "epoch": 0.309966491711888, + "grad_norm": 0.13852272927761078, + "learning_rate": 0.0007901306498694993, + "loss": 2.7293, + "step": 10453 + }, + { + "epoch": 0.30999614506419954, + "grad_norm": 0.15864197909832, + "learning_rate": 0.000790092328853655, + "loss": 2.7257, + "step": 10454 + }, + { + "epoch": 0.310025798416511, + "grad_norm": 0.19268929958343506, + "learning_rate": 0.0007900540052689932, + "loss": 2.7075, + "step": 10455 + }, + { + "epoch": 0.3100554517688225, + "grad_norm": 0.20369000732898712, + "learning_rate": 0.0007900156791158538, + "loss": 2.7359, + "step": 10456 + }, + { + "epoch": 0.31008510512113396, + "grad_norm": 0.18033050000667572, + "learning_rate": 0.0007899773503945755, + "loss": 2.744, + "step": 10457 + }, + { + "epoch": 0.31011475847344544, + "grad_norm": 0.15171557664871216, + "learning_rate": 0.000789939019105498, + "loss": 2.6982, + "step": 10458 + }, + { + "epoch": 0.3101444118257569, + "grad_norm": 0.1542479395866394, + "learning_rate": 0.0007899006852489609, + "loss": 2.7482, + "step": 10459 + }, + { + "epoch": 0.3101740651780684, + "grad_norm": 0.1793094426393509, + "learning_rate": 0.0007898623488253033, + "loss": 2.7317, + "step": 10460 + }, + { + "epoch": 0.31020371853037987, + "grad_norm": 0.16099436581134796, + "learning_rate": 0.0007898240098348649, + "loss": 2.7006, + "step": 10461 + }, + { + "epoch": 0.31023337188269134, + "grad_norm": 0.13817043602466583, + "learning_rate": 0.0007897856682779851, + "loss": 2.6759, + "step": 10462 + }, + { + "epoch": 0.3102630252350028, + "grad_norm": 0.13766273856163025, + "learning_rate": 0.0007897473241550036, + "loss": 2.7244, + "step": 10463 + }, + { + "epoch": 0.3102926785873143, + "grad_norm": 0.1359068751335144, + "learning_rate": 0.0007897089774662597, + "loss": 2.6937, + "step": 10464 + }, + { + "epoch": 0.3103223319396258, + "grad_norm": 0.15187707543373108, + "learning_rate": 0.0007896706282120932, + "loss": 2.745, + "step": 10465 + }, + { + "epoch": 0.31035198529193725, + "grad_norm": 0.13428111374378204, + "learning_rate": 0.0007896322763928434, + "loss": 2.729, + "step": 10466 + }, + { + "epoch": 0.3103816386442487, + "grad_norm": 0.1357426792383194, + "learning_rate": 0.00078959392200885, + "loss": 2.709, + "step": 10467 + }, + { + "epoch": 0.3104112919965602, + "grad_norm": 0.15182356536388397, + "learning_rate": 0.0007895555650604529, + "loss": 2.7162, + "step": 10468 + }, + { + "epoch": 0.3104409453488717, + "grad_norm": 0.16552360355854034, + "learning_rate": 0.0007895172055479916, + "loss": 2.767, + "step": 10469 + }, + { + "epoch": 0.31047059870118315, + "grad_norm": 0.14634472131729126, + "learning_rate": 0.0007894788434718057, + "loss": 2.6829, + "step": 10470 + }, + { + "epoch": 0.31050025205349463, + "grad_norm": 0.14415855705738068, + "learning_rate": 0.0007894404788322349, + "loss": 2.7244, + "step": 10471 + }, + { + "epoch": 0.3105299054058061, + "grad_norm": 0.15700498223304749, + "learning_rate": 0.000789402111629619, + "loss": 2.7527, + "step": 10472 + }, + { + "epoch": 0.3105595587581176, + "grad_norm": 0.14673985540866852, + "learning_rate": 0.0007893637418642976, + "loss": 2.6838, + "step": 10473 + }, + { + "epoch": 0.31058921211042906, + "grad_norm": 0.15508754551410675, + "learning_rate": 0.0007893253695366107, + "loss": 2.7045, + "step": 10474 + }, + { + "epoch": 0.3106188654627406, + "grad_norm": 0.15828633308410645, + "learning_rate": 0.0007892869946468981, + "loss": 2.6791, + "step": 10475 + }, + { + "epoch": 0.31064851881505207, + "grad_norm": 0.17071981728076935, + "learning_rate": 0.0007892486171954995, + "loss": 2.7594, + "step": 10476 + }, + { + "epoch": 0.31067817216736354, + "grad_norm": 0.15302494168281555, + "learning_rate": 0.0007892102371827545, + "loss": 2.7294, + "step": 10477 + }, + { + "epoch": 0.310707825519675, + "grad_norm": 0.13392049074172974, + "learning_rate": 0.0007891718546090035, + "loss": 2.7501, + "step": 10478 + }, + { + "epoch": 0.3107374788719865, + "grad_norm": 0.1360136717557907, + "learning_rate": 0.000789133469474586, + "loss": 2.7202, + "step": 10479 + }, + { + "epoch": 0.31076713222429797, + "grad_norm": 0.14818817377090454, + "learning_rate": 0.0007890950817798419, + "loss": 2.715, + "step": 10480 + }, + { + "epoch": 0.31079678557660945, + "grad_norm": 0.13188746571540833, + "learning_rate": 0.0007890566915251113, + "loss": 2.7678, + "step": 10481 + }, + { + "epoch": 0.3108264389289209, + "grad_norm": 0.11943085491657257, + "learning_rate": 0.000789018298710734, + "loss": 2.7178, + "step": 10482 + }, + { + "epoch": 0.3108560922812324, + "grad_norm": 0.13678137958049774, + "learning_rate": 0.0007889799033370502, + "loss": 2.7329, + "step": 10483 + }, + { + "epoch": 0.3108857456335439, + "grad_norm": 0.1365475207567215, + "learning_rate": 0.0007889415054043997, + "loss": 2.7287, + "step": 10484 + }, + { + "epoch": 0.31091539898585535, + "grad_norm": 0.1299731433391571, + "learning_rate": 0.0007889031049131225, + "loss": 2.7274, + "step": 10485 + }, + { + "epoch": 0.3109450523381668, + "grad_norm": 0.1343737244606018, + "learning_rate": 0.0007888647018635588, + "loss": 2.7197, + "step": 10486 + }, + { + "epoch": 0.3109747056904783, + "grad_norm": 0.14693830907344818, + "learning_rate": 0.0007888262962560486, + "loss": 2.7493, + "step": 10487 + }, + { + "epoch": 0.3110043590427898, + "grad_norm": 0.1410372108221054, + "learning_rate": 0.0007887878880909318, + "loss": 2.7001, + "step": 10488 + }, + { + "epoch": 0.31103401239510126, + "grad_norm": 0.13074159622192383, + "learning_rate": 0.0007887494773685488, + "loss": 2.7379, + "step": 10489 + }, + { + "epoch": 0.31106366574741273, + "grad_norm": 0.14324796199798584, + "learning_rate": 0.0007887110640892398, + "loss": 2.7039, + "step": 10490 + }, + { + "epoch": 0.3110933190997242, + "grad_norm": 0.13978508114814758, + "learning_rate": 0.0007886726482533445, + "loss": 2.718, + "step": 10491 + }, + { + "epoch": 0.3111229724520357, + "grad_norm": 0.15153832733631134, + "learning_rate": 0.0007886342298612035, + "loss": 2.7147, + "step": 10492 + }, + { + "epoch": 0.31115262580434716, + "grad_norm": 0.18176575005054474, + "learning_rate": 0.0007885958089131566, + "loss": 2.7661, + "step": 10493 + }, + { + "epoch": 0.31118227915665864, + "grad_norm": 0.1706114411354065, + "learning_rate": 0.0007885573854095447, + "loss": 2.7356, + "step": 10494 + }, + { + "epoch": 0.3112119325089701, + "grad_norm": 0.13615739345550537, + "learning_rate": 0.0007885189593507074, + "loss": 2.7368, + "step": 10495 + }, + { + "epoch": 0.31124158586128164, + "grad_norm": 0.1494811475276947, + "learning_rate": 0.0007884805307369851, + "loss": 2.7162, + "step": 10496 + }, + { + "epoch": 0.3112712392135931, + "grad_norm": 0.16103430092334747, + "learning_rate": 0.0007884420995687183, + "loss": 2.7204, + "step": 10497 + }, + { + "epoch": 0.3113008925659046, + "grad_norm": 0.14000627398490906, + "learning_rate": 0.0007884036658462472, + "loss": 2.7118, + "step": 10498 + }, + { + "epoch": 0.3113305459182161, + "grad_norm": 0.1476256549358368, + "learning_rate": 0.000788365229569912, + "loss": 2.7216, + "step": 10499 + }, + { + "epoch": 0.31136019927052755, + "grad_norm": 0.14454321563243866, + "learning_rate": 0.0007883267907400533, + "loss": 2.7169, + "step": 10500 + }, + { + "epoch": 0.311389852622839, + "grad_norm": 0.12355127930641174, + "learning_rate": 0.0007882883493570115, + "loss": 2.7286, + "step": 10501 + }, + { + "epoch": 0.3114195059751505, + "grad_norm": 0.11664676666259766, + "learning_rate": 0.0007882499054211267, + "loss": 2.7368, + "step": 10502 + }, + { + "epoch": 0.311449159327462, + "grad_norm": 0.14580480754375458, + "learning_rate": 0.0007882114589327396, + "loss": 2.7227, + "step": 10503 + }, + { + "epoch": 0.31147881267977345, + "grad_norm": 0.14848437905311584, + "learning_rate": 0.0007881730098921905, + "loss": 2.7312, + "step": 10504 + }, + { + "epoch": 0.31150846603208493, + "grad_norm": 0.13422681391239166, + "learning_rate": 0.0007881345582998198, + "loss": 2.7092, + "step": 10505 + }, + { + "epoch": 0.3115381193843964, + "grad_norm": 0.15414851903915405, + "learning_rate": 0.0007880961041559683, + "loss": 2.729, + "step": 10506 + }, + { + "epoch": 0.3115677727367079, + "grad_norm": 0.13874658942222595, + "learning_rate": 0.0007880576474609764, + "loss": 2.7245, + "step": 10507 + }, + { + "epoch": 0.31159742608901936, + "grad_norm": 0.13584579527378082, + "learning_rate": 0.0007880191882151846, + "loss": 2.6799, + "step": 10508 + }, + { + "epoch": 0.31162707944133083, + "grad_norm": 0.1443198323249817, + "learning_rate": 0.0007879807264189333, + "loss": 2.737, + "step": 10509 + }, + { + "epoch": 0.3116567327936423, + "grad_norm": 0.15445105731487274, + "learning_rate": 0.0007879422620725633, + "loss": 2.7616, + "step": 10510 + }, + { + "epoch": 0.3116863861459538, + "grad_norm": 0.13918085396289825, + "learning_rate": 0.0007879037951764152, + "loss": 2.6932, + "step": 10511 + }, + { + "epoch": 0.31171603949826526, + "grad_norm": 0.1373717486858368, + "learning_rate": 0.0007878653257308295, + "loss": 2.7398, + "step": 10512 + }, + { + "epoch": 0.31174569285057674, + "grad_norm": 0.15094636380672455, + "learning_rate": 0.0007878268537361469, + "loss": 2.6833, + "step": 10513 + }, + { + "epoch": 0.3117753462028882, + "grad_norm": 0.16429443657398224, + "learning_rate": 0.0007877883791927081, + "loss": 2.7344, + "step": 10514 + }, + { + "epoch": 0.3118049995551997, + "grad_norm": 0.16260451078414917, + "learning_rate": 0.000787749902100854, + "loss": 2.7339, + "step": 10515 + }, + { + "epoch": 0.31183465290751117, + "grad_norm": 0.127525195479393, + "learning_rate": 0.0007877114224609249, + "loss": 2.7442, + "step": 10516 + }, + { + "epoch": 0.3118643062598227, + "grad_norm": 0.12202835083007812, + "learning_rate": 0.0007876729402732618, + "loss": 2.6809, + "step": 10517 + }, + { + "epoch": 0.3118939596121342, + "grad_norm": 0.12916061282157898, + "learning_rate": 0.0007876344555382055, + "loss": 2.7265, + "step": 10518 + }, + { + "epoch": 0.31192361296444565, + "grad_norm": 0.1328004002571106, + "learning_rate": 0.0007875959682560968, + "loss": 2.7189, + "step": 10519 + }, + { + "epoch": 0.3119532663167571, + "grad_norm": 0.1435793787240982, + "learning_rate": 0.0007875574784272763, + "loss": 2.7097, + "step": 10520 + }, + { + "epoch": 0.3119829196690686, + "grad_norm": 0.14190427958965302, + "learning_rate": 0.000787518986052085, + "loss": 2.7355, + "step": 10521 + }, + { + "epoch": 0.3120125730213801, + "grad_norm": 0.12550939619541168, + "learning_rate": 0.0007874804911308639, + "loss": 2.7341, + "step": 10522 + }, + { + "epoch": 0.31204222637369156, + "grad_norm": 0.11236374825239182, + "learning_rate": 0.0007874419936639536, + "loss": 2.689, + "step": 10523 + }, + { + "epoch": 0.31207187972600303, + "grad_norm": 0.11471858620643616, + "learning_rate": 0.0007874034936516949, + "loss": 2.7195, + "step": 10524 + }, + { + "epoch": 0.3121015330783145, + "grad_norm": 0.1265261024236679, + "learning_rate": 0.0007873649910944292, + "loss": 2.7375, + "step": 10525 + }, + { + "epoch": 0.312131186430626, + "grad_norm": 0.1347411423921585, + "learning_rate": 0.000787326485992497, + "loss": 2.7012, + "step": 10526 + }, + { + "epoch": 0.31216083978293746, + "grad_norm": 0.12416385114192963, + "learning_rate": 0.0007872879783462395, + "loss": 2.7501, + "step": 10527 + }, + { + "epoch": 0.31219049313524894, + "grad_norm": 0.13531801104545593, + "learning_rate": 0.0007872494681559978, + "loss": 2.6922, + "step": 10528 + }, + { + "epoch": 0.3122201464875604, + "grad_norm": 0.14976616203784943, + "learning_rate": 0.0007872109554221125, + "loss": 2.7084, + "step": 10529 + }, + { + "epoch": 0.3122497998398719, + "grad_norm": 0.16652071475982666, + "learning_rate": 0.0007871724401449251, + "loss": 2.715, + "step": 10530 + }, + { + "epoch": 0.31227945319218336, + "grad_norm": 0.18615536391735077, + "learning_rate": 0.0007871339223247762, + "loss": 2.7238, + "step": 10531 + }, + { + "epoch": 0.31230910654449484, + "grad_norm": 0.23905959725379944, + "learning_rate": 0.0007870954019620073, + "loss": 2.7302, + "step": 10532 + }, + { + "epoch": 0.3123387598968063, + "grad_norm": 0.21833805739879608, + "learning_rate": 0.0007870568790569594, + "loss": 2.7103, + "step": 10533 + }, + { + "epoch": 0.3123684132491178, + "grad_norm": 0.16040341556072235, + "learning_rate": 0.0007870183536099736, + "loss": 2.6997, + "step": 10534 + }, + { + "epoch": 0.31239806660142927, + "grad_norm": 0.159804105758667, + "learning_rate": 0.0007869798256213908, + "loss": 2.7203, + "step": 10535 + }, + { + "epoch": 0.31242771995374075, + "grad_norm": 0.16134580969810486, + "learning_rate": 0.0007869412950915524, + "loss": 2.759, + "step": 10536 + }, + { + "epoch": 0.3124573733060522, + "grad_norm": 0.14593163132667542, + "learning_rate": 0.0007869027620207998, + "loss": 2.698, + "step": 10537 + }, + { + "epoch": 0.31248702665836375, + "grad_norm": 0.1436554342508316, + "learning_rate": 0.0007868642264094737, + "loss": 2.7206, + "step": 10538 + }, + { + "epoch": 0.31251668001067523, + "grad_norm": 0.1512216478586197, + "learning_rate": 0.0007868256882579159, + "loss": 2.7297, + "step": 10539 + }, + { + "epoch": 0.3125463333629867, + "grad_norm": 0.13906672596931458, + "learning_rate": 0.0007867871475664673, + "loss": 2.7061, + "step": 10540 + }, + { + "epoch": 0.3125759867152982, + "grad_norm": 0.12905678153038025, + "learning_rate": 0.0007867486043354692, + "loss": 2.7191, + "step": 10541 + }, + { + "epoch": 0.31260564006760966, + "grad_norm": 0.1402817964553833, + "learning_rate": 0.0007867100585652631, + "loss": 2.725, + "step": 10542 + }, + { + "epoch": 0.31263529341992113, + "grad_norm": 0.1284056007862091, + "learning_rate": 0.0007866715102561902, + "loss": 2.717, + "step": 10543 + }, + { + "epoch": 0.3126649467722326, + "grad_norm": 0.14544349908828735, + "learning_rate": 0.0007866329594085919, + "loss": 2.7095, + "step": 10544 + }, + { + "epoch": 0.3126946001245441, + "grad_norm": 0.1268455982208252, + "learning_rate": 0.0007865944060228094, + "loss": 2.737, + "step": 10545 + }, + { + "epoch": 0.31272425347685556, + "grad_norm": 0.11973591893911362, + "learning_rate": 0.0007865558500991844, + "loss": 2.7216, + "step": 10546 + }, + { + "epoch": 0.31275390682916704, + "grad_norm": 0.12158669531345367, + "learning_rate": 0.000786517291638058, + "loss": 2.7098, + "step": 10547 + }, + { + "epoch": 0.3127835601814785, + "grad_norm": 0.1261318475008011, + "learning_rate": 0.0007864787306397721, + "loss": 2.7171, + "step": 10548 + }, + { + "epoch": 0.31281321353379, + "grad_norm": 0.12671267986297607, + "learning_rate": 0.0007864401671046676, + "loss": 2.7387, + "step": 10549 + }, + { + "epoch": 0.31284286688610147, + "grad_norm": 0.10621454566717148, + "learning_rate": 0.0007864016010330863, + "loss": 2.7634, + "step": 10550 + }, + { + "epoch": 0.31287252023841294, + "grad_norm": 0.11721349507570267, + "learning_rate": 0.0007863630324253697, + "loss": 2.7213, + "step": 10551 + }, + { + "epoch": 0.3129021735907244, + "grad_norm": 0.12708279490470886, + "learning_rate": 0.0007863244612818592, + "loss": 2.6681, + "step": 10552 + }, + { + "epoch": 0.3129318269430359, + "grad_norm": 0.12088817358016968, + "learning_rate": 0.0007862858876028965, + "loss": 2.7233, + "step": 10553 + }, + { + "epoch": 0.31296148029534737, + "grad_norm": 0.1265326589345932, + "learning_rate": 0.0007862473113888233, + "loss": 2.6876, + "step": 10554 + }, + { + "epoch": 0.31299113364765885, + "grad_norm": 0.11447639018297195, + "learning_rate": 0.0007862087326399808, + "loss": 2.7488, + "step": 10555 + }, + { + "epoch": 0.3130207869999703, + "grad_norm": 0.10996686667203903, + "learning_rate": 0.000786170151356711, + "loss": 2.7175, + "step": 10556 + }, + { + "epoch": 0.3130504403522818, + "grad_norm": 0.11407709866762161, + "learning_rate": 0.0007861315675393553, + "loss": 2.735, + "step": 10557 + }, + { + "epoch": 0.31308009370459333, + "grad_norm": 0.1135169267654419, + "learning_rate": 0.0007860929811882554, + "loss": 2.7373, + "step": 10558 + }, + { + "epoch": 0.3131097470569048, + "grad_norm": 0.13790862262248993, + "learning_rate": 0.0007860543923037531, + "loss": 2.752, + "step": 10559 + }, + { + "epoch": 0.3131394004092163, + "grad_norm": 0.14039424061775208, + "learning_rate": 0.0007860158008861901, + "loss": 2.7164, + "step": 10560 + }, + { + "epoch": 0.31316905376152776, + "grad_norm": 0.13163518905639648, + "learning_rate": 0.0007859772069359081, + "loss": 2.6987, + "step": 10561 + }, + { + "epoch": 0.31319870711383924, + "grad_norm": 0.13733303546905518, + "learning_rate": 0.0007859386104532486, + "loss": 2.7541, + "step": 10562 + }, + { + "epoch": 0.3132283604661507, + "grad_norm": 0.1596900373697281, + "learning_rate": 0.000785900011438554, + "loss": 2.7107, + "step": 10563 + }, + { + "epoch": 0.3132580138184622, + "grad_norm": 0.17043349146842957, + "learning_rate": 0.0007858614098921655, + "loss": 2.7157, + "step": 10564 + }, + { + "epoch": 0.31328766717077366, + "grad_norm": 0.15616370737552643, + "learning_rate": 0.0007858228058144252, + "loss": 2.7202, + "step": 10565 + }, + { + "epoch": 0.31331732052308514, + "grad_norm": 0.11681990325450897, + "learning_rate": 0.0007857841992056749, + "loss": 2.7056, + "step": 10566 + }, + { + "epoch": 0.3133469738753966, + "grad_norm": 0.13134104013442993, + "learning_rate": 0.0007857455900662564, + "loss": 2.6998, + "step": 10567 + }, + { + "epoch": 0.3133766272277081, + "grad_norm": 0.12709644436836243, + "learning_rate": 0.0007857069783965119, + "loss": 2.7171, + "step": 10568 + }, + { + "epoch": 0.31340628058001957, + "grad_norm": 0.12148689478635788, + "learning_rate": 0.0007856683641967828, + "loss": 2.7051, + "step": 10569 + }, + { + "epoch": 0.31343593393233105, + "grad_norm": 0.14620059728622437, + "learning_rate": 0.0007856297474674112, + "loss": 2.7222, + "step": 10570 + }, + { + "epoch": 0.3134655872846425, + "grad_norm": 0.1370016634464264, + "learning_rate": 0.0007855911282087394, + "loss": 2.7213, + "step": 10571 + }, + { + "epoch": 0.313495240636954, + "grad_norm": 0.15889523923397064, + "learning_rate": 0.0007855525064211091, + "loss": 2.7183, + "step": 10572 + }, + { + "epoch": 0.3135248939892655, + "grad_norm": 0.14715562760829926, + "learning_rate": 0.0007855138821048623, + "loss": 2.7152, + "step": 10573 + }, + { + "epoch": 0.31355454734157695, + "grad_norm": 0.14050160348415375, + "learning_rate": 0.0007854752552603411, + "loss": 2.7093, + "step": 10574 + }, + { + "epoch": 0.3135842006938884, + "grad_norm": 0.13897192478179932, + "learning_rate": 0.0007854366258878874, + "loss": 2.7184, + "step": 10575 + }, + { + "epoch": 0.3136138540461999, + "grad_norm": 0.15459538996219635, + "learning_rate": 0.0007853979939878435, + "loss": 2.7354, + "step": 10576 + }, + { + "epoch": 0.3136435073985114, + "grad_norm": 0.1602177768945694, + "learning_rate": 0.0007853593595605513, + "loss": 2.7101, + "step": 10577 + }, + { + "epoch": 0.31367316075082285, + "grad_norm": 0.15823985636234283, + "learning_rate": 0.0007853207226063531, + "loss": 2.7721, + "step": 10578 + }, + { + "epoch": 0.3137028141031344, + "grad_norm": 0.1729205995798111, + "learning_rate": 0.0007852820831255909, + "loss": 2.6935, + "step": 10579 + }, + { + "epoch": 0.31373246745544586, + "grad_norm": 0.18765141069889069, + "learning_rate": 0.0007852434411186068, + "loss": 2.7455, + "step": 10580 + }, + { + "epoch": 0.31376212080775734, + "grad_norm": 0.16781297326087952, + "learning_rate": 0.0007852047965857432, + "loss": 2.6997, + "step": 10581 + }, + { + "epoch": 0.3137917741600688, + "grad_norm": 0.17958608269691467, + "learning_rate": 0.0007851661495273421, + "loss": 2.7326, + "step": 10582 + }, + { + "epoch": 0.3138214275123803, + "grad_norm": 0.142039492726326, + "learning_rate": 0.0007851274999437456, + "loss": 2.6991, + "step": 10583 + }, + { + "epoch": 0.31385108086469177, + "grad_norm": 0.1422424167394638, + "learning_rate": 0.0007850888478352964, + "loss": 2.7098, + "step": 10584 + }, + { + "epoch": 0.31388073421700324, + "grad_norm": 0.15278437733650208, + "learning_rate": 0.0007850501932023364, + "loss": 2.7338, + "step": 10585 + }, + { + "epoch": 0.3139103875693147, + "grad_norm": 0.14753945171833038, + "learning_rate": 0.0007850115360452082, + "loss": 2.7379, + "step": 10586 + }, + { + "epoch": 0.3139400409216262, + "grad_norm": 0.1231422945857048, + "learning_rate": 0.0007849728763642537, + "loss": 2.7445, + "step": 10587 + }, + { + "epoch": 0.31396969427393767, + "grad_norm": 0.12721283733844757, + "learning_rate": 0.0007849342141598156, + "loss": 2.7267, + "step": 10588 + }, + { + "epoch": 0.31399934762624915, + "grad_norm": 0.149134561419487, + "learning_rate": 0.0007848955494322361, + "loss": 2.7413, + "step": 10589 + }, + { + "epoch": 0.3140290009785606, + "grad_norm": 0.12301073223352432, + "learning_rate": 0.0007848568821818576, + "loss": 2.746, + "step": 10590 + }, + { + "epoch": 0.3140586543308721, + "grad_norm": 0.11489685624837875, + "learning_rate": 0.0007848182124090224, + "loss": 2.7387, + "step": 10591 + }, + { + "epoch": 0.3140883076831836, + "grad_norm": 0.11882272362709045, + "learning_rate": 0.0007847795401140731, + "loss": 2.6912, + "step": 10592 + }, + { + "epoch": 0.31411796103549505, + "grad_norm": 0.12110510468482971, + "learning_rate": 0.0007847408652973522, + "loss": 2.7237, + "step": 10593 + }, + { + "epoch": 0.31414761438780653, + "grad_norm": 0.13125671446323395, + "learning_rate": 0.000784702187959202, + "loss": 2.7244, + "step": 10594 + }, + { + "epoch": 0.314177267740118, + "grad_norm": 0.12603285908699036, + "learning_rate": 0.0007846635080999651, + "loss": 2.7307, + "step": 10595 + }, + { + "epoch": 0.3142069210924295, + "grad_norm": 0.13123415410518646, + "learning_rate": 0.0007846248257199839, + "loss": 2.7154, + "step": 10596 + }, + { + "epoch": 0.31423657444474096, + "grad_norm": 0.14329561591148376, + "learning_rate": 0.0007845861408196009, + "loss": 2.7394, + "step": 10597 + }, + { + "epoch": 0.31426622779705243, + "grad_norm": 0.14141426980495453, + "learning_rate": 0.0007845474533991591, + "loss": 2.7181, + "step": 10598 + }, + { + "epoch": 0.3142958811493639, + "grad_norm": 0.1217271238565445, + "learning_rate": 0.0007845087634590005, + "loss": 2.7222, + "step": 10599 + }, + { + "epoch": 0.31432553450167544, + "grad_norm": 0.12316001951694489, + "learning_rate": 0.0007844700709994681, + "loss": 2.7473, + "step": 10600 + }, + { + "epoch": 0.3143551878539869, + "grad_norm": 0.14180462062358856, + "learning_rate": 0.0007844313760209043, + "loss": 2.6971, + "step": 10601 + }, + { + "epoch": 0.3143848412062984, + "grad_norm": 0.12799270451068878, + "learning_rate": 0.0007843926785236519, + "loss": 2.7339, + "step": 10602 + }, + { + "epoch": 0.31441449455860987, + "grad_norm": 0.13426338136196136, + "learning_rate": 0.0007843539785080533, + "loss": 2.741, + "step": 10603 + }, + { + "epoch": 0.31444414791092135, + "grad_norm": 0.16664129495620728, + "learning_rate": 0.0007843152759744516, + "loss": 2.7422, + "step": 10604 + }, + { + "epoch": 0.3144738012632328, + "grad_norm": 0.15815545618534088, + "learning_rate": 0.0007842765709231894, + "loss": 2.7362, + "step": 10605 + }, + { + "epoch": 0.3145034546155443, + "grad_norm": 0.14823086559772491, + "learning_rate": 0.0007842378633546093, + "loss": 2.7153, + "step": 10606 + }, + { + "epoch": 0.3145331079678558, + "grad_norm": 0.15313071012496948, + "learning_rate": 0.0007841991532690542, + "loss": 2.7201, + "step": 10607 + }, + { + "epoch": 0.31456276132016725, + "grad_norm": 0.138764888048172, + "learning_rate": 0.0007841604406668667, + "loss": 2.7126, + "step": 10608 + }, + { + "epoch": 0.3145924146724787, + "grad_norm": 0.14166386425495148, + "learning_rate": 0.0007841217255483896, + "loss": 2.7396, + "step": 10609 + }, + { + "epoch": 0.3146220680247902, + "grad_norm": 0.13847389817237854, + "learning_rate": 0.0007840830079139661, + "loss": 2.7401, + "step": 10610 + }, + { + "epoch": 0.3146517213771017, + "grad_norm": 0.14444762468338013, + "learning_rate": 0.0007840442877639389, + "loss": 2.7392, + "step": 10611 + }, + { + "epoch": 0.31468137472941315, + "grad_norm": 0.1528903841972351, + "learning_rate": 0.0007840055650986505, + "loss": 2.7459, + "step": 10612 + }, + { + "epoch": 0.31471102808172463, + "grad_norm": 0.13299709558486938, + "learning_rate": 0.0007839668399184441, + "loss": 2.697, + "step": 10613 + }, + { + "epoch": 0.3147406814340361, + "grad_norm": 0.13735546171665192, + "learning_rate": 0.0007839281122236628, + "loss": 2.7259, + "step": 10614 + }, + { + "epoch": 0.3147703347863476, + "grad_norm": 0.16111862659454346, + "learning_rate": 0.0007838893820146492, + "loss": 2.7371, + "step": 10615 + }, + { + "epoch": 0.31479998813865906, + "grad_norm": 0.14803974330425262, + "learning_rate": 0.0007838506492917463, + "loss": 2.7422, + "step": 10616 + }, + { + "epoch": 0.31482964149097054, + "grad_norm": 0.13796979188919067, + "learning_rate": 0.0007838119140552973, + "loss": 2.7451, + "step": 10617 + }, + { + "epoch": 0.314859294843282, + "grad_norm": 0.172935351729393, + "learning_rate": 0.0007837731763056451, + "loss": 2.7155, + "step": 10618 + }, + { + "epoch": 0.3148889481955935, + "grad_norm": 0.2175462692975998, + "learning_rate": 0.0007837344360431325, + "loss": 2.7678, + "step": 10619 + }, + { + "epoch": 0.31491860154790496, + "grad_norm": 0.21418248116970062, + "learning_rate": 0.000783695693268103, + "loss": 2.7164, + "step": 10620 + }, + { + "epoch": 0.3149482549002165, + "grad_norm": 0.1399359107017517, + "learning_rate": 0.0007836569479808994, + "loss": 2.6799, + "step": 10621 + }, + { + "epoch": 0.31497790825252797, + "grad_norm": 0.17042092978954315, + "learning_rate": 0.0007836182001818648, + "loss": 2.7502, + "step": 10622 + }, + { + "epoch": 0.31500756160483945, + "grad_norm": 0.19462370872497559, + "learning_rate": 0.0007835794498713422, + "loss": 2.7402, + "step": 10623 + }, + { + "epoch": 0.3150372149571509, + "grad_norm": 0.15819412469863892, + "learning_rate": 0.0007835406970496751, + "loss": 2.7291, + "step": 10624 + }, + { + "epoch": 0.3150668683094624, + "grad_norm": 0.15403428673744202, + "learning_rate": 0.0007835019417172064, + "loss": 2.7109, + "step": 10625 + }, + { + "epoch": 0.3150965216617739, + "grad_norm": 0.1640051305294037, + "learning_rate": 0.0007834631838742794, + "loss": 2.7153, + "step": 10626 + }, + { + "epoch": 0.31512617501408535, + "grad_norm": 0.14956192672252655, + "learning_rate": 0.0007834244235212373, + "loss": 2.7093, + "step": 10627 + }, + { + "epoch": 0.31515582836639683, + "grad_norm": 0.13112547993659973, + "learning_rate": 0.0007833856606584231, + "loss": 2.7147, + "step": 10628 + }, + { + "epoch": 0.3151854817187083, + "grad_norm": 0.14481864869594574, + "learning_rate": 0.0007833468952861803, + "loss": 2.7408, + "step": 10629 + }, + { + "epoch": 0.3152151350710198, + "grad_norm": 0.14750957489013672, + "learning_rate": 0.0007833081274048521, + "loss": 2.7196, + "step": 10630 + }, + { + "epoch": 0.31524478842333126, + "grad_norm": 0.12842942774295807, + "learning_rate": 0.000783269357014782, + "loss": 2.7277, + "step": 10631 + }, + { + "epoch": 0.31527444177564273, + "grad_norm": 0.1292164921760559, + "learning_rate": 0.0007832305841163128, + "loss": 2.7108, + "step": 10632 + }, + { + "epoch": 0.3153040951279542, + "grad_norm": 0.1330457627773285, + "learning_rate": 0.0007831918087097883, + "loss": 2.6914, + "step": 10633 + }, + { + "epoch": 0.3153337484802657, + "grad_norm": 0.11568059027194977, + "learning_rate": 0.0007831530307955518, + "loss": 2.7384, + "step": 10634 + }, + { + "epoch": 0.31536340183257716, + "grad_norm": 0.12040053308010101, + "learning_rate": 0.0007831142503739465, + "loss": 2.7254, + "step": 10635 + }, + { + "epoch": 0.31539305518488864, + "grad_norm": 0.1328078657388687, + "learning_rate": 0.000783075467445316, + "loss": 2.7483, + "step": 10636 + }, + { + "epoch": 0.3154227085372001, + "grad_norm": 0.1177172139286995, + "learning_rate": 0.0007830366820100037, + "loss": 2.7372, + "step": 10637 + }, + { + "epoch": 0.3154523618895116, + "grad_norm": 0.10149083286523819, + "learning_rate": 0.0007829978940683529, + "loss": 2.6895, + "step": 10638 + }, + { + "epoch": 0.31548201524182307, + "grad_norm": 0.1260194033384323, + "learning_rate": 0.0007829591036207071, + "loss": 2.6993, + "step": 10639 + }, + { + "epoch": 0.31551166859413454, + "grad_norm": 0.11184239387512207, + "learning_rate": 0.00078292031066741, + "loss": 2.7466, + "step": 10640 + }, + { + "epoch": 0.315541321946446, + "grad_norm": 0.11579586565494537, + "learning_rate": 0.0007828815152088049, + "loss": 2.7159, + "step": 10641 + }, + { + "epoch": 0.31557097529875755, + "grad_norm": 0.1423199474811554, + "learning_rate": 0.0007828427172452355, + "loss": 2.6931, + "step": 10642 + }, + { + "epoch": 0.315600628651069, + "grad_norm": 0.13465209305286407, + "learning_rate": 0.0007828039167770451, + "loss": 2.7289, + "step": 10643 + }, + { + "epoch": 0.3156302820033805, + "grad_norm": 0.13461679220199585, + "learning_rate": 0.0007827651138045777, + "loss": 2.7401, + "step": 10644 + }, + { + "epoch": 0.315659935355692, + "grad_norm": 0.1311320662498474, + "learning_rate": 0.0007827263083281765, + "loss": 2.7236, + "step": 10645 + }, + { + "epoch": 0.31568958870800345, + "grad_norm": 0.14109745621681213, + "learning_rate": 0.0007826875003481855, + "loss": 2.7366, + "step": 10646 + }, + { + "epoch": 0.31571924206031493, + "grad_norm": 0.14646168053150177, + "learning_rate": 0.0007826486898649482, + "loss": 2.7212, + "step": 10647 + }, + { + "epoch": 0.3157488954126264, + "grad_norm": 0.13570711016654968, + "learning_rate": 0.0007826098768788079, + "loss": 2.7616, + "step": 10648 + }, + { + "epoch": 0.3157785487649379, + "grad_norm": 0.12530244886875153, + "learning_rate": 0.000782571061390109, + "loss": 2.7413, + "step": 10649 + }, + { + "epoch": 0.31580820211724936, + "grad_norm": 0.13064780831336975, + "learning_rate": 0.0007825322433991946, + "loss": 2.7528, + "step": 10650 + }, + { + "epoch": 0.31583785546956084, + "grad_norm": 0.12042224407196045, + "learning_rate": 0.0007824934229064089, + "loss": 2.747, + "step": 10651 + }, + { + "epoch": 0.3158675088218723, + "grad_norm": 0.1178787425160408, + "learning_rate": 0.0007824545999120953, + "loss": 2.7387, + "step": 10652 + }, + { + "epoch": 0.3158971621741838, + "grad_norm": 0.12225363403558731, + "learning_rate": 0.0007824157744165979, + "loss": 2.7149, + "step": 10653 + }, + { + "epoch": 0.31592681552649526, + "grad_norm": 0.12674318253993988, + "learning_rate": 0.0007823769464202601, + "loss": 2.7469, + "step": 10654 + }, + { + "epoch": 0.31595646887880674, + "grad_norm": 0.1446712613105774, + "learning_rate": 0.000782338115923426, + "loss": 2.6887, + "step": 10655 + }, + { + "epoch": 0.3159861222311182, + "grad_norm": 0.15393894910812378, + "learning_rate": 0.0007822992829264395, + "loss": 2.724, + "step": 10656 + }, + { + "epoch": 0.3160157755834297, + "grad_norm": 0.17071306705474854, + "learning_rate": 0.0007822604474296444, + "loss": 2.7033, + "step": 10657 + }, + { + "epoch": 0.31604542893574117, + "grad_norm": 0.18871574103832245, + "learning_rate": 0.0007822216094333848, + "loss": 2.7174, + "step": 10658 + }, + { + "epoch": 0.31607508228805264, + "grad_norm": 0.20559433102607727, + "learning_rate": 0.0007821827689380041, + "loss": 2.7163, + "step": 10659 + }, + { + "epoch": 0.3161047356403641, + "grad_norm": 0.17435982823371887, + "learning_rate": 0.0007821439259438466, + "loss": 2.7331, + "step": 10660 + }, + { + "epoch": 0.3161343889926756, + "grad_norm": 0.16682519018650055, + "learning_rate": 0.0007821050804512562, + "loss": 2.705, + "step": 10661 + }, + { + "epoch": 0.31616404234498713, + "grad_norm": 0.18168993294239044, + "learning_rate": 0.0007820662324605769, + "loss": 2.7106, + "step": 10662 + }, + { + "epoch": 0.3161936956972986, + "grad_norm": 0.17150290310382843, + "learning_rate": 0.0007820273819721527, + "loss": 2.7196, + "step": 10663 + }, + { + "epoch": 0.3162233490496101, + "grad_norm": 0.16146890819072723, + "learning_rate": 0.0007819885289863278, + "loss": 2.7391, + "step": 10664 + }, + { + "epoch": 0.31625300240192156, + "grad_norm": 0.15859860181808472, + "learning_rate": 0.0007819496735034458, + "loss": 2.752, + "step": 10665 + }, + { + "epoch": 0.31628265575423303, + "grad_norm": 0.15007509291172028, + "learning_rate": 0.0007819108155238513, + "loss": 2.6944, + "step": 10666 + }, + { + "epoch": 0.3163123091065445, + "grad_norm": 0.1546543389558792, + "learning_rate": 0.0007818719550478878, + "loss": 2.7336, + "step": 10667 + }, + { + "epoch": 0.316341962458856, + "grad_norm": 0.16498248279094696, + "learning_rate": 0.0007818330920759001, + "loss": 2.7426, + "step": 10668 + }, + { + "epoch": 0.31637161581116746, + "grad_norm": 0.1539854109287262, + "learning_rate": 0.0007817942266082319, + "loss": 2.7307, + "step": 10669 + }, + { + "epoch": 0.31640126916347894, + "grad_norm": 0.14662030339241028, + "learning_rate": 0.0007817553586452273, + "loss": 2.7094, + "step": 10670 + }, + { + "epoch": 0.3164309225157904, + "grad_norm": 0.13010790944099426, + "learning_rate": 0.0007817164881872308, + "loss": 2.6895, + "step": 10671 + }, + { + "epoch": 0.3164605758681019, + "grad_norm": 0.13385580480098724, + "learning_rate": 0.0007816776152345864, + "loss": 2.7512, + "step": 10672 + }, + { + "epoch": 0.31649022922041337, + "grad_norm": 0.12791107594966888, + "learning_rate": 0.0007816387397876383, + "loss": 2.6815, + "step": 10673 + }, + { + "epoch": 0.31651988257272484, + "grad_norm": 0.13186083734035492, + "learning_rate": 0.0007815998618467308, + "loss": 2.6984, + "step": 10674 + }, + { + "epoch": 0.3165495359250363, + "grad_norm": 0.12144216895103455, + "learning_rate": 0.0007815609814122082, + "loss": 2.6753, + "step": 10675 + }, + { + "epoch": 0.3165791892773478, + "grad_norm": 0.10652902722358704, + "learning_rate": 0.000781522098484415, + "loss": 2.7169, + "step": 10676 + }, + { + "epoch": 0.31660884262965927, + "grad_norm": 0.11838218569755554, + "learning_rate": 0.0007814832130636951, + "loss": 2.7551, + "step": 10677 + }, + { + "epoch": 0.31663849598197075, + "grad_norm": 0.12105628103017807, + "learning_rate": 0.0007814443251503931, + "loss": 2.7704, + "step": 10678 + }, + { + "epoch": 0.3166681493342822, + "grad_norm": 0.11265765875577927, + "learning_rate": 0.0007814054347448532, + "loss": 2.7034, + "step": 10679 + }, + { + "epoch": 0.3166978026865937, + "grad_norm": 0.11976884305477142, + "learning_rate": 0.0007813665418474198, + "loss": 2.7277, + "step": 10680 + }, + { + "epoch": 0.3167274560389052, + "grad_norm": 0.15059258043766022, + "learning_rate": 0.0007813276464584375, + "loss": 2.7612, + "step": 10681 + }, + { + "epoch": 0.31675710939121665, + "grad_norm": 0.19814100861549377, + "learning_rate": 0.0007812887485782507, + "loss": 2.7284, + "step": 10682 + }, + { + "epoch": 0.3167867627435282, + "grad_norm": 0.2776213586330414, + "learning_rate": 0.0007812498482072037, + "loss": 2.6905, + "step": 10683 + }, + { + "epoch": 0.31681641609583966, + "grad_norm": 0.1256321370601654, + "learning_rate": 0.0007812109453456409, + "loss": 2.7212, + "step": 10684 + }, + { + "epoch": 0.31684606944815114, + "grad_norm": 0.13538581132888794, + "learning_rate": 0.0007811720399939071, + "loss": 2.7467, + "step": 10685 + }, + { + "epoch": 0.3168757228004626, + "grad_norm": 0.1518130898475647, + "learning_rate": 0.0007811331321523465, + "loss": 2.6968, + "step": 10686 + }, + { + "epoch": 0.3169053761527741, + "grad_norm": 0.13818879425525665, + "learning_rate": 0.0007810942218213037, + "loss": 2.7316, + "step": 10687 + }, + { + "epoch": 0.31693502950508556, + "grad_norm": 0.14380139112472534, + "learning_rate": 0.0007810553090011234, + "loss": 2.7586, + "step": 10688 + }, + { + "epoch": 0.31696468285739704, + "grad_norm": 0.1481398344039917, + "learning_rate": 0.0007810163936921502, + "loss": 2.7281, + "step": 10689 + }, + { + "epoch": 0.3169943362097085, + "grad_norm": 0.14642000198364258, + "learning_rate": 0.0007809774758947283, + "loss": 2.6937, + "step": 10690 + }, + { + "epoch": 0.31702398956202, + "grad_norm": 0.13919758796691895, + "learning_rate": 0.0007809385556092029, + "loss": 2.729, + "step": 10691 + }, + { + "epoch": 0.31705364291433147, + "grad_norm": 0.1249750405550003, + "learning_rate": 0.0007808996328359184, + "loss": 2.7138, + "step": 10692 + }, + { + "epoch": 0.31708329626664294, + "grad_norm": 0.1403835266828537, + "learning_rate": 0.0007808607075752192, + "loss": 2.6733, + "step": 10693 + }, + { + "epoch": 0.3171129496189544, + "grad_norm": 0.14667728543281555, + "learning_rate": 0.0007808217798274501, + "loss": 2.736, + "step": 10694 + }, + { + "epoch": 0.3171426029712659, + "grad_norm": 0.1469268947839737, + "learning_rate": 0.0007807828495929564, + "loss": 2.7346, + "step": 10695 + }, + { + "epoch": 0.3171722563235774, + "grad_norm": 0.14222866296768188, + "learning_rate": 0.000780743916872082, + "loss": 2.7058, + "step": 10696 + }, + { + "epoch": 0.31720190967588885, + "grad_norm": 0.14685629308223724, + "learning_rate": 0.0007807049816651722, + "loss": 2.6975, + "step": 10697 + }, + { + "epoch": 0.3172315630282003, + "grad_norm": 0.16969193518161774, + "learning_rate": 0.0007806660439725716, + "loss": 2.6805, + "step": 10698 + }, + { + "epoch": 0.3172612163805118, + "grad_norm": 0.1530643254518509, + "learning_rate": 0.0007806271037946251, + "loss": 2.7202, + "step": 10699 + }, + { + "epoch": 0.3172908697328233, + "grad_norm": 0.1398041993379593, + "learning_rate": 0.0007805881611316771, + "loss": 2.6977, + "step": 10700 + }, + { + "epoch": 0.31732052308513475, + "grad_norm": 0.15713578462600708, + "learning_rate": 0.000780549215984073, + "loss": 2.7106, + "step": 10701 + }, + { + "epoch": 0.31735017643744623, + "grad_norm": 0.16308622062206268, + "learning_rate": 0.0007805102683521574, + "loss": 2.7109, + "step": 10702 + }, + { + "epoch": 0.3173798297897577, + "grad_norm": 0.1448633223772049, + "learning_rate": 0.0007804713182362753, + "loss": 2.7247, + "step": 10703 + }, + { + "epoch": 0.31740948314206924, + "grad_norm": 0.14395219087600708, + "learning_rate": 0.0007804323656367716, + "loss": 2.7593, + "step": 10704 + }, + { + "epoch": 0.3174391364943807, + "grad_norm": 0.13426311314105988, + "learning_rate": 0.000780393410553991, + "loss": 2.7238, + "step": 10705 + }, + { + "epoch": 0.3174687898466922, + "grad_norm": 0.13751958310604095, + "learning_rate": 0.0007803544529882785, + "loss": 2.7429, + "step": 10706 + }, + { + "epoch": 0.31749844319900367, + "grad_norm": 0.14474698901176453, + "learning_rate": 0.0007803154929399794, + "loss": 2.6929, + "step": 10707 + }, + { + "epoch": 0.31752809655131514, + "grad_norm": 0.14717932045459747, + "learning_rate": 0.0007802765304094384, + "loss": 2.7504, + "step": 10708 + }, + { + "epoch": 0.3175577499036266, + "grad_norm": 0.13342227041721344, + "learning_rate": 0.0007802375653970006, + "loss": 2.705, + "step": 10709 + }, + { + "epoch": 0.3175874032559381, + "grad_norm": 0.13466964662075043, + "learning_rate": 0.0007801985979030112, + "loss": 2.7541, + "step": 10710 + }, + { + "epoch": 0.31761705660824957, + "grad_norm": 0.1532689481973648, + "learning_rate": 0.0007801596279278151, + "loss": 2.7309, + "step": 10711 + }, + { + "epoch": 0.31764670996056105, + "grad_norm": 0.16756081581115723, + "learning_rate": 0.0007801206554717573, + "loss": 2.7071, + "step": 10712 + }, + { + "epoch": 0.3176763633128725, + "grad_norm": 0.15268836915493011, + "learning_rate": 0.0007800816805351831, + "loss": 2.7237, + "step": 10713 + }, + { + "epoch": 0.317706016665184, + "grad_norm": 0.12323891371488571, + "learning_rate": 0.0007800427031184374, + "loss": 2.7118, + "step": 10714 + }, + { + "epoch": 0.3177356700174955, + "grad_norm": 0.1280631721019745, + "learning_rate": 0.0007800037232218657, + "loss": 2.6978, + "step": 10715 + }, + { + "epoch": 0.31776532336980695, + "grad_norm": 0.11350716650485992, + "learning_rate": 0.0007799647408458128, + "loss": 2.7254, + "step": 10716 + }, + { + "epoch": 0.3177949767221184, + "grad_norm": 0.11094601452350616, + "learning_rate": 0.000779925755990624, + "loss": 2.7364, + "step": 10717 + }, + { + "epoch": 0.3178246300744299, + "grad_norm": 0.1470600962638855, + "learning_rate": 0.0007798867686566449, + "loss": 2.7376, + "step": 10718 + }, + { + "epoch": 0.3178542834267414, + "grad_norm": 0.14075163006782532, + "learning_rate": 0.0007798477788442202, + "loss": 2.7064, + "step": 10719 + }, + { + "epoch": 0.31788393677905286, + "grad_norm": 0.13676515221595764, + "learning_rate": 0.0007798087865536953, + "loss": 2.7182, + "step": 10720 + }, + { + "epoch": 0.31791359013136433, + "grad_norm": 0.13349291682243347, + "learning_rate": 0.0007797697917854158, + "loss": 2.719, + "step": 10721 + }, + { + "epoch": 0.3179432434836758, + "grad_norm": 0.1191684827208519, + "learning_rate": 0.0007797307945397266, + "loss": 2.7146, + "step": 10722 + }, + { + "epoch": 0.3179728968359873, + "grad_norm": 0.10772479325532913, + "learning_rate": 0.0007796917948169733, + "loss": 2.7466, + "step": 10723 + }, + { + "epoch": 0.31800255018829876, + "grad_norm": 0.11997443437576294, + "learning_rate": 0.0007796527926175011, + "loss": 2.6895, + "step": 10724 + }, + { + "epoch": 0.3180322035406103, + "grad_norm": 0.17000222206115723, + "learning_rate": 0.0007796137879416554, + "loss": 2.7691, + "step": 10725 + }, + { + "epoch": 0.31806185689292177, + "grad_norm": 0.12881022691726685, + "learning_rate": 0.0007795747807897816, + "loss": 2.7333, + "step": 10726 + }, + { + "epoch": 0.31809151024523324, + "grad_norm": 0.12226761877536774, + "learning_rate": 0.0007795357711622252, + "loss": 2.7397, + "step": 10727 + }, + { + "epoch": 0.3181211635975447, + "grad_norm": 0.12096518278121948, + "learning_rate": 0.0007794967590593315, + "loss": 2.7277, + "step": 10728 + }, + { + "epoch": 0.3181508169498562, + "grad_norm": 0.12512293457984924, + "learning_rate": 0.0007794577444814461, + "loss": 2.7196, + "step": 10729 + }, + { + "epoch": 0.3181804703021677, + "grad_norm": 0.13336297869682312, + "learning_rate": 0.0007794187274289145, + "loss": 2.7455, + "step": 10730 + }, + { + "epoch": 0.31821012365447915, + "grad_norm": 0.1282784342765808, + "learning_rate": 0.0007793797079020818, + "loss": 2.6585, + "step": 10731 + }, + { + "epoch": 0.3182397770067906, + "grad_norm": 0.10721008479595184, + "learning_rate": 0.0007793406859012939, + "loss": 2.6921, + "step": 10732 + }, + { + "epoch": 0.3182694303591021, + "grad_norm": 0.12446413934230804, + "learning_rate": 0.0007793016614268964, + "loss": 2.7126, + "step": 10733 + }, + { + "epoch": 0.3182990837114136, + "grad_norm": 0.14180587232112885, + "learning_rate": 0.0007792626344792347, + "loss": 2.7206, + "step": 10734 + }, + { + "epoch": 0.31832873706372505, + "grad_norm": 0.13383305072784424, + "learning_rate": 0.0007792236050586545, + "loss": 2.733, + "step": 10735 + }, + { + "epoch": 0.31835839041603653, + "grad_norm": 0.13958196341991425, + "learning_rate": 0.0007791845731655013, + "loss": 2.7215, + "step": 10736 + }, + { + "epoch": 0.318388043768348, + "grad_norm": 0.12590830028057098, + "learning_rate": 0.0007791455388001208, + "loss": 2.7399, + "step": 10737 + }, + { + "epoch": 0.3184176971206595, + "grad_norm": 0.10353672504425049, + "learning_rate": 0.0007791065019628585, + "loss": 2.7355, + "step": 10738 + }, + { + "epoch": 0.31844735047297096, + "grad_norm": 0.12774311006069183, + "learning_rate": 0.0007790674626540605, + "loss": 2.7138, + "step": 10739 + }, + { + "epoch": 0.31847700382528243, + "grad_norm": 0.15950968861579895, + "learning_rate": 0.000779028420874072, + "loss": 2.7151, + "step": 10740 + }, + { + "epoch": 0.3185066571775939, + "grad_norm": 0.18497160077095032, + "learning_rate": 0.000778989376623239, + "loss": 2.6975, + "step": 10741 + }, + { + "epoch": 0.3185363105299054, + "grad_norm": 0.1734134405851364, + "learning_rate": 0.0007789503299019072, + "loss": 2.7203, + "step": 10742 + }, + { + "epoch": 0.31856596388221686, + "grad_norm": 0.14603149890899658, + "learning_rate": 0.0007789112807104224, + "loss": 2.7118, + "step": 10743 + }, + { + "epoch": 0.31859561723452834, + "grad_norm": 0.14306078851222992, + "learning_rate": 0.0007788722290491301, + "loss": 2.7356, + "step": 10744 + }, + { + "epoch": 0.3186252705868398, + "grad_norm": 0.14913411438465118, + "learning_rate": 0.0007788331749183766, + "loss": 2.7629, + "step": 10745 + }, + { + "epoch": 0.31865492393915135, + "grad_norm": 0.1327408403158188, + "learning_rate": 0.0007787941183185073, + "loss": 2.7241, + "step": 10746 + }, + { + "epoch": 0.3186845772914628, + "grad_norm": 0.15350034832954407, + "learning_rate": 0.0007787550592498684, + "loss": 2.7129, + "step": 10747 + }, + { + "epoch": 0.3187142306437743, + "grad_norm": 0.15503711998462677, + "learning_rate": 0.0007787159977128055, + "loss": 2.6907, + "step": 10748 + }, + { + "epoch": 0.3187438839960858, + "grad_norm": 0.14363108575344086, + "learning_rate": 0.0007786769337076646, + "loss": 2.7374, + "step": 10749 + }, + { + "epoch": 0.31877353734839725, + "grad_norm": 0.14335951209068298, + "learning_rate": 0.0007786378672347916, + "loss": 2.7119, + "step": 10750 + }, + { + "epoch": 0.3188031907007087, + "grad_norm": 0.12869158387184143, + "learning_rate": 0.0007785987982945324, + "loss": 2.7148, + "step": 10751 + }, + { + "epoch": 0.3188328440530202, + "grad_norm": 0.1362222582101822, + "learning_rate": 0.0007785597268872332, + "loss": 2.7227, + "step": 10752 + }, + { + "epoch": 0.3188624974053317, + "grad_norm": 0.1299365907907486, + "learning_rate": 0.0007785206530132397, + "loss": 2.7192, + "step": 10753 + }, + { + "epoch": 0.31889215075764316, + "grad_norm": 0.13426445424556732, + "learning_rate": 0.000778481576672898, + "loss": 2.7137, + "step": 10754 + }, + { + "epoch": 0.31892180410995463, + "grad_norm": 0.12651965022087097, + "learning_rate": 0.0007784424978665541, + "loss": 2.7081, + "step": 10755 + }, + { + "epoch": 0.3189514574622661, + "grad_norm": 0.13584963977336884, + "learning_rate": 0.0007784034165945543, + "loss": 2.6991, + "step": 10756 + }, + { + "epoch": 0.3189811108145776, + "grad_norm": 0.15025874972343445, + "learning_rate": 0.0007783643328572443, + "loss": 2.7364, + "step": 10757 + }, + { + "epoch": 0.31901076416688906, + "grad_norm": 0.13531722128391266, + "learning_rate": 0.0007783252466549703, + "loss": 2.7219, + "step": 10758 + }, + { + "epoch": 0.31904041751920054, + "grad_norm": 0.14311936497688293, + "learning_rate": 0.0007782861579880785, + "loss": 2.732, + "step": 10759 + }, + { + "epoch": 0.319070070871512, + "grad_norm": 0.16131769120693207, + "learning_rate": 0.000778247066856915, + "loss": 2.7726, + "step": 10760 + }, + { + "epoch": 0.3190997242238235, + "grad_norm": 0.15897169709205627, + "learning_rate": 0.000778207973261826, + "loss": 2.7182, + "step": 10761 + }, + { + "epoch": 0.31912937757613496, + "grad_norm": 0.13055925071239471, + "learning_rate": 0.0007781688772031576, + "loss": 2.6943, + "step": 10762 + }, + { + "epoch": 0.31915903092844644, + "grad_norm": 0.11704488098621368, + "learning_rate": 0.0007781297786812562, + "loss": 2.726, + "step": 10763 + }, + { + "epoch": 0.3191886842807579, + "grad_norm": 0.11823999881744385, + "learning_rate": 0.0007780906776964677, + "loss": 2.7166, + "step": 10764 + }, + { + "epoch": 0.3192183376330694, + "grad_norm": 0.1345658004283905, + "learning_rate": 0.0007780515742491386, + "loss": 2.7492, + "step": 10765 + }, + { + "epoch": 0.3192479909853809, + "grad_norm": 0.12081831693649292, + "learning_rate": 0.000778012468339615, + "loss": 2.7034, + "step": 10766 + }, + { + "epoch": 0.3192776443376924, + "grad_norm": 0.12824667990207672, + "learning_rate": 0.0007779733599682434, + "loss": 2.7351, + "step": 10767 + }, + { + "epoch": 0.3193072976900039, + "grad_norm": 0.14277081191539764, + "learning_rate": 0.0007779342491353698, + "loss": 2.7156, + "step": 10768 + }, + { + "epoch": 0.31933695104231535, + "grad_norm": 0.14186862111091614, + "learning_rate": 0.0007778951358413409, + "loss": 2.7375, + "step": 10769 + }, + { + "epoch": 0.31936660439462683, + "grad_norm": 0.17315873503684998, + "learning_rate": 0.0007778560200865028, + "loss": 2.755, + "step": 10770 + }, + { + "epoch": 0.3193962577469383, + "grad_norm": 0.1813160926103592, + "learning_rate": 0.0007778169018712018, + "loss": 2.7458, + "step": 10771 + }, + { + "epoch": 0.3194259110992498, + "grad_norm": 0.16067110002040863, + "learning_rate": 0.0007777777811957847, + "loss": 2.7315, + "step": 10772 + }, + { + "epoch": 0.31945556445156126, + "grad_norm": 0.1422429382801056, + "learning_rate": 0.0007777386580605975, + "loss": 2.7165, + "step": 10773 + }, + { + "epoch": 0.31948521780387273, + "grad_norm": 0.13872067630290985, + "learning_rate": 0.0007776995324659869, + "loss": 2.7114, + "step": 10774 + }, + { + "epoch": 0.3195148711561842, + "grad_norm": 0.1299208104610443, + "learning_rate": 0.0007776604044122992, + "loss": 2.7158, + "step": 10775 + }, + { + "epoch": 0.3195445245084957, + "grad_norm": 0.14152158796787262, + "learning_rate": 0.0007776212738998811, + "loss": 2.7106, + "step": 10776 + }, + { + "epoch": 0.31957417786080716, + "grad_norm": 0.13165439665317535, + "learning_rate": 0.0007775821409290788, + "loss": 2.7099, + "step": 10777 + }, + { + "epoch": 0.31960383121311864, + "grad_norm": 0.12977032363414764, + "learning_rate": 0.000777543005500239, + "loss": 2.7123, + "step": 10778 + }, + { + "epoch": 0.3196334845654301, + "grad_norm": 0.13395150005817413, + "learning_rate": 0.0007775038676137083, + "loss": 2.742, + "step": 10779 + }, + { + "epoch": 0.3196631379177416, + "grad_norm": 0.13086611032485962, + "learning_rate": 0.0007774647272698332, + "loss": 2.7466, + "step": 10780 + }, + { + "epoch": 0.31969279127005307, + "grad_norm": 0.13382063806056976, + "learning_rate": 0.0007774255844689604, + "loss": 2.72, + "step": 10781 + }, + { + "epoch": 0.31972244462236454, + "grad_norm": 0.14236357808113098, + "learning_rate": 0.0007773864392114365, + "loss": 2.7051, + "step": 10782 + }, + { + "epoch": 0.319752097974676, + "grad_norm": 0.15812215209007263, + "learning_rate": 0.0007773472914976079, + "loss": 2.7168, + "step": 10783 + }, + { + "epoch": 0.3197817513269875, + "grad_norm": 0.12910306453704834, + "learning_rate": 0.0007773081413278214, + "loss": 2.7006, + "step": 10784 + }, + { + "epoch": 0.31981140467929897, + "grad_norm": 0.12786637246608734, + "learning_rate": 0.0007772689887024238, + "loss": 2.7269, + "step": 10785 + }, + { + "epoch": 0.31984105803161045, + "grad_norm": 0.14578677713871002, + "learning_rate": 0.0007772298336217617, + "loss": 2.7394, + "step": 10786 + }, + { + "epoch": 0.319870711383922, + "grad_norm": 0.1619839370250702, + "learning_rate": 0.0007771906760861818, + "loss": 2.7188, + "step": 10787 + }, + { + "epoch": 0.31990036473623346, + "grad_norm": 0.13428939878940582, + "learning_rate": 0.0007771515160960309, + "loss": 2.6975, + "step": 10788 + }, + { + "epoch": 0.31993001808854493, + "grad_norm": 0.1468898057937622, + "learning_rate": 0.0007771123536516558, + "loss": 2.7526, + "step": 10789 + }, + { + "epoch": 0.3199596714408564, + "grad_norm": 0.1714428961277008, + "learning_rate": 0.0007770731887534031, + "loss": 2.7113, + "step": 10790 + }, + { + "epoch": 0.3199893247931679, + "grad_norm": 0.16481523215770721, + "learning_rate": 0.00077703402140162, + "loss": 2.7131, + "step": 10791 + }, + { + "epoch": 0.32001897814547936, + "grad_norm": 0.1619262397289276, + "learning_rate": 0.0007769948515966529, + "loss": 2.7001, + "step": 10792 + }, + { + "epoch": 0.32004863149779084, + "grad_norm": 0.149252250790596, + "learning_rate": 0.0007769556793388488, + "loss": 2.729, + "step": 10793 + }, + { + "epoch": 0.3200782848501023, + "grad_norm": 0.13754640519618988, + "learning_rate": 0.0007769165046285548, + "loss": 2.7279, + "step": 10794 + }, + { + "epoch": 0.3201079382024138, + "grad_norm": 0.14666077494621277, + "learning_rate": 0.0007768773274661176, + "loss": 2.7334, + "step": 10795 + }, + { + "epoch": 0.32013759155472526, + "grad_norm": 0.13316115736961365, + "learning_rate": 0.000776838147851884, + "loss": 2.6894, + "step": 10796 + }, + { + "epoch": 0.32016724490703674, + "grad_norm": 0.11898628622293472, + "learning_rate": 0.0007767989657862011, + "loss": 2.7015, + "step": 10797 + }, + { + "epoch": 0.3201968982593482, + "grad_norm": 0.11175481975078583, + "learning_rate": 0.0007767597812694159, + "loss": 2.7289, + "step": 10798 + }, + { + "epoch": 0.3202265516116597, + "grad_norm": 0.11315032839775085, + "learning_rate": 0.0007767205943018753, + "loss": 2.7144, + "step": 10799 + }, + { + "epoch": 0.32025620496397117, + "grad_norm": 0.1258992999792099, + "learning_rate": 0.0007766814048839265, + "loss": 2.703, + "step": 10800 + }, + { + "epoch": 0.32028585831628265, + "grad_norm": 0.1310531347990036, + "learning_rate": 0.0007766422130159162, + "loss": 2.74, + "step": 10801 + }, + { + "epoch": 0.3203155116685941, + "grad_norm": 0.1312689185142517, + "learning_rate": 0.0007766030186981916, + "loss": 2.7066, + "step": 10802 + }, + { + "epoch": 0.3203451650209056, + "grad_norm": 0.1401406228542328, + "learning_rate": 0.0007765638219310998, + "loss": 2.6952, + "step": 10803 + }, + { + "epoch": 0.3203748183732171, + "grad_norm": 0.16438566148281097, + "learning_rate": 0.000776524622714988, + "loss": 2.7232, + "step": 10804 + }, + { + "epoch": 0.32040447172552855, + "grad_norm": 0.19674602150917053, + "learning_rate": 0.0007764854210502031, + "loss": 2.7427, + "step": 10805 + }, + { + "epoch": 0.32043412507784, + "grad_norm": 0.15414756536483765, + "learning_rate": 0.0007764462169370924, + "loss": 2.7307, + "step": 10806 + }, + { + "epoch": 0.3204637784301515, + "grad_norm": 0.1473783552646637, + "learning_rate": 0.000776407010376003, + "loss": 2.7082, + "step": 10807 + }, + { + "epoch": 0.32049343178246303, + "grad_norm": 0.15195618569850922, + "learning_rate": 0.0007763678013672821, + "loss": 2.755, + "step": 10808 + }, + { + "epoch": 0.3205230851347745, + "grad_norm": 0.13355976343154907, + "learning_rate": 0.0007763285899112767, + "loss": 2.7047, + "step": 10809 + }, + { + "epoch": 0.320552738487086, + "grad_norm": 0.15135082602500916, + "learning_rate": 0.0007762893760083344, + "loss": 2.6693, + "step": 10810 + }, + { + "epoch": 0.32058239183939746, + "grad_norm": 0.1251370906829834, + "learning_rate": 0.000776250159658802, + "loss": 2.6851, + "step": 10811 + }, + { + "epoch": 0.32061204519170894, + "grad_norm": 0.1356758028268814, + "learning_rate": 0.0007762109408630273, + "loss": 2.7303, + "step": 10812 + }, + { + "epoch": 0.3206416985440204, + "grad_norm": 0.16621723771095276, + "learning_rate": 0.0007761717196213574, + "loss": 2.7189, + "step": 10813 + }, + { + "epoch": 0.3206713518963319, + "grad_norm": 0.12735684216022491, + "learning_rate": 0.0007761324959341393, + "loss": 2.703, + "step": 10814 + }, + { + "epoch": 0.32070100524864337, + "grad_norm": 0.12865737080574036, + "learning_rate": 0.0007760932698017204, + "loss": 2.6847, + "step": 10815 + }, + { + "epoch": 0.32073065860095484, + "grad_norm": 0.13608798384666443, + "learning_rate": 0.0007760540412244484, + "loss": 2.6984, + "step": 10816 + }, + { + "epoch": 0.3207603119532663, + "grad_norm": 0.14342299103736877, + "learning_rate": 0.0007760148102026705, + "loss": 2.6877, + "step": 10817 + }, + { + "epoch": 0.3207899653055778, + "grad_norm": 0.13411933183670044, + "learning_rate": 0.000775975576736734, + "loss": 2.7173, + "step": 10818 + }, + { + "epoch": 0.32081961865788927, + "grad_norm": 0.11321541666984558, + "learning_rate": 0.0007759363408269866, + "loss": 2.7418, + "step": 10819 + }, + { + "epoch": 0.32084927201020075, + "grad_norm": 0.13014231622219086, + "learning_rate": 0.0007758971024737753, + "loss": 2.7404, + "step": 10820 + }, + { + "epoch": 0.3208789253625122, + "grad_norm": 0.15643517673015594, + "learning_rate": 0.0007758578616774478, + "loss": 2.7045, + "step": 10821 + }, + { + "epoch": 0.3209085787148237, + "grad_norm": 0.12532396614551544, + "learning_rate": 0.0007758186184383518, + "loss": 2.7313, + "step": 10822 + }, + { + "epoch": 0.3209382320671352, + "grad_norm": 0.12423999607563019, + "learning_rate": 0.0007757793727568343, + "loss": 2.709, + "step": 10823 + }, + { + "epoch": 0.32096788541944665, + "grad_norm": 0.11015457659959793, + "learning_rate": 0.0007757401246332434, + "loss": 2.7442, + "step": 10824 + }, + { + "epoch": 0.32099753877175813, + "grad_norm": 0.1164202094078064, + "learning_rate": 0.0007757008740679263, + "loss": 2.7086, + "step": 10825 + }, + { + "epoch": 0.3210271921240696, + "grad_norm": 0.12477301061153412, + "learning_rate": 0.0007756616210612305, + "loss": 2.7096, + "step": 10826 + }, + { + "epoch": 0.3210568454763811, + "grad_norm": 0.15084825456142426, + "learning_rate": 0.0007756223656135039, + "loss": 2.7456, + "step": 10827 + }, + { + "epoch": 0.32108649882869256, + "grad_norm": 0.1396692395210266, + "learning_rate": 0.0007755831077250938, + "loss": 2.6985, + "step": 10828 + }, + { + "epoch": 0.3211161521810041, + "grad_norm": 0.129307821393013, + "learning_rate": 0.0007755438473963479, + "loss": 2.7062, + "step": 10829 + }, + { + "epoch": 0.32114580553331556, + "grad_norm": 0.1350431740283966, + "learning_rate": 0.0007755045846276141, + "loss": 2.7116, + "step": 10830 + }, + { + "epoch": 0.32117545888562704, + "grad_norm": 0.14800584316253662, + "learning_rate": 0.0007754653194192399, + "loss": 2.7292, + "step": 10831 + }, + { + "epoch": 0.3212051122379385, + "grad_norm": 0.16305601596832275, + "learning_rate": 0.000775426051771573, + "loss": 2.7022, + "step": 10832 + }, + { + "epoch": 0.32123476559025, + "grad_norm": 0.164696604013443, + "learning_rate": 0.0007753867816849611, + "loss": 2.7117, + "step": 10833 + }, + { + "epoch": 0.32126441894256147, + "grad_norm": 0.17916660010814667, + "learning_rate": 0.000775347509159752, + "loss": 2.7302, + "step": 10834 + }, + { + "epoch": 0.32129407229487295, + "grad_norm": 0.1429852396249771, + "learning_rate": 0.0007753082341962934, + "loss": 2.7175, + "step": 10835 + }, + { + "epoch": 0.3213237256471844, + "grad_norm": 0.1430688500404358, + "learning_rate": 0.0007752689567949332, + "loss": 2.725, + "step": 10836 + }, + { + "epoch": 0.3213533789994959, + "grad_norm": 0.13530802726745605, + "learning_rate": 0.0007752296769560192, + "loss": 2.7338, + "step": 10837 + }, + { + "epoch": 0.3213830323518074, + "grad_norm": 0.14616619050502777, + "learning_rate": 0.000775190394679899, + "loss": 2.7217, + "step": 10838 + }, + { + "epoch": 0.32141268570411885, + "grad_norm": 0.15024632215499878, + "learning_rate": 0.0007751511099669207, + "loss": 2.7248, + "step": 10839 + }, + { + "epoch": 0.3214423390564303, + "grad_norm": 0.13542404770851135, + "learning_rate": 0.0007751118228174321, + "loss": 2.6875, + "step": 10840 + }, + { + "epoch": 0.3214719924087418, + "grad_norm": 0.15737329423427582, + "learning_rate": 0.0007750725332317811, + "loss": 2.7245, + "step": 10841 + }, + { + "epoch": 0.3215016457610533, + "grad_norm": 0.1882769912481308, + "learning_rate": 0.0007750332412103156, + "loss": 2.734, + "step": 10842 + }, + { + "epoch": 0.32153129911336475, + "grad_norm": 0.17574766278266907, + "learning_rate": 0.0007749939467533836, + "loss": 2.7298, + "step": 10843 + }, + { + "epoch": 0.32156095246567623, + "grad_norm": 0.15285345911979675, + "learning_rate": 0.0007749546498613329, + "loss": 2.7304, + "step": 10844 + }, + { + "epoch": 0.3215906058179877, + "grad_norm": 0.13128459453582764, + "learning_rate": 0.0007749153505345114, + "loss": 2.7064, + "step": 10845 + }, + { + "epoch": 0.3216202591702992, + "grad_norm": 0.12229174375534058, + "learning_rate": 0.0007748760487732676, + "loss": 2.7086, + "step": 10846 + }, + { + "epoch": 0.32164991252261066, + "grad_norm": 0.13639166951179504, + "learning_rate": 0.0007748367445779492, + "loss": 2.7308, + "step": 10847 + }, + { + "epoch": 0.32167956587492214, + "grad_norm": 0.12461528927087784, + "learning_rate": 0.0007747974379489041, + "loss": 2.7186, + "step": 10848 + }, + { + "epoch": 0.3217092192272336, + "grad_norm": 0.12536922097206116, + "learning_rate": 0.0007747581288864804, + "loss": 2.7543, + "step": 10849 + }, + { + "epoch": 0.32173887257954514, + "grad_norm": 0.14147454500198364, + "learning_rate": 0.0007747188173910266, + "loss": 2.737, + "step": 10850 + }, + { + "epoch": 0.3217685259318566, + "grad_norm": 0.13707351684570312, + "learning_rate": 0.0007746795034628904, + "loss": 2.7043, + "step": 10851 + }, + { + "epoch": 0.3217981792841681, + "grad_norm": 0.13135302066802979, + "learning_rate": 0.00077464018710242, + "loss": 2.7474, + "step": 10852 + }, + { + "epoch": 0.32182783263647957, + "grad_norm": 0.1266661137342453, + "learning_rate": 0.0007746008683099637, + "loss": 2.706, + "step": 10853 + }, + { + "epoch": 0.32185748598879105, + "grad_norm": 0.11960992962121964, + "learning_rate": 0.0007745615470858694, + "loss": 2.7323, + "step": 10854 + }, + { + "epoch": 0.3218871393411025, + "grad_norm": 0.1262824535369873, + "learning_rate": 0.0007745222234304856, + "loss": 2.7373, + "step": 10855 + }, + { + "epoch": 0.321916792693414, + "grad_norm": 0.12401621043682098, + "learning_rate": 0.0007744828973441603, + "loss": 2.6977, + "step": 10856 + }, + { + "epoch": 0.3219464460457255, + "grad_norm": 0.11505865305662155, + "learning_rate": 0.000774443568827242, + "loss": 2.7281, + "step": 10857 + }, + { + "epoch": 0.32197609939803695, + "grad_norm": 0.13542236387729645, + "learning_rate": 0.0007744042378800786, + "loss": 2.7091, + "step": 10858 + }, + { + "epoch": 0.32200575275034843, + "grad_norm": 0.14056158065795898, + "learning_rate": 0.0007743649045030187, + "loss": 2.7073, + "step": 10859 + }, + { + "epoch": 0.3220354061026599, + "grad_norm": 0.12962090969085693, + "learning_rate": 0.0007743255686964106, + "loss": 2.7164, + "step": 10860 + }, + { + "epoch": 0.3220650594549714, + "grad_norm": 0.12727925181388855, + "learning_rate": 0.0007742862304606022, + "loss": 2.7145, + "step": 10861 + }, + { + "epoch": 0.32209471280728286, + "grad_norm": 0.12553012371063232, + "learning_rate": 0.0007742468897959422, + "loss": 2.6891, + "step": 10862 + }, + { + "epoch": 0.32212436615959433, + "grad_norm": 0.11216486245393753, + "learning_rate": 0.0007742075467027791, + "loss": 2.7084, + "step": 10863 + }, + { + "epoch": 0.3221540195119058, + "grad_norm": 0.1367228627204895, + "learning_rate": 0.000774168201181461, + "loss": 2.755, + "step": 10864 + }, + { + "epoch": 0.3221836728642173, + "grad_norm": 0.11850948631763458, + "learning_rate": 0.0007741288532323365, + "loss": 2.7155, + "step": 10865 + }, + { + "epoch": 0.32221332621652876, + "grad_norm": 0.12202111631631851, + "learning_rate": 0.0007740895028557539, + "loss": 2.7164, + "step": 10866 + }, + { + "epoch": 0.32224297956884024, + "grad_norm": 0.14731621742248535, + "learning_rate": 0.0007740501500520617, + "loss": 2.7025, + "step": 10867 + }, + { + "epoch": 0.3222726329211517, + "grad_norm": 0.19077417254447937, + "learning_rate": 0.0007740107948216084, + "loss": 2.7022, + "step": 10868 + }, + { + "epoch": 0.3223022862734632, + "grad_norm": 0.20159205794334412, + "learning_rate": 0.0007739714371647424, + "loss": 2.723, + "step": 10869 + }, + { + "epoch": 0.3223319396257747, + "grad_norm": 0.1746385097503662, + "learning_rate": 0.0007739320770818124, + "loss": 2.6955, + "step": 10870 + }, + { + "epoch": 0.3223615929780862, + "grad_norm": 0.1669359654188156, + "learning_rate": 0.0007738927145731668, + "loss": 2.7197, + "step": 10871 + }, + { + "epoch": 0.3223912463303977, + "grad_norm": 0.1764335334300995, + "learning_rate": 0.0007738533496391542, + "loss": 2.6813, + "step": 10872 + }, + { + "epoch": 0.32242089968270915, + "grad_norm": 0.15584035217761993, + "learning_rate": 0.0007738139822801232, + "loss": 2.7149, + "step": 10873 + }, + { + "epoch": 0.3224505530350206, + "grad_norm": 0.12319547683000565, + "learning_rate": 0.0007737746124964223, + "loss": 2.6972, + "step": 10874 + }, + { + "epoch": 0.3224802063873321, + "grad_norm": 0.17642375826835632, + "learning_rate": 0.0007737352402884002, + "loss": 2.7431, + "step": 10875 + }, + { + "epoch": 0.3225098597396436, + "grad_norm": 0.15766364336013794, + "learning_rate": 0.0007736958656564057, + "loss": 2.7236, + "step": 10876 + }, + { + "epoch": 0.32253951309195505, + "grad_norm": 0.1419350504875183, + "learning_rate": 0.0007736564886007873, + "loss": 2.6661, + "step": 10877 + }, + { + "epoch": 0.32256916644426653, + "grad_norm": 0.14481325447559357, + "learning_rate": 0.0007736171091218936, + "loss": 2.7086, + "step": 10878 + }, + { + "epoch": 0.322598819796578, + "grad_norm": 0.15593169629573822, + "learning_rate": 0.0007735777272200736, + "loss": 2.72, + "step": 10879 + }, + { + "epoch": 0.3226284731488895, + "grad_norm": 0.14649097621440887, + "learning_rate": 0.0007735383428956757, + "loss": 2.7077, + "step": 10880 + }, + { + "epoch": 0.32265812650120096, + "grad_norm": 0.12034276872873306, + "learning_rate": 0.0007734989561490489, + "loss": 2.723, + "step": 10881 + }, + { + "epoch": 0.32268777985351244, + "grad_norm": 0.1378040909767151, + "learning_rate": 0.0007734595669805418, + "loss": 2.6993, + "step": 10882 + }, + { + "epoch": 0.3227174332058239, + "grad_norm": 0.12158318608999252, + "learning_rate": 0.0007734201753905035, + "loss": 2.6998, + "step": 10883 + }, + { + "epoch": 0.3227470865581354, + "grad_norm": 0.11497674137353897, + "learning_rate": 0.0007733807813792826, + "loss": 2.7125, + "step": 10884 + }, + { + "epoch": 0.32277673991044686, + "grad_norm": 0.11946320533752441, + "learning_rate": 0.0007733413849472278, + "loss": 2.7486, + "step": 10885 + }, + { + "epoch": 0.32280639326275834, + "grad_norm": 0.1216512992978096, + "learning_rate": 0.0007733019860946881, + "loss": 2.7251, + "step": 10886 + }, + { + "epoch": 0.3228360466150698, + "grad_norm": 0.11280492693185806, + "learning_rate": 0.0007732625848220125, + "loss": 2.7289, + "step": 10887 + }, + { + "epoch": 0.3228656999673813, + "grad_norm": 0.11012902110815048, + "learning_rate": 0.0007732231811295498, + "loss": 2.7163, + "step": 10888 + }, + { + "epoch": 0.32289535331969277, + "grad_norm": 0.10951171815395355, + "learning_rate": 0.0007731837750176489, + "loss": 2.7519, + "step": 10889 + }, + { + "epoch": 0.32292500667200424, + "grad_norm": 0.1224924847483635, + "learning_rate": 0.0007731443664866589, + "loss": 2.7274, + "step": 10890 + }, + { + "epoch": 0.3229546600243158, + "grad_norm": 0.11992548406124115, + "learning_rate": 0.0007731049555369285, + "loss": 2.7271, + "step": 10891 + }, + { + "epoch": 0.32298431337662725, + "grad_norm": 0.1350090503692627, + "learning_rate": 0.0007730655421688069, + "loss": 2.7504, + "step": 10892 + }, + { + "epoch": 0.32301396672893873, + "grad_norm": 0.13902992010116577, + "learning_rate": 0.0007730261263826432, + "loss": 2.709, + "step": 10893 + }, + { + "epoch": 0.3230436200812502, + "grad_norm": 0.13752517104148865, + "learning_rate": 0.000772986708178786, + "loss": 2.7555, + "step": 10894 + }, + { + "epoch": 0.3230732734335617, + "grad_norm": 0.16293886303901672, + "learning_rate": 0.0007729472875575848, + "loss": 2.7196, + "step": 10895 + }, + { + "epoch": 0.32310292678587316, + "grad_norm": 0.16261905431747437, + "learning_rate": 0.0007729078645193886, + "loss": 2.7027, + "step": 10896 + }, + { + "epoch": 0.32313258013818463, + "grad_norm": 0.1676570028066635, + "learning_rate": 0.0007728684390645461, + "loss": 2.6818, + "step": 10897 + }, + { + "epoch": 0.3231622334904961, + "grad_norm": 0.17317424714565277, + "learning_rate": 0.0007728290111934071, + "loss": 2.7261, + "step": 10898 + }, + { + "epoch": 0.3231918868428076, + "grad_norm": 0.2017344832420349, + "learning_rate": 0.0007727895809063202, + "loss": 2.7055, + "step": 10899 + }, + { + "epoch": 0.32322154019511906, + "grad_norm": 0.1851554811000824, + "learning_rate": 0.0007727501482036348, + "loss": 2.7138, + "step": 10900 + }, + { + "epoch": 0.32325119354743054, + "grad_norm": 0.18330605328083038, + "learning_rate": 0.0007727107130856999, + "loss": 2.7236, + "step": 10901 + }, + { + "epoch": 0.323280846899742, + "grad_norm": 0.16758233308792114, + "learning_rate": 0.0007726712755528649, + "loss": 2.727, + "step": 10902 + }, + { + "epoch": 0.3233105002520535, + "grad_norm": 0.18176327645778656, + "learning_rate": 0.000772631835605479, + "loss": 2.6986, + "step": 10903 + }, + { + "epoch": 0.32334015360436497, + "grad_norm": 0.15113171935081482, + "learning_rate": 0.0007725923932438914, + "loss": 2.72, + "step": 10904 + }, + { + "epoch": 0.32336980695667644, + "grad_norm": 0.13096770644187927, + "learning_rate": 0.0007725529484684513, + "loss": 2.7196, + "step": 10905 + }, + { + "epoch": 0.3233994603089879, + "grad_norm": 0.16357320547103882, + "learning_rate": 0.000772513501279508, + "loss": 2.7268, + "step": 10906 + }, + { + "epoch": 0.3234291136612994, + "grad_norm": 0.13403205573558807, + "learning_rate": 0.0007724740516774109, + "loss": 2.731, + "step": 10907 + }, + { + "epoch": 0.32345876701361087, + "grad_norm": 0.14571984112262726, + "learning_rate": 0.0007724345996625095, + "loss": 2.7217, + "step": 10908 + }, + { + "epoch": 0.32348842036592235, + "grad_norm": 0.13127930462360382, + "learning_rate": 0.0007723951452351527, + "loss": 2.705, + "step": 10909 + }, + { + "epoch": 0.3235180737182338, + "grad_norm": 0.1155281811952591, + "learning_rate": 0.0007723556883956903, + "loss": 2.7335, + "step": 10910 + }, + { + "epoch": 0.3235477270705453, + "grad_norm": 0.1410040706396103, + "learning_rate": 0.0007723162291444715, + "loss": 2.7188, + "step": 10911 + }, + { + "epoch": 0.32357738042285683, + "grad_norm": 0.1464354544878006, + "learning_rate": 0.0007722767674818458, + "loss": 2.6811, + "step": 10912 + }, + { + "epoch": 0.3236070337751683, + "grad_norm": 0.1298673450946808, + "learning_rate": 0.0007722373034081625, + "loss": 2.7134, + "step": 10913 + }, + { + "epoch": 0.3236366871274798, + "grad_norm": 0.1268956959247589, + "learning_rate": 0.0007721978369237711, + "loss": 2.7461, + "step": 10914 + }, + { + "epoch": 0.32366634047979126, + "grad_norm": 0.13444654643535614, + "learning_rate": 0.0007721583680290212, + "loss": 2.6916, + "step": 10915 + }, + { + "epoch": 0.32369599383210274, + "grad_norm": 0.13699223101139069, + "learning_rate": 0.0007721188967242623, + "loss": 2.7563, + "step": 10916 + }, + { + "epoch": 0.3237256471844142, + "grad_norm": 0.127239391207695, + "learning_rate": 0.0007720794230098438, + "loss": 2.715, + "step": 10917 + }, + { + "epoch": 0.3237553005367257, + "grad_norm": 0.12756027281284332, + "learning_rate": 0.0007720399468861153, + "loss": 2.7129, + "step": 10918 + }, + { + "epoch": 0.32378495388903716, + "grad_norm": 0.1231476292014122, + "learning_rate": 0.0007720004683534263, + "loss": 2.7439, + "step": 10919 + }, + { + "epoch": 0.32381460724134864, + "grad_norm": 0.11728861182928085, + "learning_rate": 0.0007719609874121265, + "loss": 2.7344, + "step": 10920 + }, + { + "epoch": 0.3238442605936601, + "grad_norm": 0.12077230215072632, + "learning_rate": 0.0007719215040625655, + "loss": 2.7635, + "step": 10921 + }, + { + "epoch": 0.3238739139459716, + "grad_norm": 0.11590038985013962, + "learning_rate": 0.000771882018305093, + "loss": 2.7289, + "step": 10922 + }, + { + "epoch": 0.32390356729828307, + "grad_norm": 0.11512965708971024, + "learning_rate": 0.0007718425301400585, + "loss": 2.7225, + "step": 10923 + }, + { + "epoch": 0.32393322065059454, + "grad_norm": 0.13012602925300598, + "learning_rate": 0.0007718030395678118, + "loss": 2.7281, + "step": 10924 + }, + { + "epoch": 0.323962874002906, + "grad_norm": 0.1371322125196457, + "learning_rate": 0.0007717635465887023, + "loss": 2.7176, + "step": 10925 + }, + { + "epoch": 0.3239925273552175, + "grad_norm": 0.1432090550661087, + "learning_rate": 0.0007717240512030801, + "loss": 2.7424, + "step": 10926 + }, + { + "epoch": 0.324022180707529, + "grad_norm": 0.17355015873908997, + "learning_rate": 0.0007716845534112949, + "loss": 2.7496, + "step": 10927 + }, + { + "epoch": 0.32405183405984045, + "grad_norm": 0.18344837427139282, + "learning_rate": 0.0007716450532136961, + "loss": 2.7383, + "step": 10928 + }, + { + "epoch": 0.3240814874121519, + "grad_norm": 0.1635764092206955, + "learning_rate": 0.0007716055506106339, + "loss": 2.736, + "step": 10929 + }, + { + "epoch": 0.3241111407644634, + "grad_norm": 0.11644654721021652, + "learning_rate": 0.0007715660456024578, + "loss": 2.7455, + "step": 10930 + }, + { + "epoch": 0.3241407941167749, + "grad_norm": 0.12424953281879425, + "learning_rate": 0.0007715265381895179, + "loss": 2.7108, + "step": 10931 + }, + { + "epoch": 0.32417044746908635, + "grad_norm": 0.13627032935619354, + "learning_rate": 0.0007714870283721637, + "loss": 2.7259, + "step": 10932 + }, + { + "epoch": 0.3242001008213979, + "grad_norm": 0.13592073321342468, + "learning_rate": 0.0007714475161507455, + "loss": 2.6864, + "step": 10933 + }, + { + "epoch": 0.32422975417370936, + "grad_norm": 0.14274907112121582, + "learning_rate": 0.0007714080015256129, + "loss": 2.7021, + "step": 10934 + }, + { + "epoch": 0.32425940752602084, + "grad_norm": 0.14354494214057922, + "learning_rate": 0.0007713684844971157, + "loss": 2.7093, + "step": 10935 + }, + { + "epoch": 0.3242890608783323, + "grad_norm": 0.1484335958957672, + "learning_rate": 0.0007713289650656041, + "loss": 2.7384, + "step": 10936 + }, + { + "epoch": 0.3243187142306438, + "grad_norm": 0.14817674458026886, + "learning_rate": 0.0007712894432314279, + "loss": 2.7767, + "step": 10937 + }, + { + "epoch": 0.32434836758295527, + "grad_norm": 0.17746691405773163, + "learning_rate": 0.0007712499189949371, + "loss": 2.7089, + "step": 10938 + }, + { + "epoch": 0.32437802093526674, + "grad_norm": 0.15768924355506897, + "learning_rate": 0.0007712103923564819, + "loss": 2.7109, + "step": 10939 + }, + { + "epoch": 0.3244076742875782, + "grad_norm": 0.13665583729743958, + "learning_rate": 0.0007711708633164118, + "loss": 2.6987, + "step": 10940 + }, + { + "epoch": 0.3244373276398897, + "grad_norm": 0.13760226964950562, + "learning_rate": 0.0007711313318750774, + "loss": 2.7253, + "step": 10941 + }, + { + "epoch": 0.32446698099220117, + "grad_norm": 0.15356336534023285, + "learning_rate": 0.0007710917980328285, + "loss": 2.7111, + "step": 10942 + }, + { + "epoch": 0.32449663434451265, + "grad_norm": 0.15143749117851257, + "learning_rate": 0.0007710522617900152, + "loss": 2.7077, + "step": 10943 + }, + { + "epoch": 0.3245262876968241, + "grad_norm": 0.16049224138259888, + "learning_rate": 0.0007710127231469876, + "loss": 2.6931, + "step": 10944 + }, + { + "epoch": 0.3245559410491356, + "grad_norm": 0.16904261708259583, + "learning_rate": 0.0007709731821040956, + "loss": 2.7474, + "step": 10945 + }, + { + "epoch": 0.3245855944014471, + "grad_norm": 0.17077182233333588, + "learning_rate": 0.0007709336386616898, + "loss": 2.7309, + "step": 10946 + }, + { + "epoch": 0.32461524775375855, + "grad_norm": 0.16716106235980988, + "learning_rate": 0.00077089409282012, + "loss": 2.6746, + "step": 10947 + }, + { + "epoch": 0.32464490110607, + "grad_norm": 0.16134274005889893, + "learning_rate": 0.0007708545445797366, + "loss": 2.7348, + "step": 10948 + }, + { + "epoch": 0.3246745544583815, + "grad_norm": 0.16756843030452728, + "learning_rate": 0.0007708149939408898, + "loss": 2.7498, + "step": 10949 + }, + { + "epoch": 0.324704207810693, + "grad_norm": 0.14078591763973236, + "learning_rate": 0.0007707754409039296, + "loss": 2.7326, + "step": 10950 + }, + { + "epoch": 0.32473386116300446, + "grad_norm": 0.12861934304237366, + "learning_rate": 0.0007707358854692064, + "loss": 2.7255, + "step": 10951 + }, + { + "epoch": 0.32476351451531593, + "grad_norm": 0.13662925362586975, + "learning_rate": 0.0007706963276370704, + "loss": 2.6492, + "step": 10952 + }, + { + "epoch": 0.3247931678676274, + "grad_norm": 0.11710279434919357, + "learning_rate": 0.0007706567674078719, + "loss": 2.7119, + "step": 10953 + }, + { + "epoch": 0.32482282121993894, + "grad_norm": 0.11913147568702698, + "learning_rate": 0.0007706172047819615, + "loss": 2.7088, + "step": 10954 + }, + { + "epoch": 0.3248524745722504, + "grad_norm": 0.11606588959693909, + "learning_rate": 0.0007705776397596893, + "loss": 2.7118, + "step": 10955 + }, + { + "epoch": 0.3248821279245619, + "grad_norm": 0.13042433559894562, + "learning_rate": 0.0007705380723414055, + "loss": 2.7125, + "step": 10956 + }, + { + "epoch": 0.32491178127687337, + "grad_norm": 0.14966551959514618, + "learning_rate": 0.0007704985025274607, + "loss": 2.7044, + "step": 10957 + }, + { + "epoch": 0.32494143462918484, + "grad_norm": 0.13629896938800812, + "learning_rate": 0.0007704589303182051, + "loss": 2.7066, + "step": 10958 + }, + { + "epoch": 0.3249710879814963, + "grad_norm": 0.12153173983097076, + "learning_rate": 0.0007704193557139893, + "loss": 2.743, + "step": 10959 + }, + { + "epoch": 0.3250007413338078, + "grad_norm": 0.11650701612234116, + "learning_rate": 0.0007703797787151638, + "loss": 2.707, + "step": 10960 + }, + { + "epoch": 0.3250303946861193, + "grad_norm": 0.11877983808517456, + "learning_rate": 0.000770340199322079, + "loss": 2.6997, + "step": 10961 + }, + { + "epoch": 0.32506004803843075, + "grad_norm": 0.13393673300743103, + "learning_rate": 0.0007703006175350853, + "loss": 2.6967, + "step": 10962 + }, + { + "epoch": 0.3250897013907422, + "grad_norm": 0.13209830224514008, + "learning_rate": 0.0007702610333545333, + "loss": 2.7399, + "step": 10963 + }, + { + "epoch": 0.3251193547430537, + "grad_norm": 0.14552311599254608, + "learning_rate": 0.0007702214467807732, + "loss": 2.7455, + "step": 10964 + }, + { + "epoch": 0.3251490080953652, + "grad_norm": 0.1619264781475067, + "learning_rate": 0.0007701818578141559, + "loss": 2.716, + "step": 10965 + }, + { + "epoch": 0.32517866144767665, + "grad_norm": 0.16963233053684235, + "learning_rate": 0.0007701422664550318, + "loss": 2.721, + "step": 10966 + }, + { + "epoch": 0.32520831479998813, + "grad_norm": 0.18654103577136993, + "learning_rate": 0.0007701026727037518, + "loss": 2.7241, + "step": 10967 + }, + { + "epoch": 0.3252379681522996, + "grad_norm": 0.19744007289409637, + "learning_rate": 0.0007700630765606661, + "loss": 2.7294, + "step": 10968 + }, + { + "epoch": 0.3252676215046111, + "grad_norm": 0.16828277707099915, + "learning_rate": 0.0007700234780261255, + "loss": 2.7203, + "step": 10969 + }, + { + "epoch": 0.32529727485692256, + "grad_norm": 0.16947466135025024, + "learning_rate": 0.0007699838771004808, + "loss": 2.7317, + "step": 10970 + }, + { + "epoch": 0.32532692820923403, + "grad_norm": 0.16724589467048645, + "learning_rate": 0.0007699442737840823, + "loss": 2.7454, + "step": 10971 + }, + { + "epoch": 0.3253565815615455, + "grad_norm": 0.15202291309833527, + "learning_rate": 0.0007699046680772811, + "loss": 2.7139, + "step": 10972 + }, + { + "epoch": 0.325386234913857, + "grad_norm": 0.13539686799049377, + "learning_rate": 0.0007698650599804276, + "loss": 2.7629, + "step": 10973 + }, + { + "epoch": 0.32541588826616846, + "grad_norm": 0.12693098187446594, + "learning_rate": 0.0007698254494938728, + "loss": 2.7518, + "step": 10974 + }, + { + "epoch": 0.32544554161848, + "grad_norm": 0.13557898998260498, + "learning_rate": 0.0007697858366179673, + "loss": 2.7056, + "step": 10975 + }, + { + "epoch": 0.32547519497079147, + "grad_norm": 0.11330246180295944, + "learning_rate": 0.0007697462213530619, + "loss": 2.7124, + "step": 10976 + }, + { + "epoch": 0.32550484832310295, + "grad_norm": 0.1288554072380066, + "learning_rate": 0.0007697066036995074, + "loss": 2.7072, + "step": 10977 + }, + { + "epoch": 0.3255345016754144, + "grad_norm": 0.13862359523773193, + "learning_rate": 0.0007696669836576547, + "loss": 2.7119, + "step": 10978 + }, + { + "epoch": 0.3255641550277259, + "grad_norm": 0.13881495594978333, + "learning_rate": 0.0007696273612278543, + "loss": 2.6948, + "step": 10979 + }, + { + "epoch": 0.3255938083800374, + "grad_norm": 0.12584513425827026, + "learning_rate": 0.0007695877364104576, + "loss": 2.6884, + "step": 10980 + }, + { + "epoch": 0.32562346173234885, + "grad_norm": 0.13419131934642792, + "learning_rate": 0.0007695481092058152, + "loss": 2.7288, + "step": 10981 + }, + { + "epoch": 0.3256531150846603, + "grad_norm": 0.13778547942638397, + "learning_rate": 0.0007695084796142779, + "loss": 2.7059, + "step": 10982 + }, + { + "epoch": 0.3256827684369718, + "grad_norm": 0.1448163539171219, + "learning_rate": 0.0007694688476361968, + "loss": 2.7116, + "step": 10983 + }, + { + "epoch": 0.3257124217892833, + "grad_norm": 0.13070325553417206, + "learning_rate": 0.000769429213271923, + "loss": 2.7194, + "step": 10984 + }, + { + "epoch": 0.32574207514159476, + "grad_norm": 0.11907871812582016, + "learning_rate": 0.0007693895765218071, + "loss": 2.7301, + "step": 10985 + }, + { + "epoch": 0.32577172849390623, + "grad_norm": 0.11048903316259384, + "learning_rate": 0.0007693499373862001, + "loss": 2.7326, + "step": 10986 + }, + { + "epoch": 0.3258013818462177, + "grad_norm": 0.11230280995368958, + "learning_rate": 0.0007693102958654534, + "loss": 2.7338, + "step": 10987 + }, + { + "epoch": 0.3258310351985292, + "grad_norm": 0.13101275265216827, + "learning_rate": 0.0007692706519599178, + "loss": 2.7028, + "step": 10988 + }, + { + "epoch": 0.32586068855084066, + "grad_norm": 0.12666088342666626, + "learning_rate": 0.0007692310056699443, + "loss": 2.7346, + "step": 10989 + }, + { + "epoch": 0.32589034190315214, + "grad_norm": 0.10274330526590347, + "learning_rate": 0.000769191356995884, + "loss": 2.7229, + "step": 10990 + }, + { + "epoch": 0.3259199952554636, + "grad_norm": 0.13101676106452942, + "learning_rate": 0.000769151705938088, + "loss": 2.7005, + "step": 10991 + }, + { + "epoch": 0.3259496486077751, + "grad_norm": 0.15072333812713623, + "learning_rate": 0.0007691120524969075, + "loss": 2.7265, + "step": 10992 + }, + { + "epoch": 0.32597930196008656, + "grad_norm": 0.1552705615758896, + "learning_rate": 0.0007690723966726936, + "loss": 2.7417, + "step": 10993 + }, + { + "epoch": 0.32600895531239804, + "grad_norm": 0.13260279595851898, + "learning_rate": 0.0007690327384657973, + "loss": 2.7423, + "step": 10994 + }, + { + "epoch": 0.3260386086647096, + "grad_norm": 0.1282617449760437, + "learning_rate": 0.0007689930778765701, + "loss": 2.7, + "step": 10995 + }, + { + "epoch": 0.32606826201702105, + "grad_norm": 0.14405612647533417, + "learning_rate": 0.0007689534149053631, + "loss": 2.7355, + "step": 10996 + }, + { + "epoch": 0.3260979153693325, + "grad_norm": 0.13923008739948273, + "learning_rate": 0.0007689137495525271, + "loss": 2.7136, + "step": 10997 + }, + { + "epoch": 0.326127568721644, + "grad_norm": 0.15115492045879364, + "learning_rate": 0.000768874081818414, + "loss": 2.7181, + "step": 10998 + }, + { + "epoch": 0.3261572220739555, + "grad_norm": 0.16683386266231537, + "learning_rate": 0.0007688344117033747, + "loss": 2.7322, + "step": 10999 + }, + { + "epoch": 0.32618687542626695, + "grad_norm": 0.16383343935012817, + "learning_rate": 0.0007687947392077606, + "loss": 2.7056, + "step": 11000 + }, + { + "epoch": 0.32621652877857843, + "grad_norm": 0.16029460728168488, + "learning_rate": 0.0007687550643319228, + "loss": 2.6928, + "step": 11001 + }, + { + "epoch": 0.3262461821308899, + "grad_norm": 0.16911470890045166, + "learning_rate": 0.0007687153870762127, + "loss": 2.7554, + "step": 11002 + }, + { + "epoch": 0.3262758354832014, + "grad_norm": 0.15637655556201935, + "learning_rate": 0.0007686757074409818, + "loss": 2.7296, + "step": 11003 + }, + { + "epoch": 0.32630548883551286, + "grad_norm": 0.1465001106262207, + "learning_rate": 0.0007686360254265814, + "loss": 2.7611, + "step": 11004 + }, + { + "epoch": 0.32633514218782433, + "grad_norm": 0.15564344823360443, + "learning_rate": 0.0007685963410333631, + "loss": 2.7479, + "step": 11005 + }, + { + "epoch": 0.3263647955401358, + "grad_norm": 0.1494581252336502, + "learning_rate": 0.0007685566542616779, + "loss": 2.7349, + "step": 11006 + }, + { + "epoch": 0.3263944488924473, + "grad_norm": 0.14564251899719238, + "learning_rate": 0.0007685169651118774, + "loss": 2.728, + "step": 11007 + }, + { + "epoch": 0.32642410224475876, + "grad_norm": 0.14896312355995178, + "learning_rate": 0.000768477273584313, + "loss": 2.7202, + "step": 11008 + }, + { + "epoch": 0.32645375559707024, + "grad_norm": 0.12085618823766708, + "learning_rate": 0.0007684375796793365, + "loss": 2.7366, + "step": 11009 + }, + { + "epoch": 0.3264834089493817, + "grad_norm": 0.14478595554828644, + "learning_rate": 0.0007683978833972991, + "loss": 2.7085, + "step": 11010 + }, + { + "epoch": 0.3265130623016932, + "grad_norm": 0.12493132054805756, + "learning_rate": 0.0007683581847385523, + "loss": 2.6818, + "step": 11011 + }, + { + "epoch": 0.32654271565400467, + "grad_norm": 0.13525256514549255, + "learning_rate": 0.0007683184837034476, + "loss": 2.727, + "step": 11012 + }, + { + "epoch": 0.32657236900631614, + "grad_norm": 0.12794849276542664, + "learning_rate": 0.0007682787802923368, + "loss": 2.7065, + "step": 11013 + }, + { + "epoch": 0.3266020223586276, + "grad_norm": 0.11219915002584457, + "learning_rate": 0.0007682390745055714, + "loss": 2.7347, + "step": 11014 + }, + { + "epoch": 0.3266316757109391, + "grad_norm": 0.12241413444280624, + "learning_rate": 0.000768199366343503, + "loss": 2.7427, + "step": 11015 + }, + { + "epoch": 0.3266613290632506, + "grad_norm": 0.13021160662174225, + "learning_rate": 0.0007681596558064829, + "loss": 2.7303, + "step": 11016 + }, + { + "epoch": 0.3266909824155621, + "grad_norm": 0.12627780437469482, + "learning_rate": 0.0007681199428948633, + "loss": 2.7399, + "step": 11017 + }, + { + "epoch": 0.3267206357678736, + "grad_norm": 0.13043805956840515, + "learning_rate": 0.0007680802276089954, + "loss": 2.7208, + "step": 11018 + }, + { + "epoch": 0.32675028912018506, + "grad_norm": 0.14596007764339447, + "learning_rate": 0.0007680405099492312, + "loss": 2.7423, + "step": 11019 + }, + { + "epoch": 0.32677994247249653, + "grad_norm": 0.16865423321723938, + "learning_rate": 0.0007680007899159222, + "loss": 2.7165, + "step": 11020 + }, + { + "epoch": 0.326809595824808, + "grad_norm": 0.18428972363471985, + "learning_rate": 0.0007679610675094202, + "loss": 2.7057, + "step": 11021 + }, + { + "epoch": 0.3268392491771195, + "grad_norm": 0.18394340574741364, + "learning_rate": 0.000767921342730077, + "loss": 2.697, + "step": 11022 + }, + { + "epoch": 0.32686890252943096, + "grad_norm": 0.13995204865932465, + "learning_rate": 0.0007678816155782442, + "loss": 2.7022, + "step": 11023 + }, + { + "epoch": 0.32689855588174244, + "grad_norm": 0.15832091867923737, + "learning_rate": 0.0007678418860542738, + "loss": 2.7145, + "step": 11024 + }, + { + "epoch": 0.3269282092340539, + "grad_norm": 0.16745781898498535, + "learning_rate": 0.0007678021541585176, + "loss": 2.7185, + "step": 11025 + }, + { + "epoch": 0.3269578625863654, + "grad_norm": 0.1331261694431305, + "learning_rate": 0.0007677624198913273, + "loss": 2.7079, + "step": 11026 + }, + { + "epoch": 0.32698751593867686, + "grad_norm": 0.18766482174396515, + "learning_rate": 0.0007677226832530548, + "loss": 2.7183, + "step": 11027 + }, + { + "epoch": 0.32701716929098834, + "grad_norm": 0.18086294829845428, + "learning_rate": 0.0007676829442440521, + "loss": 2.7367, + "step": 11028 + }, + { + "epoch": 0.3270468226432998, + "grad_norm": 0.14576977491378784, + "learning_rate": 0.0007676432028646707, + "loss": 2.7289, + "step": 11029 + }, + { + "epoch": 0.3270764759956113, + "grad_norm": 0.141885906457901, + "learning_rate": 0.000767603459115263, + "loss": 2.7279, + "step": 11030 + }, + { + "epoch": 0.32710612934792277, + "grad_norm": 0.11976485699415207, + "learning_rate": 0.0007675637129961807, + "loss": 2.6798, + "step": 11031 + }, + { + "epoch": 0.32713578270023425, + "grad_norm": 0.16430579125881195, + "learning_rate": 0.0007675239645077758, + "loss": 2.7364, + "step": 11032 + }, + { + "epoch": 0.3271654360525457, + "grad_norm": 0.1503637284040451, + "learning_rate": 0.0007674842136504003, + "loss": 2.711, + "step": 11033 + }, + { + "epoch": 0.3271950894048572, + "grad_norm": 0.12572211027145386, + "learning_rate": 0.0007674444604244062, + "loss": 2.7308, + "step": 11034 + }, + { + "epoch": 0.3272247427571687, + "grad_norm": 0.13569866120815277, + "learning_rate": 0.0007674047048301455, + "loss": 2.7085, + "step": 11035 + }, + { + "epoch": 0.32725439610948015, + "grad_norm": 0.11977503448724747, + "learning_rate": 0.00076736494686797, + "loss": 2.7308, + "step": 11036 + }, + { + "epoch": 0.3272840494617917, + "grad_norm": 0.12383834272623062, + "learning_rate": 0.0007673251865382323, + "loss": 2.7227, + "step": 11037 + }, + { + "epoch": 0.32731370281410316, + "grad_norm": 0.13063713908195496, + "learning_rate": 0.000767285423841284, + "loss": 2.7029, + "step": 11038 + }, + { + "epoch": 0.32734335616641463, + "grad_norm": 0.12619926035404205, + "learning_rate": 0.0007672456587774775, + "loss": 2.7025, + "step": 11039 + }, + { + "epoch": 0.3273730095187261, + "grad_norm": 0.13084056973457336, + "learning_rate": 0.0007672058913471649, + "loss": 2.6981, + "step": 11040 + }, + { + "epoch": 0.3274026628710376, + "grad_norm": 0.11235364526510239, + "learning_rate": 0.0007671661215506981, + "loss": 2.7198, + "step": 11041 + }, + { + "epoch": 0.32743231622334906, + "grad_norm": 0.12133161723613739, + "learning_rate": 0.0007671263493884293, + "loss": 2.733, + "step": 11042 + }, + { + "epoch": 0.32746196957566054, + "grad_norm": 0.14556412398815155, + "learning_rate": 0.0007670865748607112, + "loss": 2.7178, + "step": 11043 + }, + { + "epoch": 0.327491622927972, + "grad_norm": 0.1489875763654709, + "learning_rate": 0.0007670467979678955, + "loss": 2.7104, + "step": 11044 + }, + { + "epoch": 0.3275212762802835, + "grad_norm": 0.1190488189458847, + "learning_rate": 0.0007670070187103344, + "loss": 2.7234, + "step": 11045 + }, + { + "epoch": 0.32755092963259497, + "grad_norm": 0.14481842517852783, + "learning_rate": 0.0007669672370883804, + "loss": 2.7047, + "step": 11046 + }, + { + "epoch": 0.32758058298490644, + "grad_norm": 0.13043220341205597, + "learning_rate": 0.0007669274531023857, + "loss": 2.7281, + "step": 11047 + }, + { + "epoch": 0.3276102363372179, + "grad_norm": 0.12686337530612946, + "learning_rate": 0.0007668876667527027, + "loss": 2.686, + "step": 11048 + }, + { + "epoch": 0.3276398896895294, + "grad_norm": 0.13219626247882843, + "learning_rate": 0.0007668478780396835, + "loss": 2.7242, + "step": 11049 + }, + { + "epoch": 0.32766954304184087, + "grad_norm": 0.1408342868089676, + "learning_rate": 0.0007668080869636805, + "loss": 2.7227, + "step": 11050 + }, + { + "epoch": 0.32769919639415235, + "grad_norm": 0.1559421867132187, + "learning_rate": 0.0007667682935250462, + "loss": 2.7165, + "step": 11051 + }, + { + "epoch": 0.3277288497464638, + "grad_norm": 0.18278621137142181, + "learning_rate": 0.0007667284977241328, + "loss": 2.6905, + "step": 11052 + }, + { + "epoch": 0.3277585030987753, + "grad_norm": 0.15404552221298218, + "learning_rate": 0.0007666886995612928, + "loss": 2.7139, + "step": 11053 + }, + { + "epoch": 0.3277881564510868, + "grad_norm": 0.12402892857789993, + "learning_rate": 0.0007666488990368786, + "loss": 2.7357, + "step": 11054 + }, + { + "epoch": 0.32781780980339825, + "grad_norm": 0.13128246366977692, + "learning_rate": 0.0007666090961512425, + "loss": 2.7137, + "step": 11055 + }, + { + "epoch": 0.32784746315570973, + "grad_norm": 0.15550129115581512, + "learning_rate": 0.0007665692909047373, + "loss": 2.7209, + "step": 11056 + }, + { + "epoch": 0.3278771165080212, + "grad_norm": 0.14419469237327576, + "learning_rate": 0.000766529483297715, + "loss": 2.7247, + "step": 11057 + }, + { + "epoch": 0.32790676986033274, + "grad_norm": 0.12818169593811035, + "learning_rate": 0.0007664896733305287, + "loss": 2.7308, + "step": 11058 + }, + { + "epoch": 0.3279364232126442, + "grad_norm": 0.1413184553384781, + "learning_rate": 0.0007664498610035303, + "loss": 2.7505, + "step": 11059 + }, + { + "epoch": 0.3279660765649557, + "grad_norm": 0.1540546417236328, + "learning_rate": 0.0007664100463170729, + "loss": 2.7364, + "step": 11060 + }, + { + "epoch": 0.32799572991726716, + "grad_norm": 0.13632841408252716, + "learning_rate": 0.0007663702292715087, + "loss": 2.7059, + "step": 11061 + }, + { + "epoch": 0.32802538326957864, + "grad_norm": 0.1283373087644577, + "learning_rate": 0.0007663304098671903, + "loss": 2.7316, + "step": 11062 + }, + { + "epoch": 0.3280550366218901, + "grad_norm": 0.12740518152713776, + "learning_rate": 0.0007662905881044705, + "loss": 2.7072, + "step": 11063 + }, + { + "epoch": 0.3280846899742016, + "grad_norm": 0.12138032168149948, + "learning_rate": 0.0007662507639837017, + "loss": 2.6998, + "step": 11064 + }, + { + "epoch": 0.32811434332651307, + "grad_norm": 0.14776451885700226, + "learning_rate": 0.0007662109375052371, + "loss": 2.7224, + "step": 11065 + }, + { + "epoch": 0.32814399667882455, + "grad_norm": 0.1395357847213745, + "learning_rate": 0.0007661711086694286, + "loss": 2.7179, + "step": 11066 + }, + { + "epoch": 0.328173650031136, + "grad_norm": 0.1311621367931366, + "learning_rate": 0.0007661312774766293, + "loss": 2.7011, + "step": 11067 + }, + { + "epoch": 0.3282033033834475, + "grad_norm": 0.11166685074567795, + "learning_rate": 0.0007660914439271918, + "loss": 2.7055, + "step": 11068 + }, + { + "epoch": 0.328232956735759, + "grad_norm": 0.11989432573318481, + "learning_rate": 0.000766051608021469, + "loss": 2.7122, + "step": 11069 + }, + { + "epoch": 0.32826261008807045, + "grad_norm": 0.11779582500457764, + "learning_rate": 0.0007660117697598134, + "loss": 2.7208, + "step": 11070 + }, + { + "epoch": 0.3282922634403819, + "grad_norm": 0.13421699404716492, + "learning_rate": 0.0007659719291425781, + "loss": 2.7294, + "step": 11071 + }, + { + "epoch": 0.3283219167926934, + "grad_norm": 0.12032490968704224, + "learning_rate": 0.0007659320861701156, + "loss": 2.709, + "step": 11072 + }, + { + "epoch": 0.3283515701450049, + "grad_norm": 0.1257743537425995, + "learning_rate": 0.0007658922408427789, + "loss": 2.6715, + "step": 11073 + }, + { + "epoch": 0.32838122349731635, + "grad_norm": 0.1269930750131607, + "learning_rate": 0.0007658523931609207, + "loss": 2.7053, + "step": 11074 + }, + { + "epoch": 0.32841087684962783, + "grad_norm": 0.12092161178588867, + "learning_rate": 0.0007658125431248938, + "loss": 2.7322, + "step": 11075 + }, + { + "epoch": 0.3284405302019393, + "grad_norm": 0.12045339494943619, + "learning_rate": 0.0007657726907350515, + "loss": 2.7143, + "step": 11076 + }, + { + "epoch": 0.3284701835542508, + "grad_norm": 0.13060517609119415, + "learning_rate": 0.0007657328359917464, + "loss": 2.6749, + "step": 11077 + }, + { + "epoch": 0.32849983690656226, + "grad_norm": 0.13169963657855988, + "learning_rate": 0.0007656929788953313, + "loss": 2.7303, + "step": 11078 + }, + { + "epoch": 0.3285294902588738, + "grad_norm": 0.1404661387205124, + "learning_rate": 0.0007656531194461593, + "loss": 2.7226, + "step": 11079 + }, + { + "epoch": 0.32855914361118527, + "grad_norm": 0.1440398097038269, + "learning_rate": 0.0007656132576445831, + "loss": 2.7369, + "step": 11080 + }, + { + "epoch": 0.32858879696349674, + "grad_norm": 0.13209885358810425, + "learning_rate": 0.0007655733934909562, + "loss": 2.7151, + "step": 11081 + }, + { + "epoch": 0.3286184503158082, + "grad_norm": 0.11963117867708206, + "learning_rate": 0.0007655335269856311, + "loss": 2.7133, + "step": 11082 + }, + { + "epoch": 0.3286481036681197, + "grad_norm": 0.12866337597370148, + "learning_rate": 0.0007654936581289613, + "loss": 2.7211, + "step": 11083 + }, + { + "epoch": 0.32867775702043117, + "grad_norm": 0.14518605172634125, + "learning_rate": 0.0007654537869212994, + "loss": 2.7005, + "step": 11084 + }, + { + "epoch": 0.32870741037274265, + "grad_norm": 0.16848278045654297, + "learning_rate": 0.0007654139133629987, + "loss": 2.7569, + "step": 11085 + }, + { + "epoch": 0.3287370637250541, + "grad_norm": 0.16797436773777008, + "learning_rate": 0.0007653740374544123, + "loss": 2.718, + "step": 11086 + }, + { + "epoch": 0.3287667170773656, + "grad_norm": 0.16229775547981262, + "learning_rate": 0.0007653341591958931, + "loss": 2.7199, + "step": 11087 + }, + { + "epoch": 0.3287963704296771, + "grad_norm": 0.14676035940647125, + "learning_rate": 0.0007652942785877945, + "loss": 2.7185, + "step": 11088 + }, + { + "epoch": 0.32882602378198855, + "grad_norm": 0.14699280261993408, + "learning_rate": 0.0007652543956304694, + "loss": 2.7434, + "step": 11089 + }, + { + "epoch": 0.32885567713430003, + "grad_norm": 0.19098158180713654, + "learning_rate": 0.0007652145103242712, + "loss": 2.7079, + "step": 11090 + }, + { + "epoch": 0.3288853304866115, + "grad_norm": 0.20547744631767273, + "learning_rate": 0.0007651746226695529, + "loss": 2.6997, + "step": 11091 + }, + { + "epoch": 0.328914983838923, + "grad_norm": 0.19664300978183746, + "learning_rate": 0.000765134732666668, + "loss": 2.7107, + "step": 11092 + }, + { + "epoch": 0.32894463719123446, + "grad_norm": 0.16416995227336884, + "learning_rate": 0.0007650948403159694, + "loss": 2.7343, + "step": 11093 + }, + { + "epoch": 0.32897429054354593, + "grad_norm": 0.16462790966033936, + "learning_rate": 0.0007650549456178104, + "loss": 2.6981, + "step": 11094 + }, + { + "epoch": 0.3290039438958574, + "grad_norm": 0.16446886956691742, + "learning_rate": 0.0007650150485725445, + "loss": 2.7051, + "step": 11095 + }, + { + "epoch": 0.3290335972481689, + "grad_norm": 0.13370859622955322, + "learning_rate": 0.0007649751491805248, + "loss": 2.7187, + "step": 11096 + }, + { + "epoch": 0.32906325060048036, + "grad_norm": 0.15355071425437927, + "learning_rate": 0.0007649352474421047, + "loss": 2.7202, + "step": 11097 + }, + { + "epoch": 0.32909290395279184, + "grad_norm": 0.14426107704639435, + "learning_rate": 0.0007648953433576376, + "loss": 2.7086, + "step": 11098 + }, + { + "epoch": 0.32912255730510337, + "grad_norm": 0.14981012046337128, + "learning_rate": 0.0007648554369274765, + "loss": 2.7597, + "step": 11099 + }, + { + "epoch": 0.32915221065741485, + "grad_norm": 0.1618054211139679, + "learning_rate": 0.0007648155281519751, + "loss": 2.7395, + "step": 11100 + }, + { + "epoch": 0.3291818640097263, + "grad_norm": 0.15999364852905273, + "learning_rate": 0.0007647756170314868, + "loss": 2.712, + "step": 11101 + }, + { + "epoch": 0.3292115173620378, + "grad_norm": 0.12624837458133698, + "learning_rate": 0.0007647357035663651, + "loss": 2.734, + "step": 11102 + }, + { + "epoch": 0.3292411707143493, + "grad_norm": 0.13860781490802765, + "learning_rate": 0.0007646957877569632, + "loss": 2.7365, + "step": 11103 + }, + { + "epoch": 0.32927082406666075, + "grad_norm": 0.1372631937265396, + "learning_rate": 0.0007646558696036348, + "loss": 2.7283, + "step": 11104 + }, + { + "epoch": 0.3293004774189722, + "grad_norm": 0.15790002048015594, + "learning_rate": 0.000764615949106733, + "loss": 2.7141, + "step": 11105 + }, + { + "epoch": 0.3293301307712837, + "grad_norm": 0.1604982614517212, + "learning_rate": 0.0007645760262666117, + "loss": 2.7115, + "step": 11106 + }, + { + "epoch": 0.3293597841235952, + "grad_norm": 0.1457938253879547, + "learning_rate": 0.0007645361010836241, + "loss": 2.6548, + "step": 11107 + }, + { + "epoch": 0.32938943747590665, + "grad_norm": 0.12074277549982071, + "learning_rate": 0.0007644961735581241, + "loss": 2.6745, + "step": 11108 + }, + { + "epoch": 0.32941909082821813, + "grad_norm": 0.1415279060602188, + "learning_rate": 0.0007644562436904652, + "loss": 2.735, + "step": 11109 + }, + { + "epoch": 0.3294487441805296, + "grad_norm": 0.1478719413280487, + "learning_rate": 0.0007644163114810006, + "loss": 2.6964, + "step": 11110 + }, + { + "epoch": 0.3294783975328411, + "grad_norm": 0.1451568603515625, + "learning_rate": 0.0007643763769300842, + "loss": 2.7475, + "step": 11111 + }, + { + "epoch": 0.32950805088515256, + "grad_norm": 0.1228327676653862, + "learning_rate": 0.0007643364400380698, + "loss": 2.7254, + "step": 11112 + }, + { + "epoch": 0.32953770423746404, + "grad_norm": 0.13212859630584717, + "learning_rate": 0.0007642965008053107, + "loss": 2.7401, + "step": 11113 + }, + { + "epoch": 0.3295673575897755, + "grad_norm": 0.13269445300102234, + "learning_rate": 0.0007642565592321607, + "loss": 2.7014, + "step": 11114 + }, + { + "epoch": 0.329597010942087, + "grad_norm": 0.11469466984272003, + "learning_rate": 0.0007642166153189736, + "loss": 2.7687, + "step": 11115 + }, + { + "epoch": 0.32962666429439846, + "grad_norm": 0.11194009333848953, + "learning_rate": 0.000764176669066103, + "loss": 2.723, + "step": 11116 + }, + { + "epoch": 0.32965631764670994, + "grad_norm": 0.1274765431880951, + "learning_rate": 0.0007641367204739027, + "loss": 2.7108, + "step": 11117 + }, + { + "epoch": 0.3296859709990214, + "grad_norm": 0.13577400147914886, + "learning_rate": 0.0007640967695427263, + "loss": 2.7011, + "step": 11118 + }, + { + "epoch": 0.3297156243513329, + "grad_norm": 0.15728777647018433, + "learning_rate": 0.0007640568162729277, + "loss": 2.7187, + "step": 11119 + }, + { + "epoch": 0.3297452777036444, + "grad_norm": 0.14920412003993988, + "learning_rate": 0.0007640168606648606, + "loss": 2.7442, + "step": 11120 + }, + { + "epoch": 0.3297749310559559, + "grad_norm": 0.10844188928604126, + "learning_rate": 0.000763976902718879, + "loss": 2.7413, + "step": 11121 + }, + { + "epoch": 0.3298045844082674, + "grad_norm": 0.12277794629335403, + "learning_rate": 0.0007639369424353366, + "loss": 2.7379, + "step": 11122 + }, + { + "epoch": 0.32983423776057885, + "grad_norm": 0.13491016626358032, + "learning_rate": 0.0007638969798145871, + "loss": 2.6934, + "step": 11123 + }, + { + "epoch": 0.32986389111289033, + "grad_norm": 0.1525178998708725, + "learning_rate": 0.0007638570148569847, + "loss": 2.7084, + "step": 11124 + }, + { + "epoch": 0.3298935444652018, + "grad_norm": 0.15255339443683624, + "learning_rate": 0.0007638170475628832, + "loss": 2.709, + "step": 11125 + }, + { + "epoch": 0.3299231978175133, + "grad_norm": 0.129182830452919, + "learning_rate": 0.0007637770779326364, + "loss": 2.7294, + "step": 11126 + }, + { + "epoch": 0.32995285116982476, + "grad_norm": 0.1222313717007637, + "learning_rate": 0.0007637371059665982, + "loss": 2.7102, + "step": 11127 + }, + { + "epoch": 0.32998250452213623, + "grad_norm": 0.15041610598564148, + "learning_rate": 0.0007636971316651228, + "loss": 2.7501, + "step": 11128 + }, + { + "epoch": 0.3300121578744477, + "grad_norm": 0.1769363433122635, + "learning_rate": 0.000763657155028564, + "loss": 2.7469, + "step": 11129 + }, + { + "epoch": 0.3300418112267592, + "grad_norm": 0.16225047409534454, + "learning_rate": 0.0007636171760572759, + "loss": 2.7114, + "step": 11130 + }, + { + "epoch": 0.33007146457907066, + "grad_norm": 0.14032316207885742, + "learning_rate": 0.0007635771947516124, + "loss": 2.7276, + "step": 11131 + }, + { + "epoch": 0.33010111793138214, + "grad_norm": 0.1170981228351593, + "learning_rate": 0.0007635372111119276, + "loss": 2.7183, + "step": 11132 + }, + { + "epoch": 0.3301307712836936, + "grad_norm": 0.12964031100273132, + "learning_rate": 0.0007634972251385755, + "loss": 2.7135, + "step": 11133 + }, + { + "epoch": 0.3301604246360051, + "grad_norm": 0.13006682693958282, + "learning_rate": 0.0007634572368319101, + "loss": 2.7332, + "step": 11134 + }, + { + "epoch": 0.33019007798831657, + "grad_norm": 0.13418036699295044, + "learning_rate": 0.0007634172461922859, + "loss": 2.724, + "step": 11135 + }, + { + "epoch": 0.33021973134062804, + "grad_norm": 0.16620206832885742, + "learning_rate": 0.0007633772532200568, + "loss": 2.7175, + "step": 11136 + }, + { + "epoch": 0.3302493846929395, + "grad_norm": 0.1916954517364502, + "learning_rate": 0.0007633372579155768, + "loss": 2.7102, + "step": 11137 + }, + { + "epoch": 0.330279038045251, + "grad_norm": 0.18601615726947784, + "learning_rate": 0.0007632972602792002, + "loss": 2.739, + "step": 11138 + }, + { + "epoch": 0.33030869139756247, + "grad_norm": 0.1638745814561844, + "learning_rate": 0.000763257260311281, + "loss": 2.7048, + "step": 11139 + }, + { + "epoch": 0.33033834474987395, + "grad_norm": 0.1821967363357544, + "learning_rate": 0.0007632172580121738, + "loss": 2.7125, + "step": 11140 + }, + { + "epoch": 0.3303679981021855, + "grad_norm": 0.16310212016105652, + "learning_rate": 0.0007631772533822325, + "loss": 2.733, + "step": 11141 + }, + { + "epoch": 0.33039765145449695, + "grad_norm": 0.16171866655349731, + "learning_rate": 0.0007631372464218116, + "loss": 2.7055, + "step": 11142 + }, + { + "epoch": 0.33042730480680843, + "grad_norm": 0.15727709233760834, + "learning_rate": 0.000763097237131265, + "loss": 2.7669, + "step": 11143 + }, + { + "epoch": 0.3304569581591199, + "grad_norm": 0.14214184880256653, + "learning_rate": 0.0007630572255109474, + "loss": 2.7106, + "step": 11144 + }, + { + "epoch": 0.3304866115114314, + "grad_norm": 0.14064133167266846, + "learning_rate": 0.0007630172115612127, + "loss": 2.7454, + "step": 11145 + }, + { + "epoch": 0.33051626486374286, + "grad_norm": 0.13955990970134735, + "learning_rate": 0.0007629771952824155, + "loss": 2.7237, + "step": 11146 + }, + { + "epoch": 0.33054591821605434, + "grad_norm": 0.13142384588718414, + "learning_rate": 0.0007629371766749103, + "loss": 2.7264, + "step": 11147 + }, + { + "epoch": 0.3305755715683658, + "grad_norm": 0.12358840554952621, + "learning_rate": 0.000762897155739051, + "loss": 2.7564, + "step": 11148 + }, + { + "epoch": 0.3306052249206773, + "grad_norm": 0.11599932610988617, + "learning_rate": 0.0007628571324751925, + "loss": 2.684, + "step": 11149 + }, + { + "epoch": 0.33063487827298876, + "grad_norm": 0.09969955682754517, + "learning_rate": 0.0007628171068836888, + "loss": 2.7245, + "step": 11150 + }, + { + "epoch": 0.33066453162530024, + "grad_norm": 0.12808498740196228, + "learning_rate": 0.0007627770789648945, + "loss": 2.7222, + "step": 11151 + }, + { + "epoch": 0.3306941849776117, + "grad_norm": 0.13551843166351318, + "learning_rate": 0.0007627370487191642, + "loss": 2.6836, + "step": 11152 + }, + { + "epoch": 0.3307238383299232, + "grad_norm": 0.12736065685749054, + "learning_rate": 0.0007626970161468521, + "loss": 2.6777, + "step": 11153 + }, + { + "epoch": 0.33075349168223467, + "grad_norm": 0.10755382478237152, + "learning_rate": 0.0007626569812483129, + "loss": 2.7134, + "step": 11154 + }, + { + "epoch": 0.33078314503454614, + "grad_norm": 0.12594132125377655, + "learning_rate": 0.0007626169440239011, + "loss": 2.732, + "step": 11155 + }, + { + "epoch": 0.3308127983868576, + "grad_norm": 0.13739722967147827, + "learning_rate": 0.000762576904473971, + "loss": 2.697, + "step": 11156 + }, + { + "epoch": 0.3308424517391691, + "grad_norm": 0.12501685321331024, + "learning_rate": 0.0007625368625988776, + "loss": 2.7022, + "step": 11157 + }, + { + "epoch": 0.3308721050914806, + "grad_norm": 0.13060401380062103, + "learning_rate": 0.0007624968183989749, + "loss": 2.7578, + "step": 11158 + }, + { + "epoch": 0.33090175844379205, + "grad_norm": 0.1501425951719284, + "learning_rate": 0.000762456771874618, + "loss": 2.7241, + "step": 11159 + }, + { + "epoch": 0.3309314117961035, + "grad_norm": 0.15475058555603027, + "learning_rate": 0.0007624167230261614, + "loss": 2.6895, + "step": 11160 + }, + { + "epoch": 0.330961065148415, + "grad_norm": 0.13119123876094818, + "learning_rate": 0.0007623766718539596, + "loss": 2.6844, + "step": 11161 + }, + { + "epoch": 0.33099071850072653, + "grad_norm": 0.11772967129945755, + "learning_rate": 0.0007623366183583673, + "loss": 2.7167, + "step": 11162 + }, + { + "epoch": 0.331020371853038, + "grad_norm": 0.13295477628707886, + "learning_rate": 0.0007622965625397393, + "loss": 2.7066, + "step": 11163 + }, + { + "epoch": 0.3310500252053495, + "grad_norm": 0.12768098711967468, + "learning_rate": 0.0007622565043984301, + "loss": 2.7133, + "step": 11164 + }, + { + "epoch": 0.33107967855766096, + "grad_norm": 0.1139824315905571, + "learning_rate": 0.0007622164439347945, + "loss": 2.7149, + "step": 11165 + }, + { + "epoch": 0.33110933190997244, + "grad_norm": 0.1204090490937233, + "learning_rate": 0.0007621763811491876, + "loss": 2.6867, + "step": 11166 + }, + { + "epoch": 0.3311389852622839, + "grad_norm": 0.1506633758544922, + "learning_rate": 0.0007621363160419634, + "loss": 2.7527, + "step": 11167 + }, + { + "epoch": 0.3311686386145954, + "grad_norm": 0.15147820115089417, + "learning_rate": 0.0007620962486134774, + "loss": 2.7414, + "step": 11168 + }, + { + "epoch": 0.33119829196690687, + "grad_norm": 0.1555178016424179, + "learning_rate": 0.0007620561788640841, + "loss": 2.7097, + "step": 11169 + }, + { + "epoch": 0.33122794531921834, + "grad_norm": 0.16788993775844574, + "learning_rate": 0.0007620161067941384, + "loss": 2.7254, + "step": 11170 + }, + { + "epoch": 0.3312575986715298, + "grad_norm": 0.15361380577087402, + "learning_rate": 0.000761976032403995, + "loss": 2.7269, + "step": 11171 + }, + { + "epoch": 0.3312872520238413, + "grad_norm": 0.12753862142562866, + "learning_rate": 0.0007619359556940089, + "loss": 2.7432, + "step": 11172 + }, + { + "epoch": 0.33131690537615277, + "grad_norm": 0.12638667225837708, + "learning_rate": 0.000761895876664535, + "loss": 2.7198, + "step": 11173 + }, + { + "epoch": 0.33134655872846425, + "grad_norm": 0.1269243359565735, + "learning_rate": 0.0007618557953159282, + "loss": 2.7058, + "step": 11174 + }, + { + "epoch": 0.3313762120807757, + "grad_norm": 0.1522139608860016, + "learning_rate": 0.0007618157116485433, + "loss": 2.7103, + "step": 11175 + }, + { + "epoch": 0.3314058654330872, + "grad_norm": 0.13745523989200592, + "learning_rate": 0.0007617756256627353, + "loss": 2.7064, + "step": 11176 + }, + { + "epoch": 0.3314355187853987, + "grad_norm": 0.1344420164823532, + "learning_rate": 0.0007617355373588593, + "loss": 2.7475, + "step": 11177 + }, + { + "epoch": 0.33146517213771015, + "grad_norm": 0.11367880553007126, + "learning_rate": 0.0007616954467372698, + "loss": 2.7054, + "step": 11178 + }, + { + "epoch": 0.3314948254900216, + "grad_norm": 0.12217908352613449, + "learning_rate": 0.0007616553537983226, + "loss": 2.7204, + "step": 11179 + }, + { + "epoch": 0.3315244788423331, + "grad_norm": 0.11881600320339203, + "learning_rate": 0.0007616152585423724, + "loss": 2.6988, + "step": 11180 + }, + { + "epoch": 0.3315541321946446, + "grad_norm": 0.11771945655345917, + "learning_rate": 0.000761575160969774, + "loss": 2.7154, + "step": 11181 + }, + { + "epoch": 0.33158378554695606, + "grad_norm": 0.12571614980697632, + "learning_rate": 0.0007615350610808827, + "loss": 2.7026, + "step": 11182 + }, + { + "epoch": 0.3316134388992676, + "grad_norm": 0.12987960875034332, + "learning_rate": 0.0007614949588760535, + "loss": 2.7269, + "step": 11183 + }, + { + "epoch": 0.33164309225157906, + "grad_norm": 0.1241731345653534, + "learning_rate": 0.0007614548543556414, + "loss": 2.713, + "step": 11184 + }, + { + "epoch": 0.33167274560389054, + "grad_norm": 0.11636409908533096, + "learning_rate": 0.0007614147475200019, + "loss": 2.7329, + "step": 11185 + }, + { + "epoch": 0.331702398956202, + "grad_norm": 0.11833968758583069, + "learning_rate": 0.00076137463836949, + "loss": 2.7158, + "step": 11186 + }, + { + "epoch": 0.3317320523085135, + "grad_norm": 0.15205451846122742, + "learning_rate": 0.0007613345269044607, + "loss": 2.7235, + "step": 11187 + }, + { + "epoch": 0.33176170566082497, + "grad_norm": 0.17109784483909607, + "learning_rate": 0.0007612944131252694, + "loss": 2.7344, + "step": 11188 + }, + { + "epoch": 0.33179135901313644, + "grad_norm": 0.15296241641044617, + "learning_rate": 0.0007612542970322711, + "loss": 2.7195, + "step": 11189 + }, + { + "epoch": 0.3318210123654479, + "grad_norm": 0.1379934400320053, + "learning_rate": 0.0007612141786258212, + "loss": 2.6982, + "step": 11190 + }, + { + "epoch": 0.3318506657177594, + "grad_norm": 0.15734224021434784, + "learning_rate": 0.0007611740579062749, + "loss": 2.7179, + "step": 11191 + }, + { + "epoch": 0.3318803190700709, + "grad_norm": 0.16477763652801514, + "learning_rate": 0.0007611339348739876, + "loss": 2.7075, + "step": 11192 + }, + { + "epoch": 0.33190997242238235, + "grad_norm": 0.1521320343017578, + "learning_rate": 0.0007610938095293143, + "loss": 2.744, + "step": 11193 + }, + { + "epoch": 0.3319396257746938, + "grad_norm": 0.13928844034671783, + "learning_rate": 0.0007610536818726106, + "loss": 2.7284, + "step": 11194 + }, + { + "epoch": 0.3319692791270053, + "grad_norm": 0.13391099870204926, + "learning_rate": 0.0007610135519042316, + "loss": 2.7436, + "step": 11195 + }, + { + "epoch": 0.3319989324793168, + "grad_norm": 0.1464979499578476, + "learning_rate": 0.000760973419624533, + "loss": 2.7402, + "step": 11196 + }, + { + "epoch": 0.33202858583162825, + "grad_norm": 0.1827378273010254, + "learning_rate": 0.00076093328503387, + "loss": 2.6946, + "step": 11197 + }, + { + "epoch": 0.33205823918393973, + "grad_norm": 0.14793767035007477, + "learning_rate": 0.0007608931481325978, + "loss": 2.7458, + "step": 11198 + }, + { + "epoch": 0.3320878925362512, + "grad_norm": 0.12912869453430176, + "learning_rate": 0.000760853008921072, + "loss": 2.6898, + "step": 11199 + }, + { + "epoch": 0.3321175458885627, + "grad_norm": 0.1448148638010025, + "learning_rate": 0.000760812867399648, + "loss": 2.7001, + "step": 11200 + }, + { + "epoch": 0.33214719924087416, + "grad_norm": 0.15187415480613708, + "learning_rate": 0.0007607727235686815, + "loss": 2.7359, + "step": 11201 + }, + { + "epoch": 0.33217685259318563, + "grad_norm": 0.13358934223651886, + "learning_rate": 0.0007607325774285276, + "loss": 2.6898, + "step": 11202 + }, + { + "epoch": 0.33220650594549717, + "grad_norm": 0.11312808096408844, + "learning_rate": 0.0007606924289795421, + "loss": 2.7436, + "step": 11203 + }, + { + "epoch": 0.33223615929780864, + "grad_norm": 0.13297295570373535, + "learning_rate": 0.0007606522782220801, + "loss": 2.7114, + "step": 11204 + }, + { + "epoch": 0.3322658126501201, + "grad_norm": 0.12829893827438354, + "learning_rate": 0.0007606121251564978, + "loss": 2.746, + "step": 11205 + }, + { + "epoch": 0.3322954660024316, + "grad_norm": 0.14141146838665009, + "learning_rate": 0.0007605719697831502, + "loss": 2.713, + "step": 11206 + }, + { + "epoch": 0.33232511935474307, + "grad_norm": 0.15987008810043335, + "learning_rate": 0.0007605318121023932, + "loss": 2.6844, + "step": 11207 + }, + { + "epoch": 0.33235477270705455, + "grad_norm": 0.18953169882297516, + "learning_rate": 0.0007604916521145822, + "loss": 2.7167, + "step": 11208 + }, + { + "epoch": 0.332384426059366, + "grad_norm": 0.17215043306350708, + "learning_rate": 0.0007604514898200729, + "loss": 2.7503, + "step": 11209 + }, + { + "epoch": 0.3324140794116775, + "grad_norm": 0.13560397922992706, + "learning_rate": 0.0007604113252192209, + "loss": 2.713, + "step": 11210 + }, + { + "epoch": 0.332443732763989, + "grad_norm": 0.13116921484470367, + "learning_rate": 0.000760371158312382, + "loss": 2.7331, + "step": 11211 + }, + { + "epoch": 0.33247338611630045, + "grad_norm": 0.1366552859544754, + "learning_rate": 0.0007603309890999119, + "loss": 2.7248, + "step": 11212 + }, + { + "epoch": 0.3325030394686119, + "grad_norm": 0.14172832667827606, + "learning_rate": 0.0007602908175821661, + "loss": 2.7272, + "step": 11213 + }, + { + "epoch": 0.3325326928209234, + "grad_norm": 0.13318423926830292, + "learning_rate": 0.0007602506437595005, + "loss": 2.7437, + "step": 11214 + }, + { + "epoch": 0.3325623461732349, + "grad_norm": 0.1253574788570404, + "learning_rate": 0.0007602104676322707, + "loss": 2.7016, + "step": 11215 + }, + { + "epoch": 0.33259199952554636, + "grad_norm": 0.1421506702899933, + "learning_rate": 0.0007601702892008326, + "loss": 2.7034, + "step": 11216 + }, + { + "epoch": 0.33262165287785783, + "grad_norm": 0.14494019746780396, + "learning_rate": 0.0007601301084655417, + "loss": 2.6908, + "step": 11217 + }, + { + "epoch": 0.3326513062301693, + "grad_norm": 0.1381896287202835, + "learning_rate": 0.0007600899254267544, + "loss": 2.7136, + "step": 11218 + }, + { + "epoch": 0.3326809595824808, + "grad_norm": 0.14169757068157196, + "learning_rate": 0.0007600497400848258, + "loss": 2.741, + "step": 11219 + }, + { + "epoch": 0.33271061293479226, + "grad_norm": 0.13533218204975128, + "learning_rate": 0.0007600095524401124, + "loss": 2.7552, + "step": 11220 + }, + { + "epoch": 0.33274026628710374, + "grad_norm": 0.12344060093164444, + "learning_rate": 0.0007599693624929697, + "loss": 2.6977, + "step": 11221 + }, + { + "epoch": 0.3327699196394152, + "grad_norm": 0.11973655968904495, + "learning_rate": 0.0007599291702437537, + "loss": 2.6961, + "step": 11222 + }, + { + "epoch": 0.3327995729917267, + "grad_norm": 0.13044103980064392, + "learning_rate": 0.0007598889756928203, + "loss": 2.7316, + "step": 11223 + }, + { + "epoch": 0.3328292263440382, + "grad_norm": 0.12947477400302887, + "learning_rate": 0.0007598487788405253, + "loss": 2.7276, + "step": 11224 + }, + { + "epoch": 0.3328588796963497, + "grad_norm": 0.14653825759887695, + "learning_rate": 0.0007598085796872247, + "loss": 2.7451, + "step": 11225 + }, + { + "epoch": 0.3328885330486612, + "grad_norm": 0.15746746957302094, + "learning_rate": 0.0007597683782332747, + "loss": 2.6844, + "step": 11226 + }, + { + "epoch": 0.33291818640097265, + "grad_norm": 0.13347512483596802, + "learning_rate": 0.0007597281744790309, + "loss": 2.7439, + "step": 11227 + }, + { + "epoch": 0.3329478397532841, + "grad_norm": 0.11806285381317139, + "learning_rate": 0.0007596879684248499, + "loss": 2.7196, + "step": 11228 + }, + { + "epoch": 0.3329774931055956, + "grad_norm": 0.12714716792106628, + "learning_rate": 0.0007596477600710871, + "loss": 2.699, + "step": 11229 + }, + { + "epoch": 0.3330071464579071, + "grad_norm": 0.12155535817146301, + "learning_rate": 0.0007596075494180988, + "loss": 2.736, + "step": 11230 + }, + { + "epoch": 0.33303679981021855, + "grad_norm": 0.13039617240428925, + "learning_rate": 0.0007595673364662412, + "loss": 2.729, + "step": 11231 + }, + { + "epoch": 0.33306645316253003, + "grad_norm": 0.1331285536289215, + "learning_rate": 0.0007595271212158703, + "loss": 2.7324, + "step": 11232 + }, + { + "epoch": 0.3330961065148415, + "grad_norm": 0.14324283599853516, + "learning_rate": 0.0007594869036673422, + "loss": 2.7148, + "step": 11233 + }, + { + "epoch": 0.333125759867153, + "grad_norm": 0.15577061474323273, + "learning_rate": 0.0007594466838210129, + "loss": 2.724, + "step": 11234 + }, + { + "epoch": 0.33315541321946446, + "grad_norm": 0.13107721507549286, + "learning_rate": 0.0007594064616772388, + "loss": 2.7102, + "step": 11235 + }, + { + "epoch": 0.33318506657177593, + "grad_norm": 0.1141776517033577, + "learning_rate": 0.0007593662372363759, + "loss": 2.6889, + "step": 11236 + }, + { + "epoch": 0.3332147199240874, + "grad_norm": 0.14921829104423523, + "learning_rate": 0.0007593260104987805, + "loss": 2.6707, + "step": 11237 + }, + { + "epoch": 0.3332443732763989, + "grad_norm": 0.15530531108379364, + "learning_rate": 0.0007592857814648086, + "loss": 2.6981, + "step": 11238 + }, + { + "epoch": 0.33327402662871036, + "grad_norm": 0.17881520092487335, + "learning_rate": 0.0007592455501348168, + "loss": 2.6833, + "step": 11239 + }, + { + "epoch": 0.33330367998102184, + "grad_norm": 0.18159568309783936, + "learning_rate": 0.0007592053165091611, + "loss": 2.7157, + "step": 11240 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 0.17751462757587433, + "learning_rate": 0.0007591650805881978, + "loss": 2.6993, + "step": 11241 + }, + { + "epoch": 0.3333629866856448, + "grad_norm": 0.16419093310832977, + "learning_rate": 0.0007591248423722832, + "loss": 2.6942, + "step": 11242 + }, + { + "epoch": 0.33339264003795627, + "grad_norm": 0.15572768449783325, + "learning_rate": 0.0007590846018617737, + "loss": 2.6847, + "step": 11243 + }, + { + "epoch": 0.33342229339026774, + "grad_norm": 0.13253529369831085, + "learning_rate": 0.0007590443590570255, + "loss": 2.6729, + "step": 11244 + }, + { + "epoch": 0.3334519467425793, + "grad_norm": 0.1351480334997177, + "learning_rate": 0.0007590041139583952, + "loss": 2.7197, + "step": 11245 + }, + { + "epoch": 0.33348160009489075, + "grad_norm": 0.13857519626617432, + "learning_rate": 0.0007589638665662389, + "loss": 2.7261, + "step": 11246 + }, + { + "epoch": 0.3335112534472022, + "grad_norm": 0.14824971556663513, + "learning_rate": 0.000758923616880913, + "loss": 2.7322, + "step": 11247 + }, + { + "epoch": 0.3335409067995137, + "grad_norm": 0.1476651430130005, + "learning_rate": 0.0007588833649027742, + "loss": 2.7412, + "step": 11248 + }, + { + "epoch": 0.3335705601518252, + "grad_norm": 0.1366107314825058, + "learning_rate": 0.0007588431106321787, + "loss": 2.7447, + "step": 11249 + }, + { + "epoch": 0.33360021350413666, + "grad_norm": 0.14375485479831696, + "learning_rate": 0.0007588028540694831, + "loss": 2.7041, + "step": 11250 + }, + { + "epoch": 0.33362986685644813, + "grad_norm": 0.14292693138122559, + "learning_rate": 0.0007587625952150437, + "loss": 2.6855, + "step": 11251 + }, + { + "epoch": 0.3336595202087596, + "grad_norm": 0.12844885885715485, + "learning_rate": 0.000758722334069217, + "loss": 2.6856, + "step": 11252 + }, + { + "epoch": 0.3336891735610711, + "grad_norm": 0.12867633998394012, + "learning_rate": 0.0007586820706323599, + "loss": 2.7116, + "step": 11253 + }, + { + "epoch": 0.33371882691338256, + "grad_norm": 0.12269192188978195, + "learning_rate": 0.0007586418049048284, + "loss": 2.717, + "step": 11254 + }, + { + "epoch": 0.33374848026569404, + "grad_norm": 0.14224956929683685, + "learning_rate": 0.0007586015368869797, + "loss": 2.7386, + "step": 11255 + }, + { + "epoch": 0.3337781336180055, + "grad_norm": 0.1265297532081604, + "learning_rate": 0.0007585612665791697, + "loss": 2.707, + "step": 11256 + }, + { + "epoch": 0.333807786970317, + "grad_norm": 0.1257685124874115, + "learning_rate": 0.0007585209939817552, + "loss": 2.733, + "step": 11257 + }, + { + "epoch": 0.33383744032262846, + "grad_norm": 0.12779051065444946, + "learning_rate": 0.0007584807190950931, + "loss": 2.7345, + "step": 11258 + }, + { + "epoch": 0.33386709367493994, + "grad_norm": 0.1252634972333908, + "learning_rate": 0.0007584404419195399, + "loss": 2.6992, + "step": 11259 + }, + { + "epoch": 0.3338967470272514, + "grad_norm": 0.10710197687149048, + "learning_rate": 0.0007584001624554522, + "loss": 2.7035, + "step": 11260 + }, + { + "epoch": 0.3339264003795629, + "grad_norm": 0.1195344552397728, + "learning_rate": 0.0007583598807031866, + "loss": 2.6998, + "step": 11261 + }, + { + "epoch": 0.33395605373187437, + "grad_norm": 0.13721376657485962, + "learning_rate": 0.0007583195966631, + "loss": 2.7475, + "step": 11262 + }, + { + "epoch": 0.33398570708418585, + "grad_norm": 0.1372641772031784, + "learning_rate": 0.000758279310335549, + "loss": 2.7459, + "step": 11263 + }, + { + "epoch": 0.3340153604364973, + "grad_norm": 0.14946326613426208, + "learning_rate": 0.0007582390217208905, + "loss": 2.7211, + "step": 11264 + }, + { + "epoch": 0.3340450137888088, + "grad_norm": 0.1745232492685318, + "learning_rate": 0.000758198730819481, + "loss": 2.7228, + "step": 11265 + }, + { + "epoch": 0.33407466714112033, + "grad_norm": 0.16516129672527313, + "learning_rate": 0.0007581584376316775, + "loss": 2.669, + "step": 11266 + }, + { + "epoch": 0.3341043204934318, + "grad_norm": 0.12415628135204315, + "learning_rate": 0.0007581181421578367, + "loss": 2.7257, + "step": 11267 + }, + { + "epoch": 0.3341339738457433, + "grad_norm": 0.15050864219665527, + "learning_rate": 0.0007580778443983153, + "loss": 2.6935, + "step": 11268 + }, + { + "epoch": 0.33416362719805476, + "grad_norm": 0.14913097023963928, + "learning_rate": 0.0007580375443534704, + "loss": 2.7035, + "step": 11269 + }, + { + "epoch": 0.33419328055036623, + "grad_norm": 0.13024069368839264, + "learning_rate": 0.0007579972420236588, + "loss": 2.643, + "step": 11270 + }, + { + "epoch": 0.3342229339026777, + "grad_norm": 0.12395495176315308, + "learning_rate": 0.0007579569374092372, + "loss": 2.7236, + "step": 11271 + }, + { + "epoch": 0.3342525872549892, + "grad_norm": 0.1412850171327591, + "learning_rate": 0.0007579166305105628, + "loss": 2.7081, + "step": 11272 + }, + { + "epoch": 0.33428224060730066, + "grad_norm": 0.16369794309139252, + "learning_rate": 0.0007578763213279924, + "loss": 2.6976, + "step": 11273 + }, + { + "epoch": 0.33431189395961214, + "grad_norm": 0.1797226518392563, + "learning_rate": 0.0007578360098618828, + "loss": 2.743, + "step": 11274 + }, + { + "epoch": 0.3343415473119236, + "grad_norm": 0.1616300344467163, + "learning_rate": 0.000757795696112591, + "loss": 2.7398, + "step": 11275 + }, + { + "epoch": 0.3343712006642351, + "grad_norm": 0.15985846519470215, + "learning_rate": 0.0007577553800804742, + "loss": 2.713, + "step": 11276 + }, + { + "epoch": 0.33440085401654657, + "grad_norm": 0.1767430156469345, + "learning_rate": 0.0007577150617658892, + "loss": 2.7705, + "step": 11277 + }, + { + "epoch": 0.33443050736885804, + "grad_norm": 0.2022896260023117, + "learning_rate": 0.0007576747411691931, + "loss": 2.7642, + "step": 11278 + }, + { + "epoch": 0.3344601607211695, + "grad_norm": 0.1500653326511383, + "learning_rate": 0.0007576344182907431, + "loss": 2.7224, + "step": 11279 + }, + { + "epoch": 0.334489814073481, + "grad_norm": 0.1334695667028427, + "learning_rate": 0.0007575940931308959, + "loss": 2.6988, + "step": 11280 + }, + { + "epoch": 0.33451946742579247, + "grad_norm": 0.15861332416534424, + "learning_rate": 0.0007575537656900087, + "loss": 2.7263, + "step": 11281 + }, + { + "epoch": 0.33454912077810395, + "grad_norm": 0.15005676448345184, + "learning_rate": 0.0007575134359684388, + "loss": 2.7058, + "step": 11282 + }, + { + "epoch": 0.3345787741304154, + "grad_norm": 0.14721716940402985, + "learning_rate": 0.0007574731039665434, + "loss": 2.6949, + "step": 11283 + }, + { + "epoch": 0.3346084274827269, + "grad_norm": 0.11815240234136581, + "learning_rate": 0.0007574327696846793, + "loss": 2.7276, + "step": 11284 + }, + { + "epoch": 0.3346380808350384, + "grad_norm": 0.16539743542671204, + "learning_rate": 0.0007573924331232038, + "loss": 2.6933, + "step": 11285 + }, + { + "epoch": 0.33466773418734985, + "grad_norm": 0.14482206106185913, + "learning_rate": 0.0007573520942824743, + "loss": 2.694, + "step": 11286 + }, + { + "epoch": 0.3346973875396614, + "grad_norm": 0.13179954886436462, + "learning_rate": 0.0007573117531628477, + "loss": 2.7106, + "step": 11287 + }, + { + "epoch": 0.33472704089197286, + "grad_norm": 0.13348662853240967, + "learning_rate": 0.0007572714097646813, + "loss": 2.7032, + "step": 11288 + }, + { + "epoch": 0.33475669424428434, + "grad_norm": 0.1327822208404541, + "learning_rate": 0.0007572310640883327, + "loss": 2.7067, + "step": 11289 + }, + { + "epoch": 0.3347863475965958, + "grad_norm": 0.13361021876335144, + "learning_rate": 0.0007571907161341585, + "loss": 2.7035, + "step": 11290 + }, + { + "epoch": 0.3348160009489073, + "grad_norm": 0.1380477249622345, + "learning_rate": 0.0007571503659025166, + "loss": 2.7054, + "step": 11291 + }, + { + "epoch": 0.33484565430121876, + "grad_norm": 0.13693290948867798, + "learning_rate": 0.0007571100133937639, + "loss": 2.7226, + "step": 11292 + }, + { + "epoch": 0.33487530765353024, + "grad_norm": 0.13641300797462463, + "learning_rate": 0.0007570696586082581, + "loss": 2.7253, + "step": 11293 + }, + { + "epoch": 0.3349049610058417, + "grad_norm": 0.1193321943283081, + "learning_rate": 0.0007570293015463562, + "loss": 2.7248, + "step": 11294 + }, + { + "epoch": 0.3349346143581532, + "grad_norm": 0.1337047666311264, + "learning_rate": 0.0007569889422084158, + "loss": 2.7015, + "step": 11295 + }, + { + "epoch": 0.33496426771046467, + "grad_norm": 0.11699103564023972, + "learning_rate": 0.000756948580594794, + "loss": 2.7038, + "step": 11296 + }, + { + "epoch": 0.33499392106277615, + "grad_norm": 0.1183348074555397, + "learning_rate": 0.0007569082167058487, + "loss": 2.7161, + "step": 11297 + }, + { + "epoch": 0.3350235744150876, + "grad_norm": 0.1285199522972107, + "learning_rate": 0.0007568678505419368, + "loss": 2.716, + "step": 11298 + }, + { + "epoch": 0.3350532277673991, + "grad_norm": 0.1316680759191513, + "learning_rate": 0.0007568274821034163, + "loss": 2.701, + "step": 11299 + }, + { + "epoch": 0.3350828811197106, + "grad_norm": 0.11170695722103119, + "learning_rate": 0.0007567871113906442, + "loss": 2.7073, + "step": 11300 + }, + { + "epoch": 0.33511253447202205, + "grad_norm": 0.12728656828403473, + "learning_rate": 0.000756746738403978, + "loss": 2.7223, + "step": 11301 + }, + { + "epoch": 0.3351421878243335, + "grad_norm": 0.1450427621603012, + "learning_rate": 0.0007567063631437755, + "loss": 2.7364, + "step": 11302 + }, + { + "epoch": 0.335171841176645, + "grad_norm": 0.16424095630645752, + "learning_rate": 0.0007566659856103941, + "loss": 2.7115, + "step": 11303 + }, + { + "epoch": 0.3352014945289565, + "grad_norm": 0.15093660354614258, + "learning_rate": 0.0007566256058041913, + "loss": 2.7408, + "step": 11304 + }, + { + "epoch": 0.33523114788126795, + "grad_norm": 0.13953979313373566, + "learning_rate": 0.0007565852237255248, + "loss": 2.7084, + "step": 11305 + }, + { + "epoch": 0.33526080123357943, + "grad_norm": 0.16475340723991394, + "learning_rate": 0.000756544839374752, + "loss": 2.731, + "step": 11306 + }, + { + "epoch": 0.33529045458589096, + "grad_norm": 0.13527919352054596, + "learning_rate": 0.0007565044527522306, + "loss": 2.7203, + "step": 11307 + }, + { + "epoch": 0.33532010793820244, + "grad_norm": 0.1442635953426361, + "learning_rate": 0.0007564640638583183, + "loss": 2.7319, + "step": 11308 + }, + { + "epoch": 0.3353497612905139, + "grad_norm": 0.1494290828704834, + "learning_rate": 0.0007564236726933727, + "loss": 2.6702, + "step": 11309 + }, + { + "epoch": 0.3353794146428254, + "grad_norm": 0.13728217780590057, + "learning_rate": 0.0007563832792577514, + "loss": 2.6992, + "step": 11310 + }, + { + "epoch": 0.33540906799513687, + "grad_norm": 0.17026226222515106, + "learning_rate": 0.0007563428835518122, + "loss": 2.719, + "step": 11311 + }, + { + "epoch": 0.33543872134744834, + "grad_norm": 0.1546161025762558, + "learning_rate": 0.0007563024855759128, + "loss": 2.7529, + "step": 11312 + }, + { + "epoch": 0.3354683746997598, + "grad_norm": 0.13606008887290955, + "learning_rate": 0.000756262085330411, + "loss": 2.6967, + "step": 11313 + }, + { + "epoch": 0.3354980280520713, + "grad_norm": 0.12501846253871918, + "learning_rate": 0.0007562216828156642, + "loss": 2.6842, + "step": 11314 + }, + { + "epoch": 0.33552768140438277, + "grad_norm": 0.12542307376861572, + "learning_rate": 0.0007561812780320305, + "loss": 2.7112, + "step": 11315 + }, + { + "epoch": 0.33555733475669425, + "grad_norm": 0.12040236592292786, + "learning_rate": 0.0007561408709798677, + "loss": 2.7124, + "step": 11316 + }, + { + "epoch": 0.3355869881090057, + "grad_norm": 0.13393603265285492, + "learning_rate": 0.0007561004616595335, + "loss": 2.6945, + "step": 11317 + }, + { + "epoch": 0.3356166414613172, + "grad_norm": 0.1323462724685669, + "learning_rate": 0.0007560600500713856, + "loss": 2.7169, + "step": 11318 + }, + { + "epoch": 0.3356462948136287, + "grad_norm": 0.14247575402259827, + "learning_rate": 0.0007560196362157822, + "loss": 2.724, + "step": 11319 + }, + { + "epoch": 0.33567594816594015, + "grad_norm": 0.12975746393203735, + "learning_rate": 0.0007559792200930809, + "loss": 2.7081, + "step": 11320 + }, + { + "epoch": 0.33570560151825163, + "grad_norm": 0.1227576732635498, + "learning_rate": 0.0007559388017036397, + "loss": 2.7109, + "step": 11321 + }, + { + "epoch": 0.3357352548705631, + "grad_norm": 0.1299782544374466, + "learning_rate": 0.0007558983810478164, + "loss": 2.7104, + "step": 11322 + }, + { + "epoch": 0.3357649082228746, + "grad_norm": 0.12262614816427231, + "learning_rate": 0.000755857958125969, + "loss": 2.7401, + "step": 11323 + }, + { + "epoch": 0.33579456157518606, + "grad_norm": 0.1357414871454239, + "learning_rate": 0.0007558175329384556, + "loss": 2.6909, + "step": 11324 + }, + { + "epoch": 0.33582421492749753, + "grad_norm": 0.14933845400810242, + "learning_rate": 0.0007557771054856339, + "loss": 2.692, + "step": 11325 + }, + { + "epoch": 0.335853868279809, + "grad_norm": 0.1273600459098816, + "learning_rate": 0.000755736675767862, + "loss": 2.6862, + "step": 11326 + }, + { + "epoch": 0.3358835216321205, + "grad_norm": 0.11795858293771744, + "learning_rate": 0.000755696243785498, + "loss": 2.6641, + "step": 11327 + }, + { + "epoch": 0.335913174984432, + "grad_norm": 0.13143658638000488, + "learning_rate": 0.0007556558095388999, + "loss": 2.7144, + "step": 11328 + }, + { + "epoch": 0.3359428283367435, + "grad_norm": 0.15198886394500732, + "learning_rate": 0.0007556153730284257, + "loss": 2.6946, + "step": 11329 + }, + { + "epoch": 0.33597248168905497, + "grad_norm": 0.14964380860328674, + "learning_rate": 0.0007555749342544335, + "loss": 2.7161, + "step": 11330 + }, + { + "epoch": 0.33600213504136645, + "grad_norm": 0.19681280851364136, + "learning_rate": 0.0007555344932172814, + "loss": 2.7239, + "step": 11331 + }, + { + "epoch": 0.3360317883936779, + "grad_norm": 0.22580093145370483, + "learning_rate": 0.0007554940499173275, + "loss": 2.7287, + "step": 11332 + }, + { + "epoch": 0.3360614417459894, + "grad_norm": 0.18105629086494446, + "learning_rate": 0.00075545360435493, + "loss": 2.6661, + "step": 11333 + }, + { + "epoch": 0.3360910950983009, + "grad_norm": 0.16164788603782654, + "learning_rate": 0.0007554131565304469, + "loss": 2.7205, + "step": 11334 + }, + { + "epoch": 0.33612074845061235, + "grad_norm": 0.17105908691883087, + "learning_rate": 0.0007553727064442365, + "loss": 2.7077, + "step": 11335 + }, + { + "epoch": 0.3361504018029238, + "grad_norm": 0.1756901890039444, + "learning_rate": 0.000755332254096657, + "loss": 2.7031, + "step": 11336 + }, + { + "epoch": 0.3361800551552353, + "grad_norm": 0.14729739725589752, + "learning_rate": 0.0007552917994880664, + "loss": 2.7, + "step": 11337 + }, + { + "epoch": 0.3362097085075468, + "grad_norm": 0.14377345144748688, + "learning_rate": 0.0007552513426188233, + "loss": 2.6844, + "step": 11338 + }, + { + "epoch": 0.33623936185985825, + "grad_norm": 0.17695049941539764, + "learning_rate": 0.0007552108834892857, + "loss": 2.7372, + "step": 11339 + }, + { + "epoch": 0.33626901521216973, + "grad_norm": 0.15550881624221802, + "learning_rate": 0.0007551704220998117, + "loss": 2.6741, + "step": 11340 + }, + { + "epoch": 0.3362986685644812, + "grad_norm": 0.1348988264799118, + "learning_rate": 0.00075512995845076, + "loss": 2.6673, + "step": 11341 + }, + { + "epoch": 0.3363283219167927, + "grad_norm": 0.13734616339206696, + "learning_rate": 0.0007550894925424886, + "loss": 2.7127, + "step": 11342 + }, + { + "epoch": 0.33635797526910416, + "grad_norm": 0.15153443813323975, + "learning_rate": 0.0007550490243753562, + "loss": 2.733, + "step": 11343 + }, + { + "epoch": 0.33638762862141564, + "grad_norm": 0.1379079818725586, + "learning_rate": 0.0007550085539497207, + "loss": 2.717, + "step": 11344 + }, + { + "epoch": 0.3364172819737271, + "grad_norm": 0.12725941836833954, + "learning_rate": 0.0007549680812659408, + "loss": 2.7148, + "step": 11345 + }, + { + "epoch": 0.3364469353260386, + "grad_norm": 0.12505896389484406, + "learning_rate": 0.0007549276063243747, + "loss": 2.7274, + "step": 11346 + }, + { + "epoch": 0.33647658867835006, + "grad_norm": 0.11383289843797684, + "learning_rate": 0.0007548871291253807, + "loss": 2.7376, + "step": 11347 + }, + { + "epoch": 0.33650624203066154, + "grad_norm": 0.12380505353212357, + "learning_rate": 0.0007548466496693177, + "loss": 2.7116, + "step": 11348 + }, + { + "epoch": 0.33653589538297307, + "grad_norm": 0.12736830115318298, + "learning_rate": 0.0007548061679565439, + "loss": 2.7675, + "step": 11349 + }, + { + "epoch": 0.33656554873528455, + "grad_norm": 0.12469875812530518, + "learning_rate": 0.0007547656839874176, + "loss": 2.7161, + "step": 11350 + }, + { + "epoch": 0.336595202087596, + "grad_norm": 0.13213707506656647, + "learning_rate": 0.0007547251977622976, + "loss": 2.7072, + "step": 11351 + }, + { + "epoch": 0.3366248554399075, + "grad_norm": 0.14742279052734375, + "learning_rate": 0.0007546847092815421, + "loss": 2.733, + "step": 11352 + }, + { + "epoch": 0.336654508792219, + "grad_norm": 0.16947075724601746, + "learning_rate": 0.0007546442185455096, + "loss": 2.7108, + "step": 11353 + }, + { + "epoch": 0.33668416214453045, + "grad_norm": 0.15957027673721313, + "learning_rate": 0.0007546037255545592, + "loss": 2.7078, + "step": 11354 + }, + { + "epoch": 0.33671381549684193, + "grad_norm": 0.14170555770397186, + "learning_rate": 0.0007545632303090489, + "loss": 2.7264, + "step": 11355 + }, + { + "epoch": 0.3367434688491534, + "grad_norm": 0.14124758541584015, + "learning_rate": 0.0007545227328093376, + "loss": 2.7112, + "step": 11356 + }, + { + "epoch": 0.3367731222014649, + "grad_norm": 0.1519378274679184, + "learning_rate": 0.0007544822330557838, + "loss": 2.7613, + "step": 11357 + }, + { + "epoch": 0.33680277555377636, + "grad_norm": 0.14391754567623138, + "learning_rate": 0.0007544417310487462, + "loss": 2.7392, + "step": 11358 + }, + { + "epoch": 0.33683242890608783, + "grad_norm": 0.15604999661445618, + "learning_rate": 0.0007544012267885832, + "loss": 2.725, + "step": 11359 + }, + { + "epoch": 0.3368620822583993, + "grad_norm": 0.1349426507949829, + "learning_rate": 0.0007543607202756537, + "loss": 2.6873, + "step": 11360 + }, + { + "epoch": 0.3368917356107108, + "grad_norm": 0.1399097740650177, + "learning_rate": 0.0007543202115103165, + "loss": 2.7221, + "step": 11361 + }, + { + "epoch": 0.33692138896302226, + "grad_norm": 0.13713641464710236, + "learning_rate": 0.0007542797004929301, + "loss": 2.7054, + "step": 11362 + }, + { + "epoch": 0.33695104231533374, + "grad_norm": 0.14112091064453125, + "learning_rate": 0.0007542391872238535, + "loss": 2.7193, + "step": 11363 + }, + { + "epoch": 0.3369806956676452, + "grad_norm": 0.13472265005111694, + "learning_rate": 0.000754198671703445, + "loss": 2.7317, + "step": 11364 + }, + { + "epoch": 0.3370103490199567, + "grad_norm": 0.14017489552497864, + "learning_rate": 0.0007541581539320637, + "loss": 2.6846, + "step": 11365 + }, + { + "epoch": 0.33704000237226817, + "grad_norm": 0.14378687739372253, + "learning_rate": 0.0007541176339100684, + "loss": 2.7219, + "step": 11366 + }, + { + "epoch": 0.33706965572457964, + "grad_norm": 0.14849962294101715, + "learning_rate": 0.0007540771116378177, + "loss": 2.7002, + "step": 11367 + }, + { + "epoch": 0.3370993090768911, + "grad_norm": 0.12994731962680817, + "learning_rate": 0.0007540365871156707, + "loss": 2.7265, + "step": 11368 + }, + { + "epoch": 0.3371289624292026, + "grad_norm": 0.13070952892303467, + "learning_rate": 0.0007539960603439859, + "loss": 2.6843, + "step": 11369 + }, + { + "epoch": 0.3371586157815141, + "grad_norm": 0.13193988800048828, + "learning_rate": 0.0007539555313231226, + "loss": 2.6957, + "step": 11370 + }, + { + "epoch": 0.3371882691338256, + "grad_norm": 0.12886936962604523, + "learning_rate": 0.0007539150000534395, + "loss": 2.7282, + "step": 11371 + }, + { + "epoch": 0.3372179224861371, + "grad_norm": 0.1236925795674324, + "learning_rate": 0.0007538744665352953, + "loss": 2.7261, + "step": 11372 + }, + { + "epoch": 0.33724757583844855, + "grad_norm": 0.13508298993110657, + "learning_rate": 0.0007538339307690492, + "loss": 2.7237, + "step": 11373 + }, + { + "epoch": 0.33727722919076003, + "grad_norm": 0.1328444480895996, + "learning_rate": 0.0007537933927550602, + "loss": 2.7116, + "step": 11374 + }, + { + "epoch": 0.3373068825430715, + "grad_norm": 0.1389523297548294, + "learning_rate": 0.000753752852493687, + "loss": 2.7018, + "step": 11375 + }, + { + "epoch": 0.337336535895383, + "grad_norm": 0.14630751311779022, + "learning_rate": 0.0007537123099852889, + "loss": 2.7495, + "step": 11376 + }, + { + "epoch": 0.33736618924769446, + "grad_norm": 0.16296811401844025, + "learning_rate": 0.0007536717652302245, + "loss": 2.7307, + "step": 11377 + }, + { + "epoch": 0.33739584260000594, + "grad_norm": 0.15918444097042084, + "learning_rate": 0.0007536312182288531, + "loss": 2.7113, + "step": 11378 + }, + { + "epoch": 0.3374254959523174, + "grad_norm": 0.1283050775527954, + "learning_rate": 0.000753590668981534, + "loss": 2.7532, + "step": 11379 + }, + { + "epoch": 0.3374551493046289, + "grad_norm": 0.12355487793684006, + "learning_rate": 0.0007535501174886257, + "loss": 2.7319, + "step": 11380 + }, + { + "epoch": 0.33748480265694036, + "grad_norm": 0.1481219381093979, + "learning_rate": 0.0007535095637504879, + "loss": 2.7171, + "step": 11381 + }, + { + "epoch": 0.33751445600925184, + "grad_norm": 0.1387452483177185, + "learning_rate": 0.0007534690077674791, + "loss": 2.6959, + "step": 11382 + }, + { + "epoch": 0.3375441093615633, + "grad_norm": 0.13616104423999786, + "learning_rate": 0.0007534284495399589, + "loss": 2.7385, + "step": 11383 + }, + { + "epoch": 0.3375737627138748, + "grad_norm": 0.14442536234855652, + "learning_rate": 0.0007533878890682861, + "loss": 2.7032, + "step": 11384 + }, + { + "epoch": 0.33760341606618627, + "grad_norm": 0.1572670191526413, + "learning_rate": 0.0007533473263528201, + "loss": 2.6976, + "step": 11385 + }, + { + "epoch": 0.33763306941849774, + "grad_norm": 0.1409406214952469, + "learning_rate": 0.0007533067613939202, + "loss": 2.7186, + "step": 11386 + }, + { + "epoch": 0.3376627227708092, + "grad_norm": 0.1731477826833725, + "learning_rate": 0.0007532661941919455, + "loss": 2.7032, + "step": 11387 + }, + { + "epoch": 0.3376923761231207, + "grad_norm": 0.1695563942193985, + "learning_rate": 0.000753225624747255, + "loss": 2.7183, + "step": 11388 + }, + { + "epoch": 0.3377220294754322, + "grad_norm": 0.1310531347990036, + "learning_rate": 0.0007531850530602081, + "loss": 2.7187, + "step": 11389 + }, + { + "epoch": 0.33775168282774365, + "grad_norm": 0.12157987058162689, + "learning_rate": 0.0007531444791311641, + "loss": 2.7417, + "step": 11390 + }, + { + "epoch": 0.3377813361800552, + "grad_norm": 0.13095973432064056, + "learning_rate": 0.0007531039029604824, + "loss": 2.6769, + "step": 11391 + }, + { + "epoch": 0.33781098953236666, + "grad_norm": 0.1317734271287918, + "learning_rate": 0.0007530633245485221, + "loss": 2.7494, + "step": 11392 + }, + { + "epoch": 0.33784064288467813, + "grad_norm": 0.11234501004219055, + "learning_rate": 0.0007530227438956428, + "loss": 2.7518, + "step": 11393 + }, + { + "epoch": 0.3378702962369896, + "grad_norm": 0.1281026005744934, + "learning_rate": 0.0007529821610022035, + "loss": 2.7421, + "step": 11394 + }, + { + "epoch": 0.3378999495893011, + "grad_norm": 0.16017542779445648, + "learning_rate": 0.0007529415758685638, + "loss": 2.7107, + "step": 11395 + }, + { + "epoch": 0.33792960294161256, + "grad_norm": 0.16507048904895782, + "learning_rate": 0.0007529009884950829, + "loss": 2.722, + "step": 11396 + }, + { + "epoch": 0.33795925629392404, + "grad_norm": 0.14994758367538452, + "learning_rate": 0.0007528603988821205, + "loss": 2.728, + "step": 11397 + }, + { + "epoch": 0.3379889096462355, + "grad_norm": 0.14845307171344757, + "learning_rate": 0.0007528198070300358, + "loss": 2.6675, + "step": 11398 + }, + { + "epoch": 0.338018562998547, + "grad_norm": 0.14155955612659454, + "learning_rate": 0.0007527792129391884, + "loss": 2.7516, + "step": 11399 + }, + { + "epoch": 0.33804821635085847, + "grad_norm": 0.12036136537790298, + "learning_rate": 0.0007527386166099375, + "loss": 2.7166, + "step": 11400 + }, + { + "epoch": 0.33807786970316994, + "grad_norm": 0.1199079379439354, + "learning_rate": 0.0007526980180426428, + "loss": 2.7226, + "step": 11401 + }, + { + "epoch": 0.3381075230554814, + "grad_norm": 0.14759188890457153, + "learning_rate": 0.0007526574172376639, + "loss": 2.7487, + "step": 11402 + }, + { + "epoch": 0.3381371764077929, + "grad_norm": 0.17833448946475983, + "learning_rate": 0.0007526168141953602, + "loss": 2.6792, + "step": 11403 + }, + { + "epoch": 0.33816682976010437, + "grad_norm": 0.1573561131954193, + "learning_rate": 0.0007525762089160912, + "loss": 2.7077, + "step": 11404 + }, + { + "epoch": 0.33819648311241585, + "grad_norm": 0.1433144509792328, + "learning_rate": 0.0007525356014002164, + "loss": 2.7295, + "step": 11405 + }, + { + "epoch": 0.3382261364647273, + "grad_norm": 0.13047412037849426, + "learning_rate": 0.0007524949916480956, + "loss": 2.7044, + "step": 11406 + }, + { + "epoch": 0.3382557898170388, + "grad_norm": 0.11517134308815002, + "learning_rate": 0.0007524543796600885, + "loss": 2.7048, + "step": 11407 + }, + { + "epoch": 0.3382854431693503, + "grad_norm": 0.12866127490997314, + "learning_rate": 0.0007524137654365543, + "loss": 2.7048, + "step": 11408 + }, + { + "epoch": 0.33831509652166175, + "grad_norm": 0.11966000497341156, + "learning_rate": 0.0007523731489778529, + "loss": 2.7143, + "step": 11409 + }, + { + "epoch": 0.3383447498739732, + "grad_norm": 0.12167743593454361, + "learning_rate": 0.000752332530284344, + "loss": 2.7212, + "step": 11410 + }, + { + "epoch": 0.33837440322628476, + "grad_norm": 0.12081259489059448, + "learning_rate": 0.0007522919093563872, + "loss": 2.7317, + "step": 11411 + }, + { + "epoch": 0.33840405657859624, + "grad_norm": 0.13216261565685272, + "learning_rate": 0.0007522512861943422, + "loss": 2.7076, + "step": 11412 + }, + { + "epoch": 0.3384337099309077, + "grad_norm": 0.1368507742881775, + "learning_rate": 0.0007522106607985688, + "loss": 2.7086, + "step": 11413 + }, + { + "epoch": 0.3384633632832192, + "grad_norm": 0.13104389607906342, + "learning_rate": 0.0007521700331694268, + "loss": 2.7132, + "step": 11414 + }, + { + "epoch": 0.33849301663553066, + "grad_norm": 0.1425950527191162, + "learning_rate": 0.0007521294033072758, + "loss": 2.6783, + "step": 11415 + }, + { + "epoch": 0.33852266998784214, + "grad_norm": 0.14171823859214783, + "learning_rate": 0.0007520887712124758, + "loss": 2.7185, + "step": 11416 + }, + { + "epoch": 0.3385523233401536, + "grad_norm": 0.1296028196811676, + "learning_rate": 0.0007520481368853861, + "loss": 2.7013, + "step": 11417 + }, + { + "epoch": 0.3385819766924651, + "grad_norm": 0.14239321649074554, + "learning_rate": 0.0007520075003263671, + "loss": 2.7647, + "step": 11418 + }, + { + "epoch": 0.33861163004477657, + "grad_norm": 0.1446947306394577, + "learning_rate": 0.0007519668615357784, + "loss": 2.7381, + "step": 11419 + }, + { + "epoch": 0.33864128339708804, + "grad_norm": 0.12977217137813568, + "learning_rate": 0.0007519262205139801, + "loss": 2.6899, + "step": 11420 + }, + { + "epoch": 0.3386709367493995, + "grad_norm": 0.11387485265731812, + "learning_rate": 0.0007518855772613316, + "loss": 2.702, + "step": 11421 + }, + { + "epoch": 0.338700590101711, + "grad_norm": 0.12929780781269073, + "learning_rate": 0.0007518449317781932, + "loss": 2.7158, + "step": 11422 + }, + { + "epoch": 0.3387302434540225, + "grad_norm": 0.13486649096012115, + "learning_rate": 0.0007518042840649246, + "loss": 2.7625, + "step": 11423 + }, + { + "epoch": 0.33875989680633395, + "grad_norm": 0.15228070318698883, + "learning_rate": 0.0007517636341218859, + "loss": 2.7327, + "step": 11424 + }, + { + "epoch": 0.3387895501586454, + "grad_norm": 0.14270822703838348, + "learning_rate": 0.0007517229819494372, + "loss": 2.7363, + "step": 11425 + }, + { + "epoch": 0.3388192035109569, + "grad_norm": 0.1253347098827362, + "learning_rate": 0.0007516823275479379, + "loss": 2.7166, + "step": 11426 + }, + { + "epoch": 0.3388488568632684, + "grad_norm": 0.1398467719554901, + "learning_rate": 0.0007516416709177487, + "loss": 2.7231, + "step": 11427 + }, + { + "epoch": 0.33887851021557985, + "grad_norm": 0.11298337578773499, + "learning_rate": 0.0007516010120592291, + "loss": 2.7286, + "step": 11428 + }, + { + "epoch": 0.33890816356789133, + "grad_norm": 0.11010304093360901, + "learning_rate": 0.0007515603509727396, + "loss": 2.7299, + "step": 11429 + }, + { + "epoch": 0.3389378169202028, + "grad_norm": 0.12184290587902069, + "learning_rate": 0.0007515196876586397, + "loss": 2.7014, + "step": 11430 + }, + { + "epoch": 0.3389674702725143, + "grad_norm": 0.11963362246751785, + "learning_rate": 0.00075147902211729, + "loss": 2.7013, + "step": 11431 + }, + { + "epoch": 0.3389971236248258, + "grad_norm": 0.15222454071044922, + "learning_rate": 0.0007514383543490504, + "loss": 2.7056, + "step": 11432 + }, + { + "epoch": 0.3390267769771373, + "grad_norm": 0.1397620588541031, + "learning_rate": 0.0007513976843542809, + "loss": 2.6959, + "step": 11433 + }, + { + "epoch": 0.33905643032944877, + "grad_norm": 0.13997960090637207, + "learning_rate": 0.000751357012133342, + "loss": 2.7363, + "step": 11434 + }, + { + "epoch": 0.33908608368176024, + "grad_norm": 0.1473959982395172, + "learning_rate": 0.0007513163376865932, + "loss": 2.7262, + "step": 11435 + }, + { + "epoch": 0.3391157370340717, + "grad_norm": 0.15018661320209503, + "learning_rate": 0.0007512756610143954, + "loss": 2.6999, + "step": 11436 + }, + { + "epoch": 0.3391453903863832, + "grad_norm": 0.1531723588705063, + "learning_rate": 0.0007512349821171085, + "loss": 2.6897, + "step": 11437 + }, + { + "epoch": 0.33917504373869467, + "grad_norm": 0.1427205204963684, + "learning_rate": 0.0007511943009950925, + "loss": 2.755, + "step": 11438 + }, + { + "epoch": 0.33920469709100615, + "grad_norm": 0.15799756348133087, + "learning_rate": 0.0007511536176487081, + "loss": 2.7256, + "step": 11439 + }, + { + "epoch": 0.3392343504433176, + "grad_norm": 0.17853309214115143, + "learning_rate": 0.0007511129320783151, + "loss": 2.6873, + "step": 11440 + }, + { + "epoch": 0.3392640037956291, + "grad_norm": 0.15510182082653046, + "learning_rate": 0.0007510722442842741, + "loss": 2.7047, + "step": 11441 + }, + { + "epoch": 0.3392936571479406, + "grad_norm": 0.13660724461078644, + "learning_rate": 0.0007510315542669453, + "loss": 2.7371, + "step": 11442 + }, + { + "epoch": 0.33932331050025205, + "grad_norm": 0.15801531076431274, + "learning_rate": 0.0007509908620266888, + "loss": 2.6899, + "step": 11443 + }, + { + "epoch": 0.3393529638525635, + "grad_norm": 0.1678430587053299, + "learning_rate": 0.0007509501675638652, + "loss": 2.7202, + "step": 11444 + }, + { + "epoch": 0.339382617204875, + "grad_norm": 0.1585277020931244, + "learning_rate": 0.000750909470878835, + "loss": 2.7301, + "step": 11445 + }, + { + "epoch": 0.3394122705571865, + "grad_norm": 0.13382603228092194, + "learning_rate": 0.0007508687719719583, + "loss": 2.6824, + "step": 11446 + }, + { + "epoch": 0.33944192390949796, + "grad_norm": 0.14640922844409943, + "learning_rate": 0.0007508280708435955, + "loss": 2.7248, + "step": 11447 + }, + { + "epoch": 0.33947157726180943, + "grad_norm": 0.13082434237003326, + "learning_rate": 0.000750787367494107, + "loss": 2.7512, + "step": 11448 + }, + { + "epoch": 0.3395012306141209, + "grad_norm": 0.13482826948165894, + "learning_rate": 0.0007507466619238535, + "loss": 2.7155, + "step": 11449 + }, + { + "epoch": 0.3395308839664324, + "grad_norm": 0.13706074655056, + "learning_rate": 0.0007507059541331951, + "loss": 2.708, + "step": 11450 + }, + { + "epoch": 0.33956053731874386, + "grad_norm": 0.1368027925491333, + "learning_rate": 0.0007506652441224924, + "loss": 2.7131, + "step": 11451 + }, + { + "epoch": 0.33959019067105534, + "grad_norm": 0.11885860562324524, + "learning_rate": 0.0007506245318921061, + "loss": 2.7281, + "step": 11452 + }, + { + "epoch": 0.33961984402336687, + "grad_norm": 0.10609081387519836, + "learning_rate": 0.0007505838174423965, + "loss": 2.7137, + "step": 11453 + }, + { + "epoch": 0.33964949737567834, + "grad_norm": 0.11868211627006531, + "learning_rate": 0.0007505431007737242, + "loss": 2.7154, + "step": 11454 + }, + { + "epoch": 0.3396791507279898, + "grad_norm": 0.13029435276985168, + "learning_rate": 0.0007505023818864497, + "loss": 2.7257, + "step": 11455 + }, + { + "epoch": 0.3397088040803013, + "grad_norm": 0.1361534744501114, + "learning_rate": 0.0007504616607809336, + "loss": 2.7463, + "step": 11456 + }, + { + "epoch": 0.3397384574326128, + "grad_norm": 0.12969475984573364, + "learning_rate": 0.0007504209374575365, + "loss": 2.7257, + "step": 11457 + }, + { + "epoch": 0.33976811078492425, + "grad_norm": 0.1235155388712883, + "learning_rate": 0.000750380211916619, + "loss": 2.7113, + "step": 11458 + }, + { + "epoch": 0.3397977641372357, + "grad_norm": 0.13999365270137787, + "learning_rate": 0.0007503394841585419, + "loss": 2.7348, + "step": 11459 + }, + { + "epoch": 0.3398274174895472, + "grad_norm": 0.1447129249572754, + "learning_rate": 0.0007502987541836655, + "loss": 2.696, + "step": 11460 + }, + { + "epoch": 0.3398570708418587, + "grad_norm": 0.19166627526283264, + "learning_rate": 0.0007502580219923508, + "loss": 2.6921, + "step": 11461 + }, + { + "epoch": 0.33988672419417015, + "grad_norm": 0.2055966705083847, + "learning_rate": 0.0007502172875849582, + "loss": 2.7211, + "step": 11462 + }, + { + "epoch": 0.33991637754648163, + "grad_norm": 0.16427010297775269, + "learning_rate": 0.0007501765509618488, + "loss": 2.7308, + "step": 11463 + }, + { + "epoch": 0.3399460308987931, + "grad_norm": 0.12625615298748016, + "learning_rate": 0.000750135812123383, + "loss": 2.6969, + "step": 11464 + }, + { + "epoch": 0.3399756842511046, + "grad_norm": 0.15894989669322968, + "learning_rate": 0.0007500950710699215, + "loss": 2.6836, + "step": 11465 + }, + { + "epoch": 0.34000533760341606, + "grad_norm": 0.15439359843730927, + "learning_rate": 0.0007500543278018255, + "loss": 2.7087, + "step": 11466 + }, + { + "epoch": 0.34003499095572753, + "grad_norm": 0.11667076498270035, + "learning_rate": 0.0007500135823194552, + "loss": 2.6858, + "step": 11467 + }, + { + "epoch": 0.340064644308039, + "grad_norm": 0.12242981791496277, + "learning_rate": 0.000749972834623172, + "loss": 2.7015, + "step": 11468 + }, + { + "epoch": 0.3400942976603505, + "grad_norm": 0.1198251023888588, + "learning_rate": 0.0007499320847133362, + "loss": 2.7014, + "step": 11469 + }, + { + "epoch": 0.34012395101266196, + "grad_norm": 0.12792441248893738, + "learning_rate": 0.0007498913325903091, + "loss": 2.707, + "step": 11470 + }, + { + "epoch": 0.34015360436497344, + "grad_norm": 0.14249074459075928, + "learning_rate": 0.0007498505782544511, + "loss": 2.7021, + "step": 11471 + }, + { + "epoch": 0.3401832577172849, + "grad_norm": 0.12285546958446503, + "learning_rate": 0.0007498098217061235, + "loss": 2.7157, + "step": 11472 + }, + { + "epoch": 0.3402129110695964, + "grad_norm": 0.11593005806207657, + "learning_rate": 0.0007497690629456871, + "loss": 2.7453, + "step": 11473 + }, + { + "epoch": 0.3402425644219079, + "grad_norm": 0.11856738477945328, + "learning_rate": 0.0007497283019735027, + "loss": 2.6917, + "step": 11474 + }, + { + "epoch": 0.3402722177742194, + "grad_norm": 0.13111482560634613, + "learning_rate": 0.0007496875387899314, + "loss": 2.7158, + "step": 11475 + }, + { + "epoch": 0.3403018711265309, + "grad_norm": 0.13611245155334473, + "learning_rate": 0.0007496467733953341, + "loss": 2.7245, + "step": 11476 + }, + { + "epoch": 0.34033152447884235, + "grad_norm": 0.1192162036895752, + "learning_rate": 0.0007496060057900715, + "loss": 2.7267, + "step": 11477 + }, + { + "epoch": 0.3403611778311538, + "grad_norm": 0.12645751237869263, + "learning_rate": 0.0007495652359745051, + "loss": 2.6968, + "step": 11478 + }, + { + "epoch": 0.3403908311834653, + "grad_norm": 0.13719692826271057, + "learning_rate": 0.0007495244639489958, + "loss": 2.6896, + "step": 11479 + }, + { + "epoch": 0.3404204845357768, + "grad_norm": 0.12032939493656158, + "learning_rate": 0.0007494836897139045, + "loss": 2.6909, + "step": 11480 + }, + { + "epoch": 0.34045013788808826, + "grad_norm": 0.13718870282173157, + "learning_rate": 0.0007494429132695921, + "loss": 2.7337, + "step": 11481 + }, + { + "epoch": 0.34047979124039973, + "grad_norm": 0.14977584779262543, + "learning_rate": 0.0007494021346164199, + "loss": 2.7506, + "step": 11482 + }, + { + "epoch": 0.3405094445927112, + "grad_norm": 0.14882054924964905, + "learning_rate": 0.0007493613537547492, + "loss": 2.7126, + "step": 11483 + }, + { + "epoch": 0.3405390979450227, + "grad_norm": 0.1583000123500824, + "learning_rate": 0.0007493205706849408, + "loss": 2.6715, + "step": 11484 + }, + { + "epoch": 0.34056875129733416, + "grad_norm": 0.14300808310508728, + "learning_rate": 0.000749279785407356, + "loss": 2.7036, + "step": 11485 + }, + { + "epoch": 0.34059840464964564, + "grad_norm": 0.14875388145446777, + "learning_rate": 0.0007492389979223558, + "loss": 2.6539, + "step": 11486 + }, + { + "epoch": 0.3406280580019571, + "grad_norm": 0.1483219712972641, + "learning_rate": 0.0007491982082303016, + "loss": 2.7144, + "step": 11487 + }, + { + "epoch": 0.3406577113542686, + "grad_norm": 0.13343361020088196, + "learning_rate": 0.0007491574163315543, + "loss": 2.699, + "step": 11488 + }, + { + "epoch": 0.34068736470658006, + "grad_norm": 0.13989043235778809, + "learning_rate": 0.0007491166222264755, + "loss": 2.6724, + "step": 11489 + }, + { + "epoch": 0.34071701805889154, + "grad_norm": 0.14130569994449615, + "learning_rate": 0.0007490758259154263, + "loss": 2.6582, + "step": 11490 + }, + { + "epoch": 0.340746671411203, + "grad_norm": 0.11743450909852982, + "learning_rate": 0.0007490350273987678, + "loss": 2.6815, + "step": 11491 + }, + { + "epoch": 0.3407763247635145, + "grad_norm": 0.12400968372821808, + "learning_rate": 0.0007489942266768614, + "loss": 2.7338, + "step": 11492 + }, + { + "epoch": 0.34080597811582597, + "grad_norm": 0.13445349037647247, + "learning_rate": 0.0007489534237500684, + "loss": 2.7168, + "step": 11493 + }, + { + "epoch": 0.34083563146813745, + "grad_norm": 0.1379902958869934, + "learning_rate": 0.00074891261861875, + "loss": 2.7378, + "step": 11494 + }, + { + "epoch": 0.340865284820449, + "grad_norm": 0.15534763038158417, + "learning_rate": 0.0007488718112832678, + "loss": 2.6838, + "step": 11495 + }, + { + "epoch": 0.34089493817276045, + "grad_norm": 0.1453176885843277, + "learning_rate": 0.0007488310017439829, + "loss": 2.6996, + "step": 11496 + }, + { + "epoch": 0.34092459152507193, + "grad_norm": 0.13346004486083984, + "learning_rate": 0.0007487901900012569, + "loss": 2.7324, + "step": 11497 + }, + { + "epoch": 0.3409542448773834, + "grad_norm": 0.13263201713562012, + "learning_rate": 0.0007487493760554509, + "loss": 2.7173, + "step": 11498 + }, + { + "epoch": 0.3409838982296949, + "grad_norm": 0.15098023414611816, + "learning_rate": 0.0007487085599069265, + "loss": 2.7123, + "step": 11499 + }, + { + "epoch": 0.34101355158200636, + "grad_norm": 0.15942655503749847, + "learning_rate": 0.0007486677415560451, + "loss": 2.6901, + "step": 11500 + }, + { + "epoch": 0.34104320493431783, + "grad_norm": 0.12193312495946884, + "learning_rate": 0.0007486269210031682, + "loss": 2.6804, + "step": 11501 + }, + { + "epoch": 0.3410728582866293, + "grad_norm": 0.14876271784305573, + "learning_rate": 0.0007485860982486572, + "loss": 2.6916, + "step": 11502 + }, + { + "epoch": 0.3411025116389408, + "grad_norm": 0.1420510858297348, + "learning_rate": 0.0007485452732928737, + "loss": 2.7266, + "step": 11503 + }, + { + "epoch": 0.34113216499125226, + "grad_norm": 0.1382007747888565, + "learning_rate": 0.0007485044461361792, + "loss": 2.7031, + "step": 11504 + }, + { + "epoch": 0.34116181834356374, + "grad_norm": 0.13644583523273468, + "learning_rate": 0.000748463616778935, + "loss": 2.7229, + "step": 11505 + }, + { + "epoch": 0.3411914716958752, + "grad_norm": 0.135369211435318, + "learning_rate": 0.0007484227852215028, + "loss": 2.7018, + "step": 11506 + }, + { + "epoch": 0.3412211250481867, + "grad_norm": 0.12020505964756012, + "learning_rate": 0.0007483819514642445, + "loss": 2.6665, + "step": 11507 + }, + { + "epoch": 0.34125077840049817, + "grad_norm": 0.11776220053434372, + "learning_rate": 0.0007483411155075211, + "loss": 2.7205, + "step": 11508 + }, + { + "epoch": 0.34128043175280964, + "grad_norm": 0.12108348309993744, + "learning_rate": 0.0007483002773516946, + "loss": 2.6685, + "step": 11509 + }, + { + "epoch": 0.3413100851051211, + "grad_norm": 0.12265141308307648, + "learning_rate": 0.0007482594369971266, + "loss": 2.6975, + "step": 11510 + }, + { + "epoch": 0.3413397384574326, + "grad_norm": 0.12356129288673401, + "learning_rate": 0.0007482185944441785, + "loss": 2.6912, + "step": 11511 + }, + { + "epoch": 0.34136939180974407, + "grad_norm": 0.120487280189991, + "learning_rate": 0.0007481777496932123, + "loss": 2.7127, + "step": 11512 + }, + { + "epoch": 0.34139904516205555, + "grad_norm": 0.14258645474910736, + "learning_rate": 0.0007481369027445894, + "loss": 2.7371, + "step": 11513 + }, + { + "epoch": 0.341428698514367, + "grad_norm": 0.15058957040309906, + "learning_rate": 0.0007480960535986716, + "loss": 2.7197, + "step": 11514 + }, + { + "epoch": 0.34145835186667856, + "grad_norm": 0.15786249935626984, + "learning_rate": 0.0007480552022558208, + "loss": 2.6839, + "step": 11515 + }, + { + "epoch": 0.34148800521899003, + "grad_norm": 0.12331486493349075, + "learning_rate": 0.0007480143487163986, + "loss": 2.719, + "step": 11516 + }, + { + "epoch": 0.3415176585713015, + "grad_norm": 0.13001689314842224, + "learning_rate": 0.0007479734929807666, + "loss": 2.7161, + "step": 11517 + }, + { + "epoch": 0.341547311923613, + "grad_norm": 0.15634804964065552, + "learning_rate": 0.0007479326350492871, + "loss": 2.6927, + "step": 11518 + }, + { + "epoch": 0.34157696527592446, + "grad_norm": 0.16616950929164886, + "learning_rate": 0.0007478917749223213, + "loss": 2.7016, + "step": 11519 + }, + { + "epoch": 0.34160661862823594, + "grad_norm": 0.15506383776664734, + "learning_rate": 0.0007478509126002313, + "loss": 2.7051, + "step": 11520 + }, + { + "epoch": 0.3416362719805474, + "grad_norm": 0.1458783596754074, + "learning_rate": 0.000747810048083379, + "loss": 2.7062, + "step": 11521 + }, + { + "epoch": 0.3416659253328589, + "grad_norm": 0.13231098651885986, + "learning_rate": 0.000747769181372126, + "loss": 2.7209, + "step": 11522 + }, + { + "epoch": 0.34169557868517036, + "grad_norm": 0.1161058321595192, + "learning_rate": 0.0007477283124668345, + "loss": 2.703, + "step": 11523 + }, + { + "epoch": 0.34172523203748184, + "grad_norm": 0.14759553968906403, + "learning_rate": 0.0007476874413678663, + "loss": 2.7263, + "step": 11524 + }, + { + "epoch": 0.3417548853897933, + "grad_norm": 0.14846612513065338, + "learning_rate": 0.0007476465680755832, + "loss": 2.7127, + "step": 11525 + }, + { + "epoch": 0.3417845387421048, + "grad_norm": 0.16040393710136414, + "learning_rate": 0.0007476056925903474, + "loss": 2.7082, + "step": 11526 + }, + { + "epoch": 0.34181419209441627, + "grad_norm": 0.16994276642799377, + "learning_rate": 0.0007475648149125205, + "loss": 2.7107, + "step": 11527 + }, + { + "epoch": 0.34184384544672775, + "grad_norm": 0.15858376026153564, + "learning_rate": 0.0007475239350424649, + "loss": 2.7086, + "step": 11528 + }, + { + "epoch": 0.3418734987990392, + "grad_norm": 0.1640513688325882, + "learning_rate": 0.0007474830529805422, + "loss": 2.7234, + "step": 11529 + }, + { + "epoch": 0.3419031521513507, + "grad_norm": 0.17900492250919342, + "learning_rate": 0.0007474421687271147, + "loss": 2.7392, + "step": 11530 + }, + { + "epoch": 0.3419328055036622, + "grad_norm": 0.15722955763339996, + "learning_rate": 0.0007474012822825442, + "loss": 2.7434, + "step": 11531 + }, + { + "epoch": 0.34196245885597365, + "grad_norm": 0.12681785225868225, + "learning_rate": 0.0007473603936471928, + "loss": 2.6954, + "step": 11532 + }, + { + "epoch": 0.3419921122082851, + "grad_norm": 0.13999904692173004, + "learning_rate": 0.0007473195028214229, + "loss": 2.712, + "step": 11533 + }, + { + "epoch": 0.3420217655605966, + "grad_norm": 0.14301559329032898, + "learning_rate": 0.0007472786098055962, + "loss": 2.6876, + "step": 11534 + }, + { + "epoch": 0.3420514189129081, + "grad_norm": 0.12885379791259766, + "learning_rate": 0.000747237714600075, + "loss": 2.6835, + "step": 11535 + }, + { + "epoch": 0.3420810722652196, + "grad_norm": 0.15016710758209229, + "learning_rate": 0.0007471968172052213, + "loss": 2.7123, + "step": 11536 + }, + { + "epoch": 0.3421107256175311, + "grad_norm": 0.14050815999507904, + "learning_rate": 0.0007471559176213976, + "loss": 2.724, + "step": 11537 + }, + { + "epoch": 0.34214037896984256, + "grad_norm": 0.12883307039737701, + "learning_rate": 0.0007471150158489656, + "loss": 2.7418, + "step": 11538 + }, + { + "epoch": 0.34217003232215404, + "grad_norm": 0.12405078113079071, + "learning_rate": 0.0007470741118882878, + "loss": 2.6991, + "step": 11539 + }, + { + "epoch": 0.3421996856744655, + "grad_norm": 0.12635454535484314, + "learning_rate": 0.0007470332057397262, + "loss": 2.6866, + "step": 11540 + }, + { + "epoch": 0.342229339026777, + "grad_norm": 0.11315272748470306, + "learning_rate": 0.0007469922974036434, + "loss": 2.6956, + "step": 11541 + }, + { + "epoch": 0.34225899237908847, + "grad_norm": 0.12119712680578232, + "learning_rate": 0.0007469513868804012, + "loss": 2.7278, + "step": 11542 + }, + { + "epoch": 0.34228864573139994, + "grad_norm": 0.11272744089365005, + "learning_rate": 0.0007469104741703622, + "loss": 2.6939, + "step": 11543 + }, + { + "epoch": 0.3423182990837114, + "grad_norm": 0.11683611571788788, + "learning_rate": 0.0007468695592738885, + "loss": 2.7186, + "step": 11544 + }, + { + "epoch": 0.3423479524360229, + "grad_norm": 0.11559320986270905, + "learning_rate": 0.0007468286421913426, + "loss": 2.7167, + "step": 11545 + }, + { + "epoch": 0.34237760578833437, + "grad_norm": 0.10248345881700516, + "learning_rate": 0.0007467877229230866, + "loss": 2.707, + "step": 11546 + }, + { + "epoch": 0.34240725914064585, + "grad_norm": 0.1148783341050148, + "learning_rate": 0.000746746801469483, + "loss": 2.7021, + "step": 11547 + }, + { + "epoch": 0.3424369124929573, + "grad_norm": 0.10043127089738846, + "learning_rate": 0.0007467058778308941, + "loss": 2.681, + "step": 11548 + }, + { + "epoch": 0.3424665658452688, + "grad_norm": 0.10270321369171143, + "learning_rate": 0.0007466649520076823, + "loss": 2.6867, + "step": 11549 + }, + { + "epoch": 0.3424962191975803, + "grad_norm": 0.10619610548019409, + "learning_rate": 0.0007466240240002099, + "loss": 2.7026, + "step": 11550 + }, + { + "epoch": 0.34252587254989175, + "grad_norm": 0.10629469901323318, + "learning_rate": 0.0007465830938088397, + "loss": 2.7243, + "step": 11551 + }, + { + "epoch": 0.34255552590220323, + "grad_norm": 0.10978484153747559, + "learning_rate": 0.0007465421614339337, + "loss": 2.702, + "step": 11552 + }, + { + "epoch": 0.3425851792545147, + "grad_norm": 0.11280618607997894, + "learning_rate": 0.0007465012268758545, + "loss": 2.7169, + "step": 11553 + }, + { + "epoch": 0.3426148326068262, + "grad_norm": 0.11828041076660156, + "learning_rate": 0.0007464602901349647, + "loss": 2.7092, + "step": 11554 + }, + { + "epoch": 0.34264448595913766, + "grad_norm": 0.12114522606134415, + "learning_rate": 0.0007464193512116266, + "loss": 2.6965, + "step": 11555 + }, + { + "epoch": 0.34267413931144913, + "grad_norm": 0.13475339114665985, + "learning_rate": 0.0007463784101062031, + "loss": 2.7285, + "step": 11556 + }, + { + "epoch": 0.34270379266376066, + "grad_norm": 0.1348840445280075, + "learning_rate": 0.0007463374668190563, + "loss": 2.6797, + "step": 11557 + }, + { + "epoch": 0.34273344601607214, + "grad_norm": 0.12081205099821091, + "learning_rate": 0.000746296521350549, + "loss": 2.7236, + "step": 11558 + }, + { + "epoch": 0.3427630993683836, + "grad_norm": 0.13489991426467896, + "learning_rate": 0.0007462555737010437, + "loss": 2.7294, + "step": 11559 + }, + { + "epoch": 0.3427927527206951, + "grad_norm": 0.13342605531215668, + "learning_rate": 0.0007462146238709031, + "loss": 2.7185, + "step": 11560 + }, + { + "epoch": 0.34282240607300657, + "grad_norm": 0.14864394068717957, + "learning_rate": 0.0007461736718604897, + "loss": 2.7103, + "step": 11561 + }, + { + "epoch": 0.34285205942531805, + "grad_norm": 0.16141153872013092, + "learning_rate": 0.0007461327176701662, + "loss": 2.7007, + "step": 11562 + }, + { + "epoch": 0.3428817127776295, + "grad_norm": 0.151976078748703, + "learning_rate": 0.0007460917613002952, + "loss": 2.7185, + "step": 11563 + }, + { + "epoch": 0.342911366129941, + "grad_norm": 0.16664637625217438, + "learning_rate": 0.0007460508027512395, + "loss": 2.726, + "step": 11564 + }, + { + "epoch": 0.3429410194822525, + "grad_norm": 0.20266859233379364, + "learning_rate": 0.0007460098420233617, + "loss": 2.7354, + "step": 11565 + }, + { + "epoch": 0.34297067283456395, + "grad_norm": 0.19475585222244263, + "learning_rate": 0.0007459688791170243, + "loss": 2.7121, + "step": 11566 + }, + { + "epoch": 0.3430003261868754, + "grad_norm": 0.14322976768016815, + "learning_rate": 0.0007459279140325905, + "loss": 2.7236, + "step": 11567 + }, + { + "epoch": 0.3430299795391869, + "grad_norm": 0.13702107965946198, + "learning_rate": 0.0007458869467704227, + "loss": 2.7465, + "step": 11568 + }, + { + "epoch": 0.3430596328914984, + "grad_norm": 0.1377323567867279, + "learning_rate": 0.0007458459773308837, + "loss": 2.6981, + "step": 11569 + }, + { + "epoch": 0.34308928624380985, + "grad_norm": 0.13370594382286072, + "learning_rate": 0.0007458050057143365, + "loss": 2.7042, + "step": 11570 + }, + { + "epoch": 0.34311893959612133, + "grad_norm": 0.12495390325784683, + "learning_rate": 0.0007457640319211438, + "loss": 2.7205, + "step": 11571 + }, + { + "epoch": 0.3431485929484328, + "grad_norm": 0.11898249387741089, + "learning_rate": 0.0007457230559516683, + "loss": 2.7046, + "step": 11572 + }, + { + "epoch": 0.3431782463007443, + "grad_norm": 0.1342281848192215, + "learning_rate": 0.000745682077806273, + "loss": 2.7026, + "step": 11573 + }, + { + "epoch": 0.34320789965305576, + "grad_norm": 0.1445656418800354, + "learning_rate": 0.0007456410974853208, + "loss": 2.7393, + "step": 11574 + }, + { + "epoch": 0.34323755300536724, + "grad_norm": 0.1657722443342209, + "learning_rate": 0.0007456001149891744, + "loss": 2.711, + "step": 11575 + }, + { + "epoch": 0.3432672063576787, + "grad_norm": 0.15683919191360474, + "learning_rate": 0.000745559130318197, + "loss": 2.6589, + "step": 11576 + }, + { + "epoch": 0.3432968597099902, + "grad_norm": 0.13457134366035461, + "learning_rate": 0.0007455181434727513, + "loss": 2.7223, + "step": 11577 + }, + { + "epoch": 0.3433265130623017, + "grad_norm": 0.12303858250379562, + "learning_rate": 0.0007454771544532003, + "loss": 2.6773, + "step": 11578 + }, + { + "epoch": 0.3433561664146132, + "grad_norm": 0.11502961814403534, + "learning_rate": 0.0007454361632599068, + "loss": 2.6984, + "step": 11579 + }, + { + "epoch": 0.34338581976692467, + "grad_norm": 0.12590207159519196, + "learning_rate": 0.0007453951698932341, + "loss": 2.7354, + "step": 11580 + }, + { + "epoch": 0.34341547311923615, + "grad_norm": 0.11882756650447845, + "learning_rate": 0.000745354174353545, + "loss": 2.6959, + "step": 11581 + }, + { + "epoch": 0.3434451264715476, + "grad_norm": 0.10875703394412994, + "learning_rate": 0.0007453131766412026, + "loss": 2.6631, + "step": 11582 + }, + { + "epoch": 0.3434747798238591, + "grad_norm": 0.12449284642934799, + "learning_rate": 0.00074527217675657, + "loss": 2.6928, + "step": 11583 + }, + { + "epoch": 0.3435044331761706, + "grad_norm": 0.13159562647342682, + "learning_rate": 0.00074523117470001, + "loss": 2.7248, + "step": 11584 + }, + { + "epoch": 0.34353408652848205, + "grad_norm": 0.1345735788345337, + "learning_rate": 0.0007451901704718859, + "loss": 2.7041, + "step": 11585 + }, + { + "epoch": 0.34356373988079353, + "grad_norm": 0.1362410932779312, + "learning_rate": 0.0007451491640725607, + "loss": 2.7236, + "step": 11586 + }, + { + "epoch": 0.343593393233105, + "grad_norm": 0.1305781751871109, + "learning_rate": 0.0007451081555023976, + "loss": 2.716, + "step": 11587 + }, + { + "epoch": 0.3436230465854165, + "grad_norm": 0.19780988991260529, + "learning_rate": 0.0007450671447617598, + "loss": 2.714, + "step": 11588 + }, + { + "epoch": 0.34365269993772796, + "grad_norm": 0.1512204259634018, + "learning_rate": 0.0007450261318510104, + "loss": 2.7229, + "step": 11589 + }, + { + "epoch": 0.34368235329003943, + "grad_norm": 0.14808060228824615, + "learning_rate": 0.0007449851167705125, + "loss": 2.7204, + "step": 11590 + }, + { + "epoch": 0.3437120066423509, + "grad_norm": 0.1329728215932846, + "learning_rate": 0.0007449440995206294, + "loss": 2.6989, + "step": 11591 + }, + { + "epoch": 0.3437416599946624, + "grad_norm": 0.10954440385103226, + "learning_rate": 0.0007449030801017241, + "loss": 2.744, + "step": 11592 + }, + { + "epoch": 0.34377131334697386, + "grad_norm": 0.10467267781496048, + "learning_rate": 0.0007448620585141599, + "loss": 2.735, + "step": 11593 + }, + { + "epoch": 0.34380096669928534, + "grad_norm": 0.12293455749750137, + "learning_rate": 0.0007448210347583004, + "loss": 2.7298, + "step": 11594 + }, + { + "epoch": 0.3438306200515968, + "grad_norm": 0.14473573863506317, + "learning_rate": 0.0007447800088345084, + "loss": 2.7029, + "step": 11595 + }, + { + "epoch": 0.3438602734039083, + "grad_norm": 0.14107485115528107, + "learning_rate": 0.0007447389807431476, + "loss": 2.6898, + "step": 11596 + }, + { + "epoch": 0.34388992675621977, + "grad_norm": 0.12474064528942108, + "learning_rate": 0.000744697950484581, + "loss": 2.7382, + "step": 11597 + }, + { + "epoch": 0.34391958010853124, + "grad_norm": 0.1307264119386673, + "learning_rate": 0.000744656918059172, + "loss": 2.7117, + "step": 11598 + }, + { + "epoch": 0.3439492334608428, + "grad_norm": 0.13544119894504547, + "learning_rate": 0.0007446158834672843, + "loss": 2.6904, + "step": 11599 + }, + { + "epoch": 0.34397888681315425, + "grad_norm": 0.1274748295545578, + "learning_rate": 0.0007445748467092806, + "loss": 2.6793, + "step": 11600 + }, + { + "epoch": 0.3440085401654657, + "grad_norm": 0.10520192235708237, + "learning_rate": 0.0007445338077855248, + "loss": 2.735, + "step": 11601 + }, + { + "epoch": 0.3440381935177772, + "grad_norm": 0.1456245630979538, + "learning_rate": 0.0007444927666963801, + "loss": 2.7121, + "step": 11602 + }, + { + "epoch": 0.3440678468700887, + "grad_norm": 0.1583389937877655, + "learning_rate": 0.0007444517234422101, + "loss": 2.7217, + "step": 11603 + }, + { + "epoch": 0.34409750022240015, + "grad_norm": 0.16016599535942078, + "learning_rate": 0.000744410678023378, + "loss": 2.7028, + "step": 11604 + }, + { + "epoch": 0.34412715357471163, + "grad_norm": 0.16294579207897186, + "learning_rate": 0.0007443696304402476, + "loss": 2.7304, + "step": 11605 + }, + { + "epoch": 0.3441568069270231, + "grad_norm": 0.16645321249961853, + "learning_rate": 0.000744328580693182, + "loss": 2.7295, + "step": 11606 + }, + { + "epoch": 0.3441864602793346, + "grad_norm": 0.15867877006530762, + "learning_rate": 0.0007442875287825448, + "loss": 2.7221, + "step": 11607 + }, + { + "epoch": 0.34421611363164606, + "grad_norm": 0.16396380960941315, + "learning_rate": 0.0007442464747086998, + "loss": 2.7182, + "step": 11608 + }, + { + "epoch": 0.34424576698395754, + "grad_norm": 0.1750689446926117, + "learning_rate": 0.0007442054184720101, + "loss": 2.7165, + "step": 11609 + }, + { + "epoch": 0.344275420336269, + "grad_norm": 0.15677455067634583, + "learning_rate": 0.0007441643600728399, + "loss": 2.7054, + "step": 11610 + }, + { + "epoch": 0.3443050736885805, + "grad_norm": 0.13066741824150085, + "learning_rate": 0.000744123299511552, + "loss": 2.7189, + "step": 11611 + }, + { + "epoch": 0.34433472704089196, + "grad_norm": 0.11034875363111496, + "learning_rate": 0.0007440822367885105, + "loss": 2.6726, + "step": 11612 + }, + { + "epoch": 0.34436438039320344, + "grad_norm": 0.14033390581607819, + "learning_rate": 0.0007440411719040789, + "loss": 2.7281, + "step": 11613 + }, + { + "epoch": 0.3443940337455149, + "grad_norm": 0.1639099270105362, + "learning_rate": 0.0007440001048586209, + "loss": 2.7037, + "step": 11614 + }, + { + "epoch": 0.3444236870978264, + "grad_norm": 0.13437536358833313, + "learning_rate": 0.0007439590356525, + "loss": 2.6743, + "step": 11615 + }, + { + "epoch": 0.34445334045013787, + "grad_norm": 0.1292918175458908, + "learning_rate": 0.0007439179642860802, + "loss": 2.6949, + "step": 11616 + }, + { + "epoch": 0.34448299380244934, + "grad_norm": 0.15491171181201935, + "learning_rate": 0.0007438768907597246, + "loss": 2.739, + "step": 11617 + }, + { + "epoch": 0.3445126471547608, + "grad_norm": 0.14793387055397034, + "learning_rate": 0.0007438358150737974, + "loss": 2.739, + "step": 11618 + }, + { + "epoch": 0.34454230050707235, + "grad_norm": 0.11827848106622696, + "learning_rate": 0.0007437947372286622, + "loss": 2.718, + "step": 11619 + }, + { + "epoch": 0.34457195385938383, + "grad_norm": 0.1293746680021286, + "learning_rate": 0.0007437536572246828, + "loss": 2.6989, + "step": 11620 + }, + { + "epoch": 0.3446016072116953, + "grad_norm": 0.13586480915546417, + "learning_rate": 0.0007437125750622229, + "loss": 2.6752, + "step": 11621 + }, + { + "epoch": 0.3446312605640068, + "grad_norm": 0.12375206500291824, + "learning_rate": 0.0007436714907416465, + "loss": 2.7138, + "step": 11622 + }, + { + "epoch": 0.34466091391631826, + "grad_norm": 0.12806130945682526, + "learning_rate": 0.0007436304042633171, + "loss": 2.6814, + "step": 11623 + }, + { + "epoch": 0.34469056726862973, + "grad_norm": 0.13979408144950867, + "learning_rate": 0.0007435893156275985, + "loss": 2.6766, + "step": 11624 + }, + { + "epoch": 0.3447202206209412, + "grad_norm": 0.12226685136556625, + "learning_rate": 0.0007435482248348547, + "loss": 2.6896, + "step": 11625 + }, + { + "epoch": 0.3447498739732527, + "grad_norm": 0.12065441906452179, + "learning_rate": 0.0007435071318854497, + "loss": 2.7127, + "step": 11626 + }, + { + "epoch": 0.34477952732556416, + "grad_norm": 0.12964856624603271, + "learning_rate": 0.0007434660367797474, + "loss": 2.6898, + "step": 11627 + }, + { + "epoch": 0.34480918067787564, + "grad_norm": 0.1263938695192337, + "learning_rate": 0.0007434249395181113, + "loss": 2.7225, + "step": 11628 + }, + { + "epoch": 0.3448388340301871, + "grad_norm": 0.13731147348880768, + "learning_rate": 0.0007433838401009056, + "loss": 2.7466, + "step": 11629 + }, + { + "epoch": 0.3448684873824986, + "grad_norm": 0.1182718500494957, + "learning_rate": 0.0007433427385284944, + "loss": 2.711, + "step": 11630 + }, + { + "epoch": 0.34489814073481007, + "grad_norm": 0.10129188001155853, + "learning_rate": 0.0007433016348012411, + "loss": 2.7215, + "step": 11631 + }, + { + "epoch": 0.34492779408712154, + "grad_norm": 0.11550556123256683, + "learning_rate": 0.0007432605289195104, + "loss": 2.7344, + "step": 11632 + }, + { + "epoch": 0.344957447439433, + "grad_norm": 0.1127890944480896, + "learning_rate": 0.000743219420883666, + "loss": 2.6751, + "step": 11633 + }, + { + "epoch": 0.3449871007917445, + "grad_norm": 0.11405768990516663, + "learning_rate": 0.0007431783106940718, + "loss": 2.7194, + "step": 11634 + }, + { + "epoch": 0.34501675414405597, + "grad_norm": 0.11014878749847412, + "learning_rate": 0.0007431371983510917, + "loss": 2.6727, + "step": 11635 + }, + { + "epoch": 0.34504640749636745, + "grad_norm": 0.12247221171855927, + "learning_rate": 0.0007430960838550902, + "loss": 2.7162, + "step": 11636 + }, + { + "epoch": 0.3450760608486789, + "grad_norm": 0.1290813833475113, + "learning_rate": 0.0007430549672064311, + "loss": 2.7199, + "step": 11637 + }, + { + "epoch": 0.3451057142009904, + "grad_norm": 0.14120174944400787, + "learning_rate": 0.0007430138484054786, + "loss": 2.6622, + "step": 11638 + }, + { + "epoch": 0.3451353675533019, + "grad_norm": 0.1361636519432068, + "learning_rate": 0.0007429727274525967, + "loss": 2.7303, + "step": 11639 + }, + { + "epoch": 0.3451650209056134, + "grad_norm": 0.12915445864200592, + "learning_rate": 0.0007429316043481496, + "loss": 2.701, + "step": 11640 + }, + { + "epoch": 0.3451946742579249, + "grad_norm": 0.1564289927482605, + "learning_rate": 0.0007428904790925014, + "loss": 2.6929, + "step": 11641 + }, + { + "epoch": 0.34522432761023636, + "grad_norm": 0.1625307947397232, + "learning_rate": 0.0007428493516860164, + "loss": 2.6845, + "step": 11642 + }, + { + "epoch": 0.34525398096254784, + "grad_norm": 0.1391894370317459, + "learning_rate": 0.0007428082221290586, + "loss": 2.7081, + "step": 11643 + }, + { + "epoch": 0.3452836343148593, + "grad_norm": 0.13117346167564392, + "learning_rate": 0.0007427670904219925, + "loss": 2.7353, + "step": 11644 + }, + { + "epoch": 0.3453132876671708, + "grad_norm": 0.13014477491378784, + "learning_rate": 0.000742725956565182, + "loss": 2.6987, + "step": 11645 + }, + { + "epoch": 0.34534294101948226, + "grad_norm": 0.13444556295871735, + "learning_rate": 0.0007426848205589915, + "loss": 2.6725, + "step": 11646 + }, + { + "epoch": 0.34537259437179374, + "grad_norm": 0.13710647821426392, + "learning_rate": 0.0007426436824037853, + "loss": 2.6847, + "step": 11647 + }, + { + "epoch": 0.3454022477241052, + "grad_norm": 0.13726350665092468, + "learning_rate": 0.0007426025420999275, + "loss": 2.7325, + "step": 11648 + }, + { + "epoch": 0.3454319010764167, + "grad_norm": 0.13191433250904083, + "learning_rate": 0.0007425613996477828, + "loss": 2.693, + "step": 11649 + }, + { + "epoch": 0.34546155442872817, + "grad_norm": 0.14379194378852844, + "learning_rate": 0.000742520255047715, + "loss": 2.6907, + "step": 11650 + }, + { + "epoch": 0.34549120778103964, + "grad_norm": 0.14549501240253448, + "learning_rate": 0.0007424791083000888, + "loss": 2.7245, + "step": 11651 + }, + { + "epoch": 0.3455208611333511, + "grad_norm": 0.1626100093126297, + "learning_rate": 0.0007424379594052686, + "loss": 2.7024, + "step": 11652 + }, + { + "epoch": 0.3455505144856626, + "grad_norm": 0.1517837941646576, + "learning_rate": 0.0007423968083636185, + "loss": 2.6887, + "step": 11653 + }, + { + "epoch": 0.3455801678379741, + "grad_norm": 0.15008558332920074, + "learning_rate": 0.0007423556551755032, + "loss": 2.713, + "step": 11654 + }, + { + "epoch": 0.34560982119028555, + "grad_norm": 0.15915358066558838, + "learning_rate": 0.000742314499841287, + "loss": 2.7165, + "step": 11655 + }, + { + "epoch": 0.345639474542597, + "grad_norm": 0.1463482677936554, + "learning_rate": 0.0007422733423613342, + "loss": 2.6913, + "step": 11656 + }, + { + "epoch": 0.3456691278949085, + "grad_norm": 0.12959174811840057, + "learning_rate": 0.0007422321827360093, + "loss": 2.7107, + "step": 11657 + }, + { + "epoch": 0.34569878124722, + "grad_norm": 0.1285102218389511, + "learning_rate": 0.0007421910209656769, + "loss": 2.6989, + "step": 11658 + }, + { + "epoch": 0.34572843459953145, + "grad_norm": 0.12981963157653809, + "learning_rate": 0.0007421498570507015, + "loss": 2.7306, + "step": 11659 + }, + { + "epoch": 0.34575808795184293, + "grad_norm": 0.12964275479316711, + "learning_rate": 0.0007421086909914477, + "loss": 2.7073, + "step": 11660 + }, + { + "epoch": 0.34578774130415446, + "grad_norm": 0.1415739357471466, + "learning_rate": 0.0007420675227882796, + "loss": 2.707, + "step": 11661 + }, + { + "epoch": 0.34581739465646594, + "grad_norm": 0.1359989494085312, + "learning_rate": 0.0007420263524415622, + "loss": 2.6948, + "step": 11662 + }, + { + "epoch": 0.3458470480087774, + "grad_norm": 0.1461770236492157, + "learning_rate": 0.0007419851799516597, + "loss": 2.7277, + "step": 11663 + }, + { + "epoch": 0.3458767013610889, + "grad_norm": 0.1602599173784256, + "learning_rate": 0.000741944005318937, + "loss": 2.7122, + "step": 11664 + }, + { + "epoch": 0.34590635471340037, + "grad_norm": 0.1619303822517395, + "learning_rate": 0.0007419028285437588, + "loss": 2.6842, + "step": 11665 + }, + { + "epoch": 0.34593600806571184, + "grad_norm": 0.1455519050359726, + "learning_rate": 0.0007418616496264892, + "loss": 2.7146, + "step": 11666 + }, + { + "epoch": 0.3459656614180233, + "grad_norm": 0.11920200288295746, + "learning_rate": 0.0007418204685674934, + "loss": 2.7178, + "step": 11667 + }, + { + "epoch": 0.3459953147703348, + "grad_norm": 0.12589925527572632, + "learning_rate": 0.0007417792853671358, + "loss": 2.7137, + "step": 11668 + }, + { + "epoch": 0.34602496812264627, + "grad_norm": 0.14152728021144867, + "learning_rate": 0.000741738100025781, + "loss": 2.7164, + "step": 11669 + }, + { + "epoch": 0.34605462147495775, + "grad_norm": 0.14509466290473938, + "learning_rate": 0.0007416969125437941, + "loss": 2.7209, + "step": 11670 + }, + { + "epoch": 0.3460842748272692, + "grad_norm": 0.1273423582315445, + "learning_rate": 0.0007416557229215394, + "loss": 2.6917, + "step": 11671 + }, + { + "epoch": 0.3461139281795807, + "grad_norm": 0.11462123692035675, + "learning_rate": 0.0007416145311593818, + "loss": 2.6536, + "step": 11672 + }, + { + "epoch": 0.3461435815318922, + "grad_norm": 0.12214559316635132, + "learning_rate": 0.000741573337257686, + "loss": 2.7104, + "step": 11673 + }, + { + "epoch": 0.34617323488420365, + "grad_norm": 0.1282971054315567, + "learning_rate": 0.0007415321412168169, + "loss": 2.7254, + "step": 11674 + }, + { + "epoch": 0.3462028882365151, + "grad_norm": 0.13683901727199554, + "learning_rate": 0.0007414909430371393, + "loss": 2.7143, + "step": 11675 + }, + { + "epoch": 0.3462325415888266, + "grad_norm": 0.12714971601963043, + "learning_rate": 0.0007414497427190177, + "loss": 2.7246, + "step": 11676 + }, + { + "epoch": 0.3462621949411381, + "grad_norm": 0.12230009585618973, + "learning_rate": 0.0007414085402628175, + "loss": 2.7321, + "step": 11677 + }, + { + "epoch": 0.34629184829344956, + "grad_norm": 0.1355016529560089, + "learning_rate": 0.000741367335668903, + "loss": 2.7089, + "step": 11678 + }, + { + "epoch": 0.34632150164576103, + "grad_norm": 0.157108873128891, + "learning_rate": 0.0007413261289376394, + "loss": 2.7211, + "step": 11679 + }, + { + "epoch": 0.3463511549980725, + "grad_norm": 0.1745269000530243, + "learning_rate": 0.0007412849200693915, + "loss": 2.7213, + "step": 11680 + }, + { + "epoch": 0.346380808350384, + "grad_norm": 0.1707896590232849, + "learning_rate": 0.0007412437090645242, + "loss": 2.7221, + "step": 11681 + }, + { + "epoch": 0.3464104617026955, + "grad_norm": 0.16950054466724396, + "learning_rate": 0.0007412024959234025, + "loss": 2.7195, + "step": 11682 + }, + { + "epoch": 0.346440115055007, + "grad_norm": 0.1827971190214157, + "learning_rate": 0.0007411612806463912, + "loss": 2.7113, + "step": 11683 + }, + { + "epoch": 0.34646976840731847, + "grad_norm": 0.17011204361915588, + "learning_rate": 0.0007411200632338555, + "loss": 2.7137, + "step": 11684 + }, + { + "epoch": 0.34649942175962994, + "grad_norm": 0.12839075922966003, + "learning_rate": 0.0007410788436861602, + "loss": 2.7099, + "step": 11685 + }, + { + "epoch": 0.3465290751119414, + "grad_norm": 0.12459049373865128, + "learning_rate": 0.0007410376220036703, + "loss": 2.7097, + "step": 11686 + }, + { + "epoch": 0.3465587284642529, + "grad_norm": 0.1348632574081421, + "learning_rate": 0.000740996398186751, + "loss": 2.7251, + "step": 11687 + }, + { + "epoch": 0.3465883818165644, + "grad_norm": 0.12324494868516922, + "learning_rate": 0.0007409551722357672, + "loss": 2.7263, + "step": 11688 + }, + { + "epoch": 0.34661803516887585, + "grad_norm": 0.11908023804426193, + "learning_rate": 0.0007409139441510839, + "loss": 2.67, + "step": 11689 + }, + { + "epoch": 0.3466476885211873, + "grad_norm": 0.14301255345344543, + "learning_rate": 0.0007408727139330663, + "loss": 2.69, + "step": 11690 + }, + { + "epoch": 0.3466773418734988, + "grad_norm": 0.14987348020076752, + "learning_rate": 0.0007408314815820794, + "loss": 2.7144, + "step": 11691 + }, + { + "epoch": 0.3467069952258103, + "grad_norm": 0.1337135285139084, + "learning_rate": 0.0007407902470984887, + "loss": 2.6903, + "step": 11692 + }, + { + "epoch": 0.34673664857812175, + "grad_norm": 0.13053086400032043, + "learning_rate": 0.0007407490104826587, + "loss": 2.7167, + "step": 11693 + }, + { + "epoch": 0.34676630193043323, + "grad_norm": 0.1450785994529724, + "learning_rate": 0.000740707771734955, + "loss": 2.7092, + "step": 11694 + }, + { + "epoch": 0.3467959552827447, + "grad_norm": 0.1541498452425003, + "learning_rate": 0.0007406665308557426, + "loss": 2.6959, + "step": 11695 + }, + { + "epoch": 0.3468256086350562, + "grad_norm": 0.13258926570415497, + "learning_rate": 0.0007406252878453867, + "loss": 2.6939, + "step": 11696 + }, + { + "epoch": 0.34685526198736766, + "grad_norm": 0.1631488800048828, + "learning_rate": 0.0007405840427042528, + "loss": 2.7084, + "step": 11697 + }, + { + "epoch": 0.34688491533967913, + "grad_norm": 0.158323734998703, + "learning_rate": 0.0007405427954327059, + "loss": 2.7033, + "step": 11698 + }, + { + "epoch": 0.3469145686919906, + "grad_norm": 0.17605382204055786, + "learning_rate": 0.0007405015460311111, + "loss": 2.6738, + "step": 11699 + }, + { + "epoch": 0.3469442220443021, + "grad_norm": 0.19264477491378784, + "learning_rate": 0.0007404602944998339, + "loss": 2.7235, + "step": 11700 + }, + { + "epoch": 0.34697387539661356, + "grad_norm": 0.14533253014087677, + "learning_rate": 0.0007404190408392395, + "loss": 2.7078, + "step": 11701 + }, + { + "epoch": 0.34700352874892504, + "grad_norm": 0.14759083092212677, + "learning_rate": 0.0007403777850496931, + "loss": 2.7291, + "step": 11702 + }, + { + "epoch": 0.34703318210123657, + "grad_norm": 0.16349723935127258, + "learning_rate": 0.0007403365271315604, + "loss": 2.6881, + "step": 11703 + }, + { + "epoch": 0.34706283545354805, + "grad_norm": 0.1425987333059311, + "learning_rate": 0.0007402952670852065, + "loss": 2.7427, + "step": 11704 + }, + { + "epoch": 0.3470924888058595, + "grad_norm": 0.14449870586395264, + "learning_rate": 0.0007402540049109966, + "loss": 2.7097, + "step": 11705 + }, + { + "epoch": 0.347122142158171, + "grad_norm": 0.13873189687728882, + "learning_rate": 0.0007402127406092962, + "loss": 2.731, + "step": 11706 + }, + { + "epoch": 0.3471517955104825, + "grad_norm": 0.1275952011346817, + "learning_rate": 0.0007401714741804709, + "loss": 2.6865, + "step": 11707 + }, + { + "epoch": 0.34718144886279395, + "grad_norm": 0.12397986650466919, + "learning_rate": 0.0007401302056248859, + "loss": 2.7031, + "step": 11708 + }, + { + "epoch": 0.3472111022151054, + "grad_norm": 0.1202307865023613, + "learning_rate": 0.0007400889349429067, + "loss": 2.678, + "step": 11709 + }, + { + "epoch": 0.3472407555674169, + "grad_norm": 0.1164819523692131, + "learning_rate": 0.0007400476621348989, + "loss": 2.7419, + "step": 11710 + }, + { + "epoch": 0.3472704089197284, + "grad_norm": 0.1128787025809288, + "learning_rate": 0.0007400063872012277, + "loss": 2.6597, + "step": 11711 + }, + { + "epoch": 0.34730006227203986, + "grad_norm": 0.11632820963859558, + "learning_rate": 0.0007399651101422588, + "loss": 2.7132, + "step": 11712 + }, + { + "epoch": 0.34732971562435133, + "grad_norm": 0.13477975130081177, + "learning_rate": 0.0007399238309583575, + "loss": 2.7312, + "step": 11713 + }, + { + "epoch": 0.3473593689766628, + "grad_norm": 0.13850994408130646, + "learning_rate": 0.0007398825496498898, + "loss": 2.7033, + "step": 11714 + }, + { + "epoch": 0.3473890223289743, + "grad_norm": 0.140826016664505, + "learning_rate": 0.0007398412662172207, + "loss": 2.7415, + "step": 11715 + }, + { + "epoch": 0.34741867568128576, + "grad_norm": 0.14127355813980103, + "learning_rate": 0.0007397999806607161, + "loss": 2.7379, + "step": 11716 + }, + { + "epoch": 0.34744832903359724, + "grad_norm": 0.14275527000427246, + "learning_rate": 0.0007397586929807416, + "loss": 2.6949, + "step": 11717 + }, + { + "epoch": 0.3474779823859087, + "grad_norm": 0.15957292914390564, + "learning_rate": 0.0007397174031776626, + "loss": 2.6882, + "step": 11718 + }, + { + "epoch": 0.3475076357382202, + "grad_norm": 0.1431155800819397, + "learning_rate": 0.0007396761112518449, + "loss": 2.6849, + "step": 11719 + }, + { + "epoch": 0.34753728909053166, + "grad_norm": 0.11675826460123062, + "learning_rate": 0.0007396348172036542, + "loss": 2.7346, + "step": 11720 + }, + { + "epoch": 0.34756694244284314, + "grad_norm": 0.12785489857196808, + "learning_rate": 0.0007395935210334558, + "loss": 2.7365, + "step": 11721 + }, + { + "epoch": 0.3475965957951546, + "grad_norm": 0.15750627219676971, + "learning_rate": 0.0007395522227416158, + "loss": 2.7091, + "step": 11722 + }, + { + "epoch": 0.34762624914746615, + "grad_norm": 0.1426733136177063, + "learning_rate": 0.0007395109223284997, + "loss": 2.6883, + "step": 11723 + }, + { + "epoch": 0.3476559024997776, + "grad_norm": 0.13163794577121735, + "learning_rate": 0.0007394696197944733, + "loss": 2.6978, + "step": 11724 + }, + { + "epoch": 0.3476855558520891, + "grad_norm": 0.13178054988384247, + "learning_rate": 0.0007394283151399025, + "loss": 2.7144, + "step": 11725 + }, + { + "epoch": 0.3477152092044006, + "grad_norm": 0.1467788815498352, + "learning_rate": 0.0007393870083651526, + "loss": 2.7338, + "step": 11726 + }, + { + "epoch": 0.34774486255671205, + "grad_norm": 0.15238915383815765, + "learning_rate": 0.0007393456994705898, + "loss": 2.7405, + "step": 11727 + }, + { + "epoch": 0.34777451590902353, + "grad_norm": 0.1282469481229782, + "learning_rate": 0.0007393043884565795, + "loss": 2.7122, + "step": 11728 + }, + { + "epoch": 0.347804169261335, + "grad_norm": 0.12203970551490784, + "learning_rate": 0.000739263075323488, + "loss": 2.6981, + "step": 11729 + }, + { + "epoch": 0.3478338226136465, + "grad_norm": 0.12430267035961151, + "learning_rate": 0.000739221760071681, + "loss": 2.707, + "step": 11730 + }, + { + "epoch": 0.34786347596595796, + "grad_norm": 0.1071382686495781, + "learning_rate": 0.0007391804427015241, + "loss": 2.7283, + "step": 11731 + }, + { + "epoch": 0.34789312931826943, + "grad_norm": 0.1285751461982727, + "learning_rate": 0.0007391391232133834, + "loss": 2.7058, + "step": 11732 + }, + { + "epoch": 0.3479227826705809, + "grad_norm": 0.1501922905445099, + "learning_rate": 0.0007390978016076247, + "loss": 2.7245, + "step": 11733 + }, + { + "epoch": 0.3479524360228924, + "grad_norm": 0.15852892398834229, + "learning_rate": 0.0007390564778846137, + "loss": 2.7224, + "step": 11734 + }, + { + "epoch": 0.34798208937520386, + "grad_norm": 0.16053782403469086, + "learning_rate": 0.0007390151520447166, + "loss": 2.7315, + "step": 11735 + }, + { + "epoch": 0.34801174272751534, + "grad_norm": 0.1362522840499878, + "learning_rate": 0.0007389738240882996, + "loss": 2.6925, + "step": 11736 + }, + { + "epoch": 0.3480413960798268, + "grad_norm": 0.13254761695861816, + "learning_rate": 0.0007389324940157283, + "loss": 2.7256, + "step": 11737 + }, + { + "epoch": 0.3480710494321383, + "grad_norm": 0.13947109878063202, + "learning_rate": 0.0007388911618273685, + "loss": 2.7085, + "step": 11738 + }, + { + "epoch": 0.34810070278444977, + "grad_norm": 0.15107281506061554, + "learning_rate": 0.0007388498275235866, + "loss": 2.7221, + "step": 11739 + }, + { + "epoch": 0.34813035613676124, + "grad_norm": 0.15770018100738525, + "learning_rate": 0.0007388084911047485, + "loss": 2.7113, + "step": 11740 + }, + { + "epoch": 0.3481600094890727, + "grad_norm": 0.16092313826084137, + "learning_rate": 0.0007387671525712203, + "loss": 2.7039, + "step": 11741 + }, + { + "epoch": 0.3481896628413842, + "grad_norm": 0.18209169805049896, + "learning_rate": 0.0007387258119233679, + "loss": 2.7148, + "step": 11742 + }, + { + "epoch": 0.34821931619369567, + "grad_norm": 0.17441371083259583, + "learning_rate": 0.0007386844691615576, + "loss": 2.6998, + "step": 11743 + }, + { + "epoch": 0.3482489695460072, + "grad_norm": 0.1343757063150406, + "learning_rate": 0.0007386431242861551, + "loss": 2.7085, + "step": 11744 + }, + { + "epoch": 0.3482786228983187, + "grad_norm": 0.1350119262933731, + "learning_rate": 0.0007386017772975269, + "loss": 2.6905, + "step": 11745 + }, + { + "epoch": 0.34830827625063016, + "grad_norm": 0.14811554551124573, + "learning_rate": 0.0007385604281960389, + "loss": 2.7007, + "step": 11746 + }, + { + "epoch": 0.34833792960294163, + "grad_norm": 0.13543593883514404, + "learning_rate": 0.0007385190769820574, + "loss": 2.7084, + "step": 11747 + }, + { + "epoch": 0.3483675829552531, + "grad_norm": 0.12427181005477905, + "learning_rate": 0.0007384777236559487, + "loss": 2.6897, + "step": 11748 + }, + { + "epoch": 0.3483972363075646, + "grad_norm": 0.14115473628044128, + "learning_rate": 0.0007384363682180786, + "loss": 2.6935, + "step": 11749 + }, + { + "epoch": 0.34842688965987606, + "grad_norm": 0.13925573229789734, + "learning_rate": 0.0007383950106688137, + "loss": 2.6797, + "step": 11750 + }, + { + "epoch": 0.34845654301218754, + "grad_norm": 0.12909246981143951, + "learning_rate": 0.00073835365100852, + "loss": 2.7221, + "step": 11751 + }, + { + "epoch": 0.348486196364499, + "grad_norm": 0.14055460691452026, + "learning_rate": 0.0007383122892375638, + "loss": 2.708, + "step": 11752 + }, + { + "epoch": 0.3485158497168105, + "grad_norm": 0.11930206418037415, + "learning_rate": 0.0007382709253563114, + "loss": 2.6963, + "step": 11753 + }, + { + "epoch": 0.34854550306912196, + "grad_norm": 0.12395763397216797, + "learning_rate": 0.0007382295593651289, + "loss": 2.6869, + "step": 11754 + }, + { + "epoch": 0.34857515642143344, + "grad_norm": 0.12522879242897034, + "learning_rate": 0.000738188191264383, + "loss": 2.7301, + "step": 11755 + }, + { + "epoch": 0.3486048097737449, + "grad_norm": 0.13450011610984802, + "learning_rate": 0.0007381468210544397, + "loss": 2.6881, + "step": 11756 + }, + { + "epoch": 0.3486344631260564, + "grad_norm": 0.13622203469276428, + "learning_rate": 0.0007381054487356653, + "loss": 2.6818, + "step": 11757 + }, + { + "epoch": 0.34866411647836787, + "grad_norm": 0.1365165412425995, + "learning_rate": 0.0007380640743084265, + "loss": 2.7037, + "step": 11758 + }, + { + "epoch": 0.34869376983067935, + "grad_norm": 0.12806379795074463, + "learning_rate": 0.0007380226977730893, + "loss": 2.7043, + "step": 11759 + }, + { + "epoch": 0.3487234231829908, + "grad_norm": 0.1450096219778061, + "learning_rate": 0.0007379813191300202, + "loss": 2.7107, + "step": 11760 + }, + { + "epoch": 0.3487530765353023, + "grad_norm": 0.16406142711639404, + "learning_rate": 0.0007379399383795857, + "loss": 2.7345, + "step": 11761 + }, + { + "epoch": 0.3487827298876138, + "grad_norm": 0.16871528327465057, + "learning_rate": 0.0007378985555221523, + "loss": 2.6764, + "step": 11762 + }, + { + "epoch": 0.34881238323992525, + "grad_norm": 0.18230292201042175, + "learning_rate": 0.0007378571705580864, + "loss": 2.7135, + "step": 11763 + }, + { + "epoch": 0.3488420365922367, + "grad_norm": 0.16952721774578094, + "learning_rate": 0.0007378157834877543, + "loss": 2.7323, + "step": 11764 + }, + { + "epoch": 0.34887168994454826, + "grad_norm": 0.13965140283107758, + "learning_rate": 0.0007377743943115227, + "loss": 2.6675, + "step": 11765 + }, + { + "epoch": 0.34890134329685973, + "grad_norm": 0.1447756439447403, + "learning_rate": 0.0007377330030297579, + "loss": 2.748, + "step": 11766 + }, + { + "epoch": 0.3489309966491712, + "grad_norm": 0.1450134664773941, + "learning_rate": 0.0007376916096428266, + "loss": 2.6914, + "step": 11767 + }, + { + "epoch": 0.3489606500014827, + "grad_norm": 0.13800442218780518, + "learning_rate": 0.0007376502141510956, + "loss": 2.7073, + "step": 11768 + }, + { + "epoch": 0.34899030335379416, + "grad_norm": 0.1274445652961731, + "learning_rate": 0.0007376088165549308, + "loss": 2.7186, + "step": 11769 + }, + { + "epoch": 0.34901995670610564, + "grad_norm": 0.13681696355342865, + "learning_rate": 0.0007375674168546993, + "loss": 2.6975, + "step": 11770 + }, + { + "epoch": 0.3490496100584171, + "grad_norm": 0.14682556688785553, + "learning_rate": 0.0007375260150507674, + "loss": 2.7107, + "step": 11771 + }, + { + "epoch": 0.3490792634107286, + "grad_norm": 0.11828472465276718, + "learning_rate": 0.0007374846111435021, + "loss": 2.6583, + "step": 11772 + }, + { + "epoch": 0.34910891676304007, + "grad_norm": 0.1250905692577362, + "learning_rate": 0.0007374432051332695, + "loss": 2.6931, + "step": 11773 + }, + { + "epoch": 0.34913857011535154, + "grad_norm": 0.13518112897872925, + "learning_rate": 0.0007374017970204368, + "loss": 2.6812, + "step": 11774 + }, + { + "epoch": 0.349168223467663, + "grad_norm": 0.13105636835098267, + "learning_rate": 0.0007373603868053703, + "loss": 2.6785, + "step": 11775 + }, + { + "epoch": 0.3491978768199745, + "grad_norm": 0.1257840394973755, + "learning_rate": 0.000737318974488437, + "loss": 2.6891, + "step": 11776 + }, + { + "epoch": 0.34922753017228597, + "grad_norm": 0.13274292647838593, + "learning_rate": 0.0007372775600700032, + "loss": 2.6828, + "step": 11777 + }, + { + "epoch": 0.34925718352459745, + "grad_norm": 0.14654354751110077, + "learning_rate": 0.0007372361435504361, + "loss": 2.7021, + "step": 11778 + }, + { + "epoch": 0.3492868368769089, + "grad_norm": 0.1417524814605713, + "learning_rate": 0.0007371947249301021, + "loss": 2.6789, + "step": 11779 + }, + { + "epoch": 0.3493164902292204, + "grad_norm": 0.13056506216526031, + "learning_rate": 0.0007371533042093682, + "loss": 2.7295, + "step": 11780 + }, + { + "epoch": 0.3493461435815319, + "grad_norm": 0.1225033849477768, + "learning_rate": 0.0007371118813886009, + "loss": 2.6966, + "step": 11781 + }, + { + "epoch": 0.34937579693384335, + "grad_norm": 0.13339261710643768, + "learning_rate": 0.0007370704564681673, + "loss": 2.7359, + "step": 11782 + }, + { + "epoch": 0.34940545028615483, + "grad_norm": 0.13234874606132507, + "learning_rate": 0.0007370290294484343, + "loss": 2.7093, + "step": 11783 + }, + { + "epoch": 0.3494351036384663, + "grad_norm": 0.13208317756652832, + "learning_rate": 0.0007369876003297684, + "loss": 2.734, + "step": 11784 + }, + { + "epoch": 0.3494647569907778, + "grad_norm": 0.140350803732872, + "learning_rate": 0.0007369461691125366, + "loss": 2.7131, + "step": 11785 + }, + { + "epoch": 0.3494944103430893, + "grad_norm": 0.1645546853542328, + "learning_rate": 0.0007369047357971057, + "loss": 2.7088, + "step": 11786 + }, + { + "epoch": 0.3495240636954008, + "grad_norm": 0.14086413383483887, + "learning_rate": 0.0007368633003838428, + "loss": 2.722, + "step": 11787 + }, + { + "epoch": 0.34955371704771226, + "grad_norm": 0.1418641209602356, + "learning_rate": 0.0007368218628731148, + "loss": 2.7013, + "step": 11788 + }, + { + "epoch": 0.34958337040002374, + "grad_norm": 0.13060133159160614, + "learning_rate": 0.0007367804232652885, + "loss": 2.7186, + "step": 11789 + }, + { + "epoch": 0.3496130237523352, + "grad_norm": 0.13165883719921112, + "learning_rate": 0.0007367389815607308, + "loss": 2.7071, + "step": 11790 + }, + { + "epoch": 0.3496426771046467, + "grad_norm": 0.13318605720996857, + "learning_rate": 0.000736697537759809, + "loss": 2.6899, + "step": 11791 + }, + { + "epoch": 0.34967233045695817, + "grad_norm": 0.11372331529855728, + "learning_rate": 0.0007366560918628897, + "loss": 2.708, + "step": 11792 + }, + { + "epoch": 0.34970198380926965, + "grad_norm": 0.1258470118045807, + "learning_rate": 0.0007366146438703402, + "loss": 2.6999, + "step": 11793 + }, + { + "epoch": 0.3497316371615811, + "grad_norm": 0.1309257447719574, + "learning_rate": 0.0007365731937825273, + "loss": 2.7334, + "step": 11794 + }, + { + "epoch": 0.3497612905138926, + "grad_norm": 0.12258006632328033, + "learning_rate": 0.0007365317415998181, + "loss": 2.7592, + "step": 11795 + }, + { + "epoch": 0.3497909438662041, + "grad_norm": 0.1319655478000641, + "learning_rate": 0.00073649028732258, + "loss": 2.7082, + "step": 11796 + }, + { + "epoch": 0.34982059721851555, + "grad_norm": 0.12640278041362762, + "learning_rate": 0.0007364488309511796, + "loss": 2.7216, + "step": 11797 + }, + { + "epoch": 0.349850250570827, + "grad_norm": 0.15667173266410828, + "learning_rate": 0.0007364073724859843, + "loss": 2.7155, + "step": 11798 + }, + { + "epoch": 0.3498799039231385, + "grad_norm": 0.13208967447280884, + "learning_rate": 0.000736365911927361, + "loss": 2.6854, + "step": 11799 + }, + { + "epoch": 0.34990955727545, + "grad_norm": 0.12531836330890656, + "learning_rate": 0.0007363244492756771, + "loss": 2.7009, + "step": 11800 + }, + { + "epoch": 0.34993921062776145, + "grad_norm": 0.131753072142601, + "learning_rate": 0.0007362829845312997, + "loss": 2.6903, + "step": 11801 + }, + { + "epoch": 0.34996886398007293, + "grad_norm": 0.12698814272880554, + "learning_rate": 0.000736241517694596, + "loss": 2.7018, + "step": 11802 + }, + { + "epoch": 0.3499985173323844, + "grad_norm": 0.12149021774530411, + "learning_rate": 0.0007362000487659328, + "loss": 2.7263, + "step": 11803 + }, + { + "epoch": 0.3500281706846959, + "grad_norm": 0.1228727176785469, + "learning_rate": 0.0007361585777456779, + "loss": 2.7234, + "step": 11804 + }, + { + "epoch": 0.35005782403700736, + "grad_norm": 0.1259680837392807, + "learning_rate": 0.0007361171046341979, + "loss": 2.7182, + "step": 11805 + }, + { + "epoch": 0.35008747738931884, + "grad_norm": 0.13218162953853607, + "learning_rate": 0.0007360756294318608, + "loss": 2.7181, + "step": 11806 + }, + { + "epoch": 0.35011713074163037, + "grad_norm": 0.1463705450296402, + "learning_rate": 0.0007360341521390332, + "loss": 2.7118, + "step": 11807 + }, + { + "epoch": 0.35014678409394184, + "grad_norm": 0.14679810404777527, + "learning_rate": 0.0007359926727560828, + "loss": 2.723, + "step": 11808 + }, + { + "epoch": 0.3501764374462533, + "grad_norm": 0.18029898405075073, + "learning_rate": 0.0007359511912833768, + "loss": 2.7473, + "step": 11809 + }, + { + "epoch": 0.3502060907985648, + "grad_norm": 0.18020303547382355, + "learning_rate": 0.0007359097077212825, + "loss": 2.7162, + "step": 11810 + }, + { + "epoch": 0.35023574415087627, + "grad_norm": 0.16713552176952362, + "learning_rate": 0.0007358682220701671, + "loss": 2.7216, + "step": 11811 + }, + { + "epoch": 0.35026539750318775, + "grad_norm": 0.17420357465744019, + "learning_rate": 0.0007358267343303983, + "loss": 2.6375, + "step": 11812 + }, + { + "epoch": 0.3502950508554992, + "grad_norm": 0.212269127368927, + "learning_rate": 0.0007357852445023432, + "loss": 2.704, + "step": 11813 + }, + { + "epoch": 0.3503247042078107, + "grad_norm": 0.1315125674009323, + "learning_rate": 0.0007357437525863693, + "loss": 2.7016, + "step": 11814 + }, + { + "epoch": 0.3503543575601222, + "grad_norm": 0.18939867615699768, + "learning_rate": 0.000735702258582844, + "loss": 2.7003, + "step": 11815 + }, + { + "epoch": 0.35038401091243365, + "grad_norm": 0.16691678762435913, + "learning_rate": 0.0007356607624921347, + "loss": 2.7365, + "step": 11816 + }, + { + "epoch": 0.35041366426474513, + "grad_norm": 0.14647814631462097, + "learning_rate": 0.0007356192643146089, + "loss": 2.7168, + "step": 11817 + }, + { + "epoch": 0.3504433176170566, + "grad_norm": 0.14422111213207245, + "learning_rate": 0.0007355777640506341, + "loss": 2.6922, + "step": 11818 + }, + { + "epoch": 0.3504729709693681, + "grad_norm": 0.13452595472335815, + "learning_rate": 0.0007355362617005778, + "loss": 2.723, + "step": 11819 + }, + { + "epoch": 0.35050262432167956, + "grad_norm": 0.1360434591770172, + "learning_rate": 0.0007354947572648074, + "loss": 2.7209, + "step": 11820 + }, + { + "epoch": 0.35053227767399103, + "grad_norm": 0.13364547491073608, + "learning_rate": 0.0007354532507436906, + "loss": 2.7032, + "step": 11821 + }, + { + "epoch": 0.3505619310263025, + "grad_norm": 0.1175767108798027, + "learning_rate": 0.0007354117421375948, + "loss": 2.6899, + "step": 11822 + }, + { + "epoch": 0.350591584378614, + "grad_norm": 0.1260121464729309, + "learning_rate": 0.0007353702314468878, + "loss": 2.7045, + "step": 11823 + }, + { + "epoch": 0.35062123773092546, + "grad_norm": 0.13577619194984436, + "learning_rate": 0.0007353287186719367, + "loss": 2.739, + "step": 11824 + }, + { + "epoch": 0.35065089108323694, + "grad_norm": 0.12831972539424896, + "learning_rate": 0.0007352872038131095, + "loss": 2.7167, + "step": 11825 + }, + { + "epoch": 0.3506805444355484, + "grad_norm": 0.13612507283687592, + "learning_rate": 0.0007352456868707737, + "loss": 2.7375, + "step": 11826 + }, + { + "epoch": 0.35071019778785995, + "grad_norm": 0.14269587397575378, + "learning_rate": 0.000735204167845297, + "loss": 2.7008, + "step": 11827 + }, + { + "epoch": 0.3507398511401714, + "grad_norm": 0.1519625335931778, + "learning_rate": 0.000735162646737047, + "loss": 2.7303, + "step": 11828 + }, + { + "epoch": 0.3507695044924829, + "grad_norm": 0.14772504568099976, + "learning_rate": 0.0007351211235463915, + "loss": 2.6982, + "step": 11829 + }, + { + "epoch": 0.3507991578447944, + "grad_norm": 0.13650740683078766, + "learning_rate": 0.000735079598273698, + "loss": 2.6889, + "step": 11830 + }, + { + "epoch": 0.35082881119710585, + "grad_norm": 0.15220893919467926, + "learning_rate": 0.0007350380709193342, + "loss": 2.7316, + "step": 11831 + }, + { + "epoch": 0.3508584645494173, + "grad_norm": 0.18411721289157867, + "learning_rate": 0.0007349965414836681, + "loss": 2.7003, + "step": 11832 + }, + { + "epoch": 0.3508881179017288, + "grad_norm": 0.16136783361434937, + "learning_rate": 0.0007349550099670672, + "loss": 2.6771, + "step": 11833 + }, + { + "epoch": 0.3509177712540403, + "grad_norm": 0.15096616744995117, + "learning_rate": 0.0007349134763698996, + "loss": 2.6973, + "step": 11834 + }, + { + "epoch": 0.35094742460635175, + "grad_norm": 0.1629086583852768, + "learning_rate": 0.0007348719406925326, + "loss": 2.7062, + "step": 11835 + }, + { + "epoch": 0.35097707795866323, + "grad_norm": 0.15183432400226593, + "learning_rate": 0.0007348304029353343, + "loss": 2.7048, + "step": 11836 + }, + { + "epoch": 0.3510067313109747, + "grad_norm": 0.13290049135684967, + "learning_rate": 0.0007347888630986723, + "loss": 2.7004, + "step": 11837 + }, + { + "epoch": 0.3510363846632862, + "grad_norm": 0.1473473757505417, + "learning_rate": 0.0007347473211829149, + "loss": 2.6752, + "step": 11838 + }, + { + "epoch": 0.35106603801559766, + "grad_norm": 0.12016640603542328, + "learning_rate": 0.0007347057771884296, + "loss": 2.7389, + "step": 11839 + }, + { + "epoch": 0.35109569136790914, + "grad_norm": 0.10988399386405945, + "learning_rate": 0.0007346642311155844, + "loss": 2.6756, + "step": 11840 + }, + { + "epoch": 0.3511253447202206, + "grad_norm": 0.12414488941431046, + "learning_rate": 0.000734622682964747, + "loss": 2.7107, + "step": 11841 + }, + { + "epoch": 0.3511549980725321, + "grad_norm": 0.1198183074593544, + "learning_rate": 0.0007345811327362856, + "loss": 2.7183, + "step": 11842 + }, + { + "epoch": 0.35118465142484356, + "grad_norm": 0.1174241304397583, + "learning_rate": 0.0007345395804305678, + "loss": 2.6961, + "step": 11843 + }, + { + "epoch": 0.35121430477715504, + "grad_norm": 0.11332199722528458, + "learning_rate": 0.0007344980260479621, + "loss": 2.6847, + "step": 11844 + }, + { + "epoch": 0.3512439581294665, + "grad_norm": 0.11942749470472336, + "learning_rate": 0.0007344564695888358, + "loss": 2.71, + "step": 11845 + }, + { + "epoch": 0.351273611481778, + "grad_norm": 0.12276478856801987, + "learning_rate": 0.0007344149110535574, + "loss": 2.6801, + "step": 11846 + }, + { + "epoch": 0.35130326483408947, + "grad_norm": 0.15539532899856567, + "learning_rate": 0.0007343733504424947, + "loss": 2.6973, + "step": 11847 + }, + { + "epoch": 0.351332918186401, + "grad_norm": 0.1702357530593872, + "learning_rate": 0.0007343317877560158, + "loss": 2.7086, + "step": 11848 + }, + { + "epoch": 0.3513625715387125, + "grad_norm": 0.15152990818023682, + "learning_rate": 0.0007342902229944885, + "loss": 2.6995, + "step": 11849 + }, + { + "epoch": 0.35139222489102395, + "grad_norm": 0.14477790892124176, + "learning_rate": 0.0007342486561582812, + "loss": 2.7141, + "step": 11850 + }, + { + "epoch": 0.35142187824333543, + "grad_norm": 0.13992618024349213, + "learning_rate": 0.0007342070872477618, + "loss": 2.7102, + "step": 11851 + }, + { + "epoch": 0.3514515315956469, + "grad_norm": 0.13402627408504486, + "learning_rate": 0.0007341655162632986, + "loss": 2.7321, + "step": 11852 + }, + { + "epoch": 0.3514811849479584, + "grad_norm": 0.1473323106765747, + "learning_rate": 0.0007341239432052593, + "loss": 2.6973, + "step": 11853 + }, + { + "epoch": 0.35151083830026986, + "grad_norm": 0.14719422161579132, + "learning_rate": 0.0007340823680740125, + "loss": 2.6763, + "step": 11854 + }, + { + "epoch": 0.35154049165258133, + "grad_norm": 0.1268981397151947, + "learning_rate": 0.000734040790869926, + "loss": 2.7284, + "step": 11855 + }, + { + "epoch": 0.3515701450048928, + "grad_norm": 0.12866650521755219, + "learning_rate": 0.0007339992115933683, + "loss": 2.701, + "step": 11856 + }, + { + "epoch": 0.3515997983572043, + "grad_norm": 0.11534550040960312, + "learning_rate": 0.0007339576302447073, + "loss": 2.7311, + "step": 11857 + }, + { + "epoch": 0.35162945170951576, + "grad_norm": 0.12279167771339417, + "learning_rate": 0.0007339160468243114, + "loss": 2.6931, + "step": 11858 + }, + { + "epoch": 0.35165910506182724, + "grad_norm": 0.14104102551937103, + "learning_rate": 0.0007338744613325486, + "loss": 2.7402, + "step": 11859 + }, + { + "epoch": 0.3516887584141387, + "grad_norm": 0.1563587188720703, + "learning_rate": 0.0007338328737697873, + "loss": 2.7314, + "step": 11860 + }, + { + "epoch": 0.3517184117664502, + "grad_norm": 0.14931783080101013, + "learning_rate": 0.000733791284136396, + "loss": 2.7211, + "step": 11861 + }, + { + "epoch": 0.35174806511876167, + "grad_norm": 0.1034044697880745, + "learning_rate": 0.0007337496924327427, + "loss": 2.7054, + "step": 11862 + }, + { + "epoch": 0.35177771847107314, + "grad_norm": 0.12466644495725632, + "learning_rate": 0.0007337080986591955, + "loss": 2.6932, + "step": 11863 + }, + { + "epoch": 0.3518073718233846, + "grad_norm": 0.12647876143455505, + "learning_rate": 0.0007336665028161232, + "loss": 2.7223, + "step": 11864 + }, + { + "epoch": 0.3518370251756961, + "grad_norm": 0.1339520961046219, + "learning_rate": 0.0007336249049038938, + "loss": 2.7414, + "step": 11865 + }, + { + "epoch": 0.35186667852800757, + "grad_norm": 0.14244680106639862, + "learning_rate": 0.0007335833049228758, + "loss": 2.6909, + "step": 11866 + }, + { + "epoch": 0.35189633188031905, + "grad_norm": 0.15355630218982697, + "learning_rate": 0.0007335417028734377, + "loss": 2.6735, + "step": 11867 + }, + { + "epoch": 0.3519259852326305, + "grad_norm": 0.15080618858337402, + "learning_rate": 0.0007335000987559474, + "loss": 2.7338, + "step": 11868 + }, + { + "epoch": 0.35195563858494205, + "grad_norm": 0.13260841369628906, + "learning_rate": 0.0007334584925707739, + "loss": 2.681, + "step": 11869 + }, + { + "epoch": 0.35198529193725353, + "grad_norm": 0.12433555722236633, + "learning_rate": 0.0007334168843182852, + "loss": 2.7043, + "step": 11870 + }, + { + "epoch": 0.352014945289565, + "grad_norm": 0.12378189712762833, + "learning_rate": 0.0007333752739988501, + "loss": 2.6999, + "step": 11871 + }, + { + "epoch": 0.3520445986418765, + "grad_norm": 0.14909830689430237, + "learning_rate": 0.0007333336616128369, + "loss": 2.7169, + "step": 11872 + }, + { + "epoch": 0.35207425199418796, + "grad_norm": 0.1455027461051941, + "learning_rate": 0.000733292047160614, + "loss": 2.7179, + "step": 11873 + }, + { + "epoch": 0.35210390534649944, + "grad_norm": 0.1396310180425644, + "learning_rate": 0.00073325043064255, + "loss": 2.698, + "step": 11874 + }, + { + "epoch": 0.3521335586988109, + "grad_norm": 0.13003981113433838, + "learning_rate": 0.0007332088120590133, + "loss": 2.714, + "step": 11875 + }, + { + "epoch": 0.3521632120511224, + "grad_norm": 0.12334530055522919, + "learning_rate": 0.0007331671914103725, + "loss": 2.6936, + "step": 11876 + }, + { + "epoch": 0.35219286540343386, + "grad_norm": 0.11321516335010529, + "learning_rate": 0.0007331255686969964, + "loss": 2.6968, + "step": 11877 + }, + { + "epoch": 0.35222251875574534, + "grad_norm": 0.12465595453977585, + "learning_rate": 0.0007330839439192533, + "loss": 2.7467, + "step": 11878 + }, + { + "epoch": 0.3522521721080568, + "grad_norm": 0.1267177164554596, + "learning_rate": 0.0007330423170775119, + "loss": 2.7482, + "step": 11879 + }, + { + "epoch": 0.3522818254603683, + "grad_norm": 0.13627809286117554, + "learning_rate": 0.0007330006881721407, + "loss": 2.7018, + "step": 11880 + }, + { + "epoch": 0.35231147881267977, + "grad_norm": 0.13360901176929474, + "learning_rate": 0.0007329590572035086, + "loss": 2.6669, + "step": 11881 + }, + { + "epoch": 0.35234113216499124, + "grad_norm": 0.12572729587554932, + "learning_rate": 0.0007329174241719838, + "loss": 2.7061, + "step": 11882 + }, + { + "epoch": 0.3523707855173027, + "grad_norm": 0.12251551449298859, + "learning_rate": 0.0007328757890779354, + "loss": 2.6934, + "step": 11883 + }, + { + "epoch": 0.3524004388696142, + "grad_norm": 0.14433419704437256, + "learning_rate": 0.0007328341519217317, + "loss": 2.7362, + "step": 11884 + }, + { + "epoch": 0.3524300922219257, + "grad_norm": 0.1570558100938797, + "learning_rate": 0.0007327925127037418, + "loss": 2.7161, + "step": 11885 + }, + { + "epoch": 0.35245974557423715, + "grad_norm": 0.15072184801101685, + "learning_rate": 0.0007327508714243343, + "loss": 2.7086, + "step": 11886 + }, + { + "epoch": 0.3524893989265486, + "grad_norm": 0.13624756038188934, + "learning_rate": 0.0007327092280838777, + "loss": 2.6763, + "step": 11887 + }, + { + "epoch": 0.3525190522788601, + "grad_norm": 0.15244275331497192, + "learning_rate": 0.0007326675826827411, + "loss": 2.7428, + "step": 11888 + }, + { + "epoch": 0.3525487056311716, + "grad_norm": 0.15898314118385315, + "learning_rate": 0.0007326259352212931, + "loss": 2.6797, + "step": 11889 + }, + { + "epoch": 0.3525783589834831, + "grad_norm": 0.15559709072113037, + "learning_rate": 0.0007325842856999024, + "loss": 2.6866, + "step": 11890 + }, + { + "epoch": 0.3526080123357946, + "grad_norm": 0.153151735663414, + "learning_rate": 0.0007325426341189381, + "loss": 2.7225, + "step": 11891 + }, + { + "epoch": 0.35263766568810606, + "grad_norm": 0.15759456157684326, + "learning_rate": 0.0007325009804787687, + "loss": 2.7107, + "step": 11892 + }, + { + "epoch": 0.35266731904041754, + "grad_norm": 0.13832511007785797, + "learning_rate": 0.0007324593247797633, + "loss": 2.708, + "step": 11893 + }, + { + "epoch": 0.352696972392729, + "grad_norm": 0.1271076649427414, + "learning_rate": 0.0007324176670222907, + "loss": 2.7063, + "step": 11894 + }, + { + "epoch": 0.3527266257450405, + "grad_norm": 0.14045368134975433, + "learning_rate": 0.0007323760072067197, + "loss": 2.714, + "step": 11895 + }, + { + "epoch": 0.35275627909735197, + "grad_norm": 0.13668549060821533, + "learning_rate": 0.0007323343453334192, + "loss": 2.6968, + "step": 11896 + }, + { + "epoch": 0.35278593244966344, + "grad_norm": 0.12488990277051926, + "learning_rate": 0.0007322926814027582, + "loss": 2.6996, + "step": 11897 + }, + { + "epoch": 0.3528155858019749, + "grad_norm": 0.13675880432128906, + "learning_rate": 0.0007322510154151058, + "loss": 2.7025, + "step": 11898 + }, + { + "epoch": 0.3528452391542864, + "grad_norm": 0.13432109355926514, + "learning_rate": 0.0007322093473708307, + "loss": 2.7097, + "step": 11899 + }, + { + "epoch": 0.35287489250659787, + "grad_norm": 0.1436067670583725, + "learning_rate": 0.000732167677270302, + "loss": 2.7008, + "step": 11900 + }, + { + "epoch": 0.35290454585890935, + "grad_norm": 0.1418599784374237, + "learning_rate": 0.0007321260051138886, + "loss": 2.664, + "step": 11901 + }, + { + "epoch": 0.3529341992112208, + "grad_norm": 0.14221429824829102, + "learning_rate": 0.0007320843309019596, + "loss": 2.6988, + "step": 11902 + }, + { + "epoch": 0.3529638525635323, + "grad_norm": 0.13341079652309418, + "learning_rate": 0.000732042654634884, + "loss": 2.6915, + "step": 11903 + }, + { + "epoch": 0.3529935059158438, + "grad_norm": 0.12504631280899048, + "learning_rate": 0.0007320009763130309, + "loss": 2.6987, + "step": 11904 + }, + { + "epoch": 0.35302315926815525, + "grad_norm": 0.12099955976009369, + "learning_rate": 0.0007319592959367694, + "loss": 2.6945, + "step": 11905 + }, + { + "epoch": 0.3530528126204667, + "grad_norm": 0.13301315903663635, + "learning_rate": 0.0007319176135064685, + "loss": 2.6942, + "step": 11906 + }, + { + "epoch": 0.3530824659727782, + "grad_norm": 0.13660290837287903, + "learning_rate": 0.0007318759290224973, + "loss": 2.7186, + "step": 11907 + }, + { + "epoch": 0.3531121193250897, + "grad_norm": 0.13579587638378143, + "learning_rate": 0.0007318342424852248, + "loss": 2.6696, + "step": 11908 + }, + { + "epoch": 0.35314177267740116, + "grad_norm": 0.11912131309509277, + "learning_rate": 0.0007317925538950203, + "loss": 2.7385, + "step": 11909 + }, + { + "epoch": 0.35317142602971263, + "grad_norm": 0.1452917456626892, + "learning_rate": 0.0007317508632522532, + "loss": 2.7146, + "step": 11910 + }, + { + "epoch": 0.35320107938202416, + "grad_norm": 0.177597776055336, + "learning_rate": 0.0007317091705572922, + "loss": 2.7275, + "step": 11911 + }, + { + "epoch": 0.35323073273433564, + "grad_norm": 0.16571864485740662, + "learning_rate": 0.0007316674758105069, + "loss": 2.7287, + "step": 11912 + }, + { + "epoch": 0.3532603860866471, + "grad_norm": 0.13622146844863892, + "learning_rate": 0.0007316257790122661, + "loss": 2.7381, + "step": 11913 + }, + { + "epoch": 0.3532900394389586, + "grad_norm": 0.11344141513109207, + "learning_rate": 0.0007315840801629394, + "loss": 2.7054, + "step": 11914 + }, + { + "epoch": 0.35331969279127007, + "grad_norm": 0.14243534207344055, + "learning_rate": 0.000731542379262896, + "loss": 2.7206, + "step": 11915 + }, + { + "epoch": 0.35334934614358154, + "grad_norm": 0.14673975110054016, + "learning_rate": 0.000731500676312505, + "loss": 2.7105, + "step": 11916 + }, + { + "epoch": 0.353378999495893, + "grad_norm": 0.1418551802635193, + "learning_rate": 0.0007314589713121358, + "loss": 2.705, + "step": 11917 + }, + { + "epoch": 0.3534086528482045, + "grad_norm": 0.18228016793727875, + "learning_rate": 0.0007314172642621577, + "loss": 2.6804, + "step": 11918 + }, + { + "epoch": 0.353438306200516, + "grad_norm": 0.1751507669687271, + "learning_rate": 0.00073137555516294, + "loss": 2.6917, + "step": 11919 + }, + { + "epoch": 0.35346795955282745, + "grad_norm": 0.15113583207130432, + "learning_rate": 0.000731333844014852, + "loss": 2.7202, + "step": 11920 + }, + { + "epoch": 0.3534976129051389, + "grad_norm": 0.1456848531961441, + "learning_rate": 0.000731292130818263, + "loss": 2.735, + "step": 11921 + }, + { + "epoch": 0.3535272662574504, + "grad_norm": 0.15759064257144928, + "learning_rate": 0.0007312504155735426, + "loss": 2.7042, + "step": 11922 + }, + { + "epoch": 0.3535569196097619, + "grad_norm": 0.14518007636070251, + "learning_rate": 0.0007312086982810602, + "loss": 2.6958, + "step": 11923 + }, + { + "epoch": 0.35358657296207335, + "grad_norm": 0.12593887746334076, + "learning_rate": 0.0007311669789411848, + "loss": 2.7143, + "step": 11924 + }, + { + "epoch": 0.35361622631438483, + "grad_norm": 0.1602015346288681, + "learning_rate": 0.0007311252575542864, + "loss": 2.7046, + "step": 11925 + }, + { + "epoch": 0.3536458796666963, + "grad_norm": 0.13965383172035217, + "learning_rate": 0.0007310835341207341, + "loss": 2.6838, + "step": 11926 + }, + { + "epoch": 0.3536755330190078, + "grad_norm": 0.12583276629447937, + "learning_rate": 0.0007310418086408974, + "loss": 2.7055, + "step": 11927 + }, + { + "epoch": 0.35370518637131926, + "grad_norm": 0.1169506162405014, + "learning_rate": 0.0007310000811151457, + "loss": 2.6935, + "step": 11928 + }, + { + "epoch": 0.35373483972363073, + "grad_norm": 0.14647379517555237, + "learning_rate": 0.0007309583515438488, + "loss": 2.7384, + "step": 11929 + }, + { + "epoch": 0.3537644930759422, + "grad_norm": 0.1636558324098587, + "learning_rate": 0.000730916619927376, + "loss": 2.7147, + "step": 11930 + }, + { + "epoch": 0.35379414642825374, + "grad_norm": 0.15752367675304413, + "learning_rate": 0.0007308748862660969, + "loss": 2.6657, + "step": 11931 + }, + { + "epoch": 0.3538237997805652, + "grad_norm": 0.14817936718463898, + "learning_rate": 0.0007308331505603812, + "loss": 2.7073, + "step": 11932 + }, + { + "epoch": 0.3538534531328767, + "grad_norm": 0.13601088523864746, + "learning_rate": 0.0007307914128105982, + "loss": 2.7083, + "step": 11933 + }, + { + "epoch": 0.35388310648518817, + "grad_norm": 0.12612061202526093, + "learning_rate": 0.0007307496730171175, + "loss": 2.6865, + "step": 11934 + }, + { + "epoch": 0.35391275983749965, + "grad_norm": 0.1410272717475891, + "learning_rate": 0.0007307079311803089, + "loss": 2.6991, + "step": 11935 + }, + { + "epoch": 0.3539424131898111, + "grad_norm": 0.12412561476230621, + "learning_rate": 0.000730666187300542, + "loss": 2.6792, + "step": 11936 + }, + { + "epoch": 0.3539720665421226, + "grad_norm": 0.14931650459766388, + "learning_rate": 0.0007306244413781865, + "loss": 2.6636, + "step": 11937 + }, + { + "epoch": 0.3540017198944341, + "grad_norm": 0.16205279529094696, + "learning_rate": 0.0007305826934136119, + "loss": 2.7382, + "step": 11938 + }, + { + "epoch": 0.35403137324674555, + "grad_norm": 0.15434855222702026, + "learning_rate": 0.0007305409434071881, + "loss": 2.735, + "step": 11939 + }, + { + "epoch": 0.354061026599057, + "grad_norm": 0.13318988680839539, + "learning_rate": 0.0007304991913592846, + "loss": 2.7246, + "step": 11940 + }, + { + "epoch": 0.3540906799513685, + "grad_norm": 0.12235333025455475, + "learning_rate": 0.000730457437270271, + "loss": 2.7317, + "step": 11941 + }, + { + "epoch": 0.35412033330368, + "grad_norm": 0.12138044089078903, + "learning_rate": 0.0007304156811405174, + "loss": 2.7154, + "step": 11942 + }, + { + "epoch": 0.35414998665599146, + "grad_norm": 0.1748422086238861, + "learning_rate": 0.0007303739229703936, + "loss": 2.6954, + "step": 11943 + }, + { + "epoch": 0.35417964000830293, + "grad_norm": 0.14772500097751617, + "learning_rate": 0.0007303321627602688, + "loss": 2.6959, + "step": 11944 + }, + { + "epoch": 0.3542092933606144, + "grad_norm": 0.12319223582744598, + "learning_rate": 0.0007302904005105134, + "loss": 2.7104, + "step": 11945 + }, + { + "epoch": 0.3542389467129259, + "grad_norm": 0.13384635746479034, + "learning_rate": 0.0007302486362214969, + "loss": 2.6902, + "step": 11946 + }, + { + "epoch": 0.35426860006523736, + "grad_norm": 0.15154118835926056, + "learning_rate": 0.0007302068698935891, + "loss": 2.7032, + "step": 11947 + }, + { + "epoch": 0.35429825341754884, + "grad_norm": 0.13928590714931488, + "learning_rate": 0.0007301651015271602, + "loss": 2.7221, + "step": 11948 + }, + { + "epoch": 0.3543279067698603, + "grad_norm": 0.11445271223783493, + "learning_rate": 0.0007301233311225797, + "loss": 2.6895, + "step": 11949 + }, + { + "epoch": 0.3543575601221718, + "grad_norm": 0.12009520828723907, + "learning_rate": 0.0007300815586802175, + "loss": 2.6739, + "step": 11950 + }, + { + "epoch": 0.35438721347448326, + "grad_norm": 0.12219703197479248, + "learning_rate": 0.0007300397842004437, + "loss": 2.6865, + "step": 11951 + }, + { + "epoch": 0.3544168668267948, + "grad_norm": 0.11491873115301132, + "learning_rate": 0.0007299980076836281, + "loss": 2.7317, + "step": 11952 + }, + { + "epoch": 0.3544465201791063, + "grad_norm": 0.11617022007703781, + "learning_rate": 0.0007299562291301407, + "loss": 2.721, + "step": 11953 + }, + { + "epoch": 0.35447617353141775, + "grad_norm": 0.14240844547748566, + "learning_rate": 0.0007299144485403514, + "loss": 2.7328, + "step": 11954 + }, + { + "epoch": 0.3545058268837292, + "grad_norm": 0.1498793363571167, + "learning_rate": 0.0007298726659146302, + "loss": 2.6925, + "step": 11955 + }, + { + "epoch": 0.3545354802360407, + "grad_norm": 0.12535697221755981, + "learning_rate": 0.000729830881253347, + "loss": 2.6909, + "step": 11956 + }, + { + "epoch": 0.3545651335883522, + "grad_norm": 0.11623368412256241, + "learning_rate": 0.0007297890945568719, + "loss": 2.6745, + "step": 11957 + }, + { + "epoch": 0.35459478694066365, + "grad_norm": 0.12096768617630005, + "learning_rate": 0.000729747305825575, + "loss": 2.7294, + "step": 11958 + }, + { + "epoch": 0.35462444029297513, + "grad_norm": 0.12244077771902084, + "learning_rate": 0.0007297055150598263, + "loss": 2.7084, + "step": 11959 + }, + { + "epoch": 0.3546540936452866, + "grad_norm": 0.12091545015573502, + "learning_rate": 0.0007296637222599958, + "loss": 2.705, + "step": 11960 + }, + { + "epoch": 0.3546837469975981, + "grad_norm": 0.1070323958992958, + "learning_rate": 0.0007296219274264536, + "loss": 2.691, + "step": 11961 + }, + { + "epoch": 0.35471340034990956, + "grad_norm": 0.11262901872396469, + "learning_rate": 0.0007295801305595698, + "loss": 2.6755, + "step": 11962 + }, + { + "epoch": 0.35474305370222103, + "grad_norm": 0.11323113739490509, + "learning_rate": 0.0007295383316597146, + "loss": 2.729, + "step": 11963 + }, + { + "epoch": 0.3547727070545325, + "grad_norm": 0.10737192630767822, + "learning_rate": 0.0007294965307272581, + "loss": 2.6866, + "step": 11964 + }, + { + "epoch": 0.354802360406844, + "grad_norm": 0.125172957777977, + "learning_rate": 0.0007294547277625705, + "loss": 2.7096, + "step": 11965 + }, + { + "epoch": 0.35483201375915546, + "grad_norm": 0.12684372067451477, + "learning_rate": 0.0007294129227660218, + "loss": 2.7001, + "step": 11966 + }, + { + "epoch": 0.35486166711146694, + "grad_norm": 0.1352333277463913, + "learning_rate": 0.0007293711157379821, + "loss": 2.7155, + "step": 11967 + }, + { + "epoch": 0.3548913204637784, + "grad_norm": 0.16077576577663422, + "learning_rate": 0.000729329306678822, + "loss": 2.7148, + "step": 11968 + }, + { + "epoch": 0.3549209738160899, + "grad_norm": 0.182181715965271, + "learning_rate": 0.0007292874955889115, + "loss": 2.7237, + "step": 11969 + }, + { + "epoch": 0.35495062716840137, + "grad_norm": 0.18406783044338226, + "learning_rate": 0.0007292456824686209, + "loss": 2.7286, + "step": 11970 + }, + { + "epoch": 0.35498028052071284, + "grad_norm": 0.17911593616008759, + "learning_rate": 0.0007292038673183203, + "loss": 2.6974, + "step": 11971 + }, + { + "epoch": 0.3550099338730243, + "grad_norm": 0.15427234768867493, + "learning_rate": 0.0007291620501383803, + "loss": 2.7065, + "step": 11972 + }, + { + "epoch": 0.35503958722533585, + "grad_norm": 0.153425931930542, + "learning_rate": 0.0007291202309291708, + "loss": 2.6953, + "step": 11973 + }, + { + "epoch": 0.3550692405776473, + "grad_norm": 0.1545218825340271, + "learning_rate": 0.0007290784096910624, + "loss": 2.6925, + "step": 11974 + }, + { + "epoch": 0.3550988939299588, + "grad_norm": 0.14788101613521576, + "learning_rate": 0.0007290365864244255, + "loss": 2.7219, + "step": 11975 + }, + { + "epoch": 0.3551285472822703, + "grad_norm": 0.14266405999660492, + "learning_rate": 0.0007289947611296303, + "loss": 2.72, + "step": 11976 + }, + { + "epoch": 0.35515820063458176, + "grad_norm": 0.13660113513469696, + "learning_rate": 0.000728952933807047, + "loss": 2.7094, + "step": 11977 + }, + { + "epoch": 0.35518785398689323, + "grad_norm": 0.11579382419586182, + "learning_rate": 0.0007289111044570464, + "loss": 2.6867, + "step": 11978 + }, + { + "epoch": 0.3552175073392047, + "grad_norm": 0.12520308792591095, + "learning_rate": 0.0007288692730799985, + "loss": 2.6627, + "step": 11979 + }, + { + "epoch": 0.3552471606915162, + "grad_norm": 0.1415855586528778, + "learning_rate": 0.0007288274396762738, + "loss": 2.715, + "step": 11980 + }, + { + "epoch": 0.35527681404382766, + "grad_norm": 0.13665872812271118, + "learning_rate": 0.0007287856042462431, + "loss": 2.6762, + "step": 11981 + }, + { + "epoch": 0.35530646739613914, + "grad_norm": 0.1214568018913269, + "learning_rate": 0.0007287437667902766, + "loss": 2.6839, + "step": 11982 + }, + { + "epoch": 0.3553361207484506, + "grad_norm": 0.11963307112455368, + "learning_rate": 0.0007287019273087447, + "loss": 2.6706, + "step": 11983 + }, + { + "epoch": 0.3553657741007621, + "grad_norm": 0.11570420861244202, + "learning_rate": 0.0007286600858020178, + "loss": 2.6696, + "step": 11984 + }, + { + "epoch": 0.35539542745307356, + "grad_norm": 0.112466000020504, + "learning_rate": 0.0007286182422704668, + "loss": 2.692, + "step": 11985 + }, + { + "epoch": 0.35542508080538504, + "grad_norm": 0.12563076615333557, + "learning_rate": 0.0007285763967144619, + "loss": 2.6992, + "step": 11986 + }, + { + "epoch": 0.3554547341576965, + "grad_norm": 0.12219684571027756, + "learning_rate": 0.000728534549134374, + "loss": 2.6641, + "step": 11987 + }, + { + "epoch": 0.355484387510008, + "grad_norm": 0.12309777736663818, + "learning_rate": 0.0007284926995305732, + "loss": 2.6662, + "step": 11988 + }, + { + "epoch": 0.35551404086231947, + "grad_norm": 0.12305549532175064, + "learning_rate": 0.0007284508479034304, + "loss": 2.7081, + "step": 11989 + }, + { + "epoch": 0.35554369421463095, + "grad_norm": 0.10620016604661942, + "learning_rate": 0.0007284089942533162, + "loss": 2.6984, + "step": 11990 + }, + { + "epoch": 0.3555733475669424, + "grad_norm": 0.13236775994300842, + "learning_rate": 0.0007283671385806012, + "loss": 2.7323, + "step": 11991 + }, + { + "epoch": 0.3556030009192539, + "grad_norm": 0.14262071251869202, + "learning_rate": 0.0007283252808856557, + "loss": 2.7376, + "step": 11992 + }, + { + "epoch": 0.3556326542715654, + "grad_norm": 0.13434350490570068, + "learning_rate": 0.0007282834211688509, + "loss": 2.7042, + "step": 11993 + }, + { + "epoch": 0.3556623076238769, + "grad_norm": 0.13814875483512878, + "learning_rate": 0.0007282415594305571, + "loss": 2.7035, + "step": 11994 + }, + { + "epoch": 0.3556919609761884, + "grad_norm": 0.148178368806839, + "learning_rate": 0.0007281996956711452, + "loss": 2.7064, + "step": 11995 + }, + { + "epoch": 0.35572161432849986, + "grad_norm": 0.11867444962263107, + "learning_rate": 0.0007281578298909858, + "loss": 2.7016, + "step": 11996 + }, + { + "epoch": 0.35575126768081133, + "grad_norm": 0.11199085414409637, + "learning_rate": 0.0007281159620904496, + "loss": 2.6983, + "step": 11997 + }, + { + "epoch": 0.3557809210331228, + "grad_norm": 0.13302329182624817, + "learning_rate": 0.0007280740922699075, + "loss": 2.7231, + "step": 11998 + }, + { + "epoch": 0.3558105743854343, + "grad_norm": 0.14465732872486115, + "learning_rate": 0.00072803222042973, + "loss": 2.6977, + "step": 11999 + }, + { + "epoch": 0.35584022773774576, + "grad_norm": 0.14427988231182098, + "learning_rate": 0.0007279903465702882, + "loss": 2.7177, + "step": 12000 + }, + { + "epoch": 0.35586988109005724, + "grad_norm": 0.16114424169063568, + "learning_rate": 0.0007279484706919527, + "loss": 2.6991, + "step": 12001 + }, + { + "epoch": 0.3558995344423687, + "grad_norm": 0.14990071952342987, + "learning_rate": 0.0007279065927950943, + "loss": 2.712, + "step": 12002 + }, + { + "epoch": 0.3559291877946802, + "grad_norm": 0.13630247116088867, + "learning_rate": 0.0007278647128800841, + "loss": 2.6956, + "step": 12003 + }, + { + "epoch": 0.35595884114699167, + "grad_norm": 0.13566774129867554, + "learning_rate": 0.0007278228309472927, + "loss": 2.703, + "step": 12004 + }, + { + "epoch": 0.35598849449930314, + "grad_norm": 0.15025413036346436, + "learning_rate": 0.0007277809469970908, + "loss": 2.6821, + "step": 12005 + }, + { + "epoch": 0.3560181478516146, + "grad_norm": 0.11622266471385956, + "learning_rate": 0.0007277390610298496, + "loss": 2.7054, + "step": 12006 + }, + { + "epoch": 0.3560478012039261, + "grad_norm": 0.1445816308259964, + "learning_rate": 0.0007276971730459401, + "loss": 2.7009, + "step": 12007 + }, + { + "epoch": 0.35607745455623757, + "grad_norm": 0.15126970410346985, + "learning_rate": 0.0007276552830457329, + "loss": 2.7077, + "step": 12008 + }, + { + "epoch": 0.35610710790854905, + "grad_norm": 0.14146554470062256, + "learning_rate": 0.0007276133910295992, + "loss": 2.7139, + "step": 12009 + }, + { + "epoch": 0.3561367612608605, + "grad_norm": 0.14566153287887573, + "learning_rate": 0.0007275714969979097, + "loss": 2.7307, + "step": 12010 + }, + { + "epoch": 0.356166414613172, + "grad_norm": 0.14975297451019287, + "learning_rate": 0.0007275296009510357, + "loss": 2.7046, + "step": 12011 + }, + { + "epoch": 0.3561960679654835, + "grad_norm": 0.14531226456165314, + "learning_rate": 0.0007274877028893478, + "loss": 2.7293, + "step": 12012 + }, + { + "epoch": 0.35622572131779495, + "grad_norm": 0.13872382044792175, + "learning_rate": 0.0007274458028132173, + "loss": 2.7335, + "step": 12013 + }, + { + "epoch": 0.35625537467010643, + "grad_norm": 0.1229226365685463, + "learning_rate": 0.0007274039007230154, + "loss": 2.7084, + "step": 12014 + }, + { + "epoch": 0.35628502802241796, + "grad_norm": 0.14405220746994019, + "learning_rate": 0.0007273619966191128, + "loss": 2.6867, + "step": 12015 + }, + { + "epoch": 0.35631468137472944, + "grad_norm": 0.16808678209781647, + "learning_rate": 0.0007273200905018808, + "loss": 2.6768, + "step": 12016 + }, + { + "epoch": 0.3563443347270409, + "grad_norm": 0.15558961033821106, + "learning_rate": 0.0007272781823716902, + "loss": 2.7063, + "step": 12017 + }, + { + "epoch": 0.3563739880793524, + "grad_norm": 0.17526468634605408, + "learning_rate": 0.0007272362722289122, + "loss": 2.6872, + "step": 12018 + }, + { + "epoch": 0.35640364143166386, + "grad_norm": 0.16513179242610931, + "learning_rate": 0.0007271943600739183, + "loss": 2.706, + "step": 12019 + }, + { + "epoch": 0.35643329478397534, + "grad_norm": 0.12467946112155914, + "learning_rate": 0.0007271524459070792, + "loss": 2.7029, + "step": 12020 + }, + { + "epoch": 0.3564629481362868, + "grad_norm": 0.1576586812734604, + "learning_rate": 0.0007271105297287662, + "loss": 2.7237, + "step": 12021 + }, + { + "epoch": 0.3564926014885983, + "grad_norm": 0.15283679962158203, + "learning_rate": 0.0007270686115393504, + "loss": 2.6982, + "step": 12022 + }, + { + "epoch": 0.35652225484090977, + "grad_norm": 0.1378481686115265, + "learning_rate": 0.0007270266913392032, + "loss": 2.7073, + "step": 12023 + }, + { + "epoch": 0.35655190819322125, + "grad_norm": 0.14971649646759033, + "learning_rate": 0.0007269847691286955, + "loss": 2.7046, + "step": 12024 + }, + { + "epoch": 0.3565815615455327, + "grad_norm": 0.1465962678194046, + "learning_rate": 0.0007269428449081988, + "loss": 2.7329, + "step": 12025 + }, + { + "epoch": 0.3566112148978442, + "grad_norm": 0.14030401408672333, + "learning_rate": 0.0007269009186780844, + "loss": 2.7175, + "step": 12026 + }, + { + "epoch": 0.3566408682501557, + "grad_norm": 0.14034464955329895, + "learning_rate": 0.0007268589904387233, + "loss": 2.7073, + "step": 12027 + }, + { + "epoch": 0.35667052160246715, + "grad_norm": 0.13095898926258087, + "learning_rate": 0.0007268170601904869, + "loss": 2.7264, + "step": 12028 + }, + { + "epoch": 0.3567001749547786, + "grad_norm": 0.118498794734478, + "learning_rate": 0.0007267751279337464, + "loss": 2.6694, + "step": 12029 + }, + { + "epoch": 0.3567298283070901, + "grad_norm": 0.12613525986671448, + "learning_rate": 0.0007267331936688734, + "loss": 2.7155, + "step": 12030 + }, + { + "epoch": 0.3567594816594016, + "grad_norm": 0.13472683727741241, + "learning_rate": 0.000726691257396239, + "loss": 2.7316, + "step": 12031 + }, + { + "epoch": 0.35678913501171305, + "grad_norm": 0.1466342955827713, + "learning_rate": 0.0007266493191162145, + "loss": 2.6918, + "step": 12032 + }, + { + "epoch": 0.35681878836402453, + "grad_norm": 0.1353515088558197, + "learning_rate": 0.0007266073788291714, + "loss": 2.683, + "step": 12033 + }, + { + "epoch": 0.356848441716336, + "grad_norm": 0.1524142622947693, + "learning_rate": 0.0007265654365354811, + "loss": 2.7217, + "step": 12034 + }, + { + "epoch": 0.3568780950686475, + "grad_norm": 0.15754593908786774, + "learning_rate": 0.000726523492235515, + "loss": 2.7366, + "step": 12035 + }, + { + "epoch": 0.356907748420959, + "grad_norm": 0.1257590800523758, + "learning_rate": 0.0007264815459296445, + "loss": 2.7051, + "step": 12036 + }, + { + "epoch": 0.3569374017732705, + "grad_norm": 0.1293070912361145, + "learning_rate": 0.0007264395976182411, + "loss": 2.713, + "step": 12037 + }, + { + "epoch": 0.35696705512558197, + "grad_norm": 0.14250902831554413, + "learning_rate": 0.0007263976473016761, + "loss": 2.7022, + "step": 12038 + }, + { + "epoch": 0.35699670847789344, + "grad_norm": 0.13114233314990997, + "learning_rate": 0.0007263556949803209, + "loss": 2.7177, + "step": 12039 + }, + { + "epoch": 0.3570263618302049, + "grad_norm": 0.12278267741203308, + "learning_rate": 0.0007263137406545475, + "loss": 2.7174, + "step": 12040 + }, + { + "epoch": 0.3570560151825164, + "grad_norm": 0.1310197114944458, + "learning_rate": 0.0007262717843247269, + "loss": 2.6795, + "step": 12041 + }, + { + "epoch": 0.35708566853482787, + "grad_norm": 0.1542801558971405, + "learning_rate": 0.0007262298259912309, + "loss": 2.7134, + "step": 12042 + }, + { + "epoch": 0.35711532188713935, + "grad_norm": 0.15479373931884766, + "learning_rate": 0.0007261878656544308, + "loss": 2.6837, + "step": 12043 + }, + { + "epoch": 0.3571449752394508, + "grad_norm": 0.1306062638759613, + "learning_rate": 0.0007261459033146984, + "loss": 2.674, + "step": 12044 + }, + { + "epoch": 0.3571746285917623, + "grad_norm": 0.11664330214262009, + "learning_rate": 0.0007261039389724052, + "loss": 2.6745, + "step": 12045 + }, + { + "epoch": 0.3572042819440738, + "grad_norm": 0.11078078299760818, + "learning_rate": 0.0007260619726279229, + "loss": 2.66, + "step": 12046 + }, + { + "epoch": 0.35723393529638525, + "grad_norm": 0.11767128109931946, + "learning_rate": 0.000726020004281623, + "loss": 2.6899, + "step": 12047 + }, + { + "epoch": 0.35726358864869673, + "grad_norm": 0.11528587341308594, + "learning_rate": 0.0007259780339338771, + "loss": 2.7104, + "step": 12048 + }, + { + "epoch": 0.3572932420010082, + "grad_norm": 0.11603116244077682, + "learning_rate": 0.000725936061585057, + "loss": 2.6754, + "step": 12049 + }, + { + "epoch": 0.3573228953533197, + "grad_norm": 0.1368081420660019, + "learning_rate": 0.0007258940872355342, + "loss": 2.723, + "step": 12050 + }, + { + "epoch": 0.35735254870563116, + "grad_norm": 0.13500113785266876, + "learning_rate": 0.0007258521108856804, + "loss": 2.7036, + "step": 12051 + }, + { + "epoch": 0.35738220205794263, + "grad_norm": 0.12835347652435303, + "learning_rate": 0.0007258101325358677, + "loss": 2.7177, + "step": 12052 + }, + { + "epoch": 0.3574118554102541, + "grad_norm": 0.12617063522338867, + "learning_rate": 0.0007257681521864673, + "loss": 2.7061, + "step": 12053 + }, + { + "epoch": 0.3574415087625656, + "grad_norm": 0.12806911766529083, + "learning_rate": 0.0007257261698378512, + "loss": 2.735, + "step": 12054 + }, + { + "epoch": 0.35747116211487706, + "grad_norm": 0.12960653007030487, + "learning_rate": 0.0007256841854903912, + "loss": 2.7077, + "step": 12055 + }, + { + "epoch": 0.3575008154671886, + "grad_norm": 0.12913468480110168, + "learning_rate": 0.0007256421991444588, + "loss": 2.696, + "step": 12056 + }, + { + "epoch": 0.35753046881950007, + "grad_norm": 0.1393943876028061, + "learning_rate": 0.0007256002108004261, + "loss": 2.693, + "step": 12057 + }, + { + "epoch": 0.35756012217181155, + "grad_norm": 0.14857174456119537, + "learning_rate": 0.000725558220458665, + "loss": 2.726, + "step": 12058 + }, + { + "epoch": 0.357589775524123, + "grad_norm": 0.14288084208965302, + "learning_rate": 0.0007255162281195468, + "loss": 2.7348, + "step": 12059 + }, + { + "epoch": 0.3576194288764345, + "grad_norm": 0.13649383187294006, + "learning_rate": 0.0007254742337834439, + "loss": 2.7205, + "step": 12060 + }, + { + "epoch": 0.357649082228746, + "grad_norm": 0.16050662100315094, + "learning_rate": 0.000725432237450728, + "loss": 2.6764, + "step": 12061 + }, + { + "epoch": 0.35767873558105745, + "grad_norm": 0.17095965147018433, + "learning_rate": 0.0007253902391217708, + "loss": 2.7082, + "step": 12062 + }, + { + "epoch": 0.3577083889333689, + "grad_norm": 0.17760051786899567, + "learning_rate": 0.0007253482387969444, + "loss": 2.6879, + "step": 12063 + }, + { + "epoch": 0.3577380422856804, + "grad_norm": 0.15674981474876404, + "learning_rate": 0.0007253062364766206, + "loss": 2.7492, + "step": 12064 + }, + { + "epoch": 0.3577676956379919, + "grad_norm": 0.11975321918725967, + "learning_rate": 0.0007252642321611716, + "loss": 2.6957, + "step": 12065 + }, + { + "epoch": 0.35779734899030335, + "grad_norm": 0.13777309656143188, + "learning_rate": 0.0007252222258509689, + "loss": 2.7145, + "step": 12066 + }, + { + "epoch": 0.35782700234261483, + "grad_norm": 0.1325690597295761, + "learning_rate": 0.0007251802175463848, + "loss": 2.6902, + "step": 12067 + }, + { + "epoch": 0.3578566556949263, + "grad_norm": 0.13601702451705933, + "learning_rate": 0.0007251382072477914, + "loss": 2.714, + "step": 12068 + }, + { + "epoch": 0.3578863090472378, + "grad_norm": 0.12258940190076828, + "learning_rate": 0.0007250961949555604, + "loss": 2.6776, + "step": 12069 + }, + { + "epoch": 0.35791596239954926, + "grad_norm": 0.13496296107769012, + "learning_rate": 0.0007250541806700639, + "loss": 2.6852, + "step": 12070 + }, + { + "epoch": 0.35794561575186074, + "grad_norm": 0.1309240460395813, + "learning_rate": 0.000725012164391674, + "loss": 2.7244, + "step": 12071 + }, + { + "epoch": 0.3579752691041722, + "grad_norm": 0.12886938452720642, + "learning_rate": 0.000724970146120763, + "loss": 2.7039, + "step": 12072 + }, + { + "epoch": 0.3580049224564837, + "grad_norm": 0.12439519166946411, + "learning_rate": 0.0007249281258577025, + "loss": 2.738, + "step": 12073 + }, + { + "epoch": 0.35803457580879516, + "grad_norm": 0.1405601054430008, + "learning_rate": 0.000724886103602865, + "loss": 2.7159, + "step": 12074 + }, + { + "epoch": 0.35806422916110664, + "grad_norm": 0.14085686206817627, + "learning_rate": 0.0007248440793566223, + "loss": 2.71, + "step": 12075 + }, + { + "epoch": 0.3580938825134181, + "grad_norm": 0.12329771369695663, + "learning_rate": 0.0007248020531193468, + "loss": 2.704, + "step": 12076 + }, + { + "epoch": 0.35812353586572965, + "grad_norm": 0.13991229236125946, + "learning_rate": 0.0007247600248914104, + "loss": 2.7353, + "step": 12077 + }, + { + "epoch": 0.3581531892180411, + "grad_norm": 0.14284412562847137, + "learning_rate": 0.0007247179946731854, + "loss": 2.7174, + "step": 12078 + }, + { + "epoch": 0.3581828425703526, + "grad_norm": 0.12879125773906708, + "learning_rate": 0.0007246759624650442, + "loss": 2.7084, + "step": 12079 + }, + { + "epoch": 0.3582124959226641, + "grad_norm": 0.13897909224033356, + "learning_rate": 0.0007246339282673586, + "loss": 2.6836, + "step": 12080 + }, + { + "epoch": 0.35824214927497555, + "grad_norm": 0.10932256281375885, + "learning_rate": 0.0007245918920805011, + "loss": 2.6992, + "step": 12081 + }, + { + "epoch": 0.35827180262728703, + "grad_norm": 0.1412704885005951, + "learning_rate": 0.0007245498539048438, + "loss": 2.7115, + "step": 12082 + }, + { + "epoch": 0.3583014559795985, + "grad_norm": 0.12057366222143173, + "learning_rate": 0.0007245078137407588, + "loss": 2.707, + "step": 12083 + }, + { + "epoch": 0.35833110933191, + "grad_norm": 0.12230546027421951, + "learning_rate": 0.0007244657715886189, + "loss": 2.6999, + "step": 12084 + }, + { + "epoch": 0.35836076268422146, + "grad_norm": 0.12461359798908234, + "learning_rate": 0.0007244237274487959, + "loss": 2.6763, + "step": 12085 + }, + { + "epoch": 0.35839041603653293, + "grad_norm": 0.14803063869476318, + "learning_rate": 0.0007243816813216624, + "loss": 2.7184, + "step": 12086 + }, + { + "epoch": 0.3584200693888444, + "grad_norm": 0.17461822926998138, + "learning_rate": 0.0007243396332075905, + "loss": 2.6398, + "step": 12087 + }, + { + "epoch": 0.3584497227411559, + "grad_norm": 0.18315638601779938, + "learning_rate": 0.0007242975831069526, + "loss": 2.7337, + "step": 12088 + }, + { + "epoch": 0.35847937609346736, + "grad_norm": 0.16734708845615387, + "learning_rate": 0.0007242555310201211, + "loss": 2.7175, + "step": 12089 + }, + { + "epoch": 0.35850902944577884, + "grad_norm": 0.13401591777801514, + "learning_rate": 0.0007242134769474684, + "loss": 2.7001, + "step": 12090 + }, + { + "epoch": 0.3585386827980903, + "grad_norm": 0.14818482100963593, + "learning_rate": 0.0007241714208893671, + "loss": 2.7127, + "step": 12091 + }, + { + "epoch": 0.3585683361504018, + "grad_norm": 0.14573296904563904, + "learning_rate": 0.0007241293628461891, + "loss": 2.6965, + "step": 12092 + }, + { + "epoch": 0.35859798950271327, + "grad_norm": 0.12614747881889343, + "learning_rate": 0.0007240873028183071, + "loss": 2.7069, + "step": 12093 + }, + { + "epoch": 0.35862764285502474, + "grad_norm": 0.1557687222957611, + "learning_rate": 0.0007240452408060938, + "loss": 2.6799, + "step": 12094 + }, + { + "epoch": 0.3586572962073362, + "grad_norm": 0.14195825159549713, + "learning_rate": 0.0007240031768099214, + "loss": 2.7121, + "step": 12095 + }, + { + "epoch": 0.3586869495596477, + "grad_norm": 0.13561001420021057, + "learning_rate": 0.0007239611108301623, + "loss": 2.712, + "step": 12096 + }, + { + "epoch": 0.35871660291195917, + "grad_norm": 0.15026775002479553, + "learning_rate": 0.0007239190428671891, + "loss": 2.7355, + "step": 12097 + }, + { + "epoch": 0.3587462562642707, + "grad_norm": 0.15564389526844025, + "learning_rate": 0.0007238769729213744, + "loss": 2.7192, + "step": 12098 + }, + { + "epoch": 0.3587759096165822, + "grad_norm": 0.14165863394737244, + "learning_rate": 0.0007238349009930907, + "loss": 2.7122, + "step": 12099 + }, + { + "epoch": 0.35880556296889365, + "grad_norm": 0.12601147592067719, + "learning_rate": 0.0007237928270827104, + "loss": 2.6931, + "step": 12100 + }, + { + "epoch": 0.35883521632120513, + "grad_norm": 0.11928015202283859, + "learning_rate": 0.0007237507511906062, + "loss": 2.7187, + "step": 12101 + }, + { + "epoch": 0.3588648696735166, + "grad_norm": 0.1373997926712036, + "learning_rate": 0.0007237086733171509, + "loss": 2.7223, + "step": 12102 + }, + { + "epoch": 0.3588945230258281, + "grad_norm": 0.13141444325447083, + "learning_rate": 0.0007236665934627169, + "loss": 2.6951, + "step": 12103 + }, + { + "epoch": 0.35892417637813956, + "grad_norm": 0.12652581930160522, + "learning_rate": 0.0007236245116276766, + "loss": 2.6872, + "step": 12104 + }, + { + "epoch": 0.35895382973045104, + "grad_norm": 0.11974209547042847, + "learning_rate": 0.000723582427812403, + "loss": 2.7204, + "step": 12105 + }, + { + "epoch": 0.3589834830827625, + "grad_norm": 0.12112011760473251, + "learning_rate": 0.0007235403420172686, + "loss": 2.6677, + "step": 12106 + }, + { + "epoch": 0.359013136435074, + "grad_norm": 0.1259862780570984, + "learning_rate": 0.0007234982542426463, + "loss": 2.6956, + "step": 12107 + }, + { + "epoch": 0.35904278978738546, + "grad_norm": 0.13855254650115967, + "learning_rate": 0.0007234561644889084, + "loss": 2.7058, + "step": 12108 + }, + { + "epoch": 0.35907244313969694, + "grad_norm": 0.15203820168972015, + "learning_rate": 0.0007234140727564276, + "loss": 2.7003, + "step": 12109 + }, + { + "epoch": 0.3591020964920084, + "grad_norm": 0.13509047031402588, + "learning_rate": 0.0007233719790455771, + "loss": 2.6952, + "step": 12110 + }, + { + "epoch": 0.3591317498443199, + "grad_norm": 0.12825006246566772, + "learning_rate": 0.0007233298833567293, + "loss": 2.7057, + "step": 12111 + }, + { + "epoch": 0.35916140319663137, + "grad_norm": 0.11361618340015411, + "learning_rate": 0.0007232877856902572, + "loss": 2.6881, + "step": 12112 + }, + { + "epoch": 0.35919105654894284, + "grad_norm": 0.11654015630483627, + "learning_rate": 0.0007232456860465333, + "loss": 2.7416, + "step": 12113 + }, + { + "epoch": 0.3592207099012543, + "grad_norm": 0.14624255895614624, + "learning_rate": 0.0007232035844259306, + "loss": 2.7124, + "step": 12114 + }, + { + "epoch": 0.3592503632535658, + "grad_norm": 0.15421779453754425, + "learning_rate": 0.0007231614808288217, + "loss": 2.6926, + "step": 12115 + }, + { + "epoch": 0.3592800166058773, + "grad_norm": 0.1319016069173813, + "learning_rate": 0.0007231193752555797, + "loss": 2.672, + "step": 12116 + }, + { + "epoch": 0.35930966995818875, + "grad_norm": 0.12002355605363846, + "learning_rate": 0.0007230772677065773, + "loss": 2.7251, + "step": 12117 + }, + { + "epoch": 0.3593393233105002, + "grad_norm": 0.14113591611385345, + "learning_rate": 0.0007230351581821874, + "loss": 2.7029, + "step": 12118 + }, + { + "epoch": 0.35936897666281176, + "grad_norm": 0.13733892142772675, + "learning_rate": 0.000722993046682783, + "loss": 2.7166, + "step": 12119 + }, + { + "epoch": 0.35939863001512323, + "grad_norm": 0.13160677254199982, + "learning_rate": 0.0007229509332087367, + "loss": 2.7178, + "step": 12120 + }, + { + "epoch": 0.3594282833674347, + "grad_norm": 0.11556127667427063, + "learning_rate": 0.0007229088177604218, + "loss": 2.6614, + "step": 12121 + }, + { + "epoch": 0.3594579367197462, + "grad_norm": 0.1249336302280426, + "learning_rate": 0.000722866700338211, + "loss": 2.6839, + "step": 12122 + }, + { + "epoch": 0.35948759007205766, + "grad_norm": 0.16024348139762878, + "learning_rate": 0.0007228245809424772, + "loss": 2.7077, + "step": 12123 + }, + { + "epoch": 0.35951724342436914, + "grad_norm": 0.13872264325618744, + "learning_rate": 0.0007227824595735936, + "loss": 2.6622, + "step": 12124 + }, + { + "epoch": 0.3595468967766806, + "grad_norm": 0.14221547544002533, + "learning_rate": 0.0007227403362319332, + "loss": 2.692, + "step": 12125 + }, + { + "epoch": 0.3595765501289921, + "grad_norm": 0.14867086708545685, + "learning_rate": 0.0007226982109178686, + "loss": 2.7363, + "step": 12126 + }, + { + "epoch": 0.35960620348130357, + "grad_norm": 0.1654907464981079, + "learning_rate": 0.0007226560836317733, + "loss": 2.7029, + "step": 12127 + }, + { + "epoch": 0.35963585683361504, + "grad_norm": 0.17895260453224182, + "learning_rate": 0.0007226139543740201, + "loss": 2.7029, + "step": 12128 + }, + { + "epoch": 0.3596655101859265, + "grad_norm": 0.1507062017917633, + "learning_rate": 0.0007225718231449822, + "loss": 2.6805, + "step": 12129 + }, + { + "epoch": 0.359695163538238, + "grad_norm": 0.1143120601773262, + "learning_rate": 0.0007225296899450325, + "loss": 2.686, + "step": 12130 + }, + { + "epoch": 0.35972481689054947, + "grad_norm": 0.11594976484775543, + "learning_rate": 0.0007224875547745443, + "loss": 2.6974, + "step": 12131 + }, + { + "epoch": 0.35975447024286095, + "grad_norm": 0.11720706522464752, + "learning_rate": 0.0007224454176338906, + "loss": 2.6875, + "step": 12132 + }, + { + "epoch": 0.3597841235951724, + "grad_norm": 0.13910886645317078, + "learning_rate": 0.0007224032785234445, + "loss": 2.7282, + "step": 12133 + }, + { + "epoch": 0.3598137769474839, + "grad_norm": 0.1444852203130722, + "learning_rate": 0.0007223611374435792, + "loss": 2.73, + "step": 12134 + }, + { + "epoch": 0.3598434302997954, + "grad_norm": 0.14779333770275116, + "learning_rate": 0.0007223189943946677, + "loss": 2.6919, + "step": 12135 + }, + { + "epoch": 0.35987308365210685, + "grad_norm": 0.13611562550067902, + "learning_rate": 0.0007222768493770836, + "loss": 2.7026, + "step": 12136 + }, + { + "epoch": 0.3599027370044183, + "grad_norm": 0.1543048620223999, + "learning_rate": 0.0007222347023911997, + "loss": 2.6858, + "step": 12137 + }, + { + "epoch": 0.3599323903567298, + "grad_norm": 0.15783283114433289, + "learning_rate": 0.0007221925534373894, + "loss": 2.6707, + "step": 12138 + }, + { + "epoch": 0.3599620437090413, + "grad_norm": 0.1385856419801712, + "learning_rate": 0.0007221504025160259, + "loss": 2.7018, + "step": 12139 + }, + { + "epoch": 0.3599916970613528, + "grad_norm": 0.13232864439487457, + "learning_rate": 0.0007221082496274827, + "loss": 2.6869, + "step": 12140 + }, + { + "epoch": 0.3600213504136643, + "grad_norm": 0.13787782192230225, + "learning_rate": 0.0007220660947721325, + "loss": 2.6511, + "step": 12141 + }, + { + "epoch": 0.36005100376597576, + "grad_norm": 0.11926361918449402, + "learning_rate": 0.0007220239379503489, + "loss": 2.7083, + "step": 12142 + }, + { + "epoch": 0.36008065711828724, + "grad_norm": 0.16855628788471222, + "learning_rate": 0.0007219817791625054, + "loss": 2.682, + "step": 12143 + }, + { + "epoch": 0.3601103104705987, + "grad_norm": 0.0984441339969635, + "learning_rate": 0.0007219396184089751, + "loss": 2.7151, + "step": 12144 + }, + { + "epoch": 0.3601399638229102, + "grad_norm": 0.12108266353607178, + "learning_rate": 0.0007218974556901315, + "loss": 2.6955, + "step": 12145 + }, + { + "epoch": 0.36016961717522167, + "grad_norm": 0.1278301328420639, + "learning_rate": 0.0007218552910063476, + "loss": 2.7262, + "step": 12146 + }, + { + "epoch": 0.36019927052753314, + "grad_norm": 0.1194625198841095, + "learning_rate": 0.0007218131243579971, + "loss": 2.7228, + "step": 12147 + }, + { + "epoch": 0.3602289238798446, + "grad_norm": 0.10802888870239258, + "learning_rate": 0.0007217709557454532, + "loss": 2.7117, + "step": 12148 + }, + { + "epoch": 0.3602585772321561, + "grad_norm": 0.10718067735433578, + "learning_rate": 0.0007217287851690896, + "loss": 2.723, + "step": 12149 + }, + { + "epoch": 0.3602882305844676, + "grad_norm": 0.10988151282072067, + "learning_rate": 0.0007216866126292796, + "loss": 2.705, + "step": 12150 + }, + { + "epoch": 0.36031788393677905, + "grad_norm": 0.11496657878160477, + "learning_rate": 0.0007216444381263965, + "loss": 2.6989, + "step": 12151 + }, + { + "epoch": 0.3603475372890905, + "grad_norm": 0.131794735789299, + "learning_rate": 0.0007216022616608138, + "loss": 2.6967, + "step": 12152 + }, + { + "epoch": 0.360377190641402, + "grad_norm": 0.12414932996034622, + "learning_rate": 0.000721560083232905, + "loss": 2.7466, + "step": 12153 + }, + { + "epoch": 0.3604068439937135, + "grad_norm": 0.13089518249034882, + "learning_rate": 0.0007215179028430437, + "loss": 2.6855, + "step": 12154 + }, + { + "epoch": 0.36043649734602495, + "grad_norm": 0.12012162804603577, + "learning_rate": 0.0007214757204916034, + "loss": 2.685, + "step": 12155 + }, + { + "epoch": 0.36046615069833643, + "grad_norm": 0.125794917345047, + "learning_rate": 0.0007214335361789574, + "loss": 2.7292, + "step": 12156 + }, + { + "epoch": 0.3604958040506479, + "grad_norm": 0.14598533511161804, + "learning_rate": 0.0007213913499054796, + "loss": 2.7185, + "step": 12157 + }, + { + "epoch": 0.3605254574029594, + "grad_norm": 0.12060786783695221, + "learning_rate": 0.0007213491616715434, + "loss": 2.6978, + "step": 12158 + }, + { + "epoch": 0.36055511075527086, + "grad_norm": 0.1296418011188507, + "learning_rate": 0.0007213069714775224, + "loss": 2.7235, + "step": 12159 + }, + { + "epoch": 0.3605847641075824, + "grad_norm": 0.12926770746707916, + "learning_rate": 0.0007212647793237901, + "loss": 2.7063, + "step": 12160 + }, + { + "epoch": 0.36061441745989387, + "grad_norm": 0.12719348073005676, + "learning_rate": 0.0007212225852107201, + "loss": 2.7082, + "step": 12161 + }, + { + "epoch": 0.36064407081220534, + "grad_norm": 0.13115543127059937, + "learning_rate": 0.0007211803891386863, + "loss": 2.6833, + "step": 12162 + }, + { + "epoch": 0.3606737241645168, + "grad_norm": 0.1338227391242981, + "learning_rate": 0.0007211381911080621, + "loss": 2.7405, + "step": 12163 + }, + { + "epoch": 0.3607033775168283, + "grad_norm": 0.15208905935287476, + "learning_rate": 0.0007210959911192215, + "loss": 2.7377, + "step": 12164 + }, + { + "epoch": 0.36073303086913977, + "grad_norm": 0.14557981491088867, + "learning_rate": 0.0007210537891725376, + "loss": 2.6967, + "step": 12165 + }, + { + "epoch": 0.36076268422145125, + "grad_norm": 0.14747588336467743, + "learning_rate": 0.0007210115852683846, + "loss": 2.7236, + "step": 12166 + }, + { + "epoch": 0.3607923375737627, + "grad_norm": 0.1539311408996582, + "learning_rate": 0.0007209693794071361, + "loss": 2.751, + "step": 12167 + }, + { + "epoch": 0.3608219909260742, + "grad_norm": 0.13272082805633545, + "learning_rate": 0.0007209271715891657, + "loss": 2.6869, + "step": 12168 + }, + { + "epoch": 0.3608516442783857, + "grad_norm": 0.13430681824684143, + "learning_rate": 0.0007208849618148475, + "loss": 2.659, + "step": 12169 + }, + { + "epoch": 0.36088129763069715, + "grad_norm": 0.1244708001613617, + "learning_rate": 0.0007208427500845549, + "loss": 2.6808, + "step": 12170 + }, + { + "epoch": 0.3609109509830086, + "grad_norm": 0.1536405235528946, + "learning_rate": 0.0007208005363986619, + "loss": 2.7341, + "step": 12171 + }, + { + "epoch": 0.3609406043353201, + "grad_norm": 0.18939298391342163, + "learning_rate": 0.0007207583207575422, + "loss": 2.7056, + "step": 12172 + }, + { + "epoch": 0.3609702576876316, + "grad_norm": 0.19299612939357758, + "learning_rate": 0.0007207161031615697, + "loss": 2.6889, + "step": 12173 + }, + { + "epoch": 0.36099991103994306, + "grad_norm": 0.16535083949565887, + "learning_rate": 0.0007206738836111182, + "loss": 2.7182, + "step": 12174 + }, + { + "epoch": 0.36102956439225453, + "grad_norm": 0.14767038822174072, + "learning_rate": 0.0007206316621065615, + "loss": 2.6505, + "step": 12175 + }, + { + "epoch": 0.361059217744566, + "grad_norm": 0.16066129505634308, + "learning_rate": 0.0007205894386482736, + "loss": 2.7117, + "step": 12176 + }, + { + "epoch": 0.3610888710968775, + "grad_norm": 0.14722523093223572, + "learning_rate": 0.0007205472132366285, + "loss": 2.6834, + "step": 12177 + }, + { + "epoch": 0.36111852444918896, + "grad_norm": 0.13214711844921112, + "learning_rate": 0.000720504985872, + "loss": 2.7166, + "step": 12178 + }, + { + "epoch": 0.36114817780150044, + "grad_norm": 0.13898243010044098, + "learning_rate": 0.0007204627565547619, + "loss": 2.6941, + "step": 12179 + }, + { + "epoch": 0.3611778311538119, + "grad_norm": 0.14250659942626953, + "learning_rate": 0.000720420525285288, + "loss": 2.6699, + "step": 12180 + }, + { + "epoch": 0.36120748450612344, + "grad_norm": 0.11631789058446884, + "learning_rate": 0.0007203782920639528, + "loss": 2.6885, + "step": 12181 + }, + { + "epoch": 0.3612371378584349, + "grad_norm": 0.11886324733495712, + "learning_rate": 0.00072033605689113, + "loss": 2.7033, + "step": 12182 + }, + { + "epoch": 0.3612667912107464, + "grad_norm": 0.1285555511713028, + "learning_rate": 0.0007202938197671936, + "loss": 2.7467, + "step": 12183 + }, + { + "epoch": 0.3612964445630579, + "grad_norm": 0.13392841815948486, + "learning_rate": 0.0007202515806925175, + "loss": 2.6954, + "step": 12184 + }, + { + "epoch": 0.36132609791536935, + "grad_norm": 0.13848820328712463, + "learning_rate": 0.000720209339667476, + "loss": 2.6875, + "step": 12185 + }, + { + "epoch": 0.3613557512676808, + "grad_norm": 0.1281963735818863, + "learning_rate": 0.0007201670966924429, + "loss": 2.6745, + "step": 12186 + }, + { + "epoch": 0.3613854046199923, + "grad_norm": 0.12206043303012848, + "learning_rate": 0.0007201248517677922, + "loss": 2.6852, + "step": 12187 + }, + { + "epoch": 0.3614150579723038, + "grad_norm": 0.12062828242778778, + "learning_rate": 0.0007200826048938985, + "loss": 2.676, + "step": 12188 + }, + { + "epoch": 0.36144471132461525, + "grad_norm": 0.1288410723209381, + "learning_rate": 0.0007200403560711353, + "loss": 2.6722, + "step": 12189 + }, + { + "epoch": 0.36147436467692673, + "grad_norm": 0.11794733256101608, + "learning_rate": 0.000719998105299877, + "loss": 2.7114, + "step": 12190 + }, + { + "epoch": 0.3615040180292382, + "grad_norm": 0.14442875981330872, + "learning_rate": 0.0007199558525804978, + "loss": 2.7164, + "step": 12191 + }, + { + "epoch": 0.3615336713815497, + "grad_norm": 0.13519826531410217, + "learning_rate": 0.0007199135979133718, + "loss": 2.697, + "step": 12192 + }, + { + "epoch": 0.36156332473386116, + "grad_norm": 0.144073024392128, + "learning_rate": 0.000719871341298873, + "loss": 2.6845, + "step": 12193 + }, + { + "epoch": 0.36159297808617263, + "grad_norm": 0.1499084085226059, + "learning_rate": 0.0007198290827373758, + "loss": 2.7098, + "step": 12194 + }, + { + "epoch": 0.3616226314384841, + "grad_norm": 0.13430257141590118, + "learning_rate": 0.0007197868222292543, + "loss": 2.7026, + "step": 12195 + }, + { + "epoch": 0.3616522847907956, + "grad_norm": 0.14123952388763428, + "learning_rate": 0.0007197445597748828, + "loss": 2.648, + "step": 12196 + }, + { + "epoch": 0.36168193814310706, + "grad_norm": 0.1561986207962036, + "learning_rate": 0.0007197022953746355, + "loss": 2.7281, + "step": 12197 + }, + { + "epoch": 0.36171159149541854, + "grad_norm": 0.16657394170761108, + "learning_rate": 0.0007196600290288867, + "loss": 2.7152, + "step": 12198 + }, + { + "epoch": 0.36174124484773, + "grad_norm": 0.16254399716854095, + "learning_rate": 0.0007196177607380106, + "loss": 2.7382, + "step": 12199 + }, + { + "epoch": 0.3617708982000415, + "grad_norm": 0.149460569024086, + "learning_rate": 0.0007195754905023816, + "loss": 2.7228, + "step": 12200 + }, + { + "epoch": 0.36180055155235297, + "grad_norm": 0.1193317249417305, + "learning_rate": 0.0007195332183223739, + "loss": 2.6619, + "step": 12201 + }, + { + "epoch": 0.3618302049046645, + "grad_norm": 0.13261502981185913, + "learning_rate": 0.0007194909441983619, + "loss": 2.6803, + "step": 12202 + }, + { + "epoch": 0.361859858256976, + "grad_norm": 0.12017999589443207, + "learning_rate": 0.0007194486681307198, + "loss": 2.7195, + "step": 12203 + }, + { + "epoch": 0.36188951160928745, + "grad_norm": 0.13091330230236053, + "learning_rate": 0.0007194063901198222, + "loss": 2.6633, + "step": 12204 + }, + { + "epoch": 0.3619191649615989, + "grad_norm": 0.13763950765132904, + "learning_rate": 0.0007193641101660434, + "loss": 2.728, + "step": 12205 + }, + { + "epoch": 0.3619488183139104, + "grad_norm": 0.14909793436527252, + "learning_rate": 0.0007193218282697576, + "loss": 2.7287, + "step": 12206 + }, + { + "epoch": 0.3619784716662219, + "grad_norm": 0.1361771523952484, + "learning_rate": 0.0007192795444313394, + "loss": 2.7232, + "step": 12207 + }, + { + "epoch": 0.36200812501853336, + "grad_norm": 0.1179826632142067, + "learning_rate": 0.0007192372586511632, + "loss": 2.7119, + "step": 12208 + }, + { + "epoch": 0.36203777837084483, + "grad_norm": 0.11186859011650085, + "learning_rate": 0.0007191949709296035, + "loss": 2.7105, + "step": 12209 + }, + { + "epoch": 0.3620674317231563, + "grad_norm": 0.11842260509729385, + "learning_rate": 0.0007191526812670347, + "loss": 2.6953, + "step": 12210 + }, + { + "epoch": 0.3620970850754678, + "grad_norm": 0.10570599138736725, + "learning_rate": 0.0007191103896638313, + "loss": 2.6776, + "step": 12211 + }, + { + "epoch": 0.36212673842777926, + "grad_norm": 0.12439271807670593, + "learning_rate": 0.0007190680961203676, + "loss": 2.6936, + "step": 12212 + }, + { + "epoch": 0.36215639178009074, + "grad_norm": 0.11625272035598755, + "learning_rate": 0.0007190258006370185, + "loss": 2.7112, + "step": 12213 + }, + { + "epoch": 0.3621860451324022, + "grad_norm": 0.11966148018836975, + "learning_rate": 0.0007189835032141582, + "loss": 2.6778, + "step": 12214 + }, + { + "epoch": 0.3622156984847137, + "grad_norm": 0.11473158746957779, + "learning_rate": 0.0007189412038521616, + "loss": 2.7175, + "step": 12215 + }, + { + "epoch": 0.36224535183702516, + "grad_norm": 0.1110680028796196, + "learning_rate": 0.000718898902551403, + "loss": 2.6856, + "step": 12216 + }, + { + "epoch": 0.36227500518933664, + "grad_norm": 0.12388654798269272, + "learning_rate": 0.000718856599312257, + "loss": 2.6762, + "step": 12217 + }, + { + "epoch": 0.3623046585416481, + "grad_norm": 0.1311769038438797, + "learning_rate": 0.0007188142941350982, + "loss": 2.6944, + "step": 12218 + }, + { + "epoch": 0.3623343118939596, + "grad_norm": 0.12959663569927216, + "learning_rate": 0.0007187719870203012, + "loss": 2.7511, + "step": 12219 + }, + { + "epoch": 0.36236396524627107, + "grad_norm": 0.14087224006652832, + "learning_rate": 0.0007187296779682409, + "loss": 2.6949, + "step": 12220 + }, + { + "epoch": 0.36239361859858255, + "grad_norm": 0.14320868253707886, + "learning_rate": 0.0007186873669792918, + "loss": 2.7216, + "step": 12221 + }, + { + "epoch": 0.362423271950894, + "grad_norm": 0.15685990452766418, + "learning_rate": 0.0007186450540538283, + "loss": 2.6939, + "step": 12222 + }, + { + "epoch": 0.36245292530320555, + "grad_norm": 0.16166429221630096, + "learning_rate": 0.0007186027391922254, + "loss": 2.7057, + "step": 12223 + }, + { + "epoch": 0.36248257865551703, + "grad_norm": 0.15362226963043213, + "learning_rate": 0.0007185604223948577, + "loss": 2.6984, + "step": 12224 + }, + { + "epoch": 0.3625122320078285, + "grad_norm": 0.1588171273469925, + "learning_rate": 0.0007185181036620999, + "loss": 2.7173, + "step": 12225 + }, + { + "epoch": 0.36254188536014, + "grad_norm": 0.15120795369148254, + "learning_rate": 0.0007184757829943269, + "loss": 2.7062, + "step": 12226 + }, + { + "epoch": 0.36257153871245146, + "grad_norm": 0.15117555856704712, + "learning_rate": 0.0007184334603919134, + "loss": 2.7164, + "step": 12227 + }, + { + "epoch": 0.36260119206476293, + "grad_norm": 0.16848987340927124, + "learning_rate": 0.0007183911358552339, + "loss": 2.697, + "step": 12228 + }, + { + "epoch": 0.3626308454170744, + "grad_norm": 0.16943985223770142, + "learning_rate": 0.0007183488093846635, + "loss": 2.719, + "step": 12229 + }, + { + "epoch": 0.3626604987693859, + "grad_norm": 0.13976246118545532, + "learning_rate": 0.000718306480980577, + "loss": 2.7039, + "step": 12230 + }, + { + "epoch": 0.36269015212169736, + "grad_norm": 0.10784177482128143, + "learning_rate": 0.0007182641506433491, + "loss": 2.7016, + "step": 12231 + }, + { + "epoch": 0.36271980547400884, + "grad_norm": 0.12629735469818115, + "learning_rate": 0.0007182218183733547, + "loss": 2.6927, + "step": 12232 + }, + { + "epoch": 0.3627494588263203, + "grad_norm": 0.13725809752941132, + "learning_rate": 0.0007181794841709686, + "loss": 2.6982, + "step": 12233 + }, + { + "epoch": 0.3627791121786318, + "grad_norm": 0.13075102865695953, + "learning_rate": 0.0007181371480365656, + "loss": 2.6653, + "step": 12234 + }, + { + "epoch": 0.36280876553094327, + "grad_norm": 0.12470319867134094, + "learning_rate": 0.000718094809970521, + "loss": 2.7184, + "step": 12235 + }, + { + "epoch": 0.36283841888325474, + "grad_norm": 0.14109061658382416, + "learning_rate": 0.0007180524699732091, + "loss": 2.7161, + "step": 12236 + }, + { + "epoch": 0.3628680722355662, + "grad_norm": 0.1419198215007782, + "learning_rate": 0.0007180101280450053, + "loss": 2.6911, + "step": 12237 + }, + { + "epoch": 0.3628977255878777, + "grad_norm": 0.12982520461082458, + "learning_rate": 0.0007179677841862844, + "loss": 2.6942, + "step": 12238 + }, + { + "epoch": 0.36292737894018917, + "grad_norm": 0.13685786724090576, + "learning_rate": 0.0007179254383974213, + "loss": 2.704, + "step": 12239 + }, + { + "epoch": 0.36295703229250065, + "grad_norm": 0.1470584273338318, + "learning_rate": 0.0007178830906787911, + "loss": 2.6884, + "step": 12240 + }, + { + "epoch": 0.3629866856448121, + "grad_norm": 0.1177017092704773, + "learning_rate": 0.0007178407410307687, + "loss": 2.712, + "step": 12241 + }, + { + "epoch": 0.3630163389971236, + "grad_norm": 0.11272092908620834, + "learning_rate": 0.0007177983894537292, + "loss": 2.7021, + "step": 12242 + }, + { + "epoch": 0.3630459923494351, + "grad_norm": 0.1275244951248169, + "learning_rate": 0.0007177560359480477, + "loss": 2.7187, + "step": 12243 + }, + { + "epoch": 0.3630756457017466, + "grad_norm": 0.12656378746032715, + "learning_rate": 0.0007177136805140989, + "loss": 2.721, + "step": 12244 + }, + { + "epoch": 0.3631052990540581, + "grad_norm": 0.1132194995880127, + "learning_rate": 0.000717671323152258, + "loss": 2.6739, + "step": 12245 + }, + { + "epoch": 0.36313495240636956, + "grad_norm": 0.11106804758310318, + "learning_rate": 0.0007176289638629003, + "loss": 2.6926, + "step": 12246 + }, + { + "epoch": 0.36316460575868104, + "grad_norm": 0.1208367720246315, + "learning_rate": 0.0007175866026464009, + "loss": 2.7393, + "step": 12247 + }, + { + "epoch": 0.3631942591109925, + "grad_norm": 0.13963502645492554, + "learning_rate": 0.0007175442395031347, + "loss": 2.7209, + "step": 12248 + }, + { + "epoch": 0.363223912463304, + "grad_norm": 0.13681480288505554, + "learning_rate": 0.000717501874433477, + "loss": 2.7202, + "step": 12249 + }, + { + "epoch": 0.36325356581561546, + "grad_norm": 0.16259481012821198, + "learning_rate": 0.0007174595074378028, + "loss": 2.7081, + "step": 12250 + }, + { + "epoch": 0.36328321916792694, + "grad_norm": 0.17199525237083435, + "learning_rate": 0.0007174171385164872, + "loss": 2.7255, + "step": 12251 + }, + { + "epoch": 0.3633128725202384, + "grad_norm": 0.170657217502594, + "learning_rate": 0.0007173747676699055, + "loss": 2.7051, + "step": 12252 + }, + { + "epoch": 0.3633425258725499, + "grad_norm": 0.15310828387737274, + "learning_rate": 0.0007173323948984331, + "loss": 2.6915, + "step": 12253 + }, + { + "epoch": 0.36337217922486137, + "grad_norm": 0.14755600690841675, + "learning_rate": 0.0007172900202024451, + "loss": 2.7413, + "step": 12254 + }, + { + "epoch": 0.36340183257717285, + "grad_norm": 0.15760286152362823, + "learning_rate": 0.0007172476435823165, + "loss": 2.7264, + "step": 12255 + }, + { + "epoch": 0.3634314859294843, + "grad_norm": 0.12854629755020142, + "learning_rate": 0.0007172052650384228, + "loss": 2.7104, + "step": 12256 + }, + { + "epoch": 0.3634611392817958, + "grad_norm": 0.13493716716766357, + "learning_rate": 0.0007171628845711391, + "loss": 2.739, + "step": 12257 + }, + { + "epoch": 0.3634907926341073, + "grad_norm": 0.20998267829418182, + "learning_rate": 0.0007171205021808408, + "loss": 2.69, + "step": 12258 + }, + { + "epoch": 0.36352044598641875, + "grad_norm": 0.11739557981491089, + "learning_rate": 0.0007170781178679034, + "loss": 2.6895, + "step": 12259 + }, + { + "epoch": 0.3635500993387302, + "grad_norm": 0.12245844304561615, + "learning_rate": 0.0007170357316327018, + "loss": 2.7076, + "step": 12260 + }, + { + "epoch": 0.3635797526910417, + "grad_norm": 0.11120549589395523, + "learning_rate": 0.0007169933434756115, + "loss": 2.6782, + "step": 12261 + }, + { + "epoch": 0.3636094060433532, + "grad_norm": 0.12614910304546356, + "learning_rate": 0.000716950953397008, + "loss": 2.6886, + "step": 12262 + }, + { + "epoch": 0.36363905939566465, + "grad_norm": 0.13439622521400452, + "learning_rate": 0.0007169085613972666, + "loss": 2.706, + "step": 12263 + }, + { + "epoch": 0.3636687127479762, + "grad_norm": 0.12439143657684326, + "learning_rate": 0.0007168661674767626, + "loss": 2.6982, + "step": 12264 + }, + { + "epoch": 0.36369836610028766, + "grad_norm": 0.11476864665746689, + "learning_rate": 0.0007168237716358714, + "loss": 2.6771, + "step": 12265 + }, + { + "epoch": 0.36372801945259914, + "grad_norm": 0.1071864441037178, + "learning_rate": 0.0007167813738749686, + "loss": 2.7092, + "step": 12266 + }, + { + "epoch": 0.3637576728049106, + "grad_norm": 0.11139374226331711, + "learning_rate": 0.0007167389741944294, + "loss": 2.7267, + "step": 12267 + }, + { + "epoch": 0.3637873261572221, + "grad_norm": 0.10288768261671066, + "learning_rate": 0.0007166965725946297, + "loss": 2.6241, + "step": 12268 + }, + { + "epoch": 0.36381697950953357, + "grad_norm": 0.11240866780281067, + "learning_rate": 0.0007166541690759443, + "loss": 2.6549, + "step": 12269 + }, + { + "epoch": 0.36384663286184504, + "grad_norm": 0.09872224926948547, + "learning_rate": 0.0007166117636387492, + "loss": 2.6729, + "step": 12270 + }, + { + "epoch": 0.3638762862141565, + "grad_norm": 0.11529966443777084, + "learning_rate": 0.0007165693562834197, + "loss": 2.699, + "step": 12271 + }, + { + "epoch": 0.363905939566468, + "grad_norm": 0.11803987622261047, + "learning_rate": 0.0007165269470103314, + "loss": 2.6882, + "step": 12272 + }, + { + "epoch": 0.36393559291877947, + "grad_norm": 0.12157439440488815, + "learning_rate": 0.0007164845358198597, + "loss": 2.722, + "step": 12273 + }, + { + "epoch": 0.36396524627109095, + "grad_norm": 0.10572375357151031, + "learning_rate": 0.0007164421227123805, + "loss": 2.6909, + "step": 12274 + }, + { + "epoch": 0.3639948996234024, + "grad_norm": 0.11057260632514954, + "learning_rate": 0.000716399707688269, + "loss": 2.7224, + "step": 12275 + }, + { + "epoch": 0.3640245529757139, + "grad_norm": 0.13642209768295288, + "learning_rate": 0.0007163572907479011, + "loss": 2.7155, + "step": 12276 + }, + { + "epoch": 0.3640542063280254, + "grad_norm": 0.15515771508216858, + "learning_rate": 0.000716314871891652, + "loss": 2.7005, + "step": 12277 + }, + { + "epoch": 0.36408385968033685, + "grad_norm": 0.16145752370357513, + "learning_rate": 0.0007162724511198977, + "loss": 2.7097, + "step": 12278 + }, + { + "epoch": 0.36411351303264833, + "grad_norm": 0.14155061542987823, + "learning_rate": 0.0007162300284330137, + "loss": 2.7341, + "step": 12279 + }, + { + "epoch": 0.3641431663849598, + "grad_norm": 0.14606517553329468, + "learning_rate": 0.0007161876038313757, + "loss": 2.6838, + "step": 12280 + }, + { + "epoch": 0.3641728197372713, + "grad_norm": 0.17055709660053253, + "learning_rate": 0.0007161451773153595, + "loss": 2.6806, + "step": 12281 + }, + { + "epoch": 0.36420247308958276, + "grad_norm": 0.14219550788402557, + "learning_rate": 0.0007161027488853405, + "loss": 2.6844, + "step": 12282 + }, + { + "epoch": 0.36423212644189423, + "grad_norm": 0.12618188560009003, + "learning_rate": 0.0007160603185416945, + "loss": 2.6886, + "step": 12283 + }, + { + "epoch": 0.3642617797942057, + "grad_norm": 0.14106079936027527, + "learning_rate": 0.0007160178862847975, + "loss": 2.7231, + "step": 12284 + }, + { + "epoch": 0.36429143314651724, + "grad_norm": 0.1249663233757019, + "learning_rate": 0.0007159754521150249, + "loss": 2.7191, + "step": 12285 + }, + { + "epoch": 0.3643210864988287, + "grad_norm": 0.1315368413925171, + "learning_rate": 0.0007159330160327527, + "loss": 2.7015, + "step": 12286 + }, + { + "epoch": 0.3643507398511402, + "grad_norm": 0.12648379802703857, + "learning_rate": 0.0007158905780383566, + "loss": 2.6788, + "step": 12287 + }, + { + "epoch": 0.36438039320345167, + "grad_norm": 0.12347652018070221, + "learning_rate": 0.0007158481381322122, + "loss": 2.722, + "step": 12288 + }, + { + "epoch": 0.36441004655576315, + "grad_norm": 0.15441904962062836, + "learning_rate": 0.0007158056963146956, + "loss": 2.7165, + "step": 12289 + }, + { + "epoch": 0.3644396999080746, + "grad_norm": 0.15327483415603638, + "learning_rate": 0.0007157632525861823, + "loss": 2.6875, + "step": 12290 + }, + { + "epoch": 0.3644693532603861, + "grad_norm": 0.1351766139268875, + "learning_rate": 0.0007157208069470487, + "loss": 2.7103, + "step": 12291 + }, + { + "epoch": 0.3644990066126976, + "grad_norm": 0.13092167675495148, + "learning_rate": 0.0007156783593976701, + "loss": 2.7086, + "step": 12292 + }, + { + "epoch": 0.36452865996500905, + "grad_norm": 0.13124187290668488, + "learning_rate": 0.0007156359099384227, + "loss": 2.7163, + "step": 12293 + }, + { + "epoch": 0.3645583133173205, + "grad_norm": 0.14191550016403198, + "learning_rate": 0.0007155934585696824, + "loss": 2.7041, + "step": 12294 + }, + { + "epoch": 0.364587966669632, + "grad_norm": 0.13919082283973694, + "learning_rate": 0.0007155510052918248, + "loss": 2.71, + "step": 12295 + }, + { + "epoch": 0.3646176200219435, + "grad_norm": 0.1541503667831421, + "learning_rate": 0.0007155085501052261, + "loss": 2.7125, + "step": 12296 + }, + { + "epoch": 0.36464727337425495, + "grad_norm": 0.14622201025485992, + "learning_rate": 0.0007154660930102624, + "loss": 2.7063, + "step": 12297 + }, + { + "epoch": 0.36467692672656643, + "grad_norm": 0.1477542519569397, + "learning_rate": 0.0007154236340073093, + "loss": 2.6832, + "step": 12298 + }, + { + "epoch": 0.3647065800788779, + "grad_norm": 0.1438225656747818, + "learning_rate": 0.0007153811730967428, + "loss": 2.7158, + "step": 12299 + }, + { + "epoch": 0.3647362334311894, + "grad_norm": 0.11817949265241623, + "learning_rate": 0.0007153387102789392, + "loss": 2.7317, + "step": 12300 + }, + { + "epoch": 0.36476588678350086, + "grad_norm": 0.14576314389705658, + "learning_rate": 0.0007152962455542744, + "loss": 2.6715, + "step": 12301 + }, + { + "epoch": 0.36479554013581234, + "grad_norm": 0.15772050619125366, + "learning_rate": 0.0007152537789231244, + "loss": 2.7199, + "step": 12302 + }, + { + "epoch": 0.3648251934881238, + "grad_norm": 0.19111022353172302, + "learning_rate": 0.0007152113103858652, + "loss": 2.7013, + "step": 12303 + }, + { + "epoch": 0.3648548468404353, + "grad_norm": 0.1886843889951706, + "learning_rate": 0.0007151688399428728, + "loss": 2.7168, + "step": 12304 + }, + { + "epoch": 0.36488450019274676, + "grad_norm": 0.1807989776134491, + "learning_rate": 0.0007151263675945236, + "loss": 2.7176, + "step": 12305 + }, + { + "epoch": 0.3649141535450583, + "grad_norm": 0.1514381468296051, + "learning_rate": 0.0007150838933411934, + "loss": 2.7248, + "step": 12306 + }, + { + "epoch": 0.36494380689736977, + "grad_norm": 0.13869337737560272, + "learning_rate": 0.0007150414171832583, + "loss": 2.6853, + "step": 12307 + }, + { + "epoch": 0.36497346024968125, + "grad_norm": 0.16676075756549835, + "learning_rate": 0.0007149989391210947, + "loss": 2.7362, + "step": 12308 + }, + { + "epoch": 0.3650031136019927, + "grad_norm": 0.1451173722743988, + "learning_rate": 0.0007149564591550784, + "loss": 2.6857, + "step": 12309 + }, + { + "epoch": 0.3650327669543042, + "grad_norm": 0.14139510691165924, + "learning_rate": 0.000714913977285586, + "loss": 2.7125, + "step": 12310 + }, + { + "epoch": 0.3650624203066157, + "grad_norm": 0.16229085624217987, + "learning_rate": 0.0007148714935129932, + "loss": 2.7221, + "step": 12311 + }, + { + "epoch": 0.36509207365892715, + "grad_norm": 0.16005951166152954, + "learning_rate": 0.0007148290078376765, + "loss": 2.682, + "step": 12312 + }, + { + "epoch": 0.36512172701123863, + "grad_norm": 0.1294570416212082, + "learning_rate": 0.0007147865202600121, + "loss": 2.69, + "step": 12313 + }, + { + "epoch": 0.3651513803635501, + "grad_norm": 0.12910087406635284, + "learning_rate": 0.0007147440307803763, + "loss": 2.7048, + "step": 12314 + }, + { + "epoch": 0.3651810337158616, + "grad_norm": 0.1455027014017105, + "learning_rate": 0.0007147015393991451, + "loss": 2.7053, + "step": 12315 + }, + { + "epoch": 0.36521068706817306, + "grad_norm": 0.12012320011854172, + "learning_rate": 0.000714659046116695, + "loss": 2.6881, + "step": 12316 + }, + { + "epoch": 0.36524034042048453, + "grad_norm": 0.12442759424448013, + "learning_rate": 0.0007146165509334021, + "loss": 2.6998, + "step": 12317 + }, + { + "epoch": 0.365269993772796, + "grad_norm": 0.12050903588533401, + "learning_rate": 0.0007145740538496429, + "loss": 2.7015, + "step": 12318 + }, + { + "epoch": 0.3652996471251075, + "grad_norm": 0.13567954301834106, + "learning_rate": 0.0007145315548657937, + "loss": 2.6794, + "step": 12319 + }, + { + "epoch": 0.36532930047741896, + "grad_norm": 0.13538044691085815, + "learning_rate": 0.0007144890539822306, + "loss": 2.7023, + "step": 12320 + }, + { + "epoch": 0.36535895382973044, + "grad_norm": 0.1237097680568695, + "learning_rate": 0.0007144465511993302, + "loss": 2.7054, + "step": 12321 + }, + { + "epoch": 0.3653886071820419, + "grad_norm": 0.135212704539299, + "learning_rate": 0.0007144040465174686, + "loss": 2.7062, + "step": 12322 + }, + { + "epoch": 0.3654182605343534, + "grad_norm": 0.1295139491558075, + "learning_rate": 0.0007143615399370226, + "loss": 2.6732, + "step": 12323 + }, + { + "epoch": 0.36544791388666487, + "grad_norm": 0.1374434530735016, + "learning_rate": 0.0007143190314583683, + "loss": 2.6981, + "step": 12324 + }, + { + "epoch": 0.36547756723897634, + "grad_norm": 0.11678298562765121, + "learning_rate": 0.0007142765210818822, + "loss": 2.7067, + "step": 12325 + }, + { + "epoch": 0.3655072205912878, + "grad_norm": 0.11164011061191559, + "learning_rate": 0.0007142340088079406, + "loss": 2.6898, + "step": 12326 + }, + { + "epoch": 0.36553687394359935, + "grad_norm": 0.14518128335475922, + "learning_rate": 0.0007141914946369203, + "loss": 2.6822, + "step": 12327 + }, + { + "epoch": 0.3655665272959108, + "grad_norm": 0.14302827417850494, + "learning_rate": 0.0007141489785691973, + "loss": 2.6748, + "step": 12328 + }, + { + "epoch": 0.3655961806482223, + "grad_norm": 0.13148659467697144, + "learning_rate": 0.0007141064606051484, + "loss": 2.6862, + "step": 12329 + }, + { + "epoch": 0.3656258340005338, + "grad_norm": 0.13424155116081238, + "learning_rate": 0.0007140639407451502, + "loss": 2.7083, + "step": 12330 + }, + { + "epoch": 0.36565548735284525, + "grad_norm": 0.14270459115505219, + "learning_rate": 0.0007140214189895789, + "loss": 2.6776, + "step": 12331 + }, + { + "epoch": 0.36568514070515673, + "grad_norm": 0.1447209268808365, + "learning_rate": 0.0007139788953388113, + "loss": 2.699, + "step": 12332 + }, + { + "epoch": 0.3657147940574682, + "grad_norm": 0.12580420076847076, + "learning_rate": 0.0007139363697932238, + "loss": 2.6867, + "step": 12333 + }, + { + "epoch": 0.3657444474097797, + "grad_norm": 0.13930296897888184, + "learning_rate": 0.0007138938423531931, + "loss": 2.6794, + "step": 12334 + }, + { + "epoch": 0.36577410076209116, + "grad_norm": 0.15482863783836365, + "learning_rate": 0.0007138513130190957, + "loss": 2.7279, + "step": 12335 + }, + { + "epoch": 0.36580375411440264, + "grad_norm": 0.13714095950126648, + "learning_rate": 0.0007138087817913081, + "loss": 2.6738, + "step": 12336 + }, + { + "epoch": 0.3658334074667141, + "grad_norm": 0.1343267261981964, + "learning_rate": 0.0007137662486702072, + "loss": 2.7204, + "step": 12337 + }, + { + "epoch": 0.3658630608190256, + "grad_norm": 0.13483545184135437, + "learning_rate": 0.0007137237136561693, + "loss": 2.6775, + "step": 12338 + }, + { + "epoch": 0.36589271417133706, + "grad_norm": 0.12252858281135559, + "learning_rate": 0.0007136811767495712, + "loss": 2.6631, + "step": 12339 + }, + { + "epoch": 0.36592236752364854, + "grad_norm": 0.13373258709907532, + "learning_rate": 0.0007136386379507898, + "loss": 2.6418, + "step": 12340 + }, + { + "epoch": 0.36595202087596, + "grad_norm": 0.14703340828418732, + "learning_rate": 0.0007135960972602015, + "loss": 2.7204, + "step": 12341 + }, + { + "epoch": 0.3659816742282715, + "grad_norm": 0.14809578657150269, + "learning_rate": 0.0007135535546781831, + "loss": 2.705, + "step": 12342 + }, + { + "epoch": 0.36601132758058297, + "grad_norm": 0.14821970462799072, + "learning_rate": 0.0007135110102051112, + "loss": 2.7087, + "step": 12343 + }, + { + "epoch": 0.36604098093289444, + "grad_norm": 0.14397695660591125, + "learning_rate": 0.0007134684638413629, + "loss": 2.6935, + "step": 12344 + }, + { + "epoch": 0.3660706342852059, + "grad_norm": 0.137444406747818, + "learning_rate": 0.0007134259155873145, + "loss": 2.7313, + "step": 12345 + }, + { + "epoch": 0.3661002876375174, + "grad_norm": 0.11708635836839676, + "learning_rate": 0.0007133833654433431, + "loss": 2.6949, + "step": 12346 + }, + { + "epoch": 0.3661299409898289, + "grad_norm": 0.13679082691669464, + "learning_rate": 0.0007133408134098254, + "loss": 2.6838, + "step": 12347 + }, + { + "epoch": 0.3661595943421404, + "grad_norm": 0.1320647895336151, + "learning_rate": 0.000713298259487138, + "loss": 2.6851, + "step": 12348 + }, + { + "epoch": 0.3661892476944519, + "grad_norm": 0.12545490264892578, + "learning_rate": 0.000713255703675658, + "loss": 2.7016, + "step": 12349 + }, + { + "epoch": 0.36621890104676336, + "grad_norm": 0.1420414000749588, + "learning_rate": 0.0007132131459757622, + "loss": 2.7371, + "step": 12350 + }, + { + "epoch": 0.36624855439907483, + "grad_norm": 0.1364557147026062, + "learning_rate": 0.0007131705863878272, + "loss": 2.7316, + "step": 12351 + }, + { + "epoch": 0.3662782077513863, + "grad_norm": 0.12879958748817444, + "learning_rate": 0.0007131280249122304, + "loss": 2.7173, + "step": 12352 + }, + { + "epoch": 0.3663078611036978, + "grad_norm": 0.1403108388185501, + "learning_rate": 0.0007130854615493481, + "loss": 2.7016, + "step": 12353 + }, + { + "epoch": 0.36633751445600926, + "grad_norm": 0.12063764035701752, + "learning_rate": 0.0007130428962995577, + "loss": 2.6837, + "step": 12354 + }, + { + "epoch": 0.36636716780832074, + "grad_norm": 0.10460326075553894, + "learning_rate": 0.0007130003291632355, + "loss": 2.6896, + "step": 12355 + }, + { + "epoch": 0.3663968211606322, + "grad_norm": 0.14233645796775818, + "learning_rate": 0.0007129577601407591, + "loss": 2.7325, + "step": 12356 + }, + { + "epoch": 0.3664264745129437, + "grad_norm": 0.17798011004924774, + "learning_rate": 0.0007129151892325052, + "loss": 2.7041, + "step": 12357 + }, + { + "epoch": 0.36645612786525517, + "grad_norm": 0.14035822451114655, + "learning_rate": 0.0007128726164388506, + "loss": 2.6907, + "step": 12358 + }, + { + "epoch": 0.36648578121756664, + "grad_norm": 0.12554755806922913, + "learning_rate": 0.0007128300417601725, + "loss": 2.6898, + "step": 12359 + }, + { + "epoch": 0.3665154345698781, + "grad_norm": 0.11537068337202072, + "learning_rate": 0.0007127874651968479, + "loss": 2.733, + "step": 12360 + }, + { + "epoch": 0.3665450879221896, + "grad_norm": 0.11015701293945312, + "learning_rate": 0.0007127448867492536, + "loss": 2.6827, + "step": 12361 + }, + { + "epoch": 0.36657474127450107, + "grad_norm": 0.10743360221385956, + "learning_rate": 0.0007127023064177671, + "loss": 2.7451, + "step": 12362 + }, + { + "epoch": 0.36660439462681255, + "grad_norm": 0.15136542916297913, + "learning_rate": 0.0007126597242027651, + "loss": 2.6961, + "step": 12363 + }, + { + "epoch": 0.366634047979124, + "grad_norm": 0.12590013444423676, + "learning_rate": 0.0007126171401046245, + "loss": 2.7113, + "step": 12364 + }, + { + "epoch": 0.3666637013314355, + "grad_norm": 0.144495889544487, + "learning_rate": 0.0007125745541237228, + "loss": 2.6812, + "step": 12365 + }, + { + "epoch": 0.366693354683747, + "grad_norm": 0.1242845430970192, + "learning_rate": 0.000712531966260437, + "loss": 2.7031, + "step": 12366 + }, + { + "epoch": 0.36672300803605845, + "grad_norm": 0.11195021867752075, + "learning_rate": 0.000712489376515144, + "loss": 2.6915, + "step": 12367 + }, + { + "epoch": 0.36675266138837, + "grad_norm": 0.13501432538032532, + "learning_rate": 0.0007124467848882212, + "loss": 2.7146, + "step": 12368 + }, + { + "epoch": 0.36678231474068146, + "grad_norm": 0.1489470899105072, + "learning_rate": 0.0007124041913800456, + "loss": 2.6595, + "step": 12369 + }, + { + "epoch": 0.36681196809299293, + "grad_norm": 0.15746000409126282, + "learning_rate": 0.0007123615959909945, + "loss": 2.7284, + "step": 12370 + }, + { + "epoch": 0.3668416214453044, + "grad_norm": 0.1681254655122757, + "learning_rate": 0.0007123189987214449, + "loss": 2.715, + "step": 12371 + }, + { + "epoch": 0.3668712747976159, + "grad_norm": 0.14304262399673462, + "learning_rate": 0.0007122763995717743, + "loss": 2.6977, + "step": 12372 + }, + { + "epoch": 0.36690092814992736, + "grad_norm": 0.1385205090045929, + "learning_rate": 0.0007122337985423596, + "loss": 2.7165, + "step": 12373 + }, + { + "epoch": 0.36693058150223884, + "grad_norm": 0.14598415791988373, + "learning_rate": 0.0007121911956335782, + "loss": 2.6903, + "step": 12374 + }, + { + "epoch": 0.3669602348545503, + "grad_norm": 0.12114731967449188, + "learning_rate": 0.0007121485908458074, + "loss": 2.6991, + "step": 12375 + }, + { + "epoch": 0.3669898882068618, + "grad_norm": 0.11673526465892792, + "learning_rate": 0.0007121059841794242, + "loss": 2.7131, + "step": 12376 + }, + { + "epoch": 0.36701954155917327, + "grad_norm": 0.12289958447217941, + "learning_rate": 0.0007120633756348064, + "loss": 2.7166, + "step": 12377 + }, + { + "epoch": 0.36704919491148474, + "grad_norm": 0.1356593817472458, + "learning_rate": 0.0007120207652123308, + "loss": 2.7176, + "step": 12378 + }, + { + "epoch": 0.3670788482637962, + "grad_norm": 0.14430631697177887, + "learning_rate": 0.0007119781529123751, + "loss": 2.7029, + "step": 12379 + }, + { + "epoch": 0.3671085016161077, + "grad_norm": 0.13613402843475342, + "learning_rate": 0.0007119355387353164, + "loss": 2.7204, + "step": 12380 + }, + { + "epoch": 0.3671381549684192, + "grad_norm": 0.1311829686164856, + "learning_rate": 0.0007118929226815321, + "loss": 2.7181, + "step": 12381 + }, + { + "epoch": 0.36716780832073065, + "grad_norm": 0.13342301547527313, + "learning_rate": 0.0007118503047513996, + "loss": 2.6525, + "step": 12382 + }, + { + "epoch": 0.3671974616730421, + "grad_norm": 0.1404253989458084, + "learning_rate": 0.0007118076849452964, + "loss": 2.6784, + "step": 12383 + }, + { + "epoch": 0.3672271150253536, + "grad_norm": 0.1446986198425293, + "learning_rate": 0.0007117650632635996, + "loss": 2.6975, + "step": 12384 + }, + { + "epoch": 0.3672567683776651, + "grad_norm": 0.12887832522392273, + "learning_rate": 0.000711722439706687, + "loss": 2.6913, + "step": 12385 + }, + { + "epoch": 0.36728642172997655, + "grad_norm": 0.13669437170028687, + "learning_rate": 0.0007116798142749358, + "loss": 2.663, + "step": 12386 + }, + { + "epoch": 0.36731607508228803, + "grad_norm": 0.1373642086982727, + "learning_rate": 0.0007116371869687233, + "loss": 2.6595, + "step": 12387 + }, + { + "epoch": 0.3673457284345995, + "grad_norm": 0.13523508608341217, + "learning_rate": 0.0007115945577884274, + "loss": 2.7005, + "step": 12388 + }, + { + "epoch": 0.36737538178691104, + "grad_norm": 0.15035590529441833, + "learning_rate": 0.0007115519267344252, + "loss": 2.6868, + "step": 12389 + }, + { + "epoch": 0.3674050351392225, + "grad_norm": 0.14928586781024933, + "learning_rate": 0.0007115092938070947, + "loss": 2.7034, + "step": 12390 + }, + { + "epoch": 0.367434688491534, + "grad_norm": 0.1340550184249878, + "learning_rate": 0.0007114666590068129, + "loss": 2.6996, + "step": 12391 + }, + { + "epoch": 0.36746434184384547, + "grad_norm": 0.1530541181564331, + "learning_rate": 0.0007114240223339575, + "loss": 2.7034, + "step": 12392 + }, + { + "epoch": 0.36749399519615694, + "grad_norm": 0.15489622950553894, + "learning_rate": 0.000711381383788906, + "loss": 2.6764, + "step": 12393 + }, + { + "epoch": 0.3675236485484684, + "grad_norm": 0.14497320353984833, + "learning_rate": 0.0007113387433720363, + "loss": 2.6929, + "step": 12394 + }, + { + "epoch": 0.3675533019007799, + "grad_norm": 0.1312527358531952, + "learning_rate": 0.0007112961010837256, + "loss": 2.6491, + "step": 12395 + }, + { + "epoch": 0.36758295525309137, + "grad_norm": 0.11006917804479599, + "learning_rate": 0.0007112534569243519, + "loss": 2.7331, + "step": 12396 + }, + { + "epoch": 0.36761260860540285, + "grad_norm": 0.12843526899814606, + "learning_rate": 0.0007112108108942922, + "loss": 2.6657, + "step": 12397 + }, + { + "epoch": 0.3676422619577143, + "grad_norm": 0.13036799430847168, + "learning_rate": 0.0007111681629939249, + "loss": 2.6668, + "step": 12398 + }, + { + "epoch": 0.3676719153100258, + "grad_norm": 0.13547281920909882, + "learning_rate": 0.000711125513223627, + "loss": 2.6939, + "step": 12399 + }, + { + "epoch": 0.3677015686623373, + "grad_norm": 0.13244912028312683, + "learning_rate": 0.0007110828615837765, + "loss": 2.7013, + "step": 12400 + }, + { + "epoch": 0.36773122201464875, + "grad_norm": 0.13782575726509094, + "learning_rate": 0.000711040208074751, + "loss": 2.6912, + "step": 12401 + }, + { + "epoch": 0.3677608753669602, + "grad_norm": 0.10875938087701797, + "learning_rate": 0.0007109975526969283, + "loss": 2.7366, + "step": 12402 + }, + { + "epoch": 0.3677905287192717, + "grad_norm": 0.13370032608509064, + "learning_rate": 0.0007109548954506859, + "loss": 2.7174, + "step": 12403 + }, + { + "epoch": 0.3678201820715832, + "grad_norm": 0.1503288298845291, + "learning_rate": 0.0007109122363364019, + "loss": 2.6978, + "step": 12404 + }, + { + "epoch": 0.36784983542389466, + "grad_norm": 0.14807933568954468, + "learning_rate": 0.0007108695753544537, + "loss": 2.7145, + "step": 12405 + }, + { + "epoch": 0.36787948877620613, + "grad_norm": 0.13172093033790588, + "learning_rate": 0.0007108269125052194, + "loss": 2.7356, + "step": 12406 + }, + { + "epoch": 0.3679091421285176, + "grad_norm": 0.1212669387459755, + "learning_rate": 0.0007107842477890764, + "loss": 2.7203, + "step": 12407 + }, + { + "epoch": 0.3679387954808291, + "grad_norm": 0.10632821917533875, + "learning_rate": 0.0007107415812064028, + "loss": 2.6796, + "step": 12408 + }, + { + "epoch": 0.36796844883314056, + "grad_norm": 0.1099192202091217, + "learning_rate": 0.0007106989127575763, + "loss": 2.6896, + "step": 12409 + }, + { + "epoch": 0.3679981021854521, + "grad_norm": 0.13039454817771912, + "learning_rate": 0.0007106562424429748, + "loss": 2.7121, + "step": 12410 + }, + { + "epoch": 0.36802775553776357, + "grad_norm": 0.14416174590587616, + "learning_rate": 0.000710613570262976, + "loss": 2.6927, + "step": 12411 + }, + { + "epoch": 0.36805740889007504, + "grad_norm": 0.12379377335309982, + "learning_rate": 0.000710570896217958, + "loss": 2.6836, + "step": 12412 + }, + { + "epoch": 0.3680870622423865, + "grad_norm": 0.1489727795124054, + "learning_rate": 0.0007105282203082985, + "loss": 2.7162, + "step": 12413 + }, + { + "epoch": 0.368116715594698, + "grad_norm": 0.16104115545749664, + "learning_rate": 0.0007104855425343755, + "loss": 2.6973, + "step": 12414 + }, + { + "epoch": 0.3681463689470095, + "grad_norm": 0.1391471028327942, + "learning_rate": 0.0007104428628965668, + "loss": 2.6575, + "step": 12415 + }, + { + "epoch": 0.36817602229932095, + "grad_norm": 0.15771062672138214, + "learning_rate": 0.0007104001813952506, + "loss": 2.6418, + "step": 12416 + }, + { + "epoch": 0.3682056756516324, + "grad_norm": 0.14248304069042206, + "learning_rate": 0.0007103574980308046, + "loss": 2.7307, + "step": 12417 + }, + { + "epoch": 0.3682353290039439, + "grad_norm": 0.16505947709083557, + "learning_rate": 0.000710314812803607, + "loss": 2.6894, + "step": 12418 + }, + { + "epoch": 0.3682649823562554, + "grad_norm": 0.17761851847171783, + "learning_rate": 0.0007102721257140353, + "loss": 2.6939, + "step": 12419 + }, + { + "epoch": 0.36829463570856685, + "grad_norm": 0.13247276842594147, + "learning_rate": 0.0007102294367624681, + "loss": 2.7233, + "step": 12420 + }, + { + "epoch": 0.36832428906087833, + "grad_norm": 0.1331360936164856, + "learning_rate": 0.000710186745949283, + "loss": 2.7094, + "step": 12421 + }, + { + "epoch": 0.3683539424131898, + "grad_norm": 0.15289393067359924, + "learning_rate": 0.0007101440532748583, + "loss": 2.7152, + "step": 12422 + }, + { + "epoch": 0.3683835957655013, + "grad_norm": 0.15332826972007751, + "learning_rate": 0.0007101013587395719, + "loss": 2.7169, + "step": 12423 + }, + { + "epoch": 0.36841324911781276, + "grad_norm": 0.17039529979228973, + "learning_rate": 0.000710058662343802, + "loss": 2.688, + "step": 12424 + }, + { + "epoch": 0.36844290247012423, + "grad_norm": 0.15237149596214294, + "learning_rate": 0.0007100159640879265, + "loss": 2.7045, + "step": 12425 + }, + { + "epoch": 0.3684725558224357, + "grad_norm": 0.15446875989437103, + "learning_rate": 0.0007099732639723234, + "loss": 2.6897, + "step": 12426 + }, + { + "epoch": 0.3685022091747472, + "grad_norm": 0.11493123322725296, + "learning_rate": 0.0007099305619973713, + "loss": 2.7046, + "step": 12427 + }, + { + "epoch": 0.36853186252705866, + "grad_norm": 0.12308736890554428, + "learning_rate": 0.0007098878581634479, + "loss": 2.684, + "step": 12428 + }, + { + "epoch": 0.36856151587937014, + "grad_norm": 0.13969942927360535, + "learning_rate": 0.0007098451524709315, + "loss": 2.7276, + "step": 12429 + }, + { + "epoch": 0.3685911692316816, + "grad_norm": 0.13435670733451843, + "learning_rate": 0.0007098024449202003, + "loss": 2.7169, + "step": 12430 + }, + { + "epoch": 0.36862082258399315, + "grad_norm": 0.1365072876214981, + "learning_rate": 0.0007097597355116324, + "loss": 2.7051, + "step": 12431 + }, + { + "epoch": 0.3686504759363046, + "grad_norm": 0.14159591495990753, + "learning_rate": 0.000709717024245606, + "loss": 2.6907, + "step": 12432 + }, + { + "epoch": 0.3686801292886161, + "grad_norm": 0.13591794669628143, + "learning_rate": 0.0007096743111224995, + "loss": 2.6774, + "step": 12433 + }, + { + "epoch": 0.3687097826409276, + "grad_norm": 0.11949165910482407, + "learning_rate": 0.0007096315961426908, + "loss": 2.7029, + "step": 12434 + }, + { + "epoch": 0.36873943599323905, + "grad_norm": 0.12778213620185852, + "learning_rate": 0.0007095888793065585, + "loss": 2.6684, + "step": 12435 + }, + { + "epoch": 0.3687690893455505, + "grad_norm": 0.1544579118490219, + "learning_rate": 0.0007095461606144805, + "loss": 2.683, + "step": 12436 + }, + { + "epoch": 0.368798742697862, + "grad_norm": 0.14415962994098663, + "learning_rate": 0.0007095034400668354, + "loss": 2.6815, + "step": 12437 + }, + { + "epoch": 0.3688283960501735, + "grad_norm": 0.13790780305862427, + "learning_rate": 0.0007094607176640014, + "loss": 2.6809, + "step": 12438 + }, + { + "epoch": 0.36885804940248496, + "grad_norm": 0.13322019577026367, + "learning_rate": 0.0007094179934063567, + "loss": 2.7118, + "step": 12439 + }, + { + "epoch": 0.36888770275479643, + "grad_norm": 0.14752456545829773, + "learning_rate": 0.0007093752672942799, + "loss": 2.6636, + "step": 12440 + }, + { + "epoch": 0.3689173561071079, + "grad_norm": 0.12819157540798187, + "learning_rate": 0.000709332539328149, + "loss": 2.688, + "step": 12441 + }, + { + "epoch": 0.3689470094594194, + "grad_norm": 0.12385381758213043, + "learning_rate": 0.0007092898095083426, + "loss": 2.6805, + "step": 12442 + }, + { + "epoch": 0.36897666281173086, + "grad_norm": 0.11410713195800781, + "learning_rate": 0.000709247077835239, + "loss": 2.709, + "step": 12443 + }, + { + "epoch": 0.36900631616404234, + "grad_norm": 0.12453701347112656, + "learning_rate": 0.0007092043443092166, + "loss": 2.7114, + "step": 12444 + }, + { + "epoch": 0.3690359695163538, + "grad_norm": 0.1364016830921173, + "learning_rate": 0.0007091616089306539, + "loss": 2.697, + "step": 12445 + }, + { + "epoch": 0.3690656228686653, + "grad_norm": 0.15398846566677094, + "learning_rate": 0.000709118871699929, + "loss": 2.7015, + "step": 12446 + }, + { + "epoch": 0.36909527622097676, + "grad_norm": 0.1515074074268341, + "learning_rate": 0.0007090761326174208, + "loss": 2.6549, + "step": 12447 + }, + { + "epoch": 0.36912492957328824, + "grad_norm": 0.14064078032970428, + "learning_rate": 0.0007090333916835076, + "loss": 2.6767, + "step": 12448 + }, + { + "epoch": 0.3691545829255997, + "grad_norm": 0.12568923830986023, + "learning_rate": 0.0007089906488985677, + "loss": 2.7111, + "step": 12449 + }, + { + "epoch": 0.3691842362779112, + "grad_norm": 0.1459796279668808, + "learning_rate": 0.00070894790426298, + "loss": 2.7647, + "step": 12450 + }, + { + "epoch": 0.36921388963022267, + "grad_norm": 0.16072186827659607, + "learning_rate": 0.0007089051577771225, + "loss": 2.741, + "step": 12451 + }, + { + "epoch": 0.3692435429825342, + "grad_norm": 0.17070871591567993, + "learning_rate": 0.0007088624094413739, + "loss": 2.6788, + "step": 12452 + }, + { + "epoch": 0.3692731963348457, + "grad_norm": 0.15541765093803406, + "learning_rate": 0.0007088196592561129, + "loss": 2.6987, + "step": 12453 + }, + { + "epoch": 0.36930284968715715, + "grad_norm": 0.14772474765777588, + "learning_rate": 0.000708776907221718, + "loss": 2.7228, + "step": 12454 + }, + { + "epoch": 0.36933250303946863, + "grad_norm": 0.16174259781837463, + "learning_rate": 0.0007087341533385678, + "loss": 2.6737, + "step": 12455 + }, + { + "epoch": 0.3693621563917801, + "grad_norm": 0.15383148193359375, + "learning_rate": 0.0007086913976070408, + "loss": 2.6943, + "step": 12456 + }, + { + "epoch": 0.3693918097440916, + "grad_norm": 0.15005074441432953, + "learning_rate": 0.0007086486400275158, + "loss": 2.6703, + "step": 12457 + }, + { + "epoch": 0.36942146309640306, + "grad_norm": 0.16635899245738983, + "learning_rate": 0.000708605880600371, + "loss": 2.7325, + "step": 12458 + }, + { + "epoch": 0.36945111644871453, + "grad_norm": 0.1597994714975357, + "learning_rate": 0.0007085631193259853, + "loss": 2.6869, + "step": 12459 + }, + { + "epoch": 0.369480769801026, + "grad_norm": 0.13413012027740479, + "learning_rate": 0.0007085203562047376, + "loss": 2.7141, + "step": 12460 + }, + { + "epoch": 0.3695104231533375, + "grad_norm": 0.12416408210992813, + "learning_rate": 0.0007084775912370064, + "loss": 2.6637, + "step": 12461 + }, + { + "epoch": 0.36954007650564896, + "grad_norm": 0.13714627921581268, + "learning_rate": 0.0007084348244231702, + "loss": 2.6792, + "step": 12462 + }, + { + "epoch": 0.36956972985796044, + "grad_norm": 0.14569735527038574, + "learning_rate": 0.0007083920557636079, + "loss": 2.6791, + "step": 12463 + }, + { + "epoch": 0.3695993832102719, + "grad_norm": 0.11473850160837173, + "learning_rate": 0.0007083492852586981, + "loss": 2.7103, + "step": 12464 + }, + { + "epoch": 0.3696290365625834, + "grad_norm": 0.1421249508857727, + "learning_rate": 0.0007083065129088196, + "loss": 2.7018, + "step": 12465 + }, + { + "epoch": 0.36965868991489487, + "grad_norm": 0.14679040014743805, + "learning_rate": 0.0007082637387143514, + "loss": 2.7321, + "step": 12466 + }, + { + "epoch": 0.36968834326720634, + "grad_norm": 0.12720105051994324, + "learning_rate": 0.0007082209626756718, + "loss": 2.6953, + "step": 12467 + }, + { + "epoch": 0.3697179966195178, + "grad_norm": 0.1355837881565094, + "learning_rate": 0.00070817818479316, + "loss": 2.6734, + "step": 12468 + }, + { + "epoch": 0.3697476499718293, + "grad_norm": 0.13750490546226501, + "learning_rate": 0.0007081354050671946, + "loss": 2.6937, + "step": 12469 + }, + { + "epoch": 0.36977730332414077, + "grad_norm": 0.14106665551662445, + "learning_rate": 0.0007080926234981544, + "loss": 2.7285, + "step": 12470 + }, + { + "epoch": 0.36980695667645225, + "grad_norm": 0.1357242912054062, + "learning_rate": 0.0007080498400864183, + "loss": 2.7076, + "step": 12471 + }, + { + "epoch": 0.3698366100287638, + "grad_norm": 0.14604422450065613, + "learning_rate": 0.0007080070548323652, + "loss": 2.7068, + "step": 12472 + }, + { + "epoch": 0.36986626338107526, + "grad_norm": 0.15321385860443115, + "learning_rate": 0.0007079642677363739, + "loss": 2.6615, + "step": 12473 + }, + { + "epoch": 0.36989591673338673, + "grad_norm": 0.14061082899570465, + "learning_rate": 0.0007079214787988233, + "loss": 2.6662, + "step": 12474 + }, + { + "epoch": 0.3699255700856982, + "grad_norm": 0.14499446749687195, + "learning_rate": 0.0007078786880200923, + "loss": 2.731, + "step": 12475 + }, + { + "epoch": 0.3699552234380097, + "grad_norm": 0.11050962656736374, + "learning_rate": 0.0007078358954005599, + "loss": 2.6758, + "step": 12476 + }, + { + "epoch": 0.36998487679032116, + "grad_norm": 0.1285177618265152, + "learning_rate": 0.0007077931009406049, + "loss": 2.7111, + "step": 12477 + }, + { + "epoch": 0.37001453014263264, + "grad_norm": 0.13281448185443878, + "learning_rate": 0.0007077503046406064, + "loss": 2.7127, + "step": 12478 + }, + { + "epoch": 0.3700441834949441, + "grad_norm": 0.12249010801315308, + "learning_rate": 0.0007077075065009433, + "loss": 2.7021, + "step": 12479 + }, + { + "epoch": 0.3700738368472556, + "grad_norm": 0.1261296272277832, + "learning_rate": 0.0007076647065219944, + "loss": 2.7221, + "step": 12480 + }, + { + "epoch": 0.37010349019956706, + "grad_norm": 0.12818825244903564, + "learning_rate": 0.0007076219047041392, + "loss": 2.6963, + "step": 12481 + }, + { + "epoch": 0.37013314355187854, + "grad_norm": 0.11898855865001678, + "learning_rate": 0.0007075791010477562, + "loss": 2.719, + "step": 12482 + }, + { + "epoch": 0.37016279690419, + "grad_norm": 0.11768113076686859, + "learning_rate": 0.0007075362955532246, + "loss": 2.6702, + "step": 12483 + }, + { + "epoch": 0.3701924502565015, + "grad_norm": 0.10976040363311768, + "learning_rate": 0.0007074934882209234, + "loss": 2.6884, + "step": 12484 + }, + { + "epoch": 0.37022210360881297, + "grad_norm": 0.10762664675712585, + "learning_rate": 0.0007074506790512319, + "loss": 2.7056, + "step": 12485 + }, + { + "epoch": 0.37025175696112445, + "grad_norm": 0.13441386818885803, + "learning_rate": 0.000707407868044529, + "loss": 2.7308, + "step": 12486 + }, + { + "epoch": 0.3702814103134359, + "grad_norm": 0.1539892703294754, + "learning_rate": 0.0007073650552011938, + "loss": 2.7051, + "step": 12487 + }, + { + "epoch": 0.3703110636657474, + "grad_norm": 0.17275485396385193, + "learning_rate": 0.0007073222405216056, + "loss": 2.6871, + "step": 12488 + }, + { + "epoch": 0.3703407170180589, + "grad_norm": 0.1431562304496765, + "learning_rate": 0.0007072794240061432, + "loss": 2.7034, + "step": 12489 + }, + { + "epoch": 0.37037037037037035, + "grad_norm": 0.1335800588130951, + "learning_rate": 0.0007072366056551859, + "loss": 2.6893, + "step": 12490 + }, + { + "epoch": 0.3704000237226818, + "grad_norm": 0.14545272290706635, + "learning_rate": 0.000707193785469113, + "loss": 2.7023, + "step": 12491 + }, + { + "epoch": 0.3704296770749933, + "grad_norm": 0.15752160549163818, + "learning_rate": 0.0007071509634483035, + "loss": 2.7231, + "step": 12492 + }, + { + "epoch": 0.37045933042730483, + "grad_norm": 0.14886535704135895, + "learning_rate": 0.0007071081395931365, + "loss": 2.691, + "step": 12493 + }, + { + "epoch": 0.3704889837796163, + "grad_norm": 0.14719681441783905, + "learning_rate": 0.0007070653139039918, + "loss": 2.6944, + "step": 12494 + }, + { + "epoch": 0.3705186371319278, + "grad_norm": 0.12748835980892181, + "learning_rate": 0.0007070224863812479, + "loss": 2.7081, + "step": 12495 + }, + { + "epoch": 0.37054829048423926, + "grad_norm": 0.1195182129740715, + "learning_rate": 0.0007069796570252845, + "loss": 2.6832, + "step": 12496 + }, + { + "epoch": 0.37057794383655074, + "grad_norm": 0.13245761394500732, + "learning_rate": 0.0007069368258364804, + "loss": 2.7013, + "step": 12497 + }, + { + "epoch": 0.3706075971888622, + "grad_norm": 0.137197807431221, + "learning_rate": 0.0007068939928152153, + "loss": 2.7002, + "step": 12498 + }, + { + "epoch": 0.3706372505411737, + "grad_norm": 0.1336437165737152, + "learning_rate": 0.0007068511579618686, + "loss": 2.6833, + "step": 12499 + }, + { + "epoch": 0.37066690389348517, + "grad_norm": 0.13458694517612457, + "learning_rate": 0.0007068083212768192, + "loss": 2.7038, + "step": 12500 + }, + { + "epoch": 0.37069655724579664, + "grad_norm": 0.16745160520076752, + "learning_rate": 0.0007067654827604468, + "loss": 2.6745, + "step": 12501 + }, + { + "epoch": 0.3707262105981081, + "grad_norm": 0.1852053701877594, + "learning_rate": 0.0007067226424131304, + "loss": 2.6687, + "step": 12502 + }, + { + "epoch": 0.3707558639504196, + "grad_norm": 0.17564597725868225, + "learning_rate": 0.0007066798002352495, + "loss": 2.7028, + "step": 12503 + }, + { + "epoch": 0.37078551730273107, + "grad_norm": 0.1422189325094223, + "learning_rate": 0.0007066369562271836, + "loss": 2.6884, + "step": 12504 + }, + { + "epoch": 0.37081517065504255, + "grad_norm": 0.1781432330608368, + "learning_rate": 0.000706594110389312, + "loss": 2.6927, + "step": 12505 + }, + { + "epoch": 0.370844824007354, + "grad_norm": 0.18608854711055756, + "learning_rate": 0.000706551262722014, + "loss": 2.6715, + "step": 12506 + }, + { + "epoch": 0.3708744773596655, + "grad_norm": 0.1380702406167984, + "learning_rate": 0.0007065084132256692, + "loss": 2.6894, + "step": 12507 + }, + { + "epoch": 0.370904130711977, + "grad_norm": 0.15747271478176117, + "learning_rate": 0.0007064655619006568, + "loss": 2.7092, + "step": 12508 + }, + { + "epoch": 0.37093378406428845, + "grad_norm": 0.14814983308315277, + "learning_rate": 0.0007064227087473564, + "loss": 2.6924, + "step": 12509 + }, + { + "epoch": 0.37096343741659993, + "grad_norm": 0.1269969344139099, + "learning_rate": 0.0007063798537661477, + "loss": 2.7119, + "step": 12510 + }, + { + "epoch": 0.3709930907689114, + "grad_norm": 0.14726832509040833, + "learning_rate": 0.0007063369969574099, + "loss": 2.646, + "step": 12511 + }, + { + "epoch": 0.3710227441212229, + "grad_norm": 0.1297677606344223, + "learning_rate": 0.0007062941383215224, + "loss": 2.6816, + "step": 12512 + }, + { + "epoch": 0.37105239747353436, + "grad_norm": 0.10801912844181061, + "learning_rate": 0.0007062512778588651, + "loss": 2.7295, + "step": 12513 + }, + { + "epoch": 0.3710820508258459, + "grad_norm": 0.12279608845710754, + "learning_rate": 0.0007062084155698173, + "loss": 2.6875, + "step": 12514 + }, + { + "epoch": 0.37111170417815736, + "grad_norm": 0.12454010546207428, + "learning_rate": 0.0007061655514547585, + "loss": 2.6884, + "step": 12515 + }, + { + "epoch": 0.37114135753046884, + "grad_norm": 0.11983701586723328, + "learning_rate": 0.0007061226855140685, + "loss": 2.6484, + "step": 12516 + }, + { + "epoch": 0.3711710108827803, + "grad_norm": 0.12128328531980515, + "learning_rate": 0.0007060798177481266, + "loss": 2.7028, + "step": 12517 + }, + { + "epoch": 0.3712006642350918, + "grad_norm": 0.12870468199253082, + "learning_rate": 0.0007060369481573126, + "loss": 2.6577, + "step": 12518 + }, + { + "epoch": 0.37123031758740327, + "grad_norm": 0.14437180757522583, + "learning_rate": 0.000705994076742006, + "loss": 2.7143, + "step": 12519 + }, + { + "epoch": 0.37125997093971475, + "grad_norm": 0.1548263132572174, + "learning_rate": 0.0007059512035025865, + "loss": 2.7188, + "step": 12520 + }, + { + "epoch": 0.3712896242920262, + "grad_norm": 0.15025976300239563, + "learning_rate": 0.0007059083284394338, + "loss": 2.683, + "step": 12521 + }, + { + "epoch": 0.3713192776443377, + "grad_norm": 0.14226140081882477, + "learning_rate": 0.0007058654515529276, + "loss": 2.6932, + "step": 12522 + }, + { + "epoch": 0.3713489309966492, + "grad_norm": 0.11881101131439209, + "learning_rate": 0.0007058225728434472, + "loss": 2.6846, + "step": 12523 + }, + { + "epoch": 0.37137858434896065, + "grad_norm": 0.11646575480699539, + "learning_rate": 0.0007057796923113727, + "loss": 2.6879, + "step": 12524 + }, + { + "epoch": 0.3714082377012721, + "grad_norm": 0.11124148219823837, + "learning_rate": 0.0007057368099570838, + "loss": 2.6944, + "step": 12525 + }, + { + "epoch": 0.3714378910535836, + "grad_norm": 0.09947595000267029, + "learning_rate": 0.0007056939257809602, + "loss": 2.7024, + "step": 12526 + }, + { + "epoch": 0.3714675444058951, + "grad_norm": 0.10712945461273193, + "learning_rate": 0.0007056510397833815, + "loss": 2.6975, + "step": 12527 + }, + { + "epoch": 0.37149719775820655, + "grad_norm": 0.11637038737535477, + "learning_rate": 0.0007056081519647275, + "loss": 2.7275, + "step": 12528 + }, + { + "epoch": 0.37152685111051803, + "grad_norm": 0.10957442969083786, + "learning_rate": 0.000705565262325378, + "loss": 2.6778, + "step": 12529 + }, + { + "epoch": 0.3715565044628295, + "grad_norm": 0.11504249274730682, + "learning_rate": 0.0007055223708657128, + "loss": 2.7055, + "step": 12530 + }, + { + "epoch": 0.371586157815141, + "grad_norm": 0.12389072775840759, + "learning_rate": 0.0007054794775861118, + "loss": 2.681, + "step": 12531 + }, + { + "epoch": 0.37161581116745246, + "grad_norm": 0.1286022663116455, + "learning_rate": 0.0007054365824869549, + "loss": 2.6794, + "step": 12532 + }, + { + "epoch": 0.37164546451976394, + "grad_norm": 0.12880370020866394, + "learning_rate": 0.0007053936855686216, + "loss": 2.7114, + "step": 12533 + }, + { + "epoch": 0.3716751178720754, + "grad_norm": 0.11553388088941574, + "learning_rate": 0.0007053507868314919, + "loss": 2.6971, + "step": 12534 + }, + { + "epoch": 0.37170477122438694, + "grad_norm": 0.12792743742465973, + "learning_rate": 0.0007053078862759459, + "loss": 2.6922, + "step": 12535 + }, + { + "epoch": 0.3717344245766984, + "grad_norm": 0.14532552659511566, + "learning_rate": 0.0007052649839023632, + "loss": 2.682, + "step": 12536 + }, + { + "epoch": 0.3717640779290099, + "grad_norm": 0.1520797163248062, + "learning_rate": 0.0007052220797111239, + "loss": 2.6697, + "step": 12537 + }, + { + "epoch": 0.37179373128132137, + "grad_norm": 0.13914629817008972, + "learning_rate": 0.0007051791737026079, + "loss": 2.7241, + "step": 12538 + }, + { + "epoch": 0.37182338463363285, + "grad_norm": 0.1317148357629776, + "learning_rate": 0.000705136265877195, + "loss": 2.7038, + "step": 12539 + }, + { + "epoch": 0.3718530379859443, + "grad_norm": 0.12807562947273254, + "learning_rate": 0.0007050933562352654, + "loss": 2.7124, + "step": 12540 + }, + { + "epoch": 0.3718826913382558, + "grad_norm": 0.11516687273979187, + "learning_rate": 0.0007050504447771988, + "loss": 2.6801, + "step": 12541 + }, + { + "epoch": 0.3719123446905673, + "grad_norm": 0.11807885766029358, + "learning_rate": 0.0007050075315033753, + "loss": 2.6739, + "step": 12542 + }, + { + "epoch": 0.37194199804287875, + "grad_norm": 0.12847495079040527, + "learning_rate": 0.000704964616414175, + "loss": 2.6966, + "step": 12543 + }, + { + "epoch": 0.37197165139519023, + "grad_norm": 0.1489812433719635, + "learning_rate": 0.0007049216995099779, + "loss": 2.6687, + "step": 12544 + }, + { + "epoch": 0.3720013047475017, + "grad_norm": 0.15739986300468445, + "learning_rate": 0.0007048787807911637, + "loss": 2.6906, + "step": 12545 + }, + { + "epoch": 0.3720309580998132, + "grad_norm": 0.16559049487113953, + "learning_rate": 0.000704835860258113, + "loss": 2.6758, + "step": 12546 + }, + { + "epoch": 0.37206061145212466, + "grad_norm": 0.1604478657245636, + "learning_rate": 0.0007047929379112055, + "loss": 2.7162, + "step": 12547 + }, + { + "epoch": 0.37209026480443613, + "grad_norm": 0.1560119092464447, + "learning_rate": 0.0007047500137508214, + "loss": 2.6871, + "step": 12548 + }, + { + "epoch": 0.3721199181567476, + "grad_norm": 0.13751013576984406, + "learning_rate": 0.0007047070877773407, + "loss": 2.68, + "step": 12549 + }, + { + "epoch": 0.3721495715090591, + "grad_norm": 0.11417040973901749, + "learning_rate": 0.0007046641599911437, + "loss": 2.7035, + "step": 12550 + }, + { + "epoch": 0.37217922486137056, + "grad_norm": 0.1145642027258873, + "learning_rate": 0.0007046212303926105, + "loss": 2.6866, + "step": 12551 + }, + { + "epoch": 0.37220887821368204, + "grad_norm": 0.13104699552059174, + "learning_rate": 0.000704578298982121, + "loss": 2.6583, + "step": 12552 + }, + { + "epoch": 0.3722385315659935, + "grad_norm": 0.1285112053155899, + "learning_rate": 0.0007045353657600558, + "loss": 2.7042, + "step": 12553 + }, + { + "epoch": 0.372268184918305, + "grad_norm": 0.11857575178146362, + "learning_rate": 0.0007044924307267945, + "loss": 2.7093, + "step": 12554 + }, + { + "epoch": 0.37229783827061647, + "grad_norm": 0.13537481427192688, + "learning_rate": 0.0007044494938827178, + "loss": 2.6908, + "step": 12555 + }, + { + "epoch": 0.372327491622928, + "grad_norm": 0.127435564994812, + "learning_rate": 0.0007044065552282057, + "loss": 2.6923, + "step": 12556 + }, + { + "epoch": 0.3723571449752395, + "grad_norm": 0.12471667677164078, + "learning_rate": 0.0007043636147636385, + "loss": 2.662, + "step": 12557 + }, + { + "epoch": 0.37238679832755095, + "grad_norm": 0.14129069447517395, + "learning_rate": 0.0007043206724893965, + "loss": 2.7022, + "step": 12558 + }, + { + "epoch": 0.3724164516798624, + "grad_norm": 0.13226792216300964, + "learning_rate": 0.0007042777284058599, + "loss": 2.6556, + "step": 12559 + }, + { + "epoch": 0.3724461050321739, + "grad_norm": 0.12410301715135574, + "learning_rate": 0.0007042347825134088, + "loss": 2.6941, + "step": 12560 + }, + { + "epoch": 0.3724757583844854, + "grad_norm": 0.1236988827586174, + "learning_rate": 0.0007041918348124236, + "loss": 2.7074, + "step": 12561 + }, + { + "epoch": 0.37250541173679685, + "grad_norm": 0.1423174887895584, + "learning_rate": 0.0007041488853032847, + "loss": 2.7171, + "step": 12562 + }, + { + "epoch": 0.37253506508910833, + "grad_norm": 0.16309311985969543, + "learning_rate": 0.0007041059339863724, + "loss": 2.6879, + "step": 12563 + }, + { + "epoch": 0.3725647184414198, + "grad_norm": 0.15677893161773682, + "learning_rate": 0.0007040629808620671, + "loss": 2.6852, + "step": 12564 + }, + { + "epoch": 0.3725943717937313, + "grad_norm": 0.16193099319934845, + "learning_rate": 0.000704020025930749, + "loss": 2.73, + "step": 12565 + }, + { + "epoch": 0.37262402514604276, + "grad_norm": 0.14465513825416565, + "learning_rate": 0.0007039770691927986, + "loss": 2.6788, + "step": 12566 + }, + { + "epoch": 0.37265367849835423, + "grad_norm": 0.1317552924156189, + "learning_rate": 0.0007039341106485961, + "loss": 2.6991, + "step": 12567 + }, + { + "epoch": 0.3726833318506657, + "grad_norm": 0.12689636647701263, + "learning_rate": 0.0007038911502985221, + "loss": 2.6821, + "step": 12568 + }, + { + "epoch": 0.3727129852029772, + "grad_norm": 0.14239229261875153, + "learning_rate": 0.000703848188142957, + "loss": 2.7139, + "step": 12569 + }, + { + "epoch": 0.37274263855528866, + "grad_norm": 0.16195380687713623, + "learning_rate": 0.0007038052241822814, + "loss": 2.7485, + "step": 12570 + }, + { + "epoch": 0.37277229190760014, + "grad_norm": 0.15137436985969543, + "learning_rate": 0.0007037622584168754, + "loss": 2.7078, + "step": 12571 + }, + { + "epoch": 0.3728019452599116, + "grad_norm": 0.1401340216398239, + "learning_rate": 0.0007037192908471195, + "loss": 2.6721, + "step": 12572 + }, + { + "epoch": 0.3728315986122231, + "grad_norm": 0.1388619989156723, + "learning_rate": 0.0007036763214733944, + "loss": 2.6915, + "step": 12573 + }, + { + "epoch": 0.37286125196453457, + "grad_norm": 0.13512668013572693, + "learning_rate": 0.0007036333502960804, + "loss": 2.7017, + "step": 12574 + }, + { + "epoch": 0.37289090531684604, + "grad_norm": 0.12934516370296478, + "learning_rate": 0.0007035903773155584, + "loss": 2.6851, + "step": 12575 + }, + { + "epoch": 0.3729205586691576, + "grad_norm": 0.14595231413841248, + "learning_rate": 0.0007035474025322086, + "loss": 2.73, + "step": 12576 + }, + { + "epoch": 0.37295021202146905, + "grad_norm": 0.14878690242767334, + "learning_rate": 0.0007035044259464115, + "loss": 2.6927, + "step": 12577 + }, + { + "epoch": 0.37297986537378053, + "grad_norm": 0.15476468205451965, + "learning_rate": 0.0007034614475585478, + "loss": 2.6912, + "step": 12578 + }, + { + "epoch": 0.373009518726092, + "grad_norm": 0.15186135470867157, + "learning_rate": 0.000703418467368998, + "loss": 2.7048, + "step": 12579 + }, + { + "epoch": 0.3730391720784035, + "grad_norm": 0.13037146627902985, + "learning_rate": 0.0007033754853781429, + "loss": 2.7346, + "step": 12580 + }, + { + "epoch": 0.37306882543071496, + "grad_norm": 0.12294259667396545, + "learning_rate": 0.0007033325015863628, + "loss": 2.7135, + "step": 12581 + }, + { + "epoch": 0.37309847878302643, + "grad_norm": 0.1315775066614151, + "learning_rate": 0.0007032895159940387, + "loss": 2.6913, + "step": 12582 + }, + { + "epoch": 0.3731281321353379, + "grad_norm": 0.13415993750095367, + "learning_rate": 0.0007032465286015508, + "loss": 2.7124, + "step": 12583 + }, + { + "epoch": 0.3731577854876494, + "grad_norm": 0.09956087172031403, + "learning_rate": 0.0007032035394092802, + "loss": 2.6546, + "step": 12584 + }, + { + "epoch": 0.37318743883996086, + "grad_norm": 0.11340435594320297, + "learning_rate": 0.0007031605484176073, + "loss": 2.7212, + "step": 12585 + }, + { + "epoch": 0.37321709219227234, + "grad_norm": 0.11626876145601273, + "learning_rate": 0.0007031175556269129, + "loss": 2.7062, + "step": 12586 + }, + { + "epoch": 0.3732467455445838, + "grad_norm": 0.111286461353302, + "learning_rate": 0.0007030745610375777, + "loss": 2.7032, + "step": 12587 + }, + { + "epoch": 0.3732763988968953, + "grad_norm": 0.1252567619085312, + "learning_rate": 0.0007030315646499823, + "loss": 2.7086, + "step": 12588 + }, + { + "epoch": 0.37330605224920677, + "grad_norm": 0.11120111495256424, + "learning_rate": 0.0007029885664645076, + "loss": 2.6951, + "step": 12589 + }, + { + "epoch": 0.37333570560151824, + "grad_norm": 0.12252108007669449, + "learning_rate": 0.0007029455664815343, + "loss": 2.6943, + "step": 12590 + }, + { + "epoch": 0.3733653589538297, + "grad_norm": 0.11772345751523972, + "learning_rate": 0.0007029025647014432, + "loss": 2.6904, + "step": 12591 + }, + { + "epoch": 0.3733950123061412, + "grad_norm": 0.1191711276769638, + "learning_rate": 0.0007028595611246151, + "loss": 2.6895, + "step": 12592 + }, + { + "epoch": 0.37342466565845267, + "grad_norm": 0.13087745010852814, + "learning_rate": 0.0007028165557514308, + "loss": 2.6928, + "step": 12593 + }, + { + "epoch": 0.37345431901076415, + "grad_norm": 0.12400483340024948, + "learning_rate": 0.0007027735485822708, + "loss": 2.7114, + "step": 12594 + }, + { + "epoch": 0.3734839723630756, + "grad_norm": 0.12790754437446594, + "learning_rate": 0.0007027305396175165, + "loss": 2.6918, + "step": 12595 + }, + { + "epoch": 0.3735136257153871, + "grad_norm": 0.1412445306777954, + "learning_rate": 0.0007026875288575484, + "loss": 2.6753, + "step": 12596 + }, + { + "epoch": 0.37354327906769863, + "grad_norm": 0.16395707428455353, + "learning_rate": 0.0007026445163027475, + "loss": 2.7037, + "step": 12597 + }, + { + "epoch": 0.3735729324200101, + "grad_norm": 0.18516957759857178, + "learning_rate": 0.0007026015019534946, + "loss": 2.7113, + "step": 12598 + }, + { + "epoch": 0.3736025857723216, + "grad_norm": 0.20811223983764648, + "learning_rate": 0.0007025584858101706, + "loss": 2.7118, + "step": 12599 + }, + { + "epoch": 0.37363223912463306, + "grad_norm": 0.17420785129070282, + "learning_rate": 0.0007025154678731563, + "loss": 2.7228, + "step": 12600 + }, + { + "epoch": 0.37366189247694453, + "grad_norm": 0.1564183384180069, + "learning_rate": 0.0007024724481428328, + "loss": 2.7181, + "step": 12601 + }, + { + "epoch": 0.373691545829256, + "grad_norm": 0.15588997304439545, + "learning_rate": 0.0007024294266195812, + "loss": 2.7151, + "step": 12602 + }, + { + "epoch": 0.3737211991815675, + "grad_norm": 0.1526336967945099, + "learning_rate": 0.0007023864033037822, + "loss": 2.6798, + "step": 12603 + }, + { + "epoch": 0.37375085253387896, + "grad_norm": 0.1609184741973877, + "learning_rate": 0.0007023433781958168, + "loss": 2.7063, + "step": 12604 + }, + { + "epoch": 0.37378050588619044, + "grad_norm": 0.14421267807483673, + "learning_rate": 0.0007023003512960661, + "loss": 2.7225, + "step": 12605 + }, + { + "epoch": 0.3738101592385019, + "grad_norm": 0.13619858026504517, + "learning_rate": 0.0007022573226049112, + "loss": 2.7063, + "step": 12606 + }, + { + "epoch": 0.3738398125908134, + "grad_norm": 0.15832079946994781, + "learning_rate": 0.0007022142921227328, + "loss": 2.6547, + "step": 12607 + }, + { + "epoch": 0.37386946594312487, + "grad_norm": 0.14592663943767548, + "learning_rate": 0.0007021712598499122, + "loss": 2.6859, + "step": 12608 + }, + { + "epoch": 0.37389911929543634, + "grad_norm": 0.13699862360954285, + "learning_rate": 0.0007021282257868304, + "loss": 2.6976, + "step": 12609 + }, + { + "epoch": 0.3739287726477478, + "grad_norm": 0.11869127303361893, + "learning_rate": 0.0007020851899338685, + "loss": 2.6791, + "step": 12610 + }, + { + "epoch": 0.3739584260000593, + "grad_norm": 0.13133005797863007, + "learning_rate": 0.0007020421522914074, + "loss": 2.7241, + "step": 12611 + }, + { + "epoch": 0.3739880793523708, + "grad_norm": 0.1407025158405304, + "learning_rate": 0.0007019991128598285, + "loss": 2.6722, + "step": 12612 + }, + { + "epoch": 0.37401773270468225, + "grad_norm": 0.15545892715454102, + "learning_rate": 0.0007019560716395128, + "loss": 2.6764, + "step": 12613 + }, + { + "epoch": 0.3740473860569937, + "grad_norm": 0.1295865923166275, + "learning_rate": 0.0007019130286308414, + "loss": 2.724, + "step": 12614 + }, + { + "epoch": 0.3740770394093052, + "grad_norm": 0.1126253753900528, + "learning_rate": 0.0007018699838341955, + "loss": 2.7032, + "step": 12615 + }, + { + "epoch": 0.3741066927616167, + "grad_norm": 0.14094921946525574, + "learning_rate": 0.0007018269372499562, + "loss": 2.7004, + "step": 12616 + }, + { + "epoch": 0.37413634611392815, + "grad_norm": 0.12843725085258484, + "learning_rate": 0.0007017838888785046, + "loss": 2.7358, + "step": 12617 + }, + { + "epoch": 0.3741659994662397, + "grad_norm": 0.14020609855651855, + "learning_rate": 0.0007017408387202222, + "loss": 2.704, + "step": 12618 + }, + { + "epoch": 0.37419565281855116, + "grad_norm": 0.13667131960391998, + "learning_rate": 0.0007016977867754899, + "loss": 2.6977, + "step": 12619 + }, + { + "epoch": 0.37422530617086264, + "grad_norm": 0.12138042598962784, + "learning_rate": 0.0007016547330446892, + "loss": 2.6571, + "step": 12620 + }, + { + "epoch": 0.3742549595231741, + "grad_norm": 0.11753318458795547, + "learning_rate": 0.0007016116775282012, + "loss": 2.7133, + "step": 12621 + }, + { + "epoch": 0.3742846128754856, + "grad_norm": 0.1518828421831131, + "learning_rate": 0.0007015686202264072, + "loss": 2.7121, + "step": 12622 + }, + { + "epoch": 0.37431426622779707, + "grad_norm": 0.14648830890655518, + "learning_rate": 0.0007015255611396885, + "loss": 2.7122, + "step": 12623 + }, + { + "epoch": 0.37434391958010854, + "grad_norm": 0.15163761377334595, + "learning_rate": 0.0007014825002684262, + "loss": 2.6791, + "step": 12624 + }, + { + "epoch": 0.37437357293242, + "grad_norm": 0.1338958442211151, + "learning_rate": 0.000701439437613002, + "loss": 2.7112, + "step": 12625 + }, + { + "epoch": 0.3744032262847315, + "grad_norm": 0.1264297515153885, + "learning_rate": 0.000701396373173797, + "loss": 2.7029, + "step": 12626 + }, + { + "epoch": 0.37443287963704297, + "grad_norm": 0.13345268368721008, + "learning_rate": 0.0007013533069511923, + "loss": 2.7203, + "step": 12627 + }, + { + "epoch": 0.37446253298935445, + "grad_norm": 0.1256655603647232, + "learning_rate": 0.0007013102389455696, + "loss": 2.7353, + "step": 12628 + }, + { + "epoch": 0.3744921863416659, + "grad_norm": 0.12946385145187378, + "learning_rate": 0.0007012671691573102, + "loss": 2.6959, + "step": 12629 + }, + { + "epoch": 0.3745218396939774, + "grad_norm": 0.12107854336500168, + "learning_rate": 0.0007012240975867956, + "loss": 2.6924, + "step": 12630 + }, + { + "epoch": 0.3745514930462889, + "grad_norm": 0.16295108199119568, + "learning_rate": 0.000701181024234407, + "loss": 2.7008, + "step": 12631 + }, + { + "epoch": 0.37458114639860035, + "grad_norm": 0.17843571305274963, + "learning_rate": 0.0007011379491005258, + "loss": 2.709, + "step": 12632 + }, + { + "epoch": 0.3746107997509118, + "grad_norm": 0.1546166092157364, + "learning_rate": 0.0007010948721855336, + "loss": 2.7284, + "step": 12633 + }, + { + "epoch": 0.3746404531032233, + "grad_norm": 0.1198478490114212, + "learning_rate": 0.0007010517934898118, + "loss": 2.6669, + "step": 12634 + }, + { + "epoch": 0.3746701064555348, + "grad_norm": 0.15241043269634247, + "learning_rate": 0.0007010087130137419, + "loss": 2.6979, + "step": 12635 + }, + { + "epoch": 0.37469975980784626, + "grad_norm": 0.1627461463212967, + "learning_rate": 0.0007009656307577054, + "loss": 2.7099, + "step": 12636 + }, + { + "epoch": 0.37472941316015773, + "grad_norm": 0.13620947301387787, + "learning_rate": 0.0007009225467220836, + "loss": 2.7127, + "step": 12637 + }, + { + "epoch": 0.3747590665124692, + "grad_norm": 0.11277705430984497, + "learning_rate": 0.0007008794609072582, + "loss": 2.685, + "step": 12638 + }, + { + "epoch": 0.37478871986478074, + "grad_norm": 0.12084994465112686, + "learning_rate": 0.0007008363733136106, + "loss": 2.681, + "step": 12639 + }, + { + "epoch": 0.3748183732170922, + "grad_norm": 0.13084939122200012, + "learning_rate": 0.0007007932839415226, + "loss": 2.7382, + "step": 12640 + }, + { + "epoch": 0.3748480265694037, + "grad_norm": 0.13427786529064178, + "learning_rate": 0.0007007501927913755, + "loss": 2.6956, + "step": 12641 + }, + { + "epoch": 0.37487767992171517, + "grad_norm": 0.12470145523548126, + "learning_rate": 0.0007007070998635511, + "loss": 2.7074, + "step": 12642 + }, + { + "epoch": 0.37490733327402664, + "grad_norm": 0.1300458461046219, + "learning_rate": 0.0007006640051584308, + "loss": 2.6903, + "step": 12643 + }, + { + "epoch": 0.3749369866263381, + "grad_norm": 0.12566784024238586, + "learning_rate": 0.0007006209086763963, + "loss": 2.6796, + "step": 12644 + }, + { + "epoch": 0.3749666399786496, + "grad_norm": 0.12184654921293259, + "learning_rate": 0.0007005778104178292, + "loss": 2.6842, + "step": 12645 + }, + { + "epoch": 0.3749962933309611, + "grad_norm": 0.1438111960887909, + "learning_rate": 0.0007005347103831112, + "loss": 2.6797, + "step": 12646 + }, + { + "epoch": 0.37502594668327255, + "grad_norm": 0.1594569832086563, + "learning_rate": 0.0007004916085726239, + "loss": 2.7073, + "step": 12647 + }, + { + "epoch": 0.375055600035584, + "grad_norm": 0.16427096724510193, + "learning_rate": 0.0007004485049867489, + "loss": 2.6884, + "step": 12648 + }, + { + "epoch": 0.3750852533878955, + "grad_norm": 0.12925617396831512, + "learning_rate": 0.0007004053996258682, + "loss": 2.6871, + "step": 12649 + }, + { + "epoch": 0.375114906740207, + "grad_norm": 0.11521384119987488, + "learning_rate": 0.0007003622924903631, + "loss": 2.7171, + "step": 12650 + }, + { + "epoch": 0.37514456009251845, + "grad_norm": 0.13279548287391663, + "learning_rate": 0.0007003191835806155, + "loss": 2.7192, + "step": 12651 + }, + { + "epoch": 0.37517421344482993, + "grad_norm": 0.1601591855287552, + "learning_rate": 0.0007002760728970072, + "loss": 2.7116, + "step": 12652 + }, + { + "epoch": 0.3752038667971414, + "grad_norm": 0.12765902280807495, + "learning_rate": 0.0007002329604399199, + "loss": 2.7083, + "step": 12653 + }, + { + "epoch": 0.3752335201494529, + "grad_norm": 0.12221627682447433, + "learning_rate": 0.0007001898462097354, + "loss": 2.676, + "step": 12654 + }, + { + "epoch": 0.37526317350176436, + "grad_norm": 0.14404283463954926, + "learning_rate": 0.0007001467302068354, + "loss": 2.6992, + "step": 12655 + }, + { + "epoch": 0.37529282685407583, + "grad_norm": 0.12971141934394836, + "learning_rate": 0.0007001036124316018, + "loss": 2.687, + "step": 12656 + }, + { + "epoch": 0.3753224802063873, + "grad_norm": 0.126461923122406, + "learning_rate": 0.0007000604928844163, + "loss": 2.689, + "step": 12657 + }, + { + "epoch": 0.3753521335586988, + "grad_norm": 0.12082094699144363, + "learning_rate": 0.0007000173715656608, + "loss": 2.6802, + "step": 12658 + }, + { + "epoch": 0.37538178691101026, + "grad_norm": 0.12909114360809326, + "learning_rate": 0.0006999742484757172, + "loss": 2.7069, + "step": 12659 + }, + { + "epoch": 0.3754114402633218, + "grad_norm": 0.15400837361812592, + "learning_rate": 0.0006999311236149672, + "loss": 2.6867, + "step": 12660 + }, + { + "epoch": 0.37544109361563327, + "grad_norm": 0.16983410716056824, + "learning_rate": 0.000699887996983793, + "loss": 2.6734, + "step": 12661 + }, + { + "epoch": 0.37547074696794475, + "grad_norm": 0.15191896259784698, + "learning_rate": 0.0006998448685825761, + "loss": 2.6728, + "step": 12662 + }, + { + "epoch": 0.3755004003202562, + "grad_norm": 0.15388132631778717, + "learning_rate": 0.0006998017384116987, + "loss": 2.6968, + "step": 12663 + }, + { + "epoch": 0.3755300536725677, + "grad_norm": 0.14346522092819214, + "learning_rate": 0.0006997586064715426, + "loss": 2.6873, + "step": 12664 + }, + { + "epoch": 0.3755597070248792, + "grad_norm": 0.14499612152576447, + "learning_rate": 0.0006997154727624895, + "loss": 2.6615, + "step": 12665 + }, + { + "epoch": 0.37558936037719065, + "grad_norm": 0.12361830472946167, + "learning_rate": 0.0006996723372849218, + "loss": 2.7095, + "step": 12666 + }, + { + "epoch": 0.3756190137295021, + "grad_norm": 0.13337565958499908, + "learning_rate": 0.0006996292000392213, + "loss": 2.6863, + "step": 12667 + }, + { + "epoch": 0.3756486670818136, + "grad_norm": 0.1433238834142685, + "learning_rate": 0.00069958606102577, + "loss": 2.7161, + "step": 12668 + }, + { + "epoch": 0.3756783204341251, + "grad_norm": 0.12826849520206451, + "learning_rate": 0.0006995429202449498, + "loss": 2.6684, + "step": 12669 + }, + { + "epoch": 0.37570797378643656, + "grad_norm": 0.14382639527320862, + "learning_rate": 0.0006994997776971428, + "loss": 2.7043, + "step": 12670 + }, + { + "epoch": 0.37573762713874803, + "grad_norm": 0.14512480795383453, + "learning_rate": 0.000699456633382731, + "loss": 2.66, + "step": 12671 + }, + { + "epoch": 0.3757672804910595, + "grad_norm": 0.10863585025072098, + "learning_rate": 0.0006994134873020965, + "loss": 2.7065, + "step": 12672 + }, + { + "epoch": 0.375796933843371, + "grad_norm": 0.1347842663526535, + "learning_rate": 0.0006993703394556214, + "loss": 2.712, + "step": 12673 + }, + { + "epoch": 0.37582658719568246, + "grad_norm": 0.10333802551031113, + "learning_rate": 0.0006993271898436877, + "loss": 2.7136, + "step": 12674 + }, + { + "epoch": 0.37585624054799394, + "grad_norm": 0.13236111402511597, + "learning_rate": 0.0006992840384666774, + "loss": 2.6712, + "step": 12675 + }, + { + "epoch": 0.3758858939003054, + "grad_norm": 0.11688944697380066, + "learning_rate": 0.0006992408853249729, + "loss": 2.7083, + "step": 12676 + }, + { + "epoch": 0.3759155472526169, + "grad_norm": 0.13199201226234436, + "learning_rate": 0.000699197730418956, + "loss": 2.7028, + "step": 12677 + }, + { + "epoch": 0.37594520060492836, + "grad_norm": 0.14077652990818024, + "learning_rate": 0.000699154573749009, + "loss": 2.698, + "step": 12678 + }, + { + "epoch": 0.37597485395723984, + "grad_norm": 0.13752292096614838, + "learning_rate": 0.0006991114153155143, + "loss": 2.6927, + "step": 12679 + }, + { + "epoch": 0.3760045073095514, + "grad_norm": 0.14869779348373413, + "learning_rate": 0.0006990682551188536, + "loss": 2.7071, + "step": 12680 + }, + { + "epoch": 0.37603416066186285, + "grad_norm": 0.17709572613239288, + "learning_rate": 0.0006990250931594096, + "loss": 2.708, + "step": 12681 + }, + { + "epoch": 0.3760638140141743, + "grad_norm": 0.18264508247375488, + "learning_rate": 0.0006989819294375639, + "loss": 2.6869, + "step": 12682 + }, + { + "epoch": 0.3760934673664858, + "grad_norm": 0.14294975996017456, + "learning_rate": 0.0006989387639536993, + "loss": 2.6977, + "step": 12683 + }, + { + "epoch": 0.3761231207187973, + "grad_norm": 0.11888759583234787, + "learning_rate": 0.0006988955967081977, + "loss": 2.7185, + "step": 12684 + }, + { + "epoch": 0.37615277407110875, + "grad_norm": 0.14008815586566925, + "learning_rate": 0.0006988524277014415, + "loss": 2.6976, + "step": 12685 + }, + { + "epoch": 0.37618242742342023, + "grad_norm": 0.1591191440820694, + "learning_rate": 0.0006988092569338128, + "loss": 2.6804, + "step": 12686 + }, + { + "epoch": 0.3762120807757317, + "grad_norm": 0.1380414068698883, + "learning_rate": 0.0006987660844056941, + "loss": 2.73, + "step": 12687 + }, + { + "epoch": 0.3762417341280432, + "grad_norm": 0.12650758028030396, + "learning_rate": 0.0006987229101174676, + "loss": 2.6985, + "step": 12688 + }, + { + "epoch": 0.37627138748035466, + "grad_norm": 0.14042875170707703, + "learning_rate": 0.0006986797340695157, + "loss": 2.7009, + "step": 12689 + }, + { + "epoch": 0.37630104083266613, + "grad_norm": 0.15487973392009735, + "learning_rate": 0.0006986365562622205, + "loss": 2.7516, + "step": 12690 + }, + { + "epoch": 0.3763306941849776, + "grad_norm": 0.12965354323387146, + "learning_rate": 0.0006985933766959645, + "loss": 2.7369, + "step": 12691 + }, + { + "epoch": 0.3763603475372891, + "grad_norm": 0.11938483268022537, + "learning_rate": 0.0006985501953711302, + "loss": 2.7196, + "step": 12692 + }, + { + "epoch": 0.37639000088960056, + "grad_norm": 0.13071602582931519, + "learning_rate": 0.0006985070122880998, + "loss": 2.6779, + "step": 12693 + }, + { + "epoch": 0.37641965424191204, + "grad_norm": 0.13417406380176544, + "learning_rate": 0.0006984638274472556, + "loss": 2.6936, + "step": 12694 + }, + { + "epoch": 0.3764493075942235, + "grad_norm": 0.13229864835739136, + "learning_rate": 0.0006984206408489804, + "loss": 2.6707, + "step": 12695 + }, + { + "epoch": 0.376478960946535, + "grad_norm": 0.12851859629154205, + "learning_rate": 0.0006983774524936563, + "loss": 2.7195, + "step": 12696 + }, + { + "epoch": 0.37650861429884647, + "grad_norm": 0.12484963983297348, + "learning_rate": 0.0006983342623816655, + "loss": 2.6598, + "step": 12697 + }, + { + "epoch": 0.37653826765115794, + "grad_norm": 0.11695152521133423, + "learning_rate": 0.000698291070513391, + "loss": 2.7272, + "step": 12698 + }, + { + "epoch": 0.3765679210034694, + "grad_norm": 0.12373955547809601, + "learning_rate": 0.0006982478768892151, + "loss": 2.7375, + "step": 12699 + }, + { + "epoch": 0.3765975743557809, + "grad_norm": 0.14035801589488983, + "learning_rate": 0.00069820468150952, + "loss": 2.7007, + "step": 12700 + }, + { + "epoch": 0.3766272277080924, + "grad_norm": 0.15560755133628845, + "learning_rate": 0.0006981614843746888, + "loss": 2.6989, + "step": 12701 + }, + { + "epoch": 0.3766568810604039, + "grad_norm": 0.1382312774658203, + "learning_rate": 0.0006981182854851034, + "loss": 2.6605, + "step": 12702 + }, + { + "epoch": 0.3766865344127154, + "grad_norm": 0.11509452760219574, + "learning_rate": 0.0006980750848411465, + "loss": 2.701, + "step": 12703 + }, + { + "epoch": 0.37671618776502686, + "grad_norm": 0.12401776760816574, + "learning_rate": 0.0006980318824432008, + "loss": 2.7013, + "step": 12704 + }, + { + "epoch": 0.37674584111733833, + "grad_norm": 0.13331404328346252, + "learning_rate": 0.0006979886782916487, + "loss": 2.7045, + "step": 12705 + }, + { + "epoch": 0.3767754944696498, + "grad_norm": 0.1438903510570526, + "learning_rate": 0.000697945472386873, + "loss": 2.6965, + "step": 12706 + }, + { + "epoch": 0.3768051478219613, + "grad_norm": 0.12877893447875977, + "learning_rate": 0.0006979022647292562, + "loss": 2.6722, + "step": 12707 + }, + { + "epoch": 0.37683480117427276, + "grad_norm": 0.1343892365694046, + "learning_rate": 0.0006978590553191808, + "loss": 2.6821, + "step": 12708 + }, + { + "epoch": 0.37686445452658424, + "grad_norm": 0.124875009059906, + "learning_rate": 0.0006978158441570295, + "loss": 2.6813, + "step": 12709 + }, + { + "epoch": 0.3768941078788957, + "grad_norm": 0.12674380838871002, + "learning_rate": 0.0006977726312431849, + "loss": 2.7105, + "step": 12710 + }, + { + "epoch": 0.3769237612312072, + "grad_norm": 0.1311209797859192, + "learning_rate": 0.0006977294165780298, + "loss": 2.673, + "step": 12711 + }, + { + "epoch": 0.37695341458351866, + "grad_norm": 0.12060048431158066, + "learning_rate": 0.0006976862001619467, + "loss": 2.673, + "step": 12712 + }, + { + "epoch": 0.37698306793583014, + "grad_norm": 0.12983094155788422, + "learning_rate": 0.0006976429819953183, + "loss": 2.698, + "step": 12713 + }, + { + "epoch": 0.3770127212881416, + "grad_norm": 0.13995832204818726, + "learning_rate": 0.0006975997620785276, + "loss": 2.686, + "step": 12714 + }, + { + "epoch": 0.3770423746404531, + "grad_norm": 0.13072073459625244, + "learning_rate": 0.0006975565404119569, + "loss": 2.6994, + "step": 12715 + }, + { + "epoch": 0.37707202799276457, + "grad_norm": 0.12860891222953796, + "learning_rate": 0.0006975133169959892, + "loss": 2.7164, + "step": 12716 + }, + { + "epoch": 0.37710168134507605, + "grad_norm": 0.12274423986673355, + "learning_rate": 0.0006974700918310072, + "loss": 2.7085, + "step": 12717 + }, + { + "epoch": 0.3771313346973875, + "grad_norm": 0.12567245960235596, + "learning_rate": 0.0006974268649173936, + "loss": 2.673, + "step": 12718 + }, + { + "epoch": 0.377160988049699, + "grad_norm": 0.11244115233421326, + "learning_rate": 0.0006973836362555311, + "loss": 2.6802, + "step": 12719 + }, + { + "epoch": 0.3771906414020105, + "grad_norm": 0.1115075871348381, + "learning_rate": 0.0006973404058458028, + "loss": 2.6624, + "step": 12720 + }, + { + "epoch": 0.37722029475432195, + "grad_norm": 0.11536287516355515, + "learning_rate": 0.0006972971736885912, + "loss": 2.6916, + "step": 12721 + }, + { + "epoch": 0.3772499481066335, + "grad_norm": 0.11853083968162537, + "learning_rate": 0.0006972539397842795, + "loss": 2.7295, + "step": 12722 + }, + { + "epoch": 0.37727960145894496, + "grad_norm": 0.11583499610424042, + "learning_rate": 0.00069721070413325, + "loss": 2.6737, + "step": 12723 + }, + { + "epoch": 0.37730925481125643, + "grad_norm": 0.135806605219841, + "learning_rate": 0.000697167466735886, + "loss": 2.7296, + "step": 12724 + }, + { + "epoch": 0.3773389081635679, + "grad_norm": 0.1758502572774887, + "learning_rate": 0.0006971242275925704, + "loss": 2.6861, + "step": 12725 + }, + { + "epoch": 0.3773685615158794, + "grad_norm": 0.15810683369636536, + "learning_rate": 0.0006970809867036856, + "loss": 2.6989, + "step": 12726 + }, + { + "epoch": 0.37739821486819086, + "grad_norm": 0.1522175371646881, + "learning_rate": 0.000697037744069615, + "loss": 2.7101, + "step": 12727 + }, + { + "epoch": 0.37742786822050234, + "grad_norm": 0.1627587229013443, + "learning_rate": 0.0006969944996907416, + "loss": 2.719, + "step": 12728 + }, + { + "epoch": 0.3774575215728138, + "grad_norm": 0.17805209755897522, + "learning_rate": 0.0006969512535674479, + "loss": 2.7084, + "step": 12729 + }, + { + "epoch": 0.3774871749251253, + "grad_norm": 0.16707009077072144, + "learning_rate": 0.0006969080057001168, + "loss": 2.6963, + "step": 12730 + }, + { + "epoch": 0.37751682827743677, + "grad_norm": 0.1443750560283661, + "learning_rate": 0.0006968647560891317, + "loss": 2.6968, + "step": 12731 + }, + { + "epoch": 0.37754648162974824, + "grad_norm": 0.12579353153705597, + "learning_rate": 0.0006968215047348753, + "loss": 2.6689, + "step": 12732 + }, + { + "epoch": 0.3775761349820597, + "grad_norm": 0.14392384886741638, + "learning_rate": 0.0006967782516377309, + "loss": 2.6965, + "step": 12733 + }, + { + "epoch": 0.3776057883343712, + "grad_norm": 0.12937849760055542, + "learning_rate": 0.0006967349967980813, + "loss": 2.691, + "step": 12734 + }, + { + "epoch": 0.37763544168668267, + "grad_norm": 0.14200997352600098, + "learning_rate": 0.0006966917402163093, + "loss": 2.6647, + "step": 12735 + }, + { + "epoch": 0.37766509503899415, + "grad_norm": 0.14578792452812195, + "learning_rate": 0.0006966484818927983, + "loss": 2.6971, + "step": 12736 + }, + { + "epoch": 0.3776947483913056, + "grad_norm": 0.12933745980262756, + "learning_rate": 0.0006966052218279313, + "loss": 2.6962, + "step": 12737 + }, + { + "epoch": 0.3777244017436171, + "grad_norm": 0.15652300417423248, + "learning_rate": 0.0006965619600220912, + "loss": 2.6671, + "step": 12738 + }, + { + "epoch": 0.3777540550959286, + "grad_norm": 0.14168916642665863, + "learning_rate": 0.0006965186964756614, + "loss": 2.7018, + "step": 12739 + }, + { + "epoch": 0.37778370844824005, + "grad_norm": 0.12074583023786545, + "learning_rate": 0.0006964754311890247, + "loss": 2.6997, + "step": 12740 + }, + { + "epoch": 0.37781336180055153, + "grad_norm": 0.11237810552120209, + "learning_rate": 0.0006964321641625643, + "loss": 2.7042, + "step": 12741 + }, + { + "epoch": 0.377843015152863, + "grad_norm": 0.1133054569363594, + "learning_rate": 0.0006963888953966633, + "loss": 2.6928, + "step": 12742 + }, + { + "epoch": 0.37787266850517454, + "grad_norm": 0.11663765460252762, + "learning_rate": 0.0006963456248917049, + "loss": 2.6957, + "step": 12743 + }, + { + "epoch": 0.377902321857486, + "grad_norm": 0.121483713388443, + "learning_rate": 0.0006963023526480725, + "loss": 2.6762, + "step": 12744 + }, + { + "epoch": 0.3779319752097975, + "grad_norm": 0.11034536361694336, + "learning_rate": 0.000696259078666149, + "loss": 2.671, + "step": 12745 + }, + { + "epoch": 0.37796162856210896, + "grad_norm": 0.11505523324012756, + "learning_rate": 0.0006962158029463175, + "loss": 2.6977, + "step": 12746 + }, + { + "epoch": 0.37799128191442044, + "grad_norm": 0.11409887671470642, + "learning_rate": 0.0006961725254889616, + "loss": 2.7073, + "step": 12747 + }, + { + "epoch": 0.3780209352667319, + "grad_norm": 0.1025494635105133, + "learning_rate": 0.0006961292462944643, + "loss": 2.7035, + "step": 12748 + }, + { + "epoch": 0.3780505886190434, + "grad_norm": 0.10229341685771942, + "learning_rate": 0.0006960859653632088, + "loss": 2.6762, + "step": 12749 + }, + { + "epoch": 0.37808024197135487, + "grad_norm": 0.11484333127737045, + "learning_rate": 0.0006960426826955784, + "loss": 2.7131, + "step": 12750 + }, + { + "epoch": 0.37810989532366635, + "grad_norm": 0.10824193060398102, + "learning_rate": 0.0006959993982919564, + "loss": 2.7191, + "step": 12751 + }, + { + "epoch": 0.3781395486759778, + "grad_norm": 0.11651889979839325, + "learning_rate": 0.000695956112152726, + "loss": 2.7158, + "step": 12752 + }, + { + "epoch": 0.3781692020282893, + "grad_norm": 0.1117197796702385, + "learning_rate": 0.0006959128242782708, + "loss": 2.7207, + "step": 12753 + }, + { + "epoch": 0.3781988553806008, + "grad_norm": 0.11097689718008041, + "learning_rate": 0.0006958695346689737, + "loss": 2.6774, + "step": 12754 + }, + { + "epoch": 0.37822850873291225, + "grad_norm": 0.1266440749168396, + "learning_rate": 0.0006958262433252183, + "loss": 2.6867, + "step": 12755 + }, + { + "epoch": 0.3782581620852237, + "grad_norm": 0.1426561027765274, + "learning_rate": 0.000695782950247388, + "loss": 2.6898, + "step": 12756 + }, + { + "epoch": 0.3782878154375352, + "grad_norm": 0.13533154129981995, + "learning_rate": 0.0006957396554358661, + "loss": 2.6633, + "step": 12757 + }, + { + "epoch": 0.3783174687898467, + "grad_norm": 0.1387983113527298, + "learning_rate": 0.0006956963588910358, + "loss": 2.6807, + "step": 12758 + }, + { + "epoch": 0.37834712214215815, + "grad_norm": 0.14608623087406158, + "learning_rate": 0.0006956530606132807, + "loss": 2.6621, + "step": 12759 + }, + { + "epoch": 0.37837677549446963, + "grad_norm": 0.15375475585460663, + "learning_rate": 0.0006956097606029842, + "loss": 2.7108, + "step": 12760 + }, + { + "epoch": 0.3784064288467811, + "grad_norm": 0.169559508562088, + "learning_rate": 0.0006955664588605298, + "loss": 2.7057, + "step": 12761 + }, + { + "epoch": 0.3784360821990926, + "grad_norm": 0.14299479126930237, + "learning_rate": 0.0006955231553863006, + "loss": 2.7369, + "step": 12762 + }, + { + "epoch": 0.37846573555140406, + "grad_norm": 0.14011697471141815, + "learning_rate": 0.0006954798501806803, + "loss": 2.6916, + "step": 12763 + }, + { + "epoch": 0.3784953889037156, + "grad_norm": 0.1438596099615097, + "learning_rate": 0.0006954365432440526, + "loss": 2.7352, + "step": 12764 + }, + { + "epoch": 0.37852504225602707, + "grad_norm": 0.16338366270065308, + "learning_rate": 0.0006953932345768006, + "loss": 2.6691, + "step": 12765 + }, + { + "epoch": 0.37855469560833854, + "grad_norm": 0.17784874141216278, + "learning_rate": 0.0006953499241793082, + "loss": 2.7023, + "step": 12766 + }, + { + "epoch": 0.37858434896065, + "grad_norm": 0.1812797337770462, + "learning_rate": 0.0006953066120519584, + "loss": 2.6644, + "step": 12767 + }, + { + "epoch": 0.3786140023129615, + "grad_norm": 0.14690342545509338, + "learning_rate": 0.0006952632981951351, + "loss": 2.6494, + "step": 12768 + }, + { + "epoch": 0.37864365566527297, + "grad_norm": 0.14335152506828308, + "learning_rate": 0.0006952199826092218, + "loss": 2.691, + "step": 12769 + }, + { + "epoch": 0.37867330901758445, + "grad_norm": 0.1792469024658203, + "learning_rate": 0.0006951766652946021, + "loss": 2.7381, + "step": 12770 + }, + { + "epoch": 0.3787029623698959, + "grad_norm": 0.16358637809753418, + "learning_rate": 0.0006951333462516595, + "loss": 2.681, + "step": 12771 + }, + { + "epoch": 0.3787326157222074, + "grad_norm": 0.12634538114070892, + "learning_rate": 0.0006950900254807777, + "loss": 2.6811, + "step": 12772 + }, + { + "epoch": 0.3787622690745189, + "grad_norm": 0.1320657730102539, + "learning_rate": 0.00069504670298234, + "loss": 2.741, + "step": 12773 + }, + { + "epoch": 0.37879192242683035, + "grad_norm": 0.12553183734416962, + "learning_rate": 0.0006950033787567304, + "loss": 2.7388, + "step": 12774 + }, + { + "epoch": 0.37882157577914183, + "grad_norm": 0.13512566685676575, + "learning_rate": 0.0006949600528043324, + "loss": 2.7027, + "step": 12775 + }, + { + "epoch": 0.3788512291314533, + "grad_norm": 0.12897604703903198, + "learning_rate": 0.0006949167251255297, + "loss": 2.7238, + "step": 12776 + }, + { + "epoch": 0.3788808824837648, + "grad_norm": 0.12052123248577118, + "learning_rate": 0.000694873395720706, + "loss": 2.7041, + "step": 12777 + }, + { + "epoch": 0.37891053583607626, + "grad_norm": 0.1272313892841339, + "learning_rate": 0.0006948300645902448, + "loss": 2.6714, + "step": 12778 + }, + { + "epoch": 0.37894018918838773, + "grad_norm": 0.14486797153949738, + "learning_rate": 0.0006947867317345301, + "loss": 2.6906, + "step": 12779 + }, + { + "epoch": 0.3789698425406992, + "grad_norm": 0.12063923478126526, + "learning_rate": 0.0006947433971539454, + "loss": 2.6739, + "step": 12780 + }, + { + "epoch": 0.3789994958930107, + "grad_norm": 0.12603579461574554, + "learning_rate": 0.0006947000608488743, + "loss": 2.688, + "step": 12781 + }, + { + "epoch": 0.37902914924532216, + "grad_norm": 0.10982402414083481, + "learning_rate": 0.0006946567228197009, + "loss": 2.6889, + "step": 12782 + }, + { + "epoch": 0.37905880259763364, + "grad_norm": 0.11659304797649384, + "learning_rate": 0.0006946133830668089, + "loss": 2.7112, + "step": 12783 + }, + { + "epoch": 0.37908845594994517, + "grad_norm": 0.1210961565375328, + "learning_rate": 0.0006945700415905819, + "loss": 2.7069, + "step": 12784 + }, + { + "epoch": 0.37911810930225665, + "grad_norm": 0.1154000461101532, + "learning_rate": 0.0006945266983914038, + "loss": 2.718, + "step": 12785 + }, + { + "epoch": 0.3791477626545681, + "grad_norm": 0.1291070431470871, + "learning_rate": 0.0006944833534696582, + "loss": 2.6551, + "step": 12786 + }, + { + "epoch": 0.3791774160068796, + "grad_norm": 0.12754030525684357, + "learning_rate": 0.0006944400068257294, + "loss": 2.685, + "step": 12787 + }, + { + "epoch": 0.3792070693591911, + "grad_norm": 0.12465610355138779, + "learning_rate": 0.0006943966584600007, + "loss": 2.681, + "step": 12788 + }, + { + "epoch": 0.37923672271150255, + "grad_norm": 0.11157987266778946, + "learning_rate": 0.0006943533083728565, + "loss": 2.7197, + "step": 12789 + }, + { + "epoch": 0.379266376063814, + "grad_norm": 0.12832345068454742, + "learning_rate": 0.0006943099565646802, + "loss": 2.7359, + "step": 12790 + }, + { + "epoch": 0.3792960294161255, + "grad_norm": 0.12821891903877258, + "learning_rate": 0.000694266603035856, + "loss": 2.6598, + "step": 12791 + }, + { + "epoch": 0.379325682768437, + "grad_norm": 0.11374243348836899, + "learning_rate": 0.0006942232477867676, + "loss": 2.7071, + "step": 12792 + }, + { + "epoch": 0.37935533612074845, + "grad_norm": 0.130894735455513, + "learning_rate": 0.000694179890817799, + "loss": 2.6819, + "step": 12793 + }, + { + "epoch": 0.37938498947305993, + "grad_norm": 0.12655028700828552, + "learning_rate": 0.0006941365321293342, + "loss": 2.6827, + "step": 12794 + }, + { + "epoch": 0.3794146428253714, + "grad_norm": 0.15257516503334045, + "learning_rate": 0.0006940931717217572, + "loss": 2.7313, + "step": 12795 + }, + { + "epoch": 0.3794442961776829, + "grad_norm": 0.14895358681678772, + "learning_rate": 0.0006940498095954516, + "loss": 2.6947, + "step": 12796 + }, + { + "epoch": 0.37947394952999436, + "grad_norm": 0.14811618626117706, + "learning_rate": 0.0006940064457508018, + "loss": 2.7022, + "step": 12797 + }, + { + "epoch": 0.37950360288230583, + "grad_norm": 0.13572828471660614, + "learning_rate": 0.0006939630801881915, + "loss": 2.7027, + "step": 12798 + }, + { + "epoch": 0.3795332562346173, + "grad_norm": 0.13262896239757538, + "learning_rate": 0.0006939197129080051, + "loss": 2.7077, + "step": 12799 + }, + { + "epoch": 0.3795629095869288, + "grad_norm": 0.1567045897245407, + "learning_rate": 0.0006938763439106261, + "loss": 2.6571, + "step": 12800 + }, + { + "epoch": 0.37959256293924026, + "grad_norm": 0.1583845317363739, + "learning_rate": 0.0006938329731964387, + "loss": 2.7111, + "step": 12801 + }, + { + "epoch": 0.37962221629155174, + "grad_norm": 0.1553465574979782, + "learning_rate": 0.000693789600765827, + "loss": 2.6582, + "step": 12802 + }, + { + "epoch": 0.3796518696438632, + "grad_norm": 0.13321296870708466, + "learning_rate": 0.0006937462266191754, + "loss": 2.7158, + "step": 12803 + }, + { + "epoch": 0.3796815229961747, + "grad_norm": 0.13472595810890198, + "learning_rate": 0.0006937028507568678, + "loss": 2.7122, + "step": 12804 + }, + { + "epoch": 0.3797111763484862, + "grad_norm": 0.1428159922361374, + "learning_rate": 0.000693659473179288, + "loss": 2.7133, + "step": 12805 + }, + { + "epoch": 0.3797408297007977, + "grad_norm": 0.13989092409610748, + "learning_rate": 0.0006936160938868204, + "loss": 2.6634, + "step": 12806 + }, + { + "epoch": 0.3797704830531092, + "grad_norm": 0.1476353108882904, + "learning_rate": 0.0006935727128798488, + "loss": 2.7053, + "step": 12807 + }, + { + "epoch": 0.37980013640542065, + "grad_norm": 0.13502554595470428, + "learning_rate": 0.0006935293301587579, + "loss": 2.6957, + "step": 12808 + }, + { + "epoch": 0.37982978975773213, + "grad_norm": 0.12432479858398438, + "learning_rate": 0.0006934859457239314, + "loss": 2.6752, + "step": 12809 + }, + { + "epoch": 0.3798594431100436, + "grad_norm": 0.126759871840477, + "learning_rate": 0.0006934425595757538, + "loss": 2.7089, + "step": 12810 + }, + { + "epoch": 0.3798890964623551, + "grad_norm": 0.12100420147180557, + "learning_rate": 0.000693399171714609, + "loss": 2.6905, + "step": 12811 + }, + { + "epoch": 0.37991874981466656, + "grad_norm": 0.11658567935228348, + "learning_rate": 0.0006933557821408815, + "loss": 2.713, + "step": 12812 + }, + { + "epoch": 0.37994840316697803, + "grad_norm": 0.12089411169290543, + "learning_rate": 0.0006933123908549552, + "loss": 2.6996, + "step": 12813 + }, + { + "epoch": 0.3799780565192895, + "grad_norm": 0.1431732028722763, + "learning_rate": 0.0006932689978572144, + "loss": 2.69, + "step": 12814 + }, + { + "epoch": 0.380007709871601, + "grad_norm": 0.12887534499168396, + "learning_rate": 0.0006932256031480438, + "loss": 2.6932, + "step": 12815 + }, + { + "epoch": 0.38003736322391246, + "grad_norm": 0.10876510292291641, + "learning_rate": 0.0006931822067278271, + "loss": 2.6903, + "step": 12816 + }, + { + "epoch": 0.38006701657622394, + "grad_norm": 0.12665878236293793, + "learning_rate": 0.0006931388085969488, + "loss": 2.6867, + "step": 12817 + }, + { + "epoch": 0.3800966699285354, + "grad_norm": 0.13160689175128937, + "learning_rate": 0.0006930954087557931, + "loss": 2.6879, + "step": 12818 + }, + { + "epoch": 0.3801263232808469, + "grad_norm": 0.13301372528076172, + "learning_rate": 0.0006930520072047446, + "loss": 2.7085, + "step": 12819 + }, + { + "epoch": 0.38015597663315837, + "grad_norm": 0.15983635187149048, + "learning_rate": 0.0006930086039441873, + "loss": 2.6804, + "step": 12820 + }, + { + "epoch": 0.38018562998546984, + "grad_norm": 0.14795663952827454, + "learning_rate": 0.0006929651989745057, + "loss": 2.6965, + "step": 12821 + }, + { + "epoch": 0.3802152833377813, + "grad_norm": 0.14200371503829956, + "learning_rate": 0.0006929217922960842, + "loss": 2.7498, + "step": 12822 + }, + { + "epoch": 0.3802449366900928, + "grad_norm": 0.1284748911857605, + "learning_rate": 0.0006928783839093071, + "loss": 2.7074, + "step": 12823 + }, + { + "epoch": 0.38027459004240427, + "grad_norm": 0.12531344592571259, + "learning_rate": 0.0006928349738145588, + "loss": 2.6745, + "step": 12824 + }, + { + "epoch": 0.38030424339471575, + "grad_norm": 0.12786908447742462, + "learning_rate": 0.0006927915620122235, + "loss": 2.6688, + "step": 12825 + }, + { + "epoch": 0.3803338967470273, + "grad_norm": 0.11671125888824463, + "learning_rate": 0.0006927481485026861, + "loss": 2.6755, + "step": 12826 + }, + { + "epoch": 0.38036355009933875, + "grad_norm": 0.12933024764060974, + "learning_rate": 0.0006927047332863308, + "loss": 2.6867, + "step": 12827 + }, + { + "epoch": 0.38039320345165023, + "grad_norm": 0.12387717515230179, + "learning_rate": 0.0006926613163635419, + "loss": 2.7433, + "step": 12828 + }, + { + "epoch": 0.3804228568039617, + "grad_norm": 0.15125881135463715, + "learning_rate": 0.0006926178977347039, + "loss": 2.6889, + "step": 12829 + }, + { + "epoch": 0.3804525101562732, + "grad_norm": 0.1548687219619751, + "learning_rate": 0.0006925744774002015, + "loss": 2.7027, + "step": 12830 + }, + { + "epoch": 0.38048216350858466, + "grad_norm": 0.1569553017616272, + "learning_rate": 0.000692531055360419, + "loss": 2.7094, + "step": 12831 + }, + { + "epoch": 0.38051181686089613, + "grad_norm": 0.17069271206855774, + "learning_rate": 0.0006924876316157409, + "loss": 2.6824, + "step": 12832 + }, + { + "epoch": 0.3805414702132076, + "grad_norm": 0.16423213481903076, + "learning_rate": 0.0006924442061665518, + "loss": 2.7042, + "step": 12833 + }, + { + "epoch": 0.3805711235655191, + "grad_norm": 0.14518748223781586, + "learning_rate": 0.0006924007790132362, + "loss": 2.6988, + "step": 12834 + }, + { + "epoch": 0.38060077691783056, + "grad_norm": 0.13482818007469177, + "learning_rate": 0.0006923573501561786, + "loss": 2.6978, + "step": 12835 + }, + { + "epoch": 0.38063043027014204, + "grad_norm": 0.1512729525566101, + "learning_rate": 0.0006923139195957639, + "loss": 2.7086, + "step": 12836 + }, + { + "epoch": 0.3806600836224535, + "grad_norm": 0.16506147384643555, + "learning_rate": 0.0006922704873323763, + "loss": 2.7139, + "step": 12837 + }, + { + "epoch": 0.380689736974765, + "grad_norm": 0.1488388180732727, + "learning_rate": 0.0006922270533664006, + "loss": 2.7301, + "step": 12838 + }, + { + "epoch": 0.38071939032707647, + "grad_norm": 0.1361658126115799, + "learning_rate": 0.0006921836176982211, + "loss": 2.7216, + "step": 12839 + }, + { + "epoch": 0.38074904367938794, + "grad_norm": 0.13375312089920044, + "learning_rate": 0.0006921401803282228, + "loss": 2.6996, + "step": 12840 + }, + { + "epoch": 0.3807786970316994, + "grad_norm": 0.1414918601512909, + "learning_rate": 0.0006920967412567903, + "loss": 2.6796, + "step": 12841 + }, + { + "epoch": 0.3808083503840109, + "grad_norm": 0.12181328237056732, + "learning_rate": 0.0006920533004843082, + "loss": 2.6634, + "step": 12842 + }, + { + "epoch": 0.3808380037363224, + "grad_norm": 0.10839494317770004, + "learning_rate": 0.0006920098580111611, + "loss": 2.6704, + "step": 12843 + }, + { + "epoch": 0.38086765708863385, + "grad_norm": 0.10444261133670807, + "learning_rate": 0.0006919664138377339, + "loss": 2.6867, + "step": 12844 + }, + { + "epoch": 0.3808973104409453, + "grad_norm": 0.10392380505800247, + "learning_rate": 0.0006919229679644109, + "loss": 2.686, + "step": 12845 + }, + { + "epoch": 0.3809269637932568, + "grad_norm": 0.10788558423519135, + "learning_rate": 0.0006918795203915771, + "loss": 2.6777, + "step": 12846 + }, + { + "epoch": 0.38095661714556833, + "grad_norm": 0.11956681311130524, + "learning_rate": 0.0006918360711196173, + "loss": 2.6879, + "step": 12847 + }, + { + "epoch": 0.3809862704978798, + "grad_norm": 0.1230972558259964, + "learning_rate": 0.0006917926201489163, + "loss": 2.6867, + "step": 12848 + }, + { + "epoch": 0.3810159238501913, + "grad_norm": 0.1385984718799591, + "learning_rate": 0.0006917491674798586, + "loss": 2.6828, + "step": 12849 + }, + { + "epoch": 0.38104557720250276, + "grad_norm": 0.15275409817695618, + "learning_rate": 0.000691705713112829, + "loss": 2.7018, + "step": 12850 + }, + { + "epoch": 0.38107523055481424, + "grad_norm": 0.13766273856163025, + "learning_rate": 0.0006916622570482125, + "loss": 2.7122, + "step": 12851 + }, + { + "epoch": 0.3811048839071257, + "grad_norm": 0.10434705018997192, + "learning_rate": 0.0006916187992863939, + "loss": 2.736, + "step": 12852 + }, + { + "epoch": 0.3811345372594372, + "grad_norm": 0.11857867985963821, + "learning_rate": 0.0006915753398277578, + "loss": 2.7111, + "step": 12853 + }, + { + "epoch": 0.38116419061174867, + "grad_norm": 0.11727552860975266, + "learning_rate": 0.0006915318786726893, + "loss": 2.7145, + "step": 12854 + }, + { + "epoch": 0.38119384396406014, + "grad_norm": 0.1011839210987091, + "learning_rate": 0.0006914884158215731, + "loss": 2.6743, + "step": 12855 + }, + { + "epoch": 0.3812234973163716, + "grad_norm": 0.10124900192022324, + "learning_rate": 0.000691444951274794, + "loss": 2.6735, + "step": 12856 + }, + { + "epoch": 0.3812531506686831, + "grad_norm": 0.14676226675510406, + "learning_rate": 0.0006914014850327372, + "loss": 2.749, + "step": 12857 + }, + { + "epoch": 0.38128280402099457, + "grad_norm": 0.10431423038244247, + "learning_rate": 0.0006913580170957871, + "loss": 2.7092, + "step": 12858 + }, + { + "epoch": 0.38131245737330605, + "grad_norm": 0.11423026770353317, + "learning_rate": 0.0006913145474643292, + "loss": 2.69, + "step": 12859 + }, + { + "epoch": 0.3813421107256175, + "grad_norm": 0.12109913676977158, + "learning_rate": 0.000691271076138748, + "loss": 2.689, + "step": 12860 + }, + { + "epoch": 0.381371764077929, + "grad_norm": 0.10842842608690262, + "learning_rate": 0.0006912276031194286, + "loss": 2.674, + "step": 12861 + }, + { + "epoch": 0.3814014174302405, + "grad_norm": 0.10613143444061279, + "learning_rate": 0.000691184128406756, + "loss": 2.708, + "step": 12862 + }, + { + "epoch": 0.38143107078255195, + "grad_norm": 0.12378356605768204, + "learning_rate": 0.0006911406520011151, + "loss": 2.6906, + "step": 12863 + }, + { + "epoch": 0.3814607241348634, + "grad_norm": 0.12144013494253159, + "learning_rate": 0.000691097173902891, + "loss": 2.7066, + "step": 12864 + }, + { + "epoch": 0.3814903774871749, + "grad_norm": 0.13024555146694183, + "learning_rate": 0.0006910536941124684, + "loss": 2.7105, + "step": 12865 + }, + { + "epoch": 0.3815200308394864, + "grad_norm": 0.14017868041992188, + "learning_rate": 0.0006910102126302328, + "loss": 2.6945, + "step": 12866 + }, + { + "epoch": 0.38154968419179786, + "grad_norm": 0.13240963220596313, + "learning_rate": 0.0006909667294565688, + "loss": 2.6958, + "step": 12867 + }, + { + "epoch": 0.3815793375441094, + "grad_norm": 0.1414223164319992, + "learning_rate": 0.0006909232445918617, + "loss": 2.6948, + "step": 12868 + }, + { + "epoch": 0.38160899089642086, + "grad_norm": 0.16203266382217407, + "learning_rate": 0.0006908797580364965, + "loss": 2.7134, + "step": 12869 + }, + { + "epoch": 0.38163864424873234, + "grad_norm": 0.16004009544849396, + "learning_rate": 0.0006908362697908584, + "loss": 2.6856, + "step": 12870 + }, + { + "epoch": 0.3816682976010438, + "grad_norm": 0.13690322637557983, + "learning_rate": 0.0006907927798553322, + "loss": 2.6948, + "step": 12871 + }, + { + "epoch": 0.3816979509533553, + "grad_norm": 0.1399059146642685, + "learning_rate": 0.0006907492882303032, + "loss": 2.7283, + "step": 12872 + }, + { + "epoch": 0.38172760430566677, + "grad_norm": 0.1406751424074173, + "learning_rate": 0.0006907057949161565, + "loss": 2.6777, + "step": 12873 + }, + { + "epoch": 0.38175725765797824, + "grad_norm": 0.12817882001399994, + "learning_rate": 0.0006906622999132774, + "loss": 2.699, + "step": 12874 + }, + { + "epoch": 0.3817869110102897, + "grad_norm": 0.12390442937612534, + "learning_rate": 0.0006906188032220509, + "loss": 2.6778, + "step": 12875 + }, + { + "epoch": 0.3818165643626012, + "grad_norm": 0.14217804372310638, + "learning_rate": 0.000690575304842862, + "loss": 2.6857, + "step": 12876 + }, + { + "epoch": 0.3818462177149127, + "grad_norm": 0.14122949540615082, + "learning_rate": 0.0006905318047760961, + "loss": 2.6813, + "step": 12877 + }, + { + "epoch": 0.38187587106722415, + "grad_norm": 0.12070295959711075, + "learning_rate": 0.0006904883030221384, + "loss": 2.7012, + "step": 12878 + }, + { + "epoch": 0.3819055244195356, + "grad_norm": 0.12530635297298431, + "learning_rate": 0.0006904447995813741, + "loss": 2.6913, + "step": 12879 + }, + { + "epoch": 0.3819351777718471, + "grad_norm": 0.1404714435338974, + "learning_rate": 0.0006904012944541885, + "loss": 2.7241, + "step": 12880 + }, + { + "epoch": 0.3819648311241586, + "grad_norm": 0.14280308783054352, + "learning_rate": 0.0006903577876409666, + "loss": 2.6777, + "step": 12881 + }, + { + "epoch": 0.38199448447647005, + "grad_norm": 0.12637092173099518, + "learning_rate": 0.0006903142791420939, + "loss": 2.6941, + "step": 12882 + }, + { + "epoch": 0.38202413782878153, + "grad_norm": 0.13159017264842987, + "learning_rate": 0.0006902707689579555, + "loss": 2.7191, + "step": 12883 + }, + { + "epoch": 0.382053791181093, + "grad_norm": 0.13767756521701813, + "learning_rate": 0.000690227257088937, + "loss": 2.7051, + "step": 12884 + }, + { + "epoch": 0.3820834445334045, + "grad_norm": 0.139752596616745, + "learning_rate": 0.0006901837435354231, + "loss": 2.6573, + "step": 12885 + }, + { + "epoch": 0.38211309788571596, + "grad_norm": 0.15204428136348724, + "learning_rate": 0.0006901402282977998, + "loss": 2.7152, + "step": 12886 + }, + { + "epoch": 0.38214275123802743, + "grad_norm": 0.17552155256271362, + "learning_rate": 0.0006900967113764522, + "loss": 2.7058, + "step": 12887 + }, + { + "epoch": 0.38217240459033897, + "grad_norm": 0.1538781076669693, + "learning_rate": 0.0006900531927717655, + "loss": 2.6923, + "step": 12888 + }, + { + "epoch": 0.38220205794265044, + "grad_norm": 0.11971081793308258, + "learning_rate": 0.000690009672484125, + "loss": 2.7207, + "step": 12889 + }, + { + "epoch": 0.3822317112949619, + "grad_norm": 0.15388639271259308, + "learning_rate": 0.0006899661505139164, + "loss": 2.6895, + "step": 12890 + }, + { + "epoch": 0.3822613646472734, + "grad_norm": 0.15715859830379486, + "learning_rate": 0.0006899226268615249, + "loss": 2.715, + "step": 12891 + }, + { + "epoch": 0.38229101799958487, + "grad_norm": 0.14712564647197723, + "learning_rate": 0.0006898791015273359, + "loss": 2.696, + "step": 12892 + }, + { + "epoch": 0.38232067135189635, + "grad_norm": 0.1498367339372635, + "learning_rate": 0.0006898355745117349, + "loss": 2.6948, + "step": 12893 + }, + { + "epoch": 0.3823503247042078, + "grad_norm": 0.15469180047512054, + "learning_rate": 0.0006897920458151074, + "loss": 2.6953, + "step": 12894 + }, + { + "epoch": 0.3823799780565193, + "grad_norm": 0.17099390923976898, + "learning_rate": 0.0006897485154378386, + "loss": 2.704, + "step": 12895 + }, + { + "epoch": 0.3824096314088308, + "grad_norm": 0.12924927473068237, + "learning_rate": 0.0006897049833803142, + "loss": 2.7041, + "step": 12896 + }, + { + "epoch": 0.38243928476114225, + "grad_norm": 0.12570038437843323, + "learning_rate": 0.0006896614496429195, + "loss": 2.689, + "step": 12897 + }, + { + "epoch": 0.3824689381134537, + "grad_norm": 0.12349706143140793, + "learning_rate": 0.0006896179142260403, + "loss": 2.6414, + "step": 12898 + }, + { + "epoch": 0.3824985914657652, + "grad_norm": 0.12854419648647308, + "learning_rate": 0.0006895743771300618, + "loss": 2.705, + "step": 12899 + }, + { + "epoch": 0.3825282448180767, + "grad_norm": 0.13759280741214752, + "learning_rate": 0.0006895308383553697, + "loss": 2.6972, + "step": 12900 + }, + { + "epoch": 0.38255789817038816, + "grad_norm": 0.12886030972003937, + "learning_rate": 0.0006894872979023494, + "loss": 2.7086, + "step": 12901 + }, + { + "epoch": 0.38258755152269963, + "grad_norm": 0.11378064006567001, + "learning_rate": 0.0006894437557713866, + "loss": 2.6953, + "step": 12902 + }, + { + "epoch": 0.3826172048750111, + "grad_norm": 0.1127883791923523, + "learning_rate": 0.0006894002119628669, + "loss": 2.6811, + "step": 12903 + }, + { + "epoch": 0.3826468582273226, + "grad_norm": 0.12173373252153397, + "learning_rate": 0.0006893566664771758, + "loss": 2.7322, + "step": 12904 + }, + { + "epoch": 0.38267651157963406, + "grad_norm": 0.12866471707820892, + "learning_rate": 0.0006893131193146987, + "loss": 2.7121, + "step": 12905 + }, + { + "epoch": 0.38270616493194554, + "grad_norm": 0.12447098642587662, + "learning_rate": 0.0006892695704758217, + "loss": 2.6849, + "step": 12906 + }, + { + "epoch": 0.382735818284257, + "grad_norm": 0.12312145531177521, + "learning_rate": 0.0006892260199609301, + "loss": 2.7189, + "step": 12907 + }, + { + "epoch": 0.3827654716365685, + "grad_norm": 0.11176155507564545, + "learning_rate": 0.0006891824677704097, + "loss": 2.6747, + "step": 12908 + }, + { + "epoch": 0.38279512498888, + "grad_norm": 0.12345394492149353, + "learning_rate": 0.0006891389139046459, + "loss": 2.6406, + "step": 12909 + }, + { + "epoch": 0.3828247783411915, + "grad_norm": 0.11930522322654724, + "learning_rate": 0.0006890953583640246, + "loss": 2.6629, + "step": 12910 + }, + { + "epoch": 0.382854431693503, + "grad_norm": 0.11343703418970108, + "learning_rate": 0.0006890518011489314, + "loss": 2.7042, + "step": 12911 + }, + { + "epoch": 0.38288408504581445, + "grad_norm": 0.13862527906894684, + "learning_rate": 0.0006890082422597521, + "loss": 2.6607, + "step": 12912 + }, + { + "epoch": 0.3829137383981259, + "grad_norm": 0.157679483294487, + "learning_rate": 0.0006889646816968725, + "loss": 2.6673, + "step": 12913 + }, + { + "epoch": 0.3829433917504374, + "grad_norm": 0.15384922921657562, + "learning_rate": 0.000688921119460678, + "loss": 2.6833, + "step": 12914 + }, + { + "epoch": 0.3829730451027489, + "grad_norm": 0.1249953955411911, + "learning_rate": 0.0006888775555515547, + "loss": 2.6899, + "step": 12915 + }, + { + "epoch": 0.38300269845506035, + "grad_norm": 0.11210070550441742, + "learning_rate": 0.0006888339899698881, + "loss": 2.7108, + "step": 12916 + }, + { + "epoch": 0.38303235180737183, + "grad_norm": 0.12026350200176239, + "learning_rate": 0.0006887904227160642, + "loss": 2.703, + "step": 12917 + }, + { + "epoch": 0.3830620051596833, + "grad_norm": 0.15120013058185577, + "learning_rate": 0.0006887468537904686, + "loss": 2.7244, + "step": 12918 + }, + { + "epoch": 0.3830916585119948, + "grad_norm": 0.1626242846250534, + "learning_rate": 0.0006887032831934874, + "loss": 2.7023, + "step": 12919 + }, + { + "epoch": 0.38312131186430626, + "grad_norm": 0.1542404592037201, + "learning_rate": 0.0006886597109255062, + "loss": 2.6666, + "step": 12920 + }, + { + "epoch": 0.38315096521661773, + "grad_norm": 0.13175781071186066, + "learning_rate": 0.0006886161369869107, + "loss": 2.6798, + "step": 12921 + }, + { + "epoch": 0.3831806185689292, + "grad_norm": 0.11712311208248138, + "learning_rate": 0.0006885725613780871, + "loss": 2.6884, + "step": 12922 + }, + { + "epoch": 0.3832102719212407, + "grad_norm": 0.1434205323457718, + "learning_rate": 0.000688528984099421, + "loss": 2.6884, + "step": 12923 + }, + { + "epoch": 0.38323992527355216, + "grad_norm": 0.1436719447374344, + "learning_rate": 0.0006884854051512984, + "loss": 2.6877, + "step": 12924 + }, + { + "epoch": 0.38326957862586364, + "grad_norm": 0.12965412437915802, + "learning_rate": 0.0006884418245341052, + "loss": 2.7097, + "step": 12925 + }, + { + "epoch": 0.3832992319781751, + "grad_norm": 0.12759728729724884, + "learning_rate": 0.0006883982422482273, + "loss": 2.695, + "step": 12926 + }, + { + "epoch": 0.3833288853304866, + "grad_norm": 0.13066889345645905, + "learning_rate": 0.0006883546582940506, + "loss": 2.6907, + "step": 12927 + }, + { + "epoch": 0.38335853868279807, + "grad_norm": 0.12792140245437622, + "learning_rate": 0.0006883110726719612, + "loss": 2.6734, + "step": 12928 + }, + { + "epoch": 0.38338819203510954, + "grad_norm": 0.1288881003856659, + "learning_rate": 0.0006882674853823448, + "loss": 2.6845, + "step": 12929 + }, + { + "epoch": 0.3834178453874211, + "grad_norm": 0.1348947435617447, + "learning_rate": 0.0006882238964255875, + "loss": 2.7266, + "step": 12930 + }, + { + "epoch": 0.38344749873973255, + "grad_norm": 0.14747384190559387, + "learning_rate": 0.0006881803058020752, + "loss": 2.7005, + "step": 12931 + }, + { + "epoch": 0.383477152092044, + "grad_norm": 0.14179247617721558, + "learning_rate": 0.0006881367135121942, + "loss": 2.7043, + "step": 12932 + }, + { + "epoch": 0.3835068054443555, + "grad_norm": 0.1448289304971695, + "learning_rate": 0.0006880931195563303, + "loss": 2.7018, + "step": 12933 + }, + { + "epoch": 0.383536458796667, + "grad_norm": 0.14361897110939026, + "learning_rate": 0.0006880495239348694, + "loss": 2.702, + "step": 12934 + }, + { + "epoch": 0.38356611214897846, + "grad_norm": 0.1388169229030609, + "learning_rate": 0.0006880059266481977, + "loss": 2.6874, + "step": 12935 + }, + { + "epoch": 0.38359576550128993, + "grad_norm": 0.12973786890506744, + "learning_rate": 0.0006879623276967013, + "loss": 2.6875, + "step": 12936 + }, + { + "epoch": 0.3836254188536014, + "grad_norm": 0.1439528912305832, + "learning_rate": 0.0006879187270807663, + "loss": 2.6784, + "step": 12937 + }, + { + "epoch": 0.3836550722059129, + "grad_norm": 0.14492879807949066, + "learning_rate": 0.0006878751248007787, + "loss": 2.7111, + "step": 12938 + }, + { + "epoch": 0.38368472555822436, + "grad_norm": 0.11493740230798721, + "learning_rate": 0.0006878315208571244, + "loss": 2.7139, + "step": 12939 + }, + { + "epoch": 0.38371437891053584, + "grad_norm": 0.10399851202964783, + "learning_rate": 0.0006877879152501899, + "loss": 2.6465, + "step": 12940 + }, + { + "epoch": 0.3837440322628473, + "grad_norm": 0.13276298344135284, + "learning_rate": 0.0006877443079803614, + "loss": 2.6902, + "step": 12941 + }, + { + "epoch": 0.3837736856151588, + "grad_norm": 0.10717113316059113, + "learning_rate": 0.0006877006990480246, + "loss": 2.709, + "step": 12942 + }, + { + "epoch": 0.38380333896747026, + "grad_norm": 0.11228213459253311, + "learning_rate": 0.0006876570884535657, + "loss": 2.7054, + "step": 12943 + }, + { + "epoch": 0.38383299231978174, + "grad_norm": 0.12524865567684174, + "learning_rate": 0.0006876134761973713, + "loss": 2.6935, + "step": 12944 + }, + { + "epoch": 0.3838626456720932, + "grad_norm": 0.11198647320270538, + "learning_rate": 0.0006875698622798274, + "loss": 2.696, + "step": 12945 + }, + { + "epoch": 0.3838922990244047, + "grad_norm": 0.11394570022821426, + "learning_rate": 0.0006875262467013201, + "loss": 2.6912, + "step": 12946 + }, + { + "epoch": 0.38392195237671617, + "grad_norm": 0.12059742212295532, + "learning_rate": 0.0006874826294622357, + "loss": 2.7166, + "step": 12947 + }, + { + "epoch": 0.38395160572902765, + "grad_norm": 0.14213994145393372, + "learning_rate": 0.0006874390105629604, + "loss": 2.6836, + "step": 12948 + }, + { + "epoch": 0.3839812590813391, + "grad_norm": 0.130066379904747, + "learning_rate": 0.0006873953900038805, + "loss": 2.6981, + "step": 12949 + }, + { + "epoch": 0.3840109124336506, + "grad_norm": 0.13086570799350739, + "learning_rate": 0.0006873517677853823, + "loss": 2.6837, + "step": 12950 + }, + { + "epoch": 0.38404056578596213, + "grad_norm": 0.1506010890007019, + "learning_rate": 0.0006873081439078521, + "loss": 2.6767, + "step": 12951 + }, + { + "epoch": 0.3840702191382736, + "grad_norm": 0.1241583600640297, + "learning_rate": 0.000687264518371676, + "loss": 2.6633, + "step": 12952 + }, + { + "epoch": 0.3840998724905851, + "grad_norm": 0.12340249121189117, + "learning_rate": 0.0006872208911772405, + "loss": 2.6979, + "step": 12953 + }, + { + "epoch": 0.38412952584289656, + "grad_norm": 0.1390572190284729, + "learning_rate": 0.0006871772623249319, + "loss": 2.7138, + "step": 12954 + }, + { + "epoch": 0.38415917919520803, + "grad_norm": 0.12753914296627045, + "learning_rate": 0.0006871336318151365, + "loss": 2.6917, + "step": 12955 + }, + { + "epoch": 0.3841888325475195, + "grad_norm": 0.12714837491512299, + "learning_rate": 0.0006870899996482405, + "loss": 2.6839, + "step": 12956 + }, + { + "epoch": 0.384218485899831, + "grad_norm": 0.1226317286491394, + "learning_rate": 0.0006870463658246306, + "loss": 2.7127, + "step": 12957 + }, + { + "epoch": 0.38424813925214246, + "grad_norm": 0.12572729587554932, + "learning_rate": 0.0006870027303446931, + "loss": 2.696, + "step": 12958 + }, + { + "epoch": 0.38427779260445394, + "grad_norm": 0.1234983503818512, + "learning_rate": 0.0006869590932088143, + "loss": 2.6998, + "step": 12959 + }, + { + "epoch": 0.3843074459567654, + "grad_norm": 0.13323970139026642, + "learning_rate": 0.0006869154544173805, + "loss": 2.7055, + "step": 12960 + }, + { + "epoch": 0.3843370993090769, + "grad_norm": 0.14705538749694824, + "learning_rate": 0.0006868718139707784, + "loss": 2.7034, + "step": 12961 + }, + { + "epoch": 0.38436675266138837, + "grad_norm": 0.16250549256801605, + "learning_rate": 0.0006868281718693943, + "loss": 2.7076, + "step": 12962 + }, + { + "epoch": 0.38439640601369984, + "grad_norm": 0.16933999955654144, + "learning_rate": 0.0006867845281136145, + "loss": 2.6739, + "step": 12963 + }, + { + "epoch": 0.3844260593660113, + "grad_norm": 0.15075097978115082, + "learning_rate": 0.0006867408827038259, + "loss": 2.7233, + "step": 12964 + }, + { + "epoch": 0.3844557127183228, + "grad_norm": 0.13193055987358093, + "learning_rate": 0.0006866972356404145, + "loss": 2.6811, + "step": 12965 + }, + { + "epoch": 0.38448536607063427, + "grad_norm": 0.14527271687984467, + "learning_rate": 0.0006866535869237671, + "loss": 2.7223, + "step": 12966 + }, + { + "epoch": 0.38451501942294575, + "grad_norm": 0.14184515178203583, + "learning_rate": 0.0006866099365542703, + "loss": 2.7015, + "step": 12967 + }, + { + "epoch": 0.3845446727752572, + "grad_norm": 0.1340636909008026, + "learning_rate": 0.0006865662845323104, + "loss": 2.672, + "step": 12968 + }, + { + "epoch": 0.3845743261275687, + "grad_norm": 0.15237165987491608, + "learning_rate": 0.0006865226308582739, + "loss": 2.6494, + "step": 12969 + }, + { + "epoch": 0.3846039794798802, + "grad_norm": 0.15743188560009003, + "learning_rate": 0.0006864789755325476, + "loss": 2.6949, + "step": 12970 + }, + { + "epoch": 0.38463363283219165, + "grad_norm": 0.14788447320461273, + "learning_rate": 0.0006864353185555179, + "loss": 2.7076, + "step": 12971 + }, + { + "epoch": 0.3846632861845032, + "grad_norm": 0.12652096152305603, + "learning_rate": 0.0006863916599275714, + "loss": 2.6877, + "step": 12972 + }, + { + "epoch": 0.38469293953681466, + "grad_norm": 0.11595975607633591, + "learning_rate": 0.000686347999649095, + "loss": 2.6851, + "step": 12973 + }, + { + "epoch": 0.38472259288912614, + "grad_norm": 0.12267570197582245, + "learning_rate": 0.000686304337720475, + "loss": 2.7142, + "step": 12974 + }, + { + "epoch": 0.3847522462414376, + "grad_norm": 0.1530127078294754, + "learning_rate": 0.0006862606741420979, + "loss": 2.6758, + "step": 12975 + }, + { + "epoch": 0.3847818995937491, + "grad_norm": 0.14280718564987183, + "learning_rate": 0.0006862170089143506, + "loss": 2.7297, + "step": 12976 + }, + { + "epoch": 0.38481155294606056, + "grad_norm": 0.1331898272037506, + "learning_rate": 0.0006861733420376199, + "loss": 2.6758, + "step": 12977 + }, + { + "epoch": 0.38484120629837204, + "grad_norm": 0.14431294798851013, + "learning_rate": 0.0006861296735122923, + "loss": 2.7029, + "step": 12978 + }, + { + "epoch": 0.3848708596506835, + "grad_norm": 0.1504025012254715, + "learning_rate": 0.0006860860033387544, + "loss": 2.7063, + "step": 12979 + }, + { + "epoch": 0.384900513002995, + "grad_norm": 0.148835688829422, + "learning_rate": 0.0006860423315173932, + "loss": 2.663, + "step": 12980 + }, + { + "epoch": 0.38493016635530647, + "grad_norm": 0.12624730169773102, + "learning_rate": 0.0006859986580485948, + "loss": 2.6979, + "step": 12981 + }, + { + "epoch": 0.38495981970761795, + "grad_norm": 0.1286839097738266, + "learning_rate": 0.0006859549829327466, + "loss": 2.6702, + "step": 12982 + }, + { + "epoch": 0.3849894730599294, + "grad_norm": 0.14145182073116302, + "learning_rate": 0.0006859113061702351, + "loss": 2.6988, + "step": 12983 + }, + { + "epoch": 0.3850191264122409, + "grad_norm": 0.11596526950597763, + "learning_rate": 0.0006858676277614472, + "loss": 2.6737, + "step": 12984 + }, + { + "epoch": 0.3850487797645524, + "grad_norm": 0.11400441825389862, + "learning_rate": 0.0006858239477067694, + "loss": 2.7278, + "step": 12985 + }, + { + "epoch": 0.38507843311686385, + "grad_norm": 0.1144244596362114, + "learning_rate": 0.0006857802660065886, + "loss": 2.6904, + "step": 12986 + }, + { + "epoch": 0.3851080864691753, + "grad_norm": 0.12237491458654404, + "learning_rate": 0.0006857365826612918, + "loss": 2.682, + "step": 12987 + }, + { + "epoch": 0.3851377398214868, + "grad_norm": 0.10361232608556747, + "learning_rate": 0.0006856928976712656, + "loss": 2.7045, + "step": 12988 + }, + { + "epoch": 0.3851673931737983, + "grad_norm": 0.10750555992126465, + "learning_rate": 0.0006856492110368969, + "loss": 2.7193, + "step": 12989 + }, + { + "epoch": 0.38519704652610975, + "grad_norm": 0.13096235692501068, + "learning_rate": 0.0006856055227585726, + "loss": 2.6743, + "step": 12990 + }, + { + "epoch": 0.38522669987842123, + "grad_norm": 0.14592906832695007, + "learning_rate": 0.0006855618328366795, + "loss": 2.6966, + "step": 12991 + }, + { + "epoch": 0.38525635323073276, + "grad_norm": 0.13943952322006226, + "learning_rate": 0.0006855181412716045, + "loss": 2.7029, + "step": 12992 + }, + { + "epoch": 0.38528600658304424, + "grad_norm": 0.15330199897289276, + "learning_rate": 0.0006854744480637346, + "loss": 2.7047, + "step": 12993 + }, + { + "epoch": 0.3853156599353557, + "grad_norm": 0.14393635094165802, + "learning_rate": 0.0006854307532134566, + "loss": 2.7211, + "step": 12994 + }, + { + "epoch": 0.3853453132876672, + "grad_norm": 0.14588147401809692, + "learning_rate": 0.0006853870567211574, + "loss": 2.6747, + "step": 12995 + }, + { + "epoch": 0.38537496663997867, + "grad_norm": 0.1333330124616623, + "learning_rate": 0.0006853433585872241, + "loss": 2.6941, + "step": 12996 + }, + { + "epoch": 0.38540461999229014, + "grad_norm": 0.12042661011219025, + "learning_rate": 0.0006852996588120434, + "loss": 2.6807, + "step": 12997 + }, + { + "epoch": 0.3854342733446016, + "grad_norm": 0.11244811117649078, + "learning_rate": 0.0006852559573960026, + "loss": 2.6939, + "step": 12998 + }, + { + "epoch": 0.3854639266969131, + "grad_norm": 0.13737984001636505, + "learning_rate": 0.0006852122543394883, + "loss": 2.6963, + "step": 12999 + }, + { + "epoch": 0.38549358004922457, + "grad_norm": 0.12776997685432434, + "learning_rate": 0.0006851685496428877, + "loss": 2.6687, + "step": 13000 + }, + { + "epoch": 0.38552323340153605, + "grad_norm": 0.12597043812274933, + "learning_rate": 0.0006851248433065879, + "loss": 2.6435, + "step": 13001 + }, + { + "epoch": 0.3855528867538475, + "grad_norm": 0.12667430937290192, + "learning_rate": 0.000685081135330976, + "loss": 2.7122, + "step": 13002 + }, + { + "epoch": 0.385582540106159, + "grad_norm": 0.12767744064331055, + "learning_rate": 0.0006850374257164387, + "loss": 2.7182, + "step": 13003 + }, + { + "epoch": 0.3856121934584705, + "grad_norm": 0.12515494227409363, + "learning_rate": 0.0006849937144633632, + "loss": 2.684, + "step": 13004 + }, + { + "epoch": 0.38564184681078195, + "grad_norm": 0.12480246275663376, + "learning_rate": 0.0006849500015721366, + "loss": 2.676, + "step": 13005 + }, + { + "epoch": 0.38567150016309343, + "grad_norm": 0.14137989282608032, + "learning_rate": 0.0006849062870431462, + "loss": 2.6885, + "step": 13006 + }, + { + "epoch": 0.3857011535154049, + "grad_norm": 0.12161242216825485, + "learning_rate": 0.0006848625708767787, + "loss": 2.643, + "step": 13007 + }, + { + "epoch": 0.3857308068677164, + "grad_norm": 0.11566410958766937, + "learning_rate": 0.0006848188530734214, + "loss": 2.6983, + "step": 13008 + }, + { + "epoch": 0.38576046022002786, + "grad_norm": 0.12283608317375183, + "learning_rate": 0.0006847751336334614, + "loss": 2.7025, + "step": 13009 + }, + { + "epoch": 0.38579011357233933, + "grad_norm": 0.13566596806049347, + "learning_rate": 0.0006847314125572859, + "loss": 2.6769, + "step": 13010 + }, + { + "epoch": 0.3858197669246508, + "grad_norm": 0.13083219528198242, + "learning_rate": 0.000684687689845282, + "loss": 2.684, + "step": 13011 + }, + { + "epoch": 0.3858494202769623, + "grad_norm": 0.1299288123846054, + "learning_rate": 0.0006846439654978371, + "loss": 2.707, + "step": 13012 + }, + { + "epoch": 0.3858790736292738, + "grad_norm": 0.12331988662481308, + "learning_rate": 0.0006846002395153382, + "loss": 2.6841, + "step": 13013 + }, + { + "epoch": 0.3859087269815853, + "grad_norm": 0.13164784014225006, + "learning_rate": 0.0006845565118981723, + "loss": 2.6688, + "step": 13014 + }, + { + "epoch": 0.38593838033389677, + "grad_norm": 0.1332237422466278, + "learning_rate": 0.0006845127826467268, + "loss": 2.6953, + "step": 13015 + }, + { + "epoch": 0.38596803368620825, + "grad_norm": 0.1289331316947937, + "learning_rate": 0.000684469051761389, + "loss": 2.694, + "step": 13016 + }, + { + "epoch": 0.3859976870385197, + "grad_norm": 0.12193144112825394, + "learning_rate": 0.0006844253192425463, + "loss": 2.6685, + "step": 13017 + }, + { + "epoch": 0.3860273403908312, + "grad_norm": 0.10962407290935516, + "learning_rate": 0.0006843815850905854, + "loss": 2.675, + "step": 13018 + }, + { + "epoch": 0.3860569937431427, + "grad_norm": 0.14448967576026917, + "learning_rate": 0.0006843378493058941, + "loss": 2.6582, + "step": 13019 + }, + { + "epoch": 0.38608664709545415, + "grad_norm": 0.13075511157512665, + "learning_rate": 0.0006842941118888593, + "loss": 2.7238, + "step": 13020 + }, + { + "epoch": 0.3861163004477656, + "grad_norm": 0.1292884349822998, + "learning_rate": 0.0006842503728398686, + "loss": 2.7119, + "step": 13021 + }, + { + "epoch": 0.3861459538000771, + "grad_norm": 0.12895171344280243, + "learning_rate": 0.0006842066321593094, + "loss": 2.7139, + "step": 13022 + }, + { + "epoch": 0.3861756071523886, + "grad_norm": 0.12030114978551865, + "learning_rate": 0.0006841628898475688, + "loss": 2.6989, + "step": 13023 + }, + { + "epoch": 0.38620526050470005, + "grad_norm": 0.15296724438667297, + "learning_rate": 0.0006841191459050341, + "loss": 2.6855, + "step": 13024 + }, + { + "epoch": 0.38623491385701153, + "grad_norm": 0.15645742416381836, + "learning_rate": 0.0006840754003320928, + "loss": 2.6896, + "step": 13025 + }, + { + "epoch": 0.386264567209323, + "grad_norm": 0.14926233887672424, + "learning_rate": 0.0006840316531291321, + "loss": 2.684, + "step": 13026 + }, + { + "epoch": 0.3862942205616345, + "grad_norm": 0.14984595775604248, + "learning_rate": 0.0006839879042965398, + "loss": 2.6753, + "step": 13027 + }, + { + "epoch": 0.38632387391394596, + "grad_norm": 0.16552771627902985, + "learning_rate": 0.0006839441538347029, + "loss": 2.7204, + "step": 13028 + }, + { + "epoch": 0.38635352726625743, + "grad_norm": 0.1760391891002655, + "learning_rate": 0.0006839004017440089, + "loss": 2.7239, + "step": 13029 + }, + { + "epoch": 0.3863831806185689, + "grad_norm": 0.14371958374977112, + "learning_rate": 0.0006838566480248453, + "loss": 2.6762, + "step": 13030 + }, + { + "epoch": 0.3864128339708804, + "grad_norm": 0.1378575712442398, + "learning_rate": 0.0006838128926775995, + "loss": 2.6858, + "step": 13031 + }, + { + "epoch": 0.38644248732319186, + "grad_norm": 0.14035506546497345, + "learning_rate": 0.0006837691357026589, + "loss": 2.686, + "step": 13032 + }, + { + "epoch": 0.38647214067550334, + "grad_norm": 0.13708841800689697, + "learning_rate": 0.0006837253771004113, + "loss": 2.7052, + "step": 13033 + }, + { + "epoch": 0.38650179402781487, + "grad_norm": 0.1509784758090973, + "learning_rate": 0.0006836816168712438, + "loss": 2.7239, + "step": 13034 + }, + { + "epoch": 0.38653144738012635, + "grad_norm": 0.13765473663806915, + "learning_rate": 0.000683637855015544, + "loss": 2.6968, + "step": 13035 + }, + { + "epoch": 0.3865611007324378, + "grad_norm": 0.13477823138237, + "learning_rate": 0.0006835940915336996, + "loss": 2.6962, + "step": 13036 + }, + { + "epoch": 0.3865907540847493, + "grad_norm": 0.1517278105020523, + "learning_rate": 0.000683550326426098, + "loss": 2.695, + "step": 13037 + }, + { + "epoch": 0.3866204074370608, + "grad_norm": 0.12333323061466217, + "learning_rate": 0.0006835065596931265, + "loss": 2.671, + "step": 13038 + }, + { + "epoch": 0.38665006078937225, + "grad_norm": 0.13173240423202515, + "learning_rate": 0.0006834627913351733, + "loss": 2.6907, + "step": 13039 + }, + { + "epoch": 0.38667971414168373, + "grad_norm": 0.13089033961296082, + "learning_rate": 0.0006834190213526254, + "loss": 2.6988, + "step": 13040 + }, + { + "epoch": 0.3867093674939952, + "grad_norm": 0.15905976295471191, + "learning_rate": 0.0006833752497458705, + "loss": 2.6456, + "step": 13041 + }, + { + "epoch": 0.3867390208463067, + "grad_norm": 0.14419764280319214, + "learning_rate": 0.0006833314765152963, + "loss": 2.6981, + "step": 13042 + }, + { + "epoch": 0.38676867419861816, + "grad_norm": 0.11650888621807098, + "learning_rate": 0.0006832877016612906, + "loss": 2.6922, + "step": 13043 + }, + { + "epoch": 0.38679832755092963, + "grad_norm": 0.12667523324489594, + "learning_rate": 0.0006832439251842408, + "loss": 2.6746, + "step": 13044 + }, + { + "epoch": 0.3868279809032411, + "grad_norm": 0.13021935522556305, + "learning_rate": 0.0006832001470845346, + "loss": 2.7141, + "step": 13045 + }, + { + "epoch": 0.3868576342555526, + "grad_norm": 0.11897242069244385, + "learning_rate": 0.0006831563673625594, + "loss": 2.6963, + "step": 13046 + }, + { + "epoch": 0.38688728760786406, + "grad_norm": 0.11213845759630203, + "learning_rate": 0.0006831125860187033, + "loss": 2.6895, + "step": 13047 + }, + { + "epoch": 0.38691694096017554, + "grad_norm": 0.13644950091838837, + "learning_rate": 0.0006830688030533538, + "loss": 2.6775, + "step": 13048 + }, + { + "epoch": 0.386946594312487, + "grad_norm": 0.13742707669734955, + "learning_rate": 0.0006830250184668987, + "loss": 2.6646, + "step": 13049 + }, + { + "epoch": 0.3869762476647985, + "grad_norm": 0.12565015256404877, + "learning_rate": 0.0006829812322597256, + "loss": 2.6686, + "step": 13050 + }, + { + "epoch": 0.38700590101710997, + "grad_norm": 0.10593602061271667, + "learning_rate": 0.0006829374444322222, + "loss": 2.7164, + "step": 13051 + }, + { + "epoch": 0.38703555436942144, + "grad_norm": 0.12176983058452606, + "learning_rate": 0.0006828936549847765, + "loss": 2.6941, + "step": 13052 + }, + { + "epoch": 0.3870652077217329, + "grad_norm": 0.1301674097776413, + "learning_rate": 0.0006828498639177758, + "loss": 2.6916, + "step": 13053 + }, + { + "epoch": 0.3870948610740444, + "grad_norm": 0.10586622357368469, + "learning_rate": 0.0006828060712316084, + "loss": 2.6889, + "step": 13054 + }, + { + "epoch": 0.3871245144263559, + "grad_norm": 0.11832330375909805, + "learning_rate": 0.0006827622769266619, + "loss": 2.6944, + "step": 13055 + }, + { + "epoch": 0.3871541677786674, + "grad_norm": 0.1142486035823822, + "learning_rate": 0.000682718481003324, + "loss": 2.6922, + "step": 13056 + }, + { + "epoch": 0.3871838211309789, + "grad_norm": 0.1071300208568573, + "learning_rate": 0.0006826746834619826, + "loss": 2.6602, + "step": 13057 + }, + { + "epoch": 0.38721347448329035, + "grad_norm": 0.10896514356136322, + "learning_rate": 0.0006826308843030255, + "loss": 2.686, + "step": 13058 + }, + { + "epoch": 0.38724312783560183, + "grad_norm": 0.11886845529079437, + "learning_rate": 0.0006825870835268404, + "loss": 2.7032, + "step": 13059 + }, + { + "epoch": 0.3872727811879133, + "grad_norm": 0.12331976741552353, + "learning_rate": 0.0006825432811338157, + "loss": 2.6921, + "step": 13060 + }, + { + "epoch": 0.3873024345402248, + "grad_norm": 0.1349438577890396, + "learning_rate": 0.0006824994771243387, + "loss": 2.6704, + "step": 13061 + }, + { + "epoch": 0.38733208789253626, + "grad_norm": 0.11424781382083893, + "learning_rate": 0.0006824556714987975, + "loss": 2.7053, + "step": 13062 + }, + { + "epoch": 0.38736174124484773, + "grad_norm": 0.12124038487672806, + "learning_rate": 0.00068241186425758, + "loss": 2.6658, + "step": 13063 + }, + { + "epoch": 0.3873913945971592, + "grad_norm": 0.14256615936756134, + "learning_rate": 0.0006823680554010742, + "loss": 2.6582, + "step": 13064 + }, + { + "epoch": 0.3874210479494707, + "grad_norm": 0.15148645639419556, + "learning_rate": 0.0006823242449296678, + "loss": 2.7001, + "step": 13065 + }, + { + "epoch": 0.38745070130178216, + "grad_norm": 0.1292271465063095, + "learning_rate": 0.0006822804328437491, + "loss": 2.7002, + "step": 13066 + }, + { + "epoch": 0.38748035465409364, + "grad_norm": 0.14957183599472046, + "learning_rate": 0.0006822366191437058, + "loss": 2.7027, + "step": 13067 + }, + { + "epoch": 0.3875100080064051, + "grad_norm": 0.16727682948112488, + "learning_rate": 0.000682192803829926, + "loss": 2.6975, + "step": 13068 + }, + { + "epoch": 0.3875396613587166, + "grad_norm": 0.1847495436668396, + "learning_rate": 0.0006821489869027976, + "loss": 2.6839, + "step": 13069 + }, + { + "epoch": 0.38756931471102807, + "grad_norm": 0.15967318415641785, + "learning_rate": 0.0006821051683627087, + "loss": 2.6575, + "step": 13070 + }, + { + "epoch": 0.38759896806333954, + "grad_norm": 0.15164393186569214, + "learning_rate": 0.0006820613482100473, + "loss": 2.7035, + "step": 13071 + }, + { + "epoch": 0.387628621415651, + "grad_norm": 0.14654704928398132, + "learning_rate": 0.0006820175264452013, + "loss": 2.707, + "step": 13072 + }, + { + "epoch": 0.3876582747679625, + "grad_norm": 0.15024733543395996, + "learning_rate": 0.000681973703068559, + "loss": 2.6951, + "step": 13073 + }, + { + "epoch": 0.387687928120274, + "grad_norm": 0.14415550231933594, + "learning_rate": 0.0006819298780805082, + "loss": 2.6917, + "step": 13074 + }, + { + "epoch": 0.38771758147258545, + "grad_norm": 0.12911206483840942, + "learning_rate": 0.0006818860514814371, + "loss": 2.6807, + "step": 13075 + }, + { + "epoch": 0.387747234824897, + "grad_norm": 0.11546048521995544, + "learning_rate": 0.0006818422232717339, + "loss": 2.6476, + "step": 13076 + }, + { + "epoch": 0.38777688817720846, + "grad_norm": 0.1273551732301712, + "learning_rate": 0.0006817983934517866, + "loss": 2.6982, + "step": 13077 + }, + { + "epoch": 0.38780654152951993, + "grad_norm": 0.11138725280761719, + "learning_rate": 0.0006817545620219833, + "loss": 2.6874, + "step": 13078 + }, + { + "epoch": 0.3878361948818314, + "grad_norm": 0.12213661521673203, + "learning_rate": 0.0006817107289827121, + "loss": 2.6849, + "step": 13079 + }, + { + "epoch": 0.3878658482341429, + "grad_norm": 0.129232257604599, + "learning_rate": 0.0006816668943343612, + "loss": 2.7088, + "step": 13080 + }, + { + "epoch": 0.38789550158645436, + "grad_norm": 0.14251269400119781, + "learning_rate": 0.0006816230580773188, + "loss": 2.7419, + "step": 13081 + }, + { + "epoch": 0.38792515493876584, + "grad_norm": 0.12657012045383453, + "learning_rate": 0.0006815792202119731, + "loss": 2.6571, + "step": 13082 + }, + { + "epoch": 0.3879548082910773, + "grad_norm": 0.11767958849668503, + "learning_rate": 0.0006815353807387121, + "loss": 2.7073, + "step": 13083 + }, + { + "epoch": 0.3879844616433888, + "grad_norm": 0.13586784899234772, + "learning_rate": 0.0006814915396579244, + "loss": 2.6714, + "step": 13084 + }, + { + "epoch": 0.38801411499570027, + "grad_norm": 0.1307426244020462, + "learning_rate": 0.0006814476969699976, + "loss": 2.714, + "step": 13085 + }, + { + "epoch": 0.38804376834801174, + "grad_norm": 0.10733244568109512, + "learning_rate": 0.0006814038526753205, + "loss": 2.6288, + "step": 13086 + }, + { + "epoch": 0.3880734217003232, + "grad_norm": 0.11722220480442047, + "learning_rate": 0.0006813600067742811, + "loss": 2.6948, + "step": 13087 + }, + { + "epoch": 0.3881030750526347, + "grad_norm": 0.12955304980278015, + "learning_rate": 0.0006813161592672678, + "loss": 2.6828, + "step": 13088 + }, + { + "epoch": 0.38813272840494617, + "grad_norm": 0.1169920563697815, + "learning_rate": 0.0006812723101546687, + "loss": 2.6838, + "step": 13089 + }, + { + "epoch": 0.38816238175725765, + "grad_norm": 0.10165832936763763, + "learning_rate": 0.0006812284594368723, + "loss": 2.6992, + "step": 13090 + }, + { + "epoch": 0.3881920351095691, + "grad_norm": 0.10822558403015137, + "learning_rate": 0.0006811846071142667, + "loss": 2.6994, + "step": 13091 + }, + { + "epoch": 0.3882216884618806, + "grad_norm": 0.13949254155158997, + "learning_rate": 0.0006811407531872402, + "loss": 2.7071, + "step": 13092 + }, + { + "epoch": 0.3882513418141921, + "grad_norm": 0.1560579091310501, + "learning_rate": 0.0006810968976561814, + "loss": 2.712, + "step": 13093 + }, + { + "epoch": 0.38828099516650355, + "grad_norm": 0.16659009456634521, + "learning_rate": 0.0006810530405214785, + "loss": 2.6999, + "step": 13094 + }, + { + "epoch": 0.388310648518815, + "grad_norm": 0.1386948823928833, + "learning_rate": 0.0006810091817835197, + "loss": 2.6845, + "step": 13095 + }, + { + "epoch": 0.3883403018711265, + "grad_norm": 0.14408652484416962, + "learning_rate": 0.0006809653214426936, + "loss": 2.6823, + "step": 13096 + }, + { + "epoch": 0.38836995522343803, + "grad_norm": 0.14977137744426727, + "learning_rate": 0.0006809214594993884, + "loss": 2.6781, + "step": 13097 + }, + { + "epoch": 0.3883996085757495, + "grad_norm": 0.11086760461330414, + "learning_rate": 0.0006808775959539928, + "loss": 2.673, + "step": 13098 + }, + { + "epoch": 0.388429261928061, + "grad_norm": 0.11997310817241669, + "learning_rate": 0.0006808337308068951, + "loss": 2.6992, + "step": 13099 + }, + { + "epoch": 0.38845891528037246, + "grad_norm": 0.13727673888206482, + "learning_rate": 0.0006807898640584834, + "loss": 2.6885, + "step": 13100 + }, + { + "epoch": 0.38848856863268394, + "grad_norm": 0.1402924805879593, + "learning_rate": 0.0006807459957091466, + "loss": 2.6873, + "step": 13101 + }, + { + "epoch": 0.3885182219849954, + "grad_norm": 0.14366358518600464, + "learning_rate": 0.0006807021257592729, + "loss": 2.7059, + "step": 13102 + }, + { + "epoch": 0.3885478753373069, + "grad_norm": 0.1276565045118332, + "learning_rate": 0.000680658254209251, + "loss": 2.7009, + "step": 13103 + }, + { + "epoch": 0.38857752868961837, + "grad_norm": 0.10155022889375687, + "learning_rate": 0.0006806143810594692, + "loss": 2.7465, + "step": 13104 + }, + { + "epoch": 0.38860718204192984, + "grad_norm": 0.11166515201330185, + "learning_rate": 0.0006805705063103161, + "loss": 2.6711, + "step": 13105 + }, + { + "epoch": 0.3886368353942413, + "grad_norm": 0.1336405724287033, + "learning_rate": 0.00068052662996218, + "loss": 2.7208, + "step": 13106 + }, + { + "epoch": 0.3886664887465528, + "grad_norm": 0.13390135765075684, + "learning_rate": 0.0006804827520154496, + "loss": 2.6981, + "step": 13107 + }, + { + "epoch": 0.3886961420988643, + "grad_norm": 0.1256657987833023, + "learning_rate": 0.0006804388724705136, + "loss": 2.7256, + "step": 13108 + }, + { + "epoch": 0.38872579545117575, + "grad_norm": 0.13872836530208588, + "learning_rate": 0.0006803949913277603, + "loss": 2.6497, + "step": 13109 + }, + { + "epoch": 0.3887554488034872, + "grad_norm": 0.14185449481010437, + "learning_rate": 0.0006803511085875785, + "loss": 2.6829, + "step": 13110 + }, + { + "epoch": 0.3887851021557987, + "grad_norm": 0.1321304887533188, + "learning_rate": 0.0006803072242503567, + "loss": 2.688, + "step": 13111 + }, + { + "epoch": 0.3888147555081102, + "grad_norm": 0.12998540699481964, + "learning_rate": 0.0006802633383164833, + "loss": 2.6883, + "step": 13112 + }, + { + "epoch": 0.38884440886042165, + "grad_norm": 0.1143157109618187, + "learning_rate": 0.0006802194507863472, + "loss": 2.6703, + "step": 13113 + }, + { + "epoch": 0.38887406221273313, + "grad_norm": 0.12949703633785248, + "learning_rate": 0.0006801755616603369, + "loss": 2.691, + "step": 13114 + }, + { + "epoch": 0.3889037155650446, + "grad_norm": 0.14149150252342224, + "learning_rate": 0.0006801316709388412, + "loss": 2.719, + "step": 13115 + }, + { + "epoch": 0.3889333689173561, + "grad_norm": 0.12275724858045578, + "learning_rate": 0.0006800877786222486, + "loss": 2.6963, + "step": 13116 + }, + { + "epoch": 0.3889630222696676, + "grad_norm": 0.11025889962911606, + "learning_rate": 0.0006800438847109476, + "loss": 2.7246, + "step": 13117 + }, + { + "epoch": 0.3889926756219791, + "grad_norm": 0.10894546657800674, + "learning_rate": 0.0006799999892053273, + "loss": 2.6802, + "step": 13118 + }, + { + "epoch": 0.38902232897429057, + "grad_norm": 0.12086587399244308, + "learning_rate": 0.000679956092105776, + "loss": 2.6924, + "step": 13119 + }, + { + "epoch": 0.38905198232660204, + "grad_norm": 0.11617963761091232, + "learning_rate": 0.0006799121934126829, + "loss": 2.659, + "step": 13120 + }, + { + "epoch": 0.3890816356789135, + "grad_norm": 0.11862888187170029, + "learning_rate": 0.0006798682931264363, + "loss": 2.6763, + "step": 13121 + }, + { + "epoch": 0.389111289031225, + "grad_norm": 0.11109455674886703, + "learning_rate": 0.0006798243912474251, + "loss": 2.6868, + "step": 13122 + }, + { + "epoch": 0.38914094238353647, + "grad_norm": 0.1270124912261963, + "learning_rate": 0.000679780487776038, + "loss": 2.6788, + "step": 13123 + }, + { + "epoch": 0.38917059573584795, + "grad_norm": 0.15211956202983856, + "learning_rate": 0.0006797365827126638, + "loss": 2.6947, + "step": 13124 + }, + { + "epoch": 0.3892002490881594, + "grad_norm": 0.152733713388443, + "learning_rate": 0.0006796926760576914, + "loss": 2.7275, + "step": 13125 + }, + { + "epoch": 0.3892299024404709, + "grad_norm": 0.14295914769172668, + "learning_rate": 0.0006796487678115095, + "loss": 2.6799, + "step": 13126 + }, + { + "epoch": 0.3892595557927824, + "grad_norm": 0.1511671394109726, + "learning_rate": 0.000679604857974507, + "loss": 2.6672, + "step": 13127 + }, + { + "epoch": 0.38928920914509385, + "grad_norm": 0.15714287757873535, + "learning_rate": 0.0006795609465470724, + "loss": 2.7084, + "step": 13128 + }, + { + "epoch": 0.3893188624974053, + "grad_norm": 0.14920087158679962, + "learning_rate": 0.000679517033529595, + "loss": 2.6968, + "step": 13129 + }, + { + "epoch": 0.3893485158497168, + "grad_norm": 0.14379069209098816, + "learning_rate": 0.0006794731189224634, + "loss": 2.661, + "step": 13130 + }, + { + "epoch": 0.3893781692020283, + "grad_norm": 0.12974980473518372, + "learning_rate": 0.0006794292027260667, + "loss": 2.705, + "step": 13131 + }, + { + "epoch": 0.38940782255433976, + "grad_norm": 0.13363873958587646, + "learning_rate": 0.0006793852849407933, + "loss": 2.7233, + "step": 13132 + }, + { + "epoch": 0.38943747590665123, + "grad_norm": 0.1371014416217804, + "learning_rate": 0.0006793413655670327, + "loss": 2.7015, + "step": 13133 + }, + { + "epoch": 0.3894671292589627, + "grad_norm": 0.11761022359132767, + "learning_rate": 0.0006792974446051732, + "loss": 2.6701, + "step": 13134 + }, + { + "epoch": 0.3894967826112742, + "grad_norm": 0.13393338024616241, + "learning_rate": 0.0006792535220556044, + "loss": 2.6643, + "step": 13135 + }, + { + "epoch": 0.38952643596358566, + "grad_norm": 0.13312460482120514, + "learning_rate": 0.0006792095979187147, + "loss": 2.6926, + "step": 13136 + }, + { + "epoch": 0.38955608931589714, + "grad_norm": 0.13289465010166168, + "learning_rate": 0.0006791656721948932, + "loss": 2.6701, + "step": 13137 + }, + { + "epoch": 0.38958574266820867, + "grad_norm": 0.13359935581684113, + "learning_rate": 0.000679121744884529, + "loss": 2.7044, + "step": 13138 + }, + { + "epoch": 0.38961539602052014, + "grad_norm": 0.13656283915042877, + "learning_rate": 0.000679077815988011, + "loss": 2.704, + "step": 13139 + }, + { + "epoch": 0.3896450493728316, + "grad_norm": 0.13725793361663818, + "learning_rate": 0.0006790338855057282, + "loss": 2.6803, + "step": 13140 + }, + { + "epoch": 0.3896747027251431, + "grad_norm": 0.14465470612049103, + "learning_rate": 0.0006789899534380697, + "loss": 2.6907, + "step": 13141 + }, + { + "epoch": 0.3897043560774546, + "grad_norm": 0.1688813716173172, + "learning_rate": 0.0006789460197854242, + "loss": 2.7161, + "step": 13142 + }, + { + "epoch": 0.38973400942976605, + "grad_norm": 0.15400271117687225, + "learning_rate": 0.0006789020845481813, + "loss": 2.6798, + "step": 13143 + }, + { + "epoch": 0.3897636627820775, + "grad_norm": 0.11261529475450516, + "learning_rate": 0.0006788581477267295, + "loss": 2.6934, + "step": 13144 + }, + { + "epoch": 0.389793316134389, + "grad_norm": 0.1310034841299057, + "learning_rate": 0.0006788142093214582, + "loss": 2.6904, + "step": 13145 + }, + { + "epoch": 0.3898229694867005, + "grad_norm": 0.14795148372650146, + "learning_rate": 0.0006787702693327563, + "loss": 2.684, + "step": 13146 + }, + { + "epoch": 0.38985262283901195, + "grad_norm": 0.15855754911899567, + "learning_rate": 0.000678726327761013, + "loss": 2.7079, + "step": 13147 + }, + { + "epoch": 0.38988227619132343, + "grad_norm": 0.15115095674991608, + "learning_rate": 0.0006786823846066176, + "loss": 2.7023, + "step": 13148 + }, + { + "epoch": 0.3899119295436349, + "grad_norm": 0.13691362738609314, + "learning_rate": 0.0006786384398699588, + "loss": 2.6757, + "step": 13149 + }, + { + "epoch": 0.3899415828959464, + "grad_norm": 0.13555195927619934, + "learning_rate": 0.0006785944935514259, + "loss": 2.7277, + "step": 13150 + }, + { + "epoch": 0.38997123624825786, + "grad_norm": 0.1417165994644165, + "learning_rate": 0.0006785505456514082, + "loss": 2.694, + "step": 13151 + }, + { + "epoch": 0.39000088960056933, + "grad_norm": 0.12297403067350388, + "learning_rate": 0.0006785065961702947, + "loss": 2.6803, + "step": 13152 + }, + { + "epoch": 0.3900305429528808, + "grad_norm": 0.11390387266874313, + "learning_rate": 0.0006784626451084748, + "loss": 2.6604, + "step": 13153 + }, + { + "epoch": 0.3900601963051923, + "grad_norm": 0.11886770278215408, + "learning_rate": 0.0006784186924663375, + "loss": 2.723, + "step": 13154 + }, + { + "epoch": 0.39008984965750376, + "grad_norm": 0.12029707431793213, + "learning_rate": 0.000678374738244272, + "loss": 2.6777, + "step": 13155 + }, + { + "epoch": 0.39011950300981524, + "grad_norm": 0.12026051431894302, + "learning_rate": 0.0006783307824426674, + "loss": 2.6727, + "step": 13156 + }, + { + "epoch": 0.3901491563621267, + "grad_norm": 0.11865660548210144, + "learning_rate": 0.0006782868250619134, + "loss": 2.6448, + "step": 13157 + }, + { + "epoch": 0.3901788097144382, + "grad_norm": 0.11317840963602066, + "learning_rate": 0.0006782428661023988, + "loss": 2.7021, + "step": 13158 + }, + { + "epoch": 0.3902084630667497, + "grad_norm": 0.13458259403705597, + "learning_rate": 0.0006781989055645132, + "loss": 2.7207, + "step": 13159 + }, + { + "epoch": 0.3902381164190612, + "grad_norm": 0.13011449575424194, + "learning_rate": 0.0006781549434486456, + "loss": 2.7445, + "step": 13160 + }, + { + "epoch": 0.3902677697713727, + "grad_norm": 0.13394305109977722, + "learning_rate": 0.0006781109797551854, + "loss": 2.6821, + "step": 13161 + }, + { + "epoch": 0.39029742312368415, + "grad_norm": 0.10490253567695618, + "learning_rate": 0.0006780670144845218, + "loss": 2.6818, + "step": 13162 + }, + { + "epoch": 0.3903270764759956, + "grad_norm": 0.11306396871805191, + "learning_rate": 0.0006780230476370443, + "loss": 2.7271, + "step": 13163 + }, + { + "epoch": 0.3903567298283071, + "grad_norm": 0.12632957100868225, + "learning_rate": 0.0006779790792131421, + "loss": 2.6797, + "step": 13164 + }, + { + "epoch": 0.3903863831806186, + "grad_norm": 0.12062744051218033, + "learning_rate": 0.0006779351092132047, + "loss": 2.6682, + "step": 13165 + }, + { + "epoch": 0.39041603653293006, + "grad_norm": 0.12255249172449112, + "learning_rate": 0.0006778911376376215, + "loss": 2.6878, + "step": 13166 + }, + { + "epoch": 0.39044568988524153, + "grad_norm": 0.1358552873134613, + "learning_rate": 0.0006778471644867815, + "loss": 2.6531, + "step": 13167 + }, + { + "epoch": 0.390475343237553, + "grad_norm": 0.1448705643415451, + "learning_rate": 0.0006778031897610744, + "loss": 2.7053, + "step": 13168 + }, + { + "epoch": 0.3905049965898645, + "grad_norm": 0.13573220372200012, + "learning_rate": 0.0006777592134608895, + "loss": 2.6902, + "step": 13169 + }, + { + "epoch": 0.39053464994217596, + "grad_norm": 0.11601053178310394, + "learning_rate": 0.0006777152355866163, + "loss": 2.6963, + "step": 13170 + }, + { + "epoch": 0.39056430329448744, + "grad_norm": 0.11824368685483932, + "learning_rate": 0.0006776712561386442, + "loss": 2.698, + "step": 13171 + }, + { + "epoch": 0.3905939566467989, + "grad_norm": 0.14818242192268372, + "learning_rate": 0.0006776272751173627, + "loss": 2.7039, + "step": 13172 + }, + { + "epoch": 0.3906236099991104, + "grad_norm": 0.16149784624576569, + "learning_rate": 0.000677583292523161, + "loss": 2.6877, + "step": 13173 + }, + { + "epoch": 0.39065326335142186, + "grad_norm": 0.1273520290851593, + "learning_rate": 0.0006775393083564288, + "loss": 2.6869, + "step": 13174 + }, + { + "epoch": 0.39068291670373334, + "grad_norm": 0.15158702433109283, + "learning_rate": 0.0006774953226175557, + "loss": 2.6886, + "step": 13175 + }, + { + "epoch": 0.3907125700560448, + "grad_norm": 0.15303358435630798, + "learning_rate": 0.0006774513353069309, + "loss": 2.7086, + "step": 13176 + }, + { + "epoch": 0.3907422234083563, + "grad_norm": 0.11673186719417572, + "learning_rate": 0.0006774073464249442, + "loss": 2.7011, + "step": 13177 + }, + { + "epoch": 0.39077187676066777, + "grad_norm": 0.1313604861497879, + "learning_rate": 0.0006773633559719849, + "loss": 2.6879, + "step": 13178 + }, + { + "epoch": 0.39080153011297925, + "grad_norm": 0.16616831719875336, + "learning_rate": 0.0006773193639484426, + "loss": 2.7053, + "step": 13179 + }, + { + "epoch": 0.3908311834652908, + "grad_norm": 0.1850876659154892, + "learning_rate": 0.0006772753703547069, + "loss": 2.7107, + "step": 13180 + }, + { + "epoch": 0.39086083681760225, + "grad_norm": 0.15959711372852325, + "learning_rate": 0.0006772313751911677, + "loss": 2.6979, + "step": 13181 + }, + { + "epoch": 0.39089049016991373, + "grad_norm": 0.14699435234069824, + "learning_rate": 0.0006771873784582138, + "loss": 2.6381, + "step": 13182 + }, + { + "epoch": 0.3909201435222252, + "grad_norm": 0.11763191223144531, + "learning_rate": 0.0006771433801562354, + "loss": 2.6844, + "step": 13183 + }, + { + "epoch": 0.3909497968745367, + "grad_norm": 0.11821941286325455, + "learning_rate": 0.000677099380285622, + "loss": 2.6999, + "step": 13184 + }, + { + "epoch": 0.39097945022684816, + "grad_norm": 0.12377136945724487, + "learning_rate": 0.0006770553788467632, + "loss": 2.7242, + "step": 13185 + }, + { + "epoch": 0.39100910357915963, + "grad_norm": 0.12244571000337601, + "learning_rate": 0.0006770113758400487, + "loss": 2.6644, + "step": 13186 + }, + { + "epoch": 0.3910387569314711, + "grad_norm": 0.13155105710029602, + "learning_rate": 0.000676967371265868, + "loss": 2.6576, + "step": 13187 + }, + { + "epoch": 0.3910684102837826, + "grad_norm": 0.13058115541934967, + "learning_rate": 0.0006769233651246108, + "loss": 2.6906, + "step": 13188 + }, + { + "epoch": 0.39109806363609406, + "grad_norm": 0.1490708887577057, + "learning_rate": 0.0006768793574166668, + "loss": 2.6709, + "step": 13189 + }, + { + "epoch": 0.39112771698840554, + "grad_norm": 0.14034847915172577, + "learning_rate": 0.0006768353481424259, + "loss": 2.6567, + "step": 13190 + }, + { + "epoch": 0.391157370340717, + "grad_norm": 0.1120927631855011, + "learning_rate": 0.0006767913373022776, + "loss": 2.6706, + "step": 13191 + }, + { + "epoch": 0.3911870236930285, + "grad_norm": 0.12587490677833557, + "learning_rate": 0.0006767473248966116, + "loss": 2.6872, + "step": 13192 + }, + { + "epoch": 0.39121667704533997, + "grad_norm": 0.13441021740436554, + "learning_rate": 0.0006767033109258176, + "loss": 2.7321, + "step": 13193 + }, + { + "epoch": 0.39124633039765144, + "grad_norm": 0.11943621188402176, + "learning_rate": 0.0006766592953902856, + "loss": 2.6556, + "step": 13194 + }, + { + "epoch": 0.3912759837499629, + "grad_norm": 0.12812921404838562, + "learning_rate": 0.0006766152782904051, + "loss": 2.6832, + "step": 13195 + }, + { + "epoch": 0.3913056371022744, + "grad_norm": 0.12236762046813965, + "learning_rate": 0.0006765712596265661, + "loss": 2.6821, + "step": 13196 + }, + { + "epoch": 0.39133529045458587, + "grad_norm": 0.12683293223381042, + "learning_rate": 0.0006765272393991583, + "loss": 2.6887, + "step": 13197 + }, + { + "epoch": 0.39136494380689735, + "grad_norm": 0.1273951232433319, + "learning_rate": 0.0006764832176085714, + "loss": 2.657, + "step": 13198 + }, + { + "epoch": 0.3913945971592088, + "grad_norm": 0.10211847722530365, + "learning_rate": 0.0006764391942551954, + "loss": 2.7017, + "step": 13199 + }, + { + "epoch": 0.3914242505115203, + "grad_norm": 0.13282884657382965, + "learning_rate": 0.00067639516933942, + "loss": 2.6553, + "step": 13200 + }, + { + "epoch": 0.39145390386383183, + "grad_norm": 0.12890955805778503, + "learning_rate": 0.0006763511428616351, + "loss": 2.7164, + "step": 13201 + }, + { + "epoch": 0.3914835572161433, + "grad_norm": 0.11814027279615402, + "learning_rate": 0.0006763071148222306, + "loss": 2.6955, + "step": 13202 + }, + { + "epoch": 0.3915132105684548, + "grad_norm": 0.11004193872213364, + "learning_rate": 0.0006762630852215962, + "loss": 2.6761, + "step": 13203 + }, + { + "epoch": 0.39154286392076626, + "grad_norm": 0.11072026938199997, + "learning_rate": 0.0006762190540601222, + "loss": 2.7067, + "step": 13204 + }, + { + "epoch": 0.39157251727307774, + "grad_norm": 0.12103566527366638, + "learning_rate": 0.000676175021338198, + "loss": 2.6959, + "step": 13205 + }, + { + "epoch": 0.3916021706253892, + "grad_norm": 0.11735857278108597, + "learning_rate": 0.0006761309870562138, + "loss": 2.7059, + "step": 13206 + }, + { + "epoch": 0.3916318239777007, + "grad_norm": 0.13262468576431274, + "learning_rate": 0.0006760869512145595, + "loss": 2.7023, + "step": 13207 + }, + { + "epoch": 0.39166147733001216, + "grad_norm": 0.15935052931308746, + "learning_rate": 0.000676042913813625, + "loss": 2.6902, + "step": 13208 + }, + { + "epoch": 0.39169113068232364, + "grad_norm": 0.19166986644268036, + "learning_rate": 0.0006759988748538003, + "loss": 2.6736, + "step": 13209 + }, + { + "epoch": 0.3917207840346351, + "grad_norm": 0.17144201695919037, + "learning_rate": 0.0006759548343354754, + "loss": 2.7019, + "step": 13210 + }, + { + "epoch": 0.3917504373869466, + "grad_norm": 0.1477343738079071, + "learning_rate": 0.0006759107922590402, + "loss": 2.7213, + "step": 13211 + }, + { + "epoch": 0.39178009073925807, + "grad_norm": 0.1851940155029297, + "learning_rate": 0.0006758667486248846, + "loss": 2.6867, + "step": 13212 + }, + { + "epoch": 0.39180974409156955, + "grad_norm": 0.1806229203939438, + "learning_rate": 0.000675822703433399, + "loss": 2.6644, + "step": 13213 + }, + { + "epoch": 0.391839397443881, + "grad_norm": 0.16275519132614136, + "learning_rate": 0.0006757786566849729, + "loss": 2.725, + "step": 13214 + }, + { + "epoch": 0.3918690507961925, + "grad_norm": 0.15952761471271515, + "learning_rate": 0.0006757346083799969, + "loss": 2.6959, + "step": 13215 + }, + { + "epoch": 0.391898704148504, + "grad_norm": 0.14648708701133728, + "learning_rate": 0.0006756905585188607, + "loss": 2.6891, + "step": 13216 + }, + { + "epoch": 0.39192835750081545, + "grad_norm": 0.13712918758392334, + "learning_rate": 0.0006756465071019543, + "loss": 2.6886, + "step": 13217 + }, + { + "epoch": 0.3919580108531269, + "grad_norm": 0.1446031779050827, + "learning_rate": 0.000675602454129668, + "loss": 2.656, + "step": 13218 + }, + { + "epoch": 0.3919876642054384, + "grad_norm": 0.11317650228738785, + "learning_rate": 0.0006755583996023919, + "loss": 2.706, + "step": 13219 + }, + { + "epoch": 0.3920173175577499, + "grad_norm": 0.13052335381507874, + "learning_rate": 0.0006755143435205161, + "loss": 2.7156, + "step": 13220 + }, + { + "epoch": 0.3920469709100614, + "grad_norm": 0.14229604601860046, + "learning_rate": 0.0006754702858844303, + "loss": 2.675, + "step": 13221 + }, + { + "epoch": 0.3920766242623729, + "grad_norm": 0.1213744580745697, + "learning_rate": 0.0006754262266945254, + "loss": 2.6759, + "step": 13222 + }, + { + "epoch": 0.39210627761468436, + "grad_norm": 0.1279689371585846, + "learning_rate": 0.0006753821659511909, + "loss": 2.717, + "step": 13223 + }, + { + "epoch": 0.39213593096699584, + "grad_norm": 0.12350346893072128, + "learning_rate": 0.0006753381036548175, + "loss": 2.6833, + "step": 13224 + }, + { + "epoch": 0.3921655843193073, + "grad_norm": 0.12485314160585403, + "learning_rate": 0.000675294039805795, + "loss": 2.6766, + "step": 13225 + }, + { + "epoch": 0.3921952376716188, + "grad_norm": 0.11843733489513397, + "learning_rate": 0.0006752499744045135, + "loss": 2.7004, + "step": 13226 + }, + { + "epoch": 0.39222489102393027, + "grad_norm": 0.10580603033304214, + "learning_rate": 0.0006752059074513634, + "loss": 2.6871, + "step": 13227 + }, + { + "epoch": 0.39225454437624174, + "grad_norm": 0.11739449203014374, + "learning_rate": 0.0006751618389467351, + "loss": 2.6996, + "step": 13228 + }, + { + "epoch": 0.3922841977285532, + "grad_norm": 0.1147506833076477, + "learning_rate": 0.0006751177688910186, + "loss": 2.6774, + "step": 13229 + }, + { + "epoch": 0.3923138510808647, + "grad_norm": 0.11330052465200424, + "learning_rate": 0.0006750736972846042, + "loss": 2.7073, + "step": 13230 + }, + { + "epoch": 0.39234350443317617, + "grad_norm": 0.13596072793006897, + "learning_rate": 0.0006750296241278821, + "loss": 2.6904, + "step": 13231 + }, + { + "epoch": 0.39237315778548765, + "grad_norm": 0.1379956156015396, + "learning_rate": 0.0006749855494212427, + "loss": 2.6854, + "step": 13232 + }, + { + "epoch": 0.3924028111377991, + "grad_norm": 0.1448609083890915, + "learning_rate": 0.0006749414731650762, + "loss": 2.7216, + "step": 13233 + }, + { + "epoch": 0.3924324644901106, + "grad_norm": 0.1302390843629837, + "learning_rate": 0.0006748973953597727, + "loss": 2.6909, + "step": 13234 + }, + { + "epoch": 0.3924621178424221, + "grad_norm": 0.1288163661956787, + "learning_rate": 0.0006748533160057232, + "loss": 2.6865, + "step": 13235 + }, + { + "epoch": 0.39249177119473355, + "grad_norm": 0.14010198414325714, + "learning_rate": 0.0006748092351033173, + "loss": 2.7126, + "step": 13236 + }, + { + "epoch": 0.39252142454704503, + "grad_norm": 0.14892363548278809, + "learning_rate": 0.0006747651526529456, + "loss": 2.6737, + "step": 13237 + }, + { + "epoch": 0.3925510778993565, + "grad_norm": 0.14099565148353577, + "learning_rate": 0.0006747210686549987, + "loss": 2.6576, + "step": 13238 + }, + { + "epoch": 0.392580731251668, + "grad_norm": 0.12017449736595154, + "learning_rate": 0.0006746769831098664, + "loss": 2.6953, + "step": 13239 + }, + { + "epoch": 0.39261038460397946, + "grad_norm": 0.12940965592861176, + "learning_rate": 0.0006746328960179396, + "loss": 2.6781, + "step": 13240 + }, + { + "epoch": 0.39264003795629093, + "grad_norm": 0.1414375603199005, + "learning_rate": 0.0006745888073796086, + "loss": 2.6833, + "step": 13241 + }, + { + "epoch": 0.39266969130860246, + "grad_norm": 0.12418517470359802, + "learning_rate": 0.0006745447171952637, + "loss": 2.7227, + "step": 13242 + }, + { + "epoch": 0.39269934466091394, + "grad_norm": 0.11626549065113068, + "learning_rate": 0.0006745006254652953, + "loss": 2.6654, + "step": 13243 + }, + { + "epoch": 0.3927289980132254, + "grad_norm": 0.16364869475364685, + "learning_rate": 0.000674456532190094, + "loss": 2.6894, + "step": 13244 + }, + { + "epoch": 0.3927586513655369, + "grad_norm": 0.1603481024503708, + "learning_rate": 0.0006744124373700501, + "loss": 2.6869, + "step": 13245 + }, + { + "epoch": 0.39278830471784837, + "grad_norm": 0.12804675102233887, + "learning_rate": 0.0006743683410055543, + "loss": 2.6998, + "step": 13246 + }, + { + "epoch": 0.39281795807015985, + "grad_norm": 0.1473168283700943, + "learning_rate": 0.0006743242430969965, + "loss": 2.7115, + "step": 13247 + }, + { + "epoch": 0.3928476114224713, + "grad_norm": 0.13641056418418884, + "learning_rate": 0.0006742801436447679, + "loss": 2.7277, + "step": 13248 + }, + { + "epoch": 0.3928772647747828, + "grad_norm": 0.12867407500743866, + "learning_rate": 0.0006742360426492587, + "loss": 2.7097, + "step": 13249 + }, + { + "epoch": 0.3929069181270943, + "grad_norm": 0.1286880224943161, + "learning_rate": 0.0006741919401108594, + "loss": 2.6771, + "step": 13250 + }, + { + "epoch": 0.39293657147940575, + "grad_norm": 0.12142255157232285, + "learning_rate": 0.0006741478360299607, + "loss": 2.6684, + "step": 13251 + }, + { + "epoch": 0.3929662248317172, + "grad_norm": 0.12844176590442657, + "learning_rate": 0.0006741037304069529, + "loss": 2.714, + "step": 13252 + }, + { + "epoch": 0.3929958781840287, + "grad_norm": 0.12683314085006714, + "learning_rate": 0.0006740596232422266, + "loss": 2.6589, + "step": 13253 + }, + { + "epoch": 0.3930255315363402, + "grad_norm": 0.11642986536026001, + "learning_rate": 0.0006740155145361726, + "loss": 2.7116, + "step": 13254 + }, + { + "epoch": 0.39305518488865165, + "grad_norm": 0.12633396685123444, + "learning_rate": 0.0006739714042891812, + "loss": 2.6676, + "step": 13255 + }, + { + "epoch": 0.39308483824096313, + "grad_norm": 0.1526196449995041, + "learning_rate": 0.0006739272925016433, + "loss": 2.671, + "step": 13256 + }, + { + "epoch": 0.3931144915932746, + "grad_norm": 0.15736359357833862, + "learning_rate": 0.0006738831791739493, + "loss": 2.7004, + "step": 13257 + }, + { + "epoch": 0.3931441449455861, + "grad_norm": 0.1503107249736786, + "learning_rate": 0.00067383906430649, + "loss": 2.6976, + "step": 13258 + }, + { + "epoch": 0.39317379829789756, + "grad_norm": 0.16286374628543854, + "learning_rate": 0.0006737949478996559, + "loss": 2.6879, + "step": 13259 + }, + { + "epoch": 0.39320345165020903, + "grad_norm": 0.16259269416332245, + "learning_rate": 0.0006737508299538375, + "loss": 2.6846, + "step": 13260 + }, + { + "epoch": 0.3932331050025205, + "grad_norm": 0.14971189200878143, + "learning_rate": 0.0006737067104694258, + "loss": 2.6912, + "step": 13261 + }, + { + "epoch": 0.393262758354832, + "grad_norm": 0.12501263618469238, + "learning_rate": 0.0006736625894468116, + "loss": 2.679, + "step": 13262 + }, + { + "epoch": 0.3932924117071435, + "grad_norm": 0.12529125809669495, + "learning_rate": 0.0006736184668863852, + "loss": 2.6811, + "step": 13263 + }, + { + "epoch": 0.393322065059455, + "grad_norm": 0.12117592245340347, + "learning_rate": 0.0006735743427885375, + "loss": 2.695, + "step": 13264 + }, + { + "epoch": 0.39335171841176647, + "grad_norm": 0.12834584712982178, + "learning_rate": 0.0006735302171536591, + "loss": 2.6648, + "step": 13265 + }, + { + "epoch": 0.39338137176407795, + "grad_norm": 0.11455351114273071, + "learning_rate": 0.0006734860899821408, + "loss": 2.6739, + "step": 13266 + }, + { + "epoch": 0.3934110251163894, + "grad_norm": 0.11512768268585205, + "learning_rate": 0.0006734419612743736, + "loss": 2.6928, + "step": 13267 + }, + { + "epoch": 0.3934406784687009, + "grad_norm": 0.12481331080198288, + "learning_rate": 0.0006733978310307479, + "loss": 2.6846, + "step": 13268 + }, + { + "epoch": 0.3934703318210124, + "grad_norm": 0.11197403818368912, + "learning_rate": 0.0006733536992516546, + "loss": 2.698, + "step": 13269 + }, + { + "epoch": 0.39349998517332385, + "grad_norm": 0.11313886940479279, + "learning_rate": 0.0006733095659374847, + "loss": 2.6677, + "step": 13270 + }, + { + "epoch": 0.39352963852563533, + "grad_norm": 0.1100691482424736, + "learning_rate": 0.0006732654310886288, + "loss": 2.7079, + "step": 13271 + }, + { + "epoch": 0.3935592918779468, + "grad_norm": 0.11447305977344513, + "learning_rate": 0.0006732212947054777, + "loss": 2.7036, + "step": 13272 + }, + { + "epoch": 0.3935889452302583, + "grad_norm": 0.13590772449970245, + "learning_rate": 0.0006731771567884223, + "loss": 2.6844, + "step": 13273 + }, + { + "epoch": 0.39361859858256976, + "grad_norm": 0.14371289312839508, + "learning_rate": 0.0006731330173378535, + "loss": 2.713, + "step": 13274 + }, + { + "epoch": 0.39364825193488123, + "grad_norm": 0.1751418560743332, + "learning_rate": 0.000673088876354162, + "loss": 2.6608, + "step": 13275 + }, + { + "epoch": 0.3936779052871927, + "grad_norm": 0.1549123376607895, + "learning_rate": 0.000673044733837739, + "loss": 2.6916, + "step": 13276 + }, + { + "epoch": 0.3937075586395042, + "grad_norm": 0.12397680431604385, + "learning_rate": 0.000673000589788975, + "loss": 2.6893, + "step": 13277 + }, + { + "epoch": 0.39373721199181566, + "grad_norm": 0.12943431735038757, + "learning_rate": 0.0006729564442082612, + "loss": 2.6903, + "step": 13278 + }, + { + "epoch": 0.39376686534412714, + "grad_norm": 0.11256329715251923, + "learning_rate": 0.0006729122970959884, + "loss": 2.6671, + "step": 13279 + }, + { + "epoch": 0.3937965186964386, + "grad_norm": 0.11770607531070709, + "learning_rate": 0.0006728681484525474, + "loss": 2.6624, + "step": 13280 + }, + { + "epoch": 0.3938261720487501, + "grad_norm": 0.12141755223274231, + "learning_rate": 0.0006728239982783294, + "loss": 2.6853, + "step": 13281 + }, + { + "epoch": 0.39385582540106157, + "grad_norm": 0.12079830467700958, + "learning_rate": 0.0006727798465737252, + "loss": 2.7305, + "step": 13282 + }, + { + "epoch": 0.39388547875337304, + "grad_norm": 0.1398785412311554, + "learning_rate": 0.0006727356933391257, + "loss": 2.708, + "step": 13283 + }, + { + "epoch": 0.3939151321056846, + "grad_norm": 0.17065533995628357, + "learning_rate": 0.0006726915385749223, + "loss": 2.7287, + "step": 13284 + }, + { + "epoch": 0.39394478545799605, + "grad_norm": 0.1970251202583313, + "learning_rate": 0.0006726473822815055, + "loss": 2.678, + "step": 13285 + }, + { + "epoch": 0.3939744388103075, + "grad_norm": 0.18951645493507385, + "learning_rate": 0.0006726032244592663, + "loss": 2.7018, + "step": 13286 + }, + { + "epoch": 0.394004092162619, + "grad_norm": 0.17670688033103943, + "learning_rate": 0.000672559065108596, + "loss": 2.6833, + "step": 13287 + }, + { + "epoch": 0.3940337455149305, + "grad_norm": 0.16862380504608154, + "learning_rate": 0.0006725149042298857, + "loss": 2.6743, + "step": 13288 + }, + { + "epoch": 0.39406339886724195, + "grad_norm": 0.16504420340061188, + "learning_rate": 0.0006724707418235262, + "loss": 2.6752, + "step": 13289 + }, + { + "epoch": 0.39409305221955343, + "grad_norm": 0.14826661348342896, + "learning_rate": 0.0006724265778899088, + "loss": 2.6515, + "step": 13290 + }, + { + "epoch": 0.3941227055718649, + "grad_norm": 0.1466597616672516, + "learning_rate": 0.0006723824124294244, + "loss": 2.6932, + "step": 13291 + }, + { + "epoch": 0.3941523589241764, + "grad_norm": 0.12596529722213745, + "learning_rate": 0.0006723382454424641, + "loss": 2.6685, + "step": 13292 + }, + { + "epoch": 0.39418201227648786, + "grad_norm": 0.1311635971069336, + "learning_rate": 0.000672294076929419, + "loss": 2.7013, + "step": 13293 + }, + { + "epoch": 0.39421166562879933, + "grad_norm": 0.14344483613967896, + "learning_rate": 0.0006722499068906804, + "loss": 2.6706, + "step": 13294 + }, + { + "epoch": 0.3942413189811108, + "grad_norm": 0.12312573194503784, + "learning_rate": 0.0006722057353266394, + "loss": 2.7097, + "step": 13295 + }, + { + "epoch": 0.3942709723334223, + "grad_norm": 0.12869775295257568, + "learning_rate": 0.0006721615622376869, + "loss": 2.6843, + "step": 13296 + }, + { + "epoch": 0.39430062568573376, + "grad_norm": 0.12067270278930664, + "learning_rate": 0.0006721173876242142, + "loss": 2.6611, + "step": 13297 + }, + { + "epoch": 0.39433027903804524, + "grad_norm": 0.14324699342250824, + "learning_rate": 0.0006720732114866124, + "loss": 2.6669, + "step": 13298 + }, + { + "epoch": 0.3943599323903567, + "grad_norm": 0.14259670674800873, + "learning_rate": 0.0006720290338252729, + "loss": 2.6624, + "step": 13299 + }, + { + "epoch": 0.3943895857426682, + "grad_norm": 0.15179121494293213, + "learning_rate": 0.0006719848546405869, + "loss": 2.7278, + "step": 13300 + }, + { + "epoch": 0.39441923909497967, + "grad_norm": 0.12216110527515411, + "learning_rate": 0.0006719406739329454, + "loss": 2.6906, + "step": 13301 + }, + { + "epoch": 0.39444889244729114, + "grad_norm": 0.12770552933216095, + "learning_rate": 0.0006718964917027396, + "loss": 2.689, + "step": 13302 + }, + { + "epoch": 0.3944785457996026, + "grad_norm": 0.11384662240743637, + "learning_rate": 0.000671852307950361, + "loss": 2.6805, + "step": 13303 + }, + { + "epoch": 0.3945081991519141, + "grad_norm": 0.12030345946550369, + "learning_rate": 0.0006718081226762007, + "loss": 2.7224, + "step": 13304 + }, + { + "epoch": 0.39453785250422563, + "grad_norm": 0.12221145629882812, + "learning_rate": 0.0006717639358806499, + "loss": 2.667, + "step": 13305 + }, + { + "epoch": 0.3945675058565371, + "grad_norm": 0.10272995382547379, + "learning_rate": 0.0006717197475640999, + "loss": 2.6899, + "step": 13306 + }, + { + "epoch": 0.3945971592088486, + "grad_norm": 0.13363924622535706, + "learning_rate": 0.0006716755577269423, + "loss": 2.7081, + "step": 13307 + }, + { + "epoch": 0.39462681256116006, + "grad_norm": 0.11459245532751083, + "learning_rate": 0.000671631366369568, + "loss": 2.6831, + "step": 13308 + }, + { + "epoch": 0.39465646591347153, + "grad_norm": 0.10682201385498047, + "learning_rate": 0.0006715871734923685, + "loss": 2.6893, + "step": 13309 + }, + { + "epoch": 0.394686119265783, + "grad_norm": 0.11930854618549347, + "learning_rate": 0.0006715429790957352, + "loss": 2.7072, + "step": 13310 + }, + { + "epoch": 0.3947157726180945, + "grad_norm": 0.11996374279260635, + "learning_rate": 0.0006714987831800593, + "loss": 2.6932, + "step": 13311 + }, + { + "epoch": 0.39474542597040596, + "grad_norm": 0.1239248514175415, + "learning_rate": 0.0006714545857457322, + "loss": 2.6511, + "step": 13312 + }, + { + "epoch": 0.39477507932271744, + "grad_norm": 0.11810582131147385, + "learning_rate": 0.0006714103867931455, + "loss": 2.6708, + "step": 13313 + }, + { + "epoch": 0.3948047326750289, + "grad_norm": 0.1388993263244629, + "learning_rate": 0.0006713661863226902, + "loss": 2.6973, + "step": 13314 + }, + { + "epoch": 0.3948343860273404, + "grad_norm": 0.14282813668251038, + "learning_rate": 0.000671321984334758, + "loss": 2.6787, + "step": 13315 + }, + { + "epoch": 0.39486403937965187, + "grad_norm": 0.1359241008758545, + "learning_rate": 0.0006712777808297402, + "loss": 2.6906, + "step": 13316 + }, + { + "epoch": 0.39489369273196334, + "grad_norm": 0.12459985166788101, + "learning_rate": 0.0006712335758080283, + "loss": 2.6953, + "step": 13317 + }, + { + "epoch": 0.3949233460842748, + "grad_norm": 0.11437604576349258, + "learning_rate": 0.0006711893692700136, + "loss": 2.7048, + "step": 13318 + }, + { + "epoch": 0.3949529994365863, + "grad_norm": 0.12882235646247864, + "learning_rate": 0.0006711451612160877, + "loss": 2.6457, + "step": 13319 + }, + { + "epoch": 0.39498265278889777, + "grad_norm": 0.16774293780326843, + "learning_rate": 0.0006711009516466421, + "loss": 2.6605, + "step": 13320 + }, + { + "epoch": 0.39501230614120925, + "grad_norm": 0.17039421200752258, + "learning_rate": 0.0006710567405620681, + "loss": 2.6749, + "step": 13321 + }, + { + "epoch": 0.3950419594935207, + "grad_norm": 0.15788348019123077, + "learning_rate": 0.0006710125279627574, + "loss": 2.6856, + "step": 13322 + }, + { + "epoch": 0.3950716128458322, + "grad_norm": 0.14791472256183624, + "learning_rate": 0.0006709683138491014, + "loss": 2.6806, + "step": 13323 + }, + { + "epoch": 0.3951012661981437, + "grad_norm": 0.123735710978508, + "learning_rate": 0.0006709240982214914, + "loss": 2.6911, + "step": 13324 + }, + { + "epoch": 0.3951309195504552, + "grad_norm": 0.138624370098114, + "learning_rate": 0.0006708798810803194, + "loss": 2.6327, + "step": 13325 + }, + { + "epoch": 0.3951605729027667, + "grad_norm": 0.1346742957830429, + "learning_rate": 0.0006708356624259768, + "loss": 2.6625, + "step": 13326 + }, + { + "epoch": 0.39519022625507816, + "grad_norm": 0.13928723335266113, + "learning_rate": 0.0006707914422588548, + "loss": 2.6468, + "step": 13327 + }, + { + "epoch": 0.39521987960738963, + "grad_norm": 0.12808160483837128, + "learning_rate": 0.0006707472205793456, + "loss": 2.7014, + "step": 13328 + }, + { + "epoch": 0.3952495329597011, + "grad_norm": 0.11377613991498947, + "learning_rate": 0.0006707029973878402, + "loss": 2.6635, + "step": 13329 + }, + { + "epoch": 0.3952791863120126, + "grad_norm": 0.13294823467731476, + "learning_rate": 0.0006706587726847306, + "loss": 2.6751, + "step": 13330 + }, + { + "epoch": 0.39530883966432406, + "grad_norm": 0.13707688450813293, + "learning_rate": 0.0006706145464704081, + "loss": 2.682, + "step": 13331 + }, + { + "epoch": 0.39533849301663554, + "grad_norm": 0.11699295043945312, + "learning_rate": 0.0006705703187452646, + "loss": 2.7186, + "step": 13332 + }, + { + "epoch": 0.395368146368947, + "grad_norm": 0.13340407609939575, + "learning_rate": 0.0006705260895096917, + "loss": 2.668, + "step": 13333 + }, + { + "epoch": 0.3953977997212585, + "grad_norm": 0.11881014704704285, + "learning_rate": 0.0006704818587640811, + "loss": 2.7087, + "step": 13334 + }, + { + "epoch": 0.39542745307356997, + "grad_norm": 0.12416695058345795, + "learning_rate": 0.0006704376265088242, + "loss": 2.6797, + "step": 13335 + }, + { + "epoch": 0.39545710642588144, + "grad_norm": 0.13029509782791138, + "learning_rate": 0.0006703933927443129, + "loss": 2.6896, + "step": 13336 + }, + { + "epoch": 0.3954867597781929, + "grad_norm": 0.10955746471881866, + "learning_rate": 0.0006703491574709387, + "loss": 2.6812, + "step": 13337 + }, + { + "epoch": 0.3955164131305044, + "grad_norm": 0.11309413611888885, + "learning_rate": 0.0006703049206890938, + "loss": 2.6498, + "step": 13338 + }, + { + "epoch": 0.3955460664828159, + "grad_norm": 0.09875118732452393, + "learning_rate": 0.0006702606823991694, + "loss": 2.6586, + "step": 13339 + }, + { + "epoch": 0.39557571983512735, + "grad_norm": 0.1122947409749031, + "learning_rate": 0.0006702164426015575, + "loss": 2.7168, + "step": 13340 + }, + { + "epoch": 0.3956053731874388, + "grad_norm": 0.12544013559818268, + "learning_rate": 0.0006701722012966497, + "loss": 2.6576, + "step": 13341 + }, + { + "epoch": 0.3956350265397503, + "grad_norm": 0.13658669590950012, + "learning_rate": 0.0006701279584848379, + "loss": 2.682, + "step": 13342 + }, + { + "epoch": 0.3956646798920618, + "grad_norm": 0.13246335089206696, + "learning_rate": 0.0006700837141665138, + "loss": 2.6916, + "step": 13343 + }, + { + "epoch": 0.39569433324437325, + "grad_norm": 0.1126830130815506, + "learning_rate": 0.0006700394683420693, + "loss": 2.7085, + "step": 13344 + }, + { + "epoch": 0.39572398659668473, + "grad_norm": 0.1251719892024994, + "learning_rate": 0.000669995221011896, + "loss": 2.6547, + "step": 13345 + }, + { + "epoch": 0.39575363994899626, + "grad_norm": 0.1415484994649887, + "learning_rate": 0.0006699509721763859, + "loss": 2.6939, + "step": 13346 + }, + { + "epoch": 0.39578329330130774, + "grad_norm": 0.14923128485679626, + "learning_rate": 0.0006699067218359308, + "loss": 2.6611, + "step": 13347 + }, + { + "epoch": 0.3958129466536192, + "grad_norm": 0.15126104652881622, + "learning_rate": 0.0006698624699909225, + "loss": 2.6728, + "step": 13348 + }, + { + "epoch": 0.3958426000059307, + "grad_norm": 0.13468632102012634, + "learning_rate": 0.0006698182166417528, + "loss": 2.6885, + "step": 13349 + }, + { + "epoch": 0.39587225335824217, + "grad_norm": 0.11832839995622635, + "learning_rate": 0.0006697739617888137, + "loss": 2.6557, + "step": 13350 + }, + { + "epoch": 0.39590190671055364, + "grad_norm": 0.12452578544616699, + "learning_rate": 0.000669729705432497, + "loss": 2.7373, + "step": 13351 + }, + { + "epoch": 0.3959315600628651, + "grad_norm": 0.13362963497638702, + "learning_rate": 0.0006696854475731947, + "loss": 2.7321, + "step": 13352 + }, + { + "epoch": 0.3959612134151766, + "grad_norm": 0.12257282435894012, + "learning_rate": 0.0006696411882112986, + "loss": 2.7033, + "step": 13353 + }, + { + "epoch": 0.39599086676748807, + "grad_norm": 0.13508377969264984, + "learning_rate": 0.0006695969273472007, + "loss": 2.6478, + "step": 13354 + }, + { + "epoch": 0.39602052011979955, + "grad_norm": 0.14054395258426666, + "learning_rate": 0.0006695526649812928, + "loss": 2.6781, + "step": 13355 + }, + { + "epoch": 0.396050173472111, + "grad_norm": 0.14333461225032806, + "learning_rate": 0.000669508401113967, + "loss": 2.7006, + "step": 13356 + }, + { + "epoch": 0.3960798268244225, + "grad_norm": 0.13566049933433533, + "learning_rate": 0.0006694641357456152, + "loss": 2.6741, + "step": 13357 + }, + { + "epoch": 0.396109480176734, + "grad_norm": 0.12023026496171951, + "learning_rate": 0.0006694198688766293, + "loss": 2.6683, + "step": 13358 + }, + { + "epoch": 0.39613913352904545, + "grad_norm": 0.1265224665403366, + "learning_rate": 0.0006693756005074016, + "loss": 2.6859, + "step": 13359 + }, + { + "epoch": 0.3961687868813569, + "grad_norm": 0.135967418551445, + "learning_rate": 0.0006693313306383236, + "loss": 2.7149, + "step": 13360 + }, + { + "epoch": 0.3961984402336684, + "grad_norm": 0.14300765097141266, + "learning_rate": 0.0006692870592697879, + "loss": 2.7224, + "step": 13361 + }, + { + "epoch": 0.3962280935859799, + "grad_norm": 0.15325415134429932, + "learning_rate": 0.0006692427864021861, + "loss": 2.6527, + "step": 13362 + }, + { + "epoch": 0.39625774693829136, + "grad_norm": 0.14120709896087646, + "learning_rate": 0.0006691985120359103, + "loss": 2.6809, + "step": 13363 + }, + { + "epoch": 0.39628740029060283, + "grad_norm": 0.13308900594711304, + "learning_rate": 0.0006691542361713527, + "loss": 2.7156, + "step": 13364 + }, + { + "epoch": 0.3963170536429143, + "grad_norm": 0.13896270096302032, + "learning_rate": 0.0006691099588089052, + "loss": 2.6958, + "step": 13365 + }, + { + "epoch": 0.3963467069952258, + "grad_norm": 0.15156398713588715, + "learning_rate": 0.0006690656799489602, + "loss": 2.6765, + "step": 13366 + }, + { + "epoch": 0.3963763603475373, + "grad_norm": 0.1463850736618042, + "learning_rate": 0.0006690213995919096, + "loss": 2.6784, + "step": 13367 + }, + { + "epoch": 0.3964060136998488, + "grad_norm": 0.1148986741900444, + "learning_rate": 0.0006689771177381453, + "loss": 2.6742, + "step": 13368 + }, + { + "epoch": 0.39643566705216027, + "grad_norm": 0.13036519289016724, + "learning_rate": 0.0006689328343880597, + "loss": 2.6738, + "step": 13369 + }, + { + "epoch": 0.39646532040447174, + "grad_norm": 0.1373690366744995, + "learning_rate": 0.0006688885495420447, + "loss": 2.7031, + "step": 13370 + }, + { + "epoch": 0.3964949737567832, + "grad_norm": 0.12305987626314163, + "learning_rate": 0.0006688442632004929, + "loss": 2.6694, + "step": 13371 + }, + { + "epoch": 0.3965246271090947, + "grad_norm": 0.11503439396619797, + "learning_rate": 0.000668799975363796, + "loss": 2.6613, + "step": 13372 + }, + { + "epoch": 0.3965542804614062, + "grad_norm": 0.11877801269292831, + "learning_rate": 0.0006687556860323464, + "loss": 2.6833, + "step": 13373 + }, + { + "epoch": 0.39658393381371765, + "grad_norm": 0.1271257847547531, + "learning_rate": 0.0006687113952065361, + "loss": 2.6896, + "step": 13374 + }, + { + "epoch": 0.3966135871660291, + "grad_norm": 0.13076934218406677, + "learning_rate": 0.0006686671028867576, + "loss": 2.6884, + "step": 13375 + }, + { + "epoch": 0.3966432405183406, + "grad_norm": 0.1412576586008072, + "learning_rate": 0.0006686228090734029, + "loss": 2.6785, + "step": 13376 + }, + { + "epoch": 0.3966728938706521, + "grad_norm": 0.1321491301059723, + "learning_rate": 0.0006685785137668642, + "loss": 2.676, + "step": 13377 + }, + { + "epoch": 0.39670254722296355, + "grad_norm": 0.13368423283100128, + "learning_rate": 0.0006685342169675339, + "loss": 2.6878, + "step": 13378 + }, + { + "epoch": 0.39673220057527503, + "grad_norm": 0.1322784125804901, + "learning_rate": 0.0006684899186758042, + "loss": 2.6926, + "step": 13379 + }, + { + "epoch": 0.3967618539275865, + "grad_norm": 0.14020788669586182, + "learning_rate": 0.0006684456188920673, + "loss": 2.6899, + "step": 13380 + }, + { + "epoch": 0.396791507279898, + "grad_norm": 0.13727979362010956, + "learning_rate": 0.0006684013176167155, + "loss": 2.6828, + "step": 13381 + }, + { + "epoch": 0.39682116063220946, + "grad_norm": 0.13549424707889557, + "learning_rate": 0.0006683570148501413, + "loss": 2.6578, + "step": 13382 + }, + { + "epoch": 0.39685081398452093, + "grad_norm": 0.13024179637432098, + "learning_rate": 0.0006683127105927367, + "loss": 2.7043, + "step": 13383 + }, + { + "epoch": 0.3968804673368324, + "grad_norm": 0.12512248754501343, + "learning_rate": 0.0006682684048448941, + "loss": 2.6789, + "step": 13384 + }, + { + "epoch": 0.3969101206891439, + "grad_norm": 0.10818202793598175, + "learning_rate": 0.000668224097607006, + "loss": 2.7123, + "step": 13385 + }, + { + "epoch": 0.39693977404145536, + "grad_norm": 0.11394574493169785, + "learning_rate": 0.0006681797888794645, + "loss": 2.6364, + "step": 13386 + }, + { + "epoch": 0.39696942739376684, + "grad_norm": 0.11154496669769287, + "learning_rate": 0.0006681354786626622, + "loss": 2.6951, + "step": 13387 + }, + { + "epoch": 0.39699908074607837, + "grad_norm": 0.12452396005392075, + "learning_rate": 0.0006680911669569915, + "loss": 2.6862, + "step": 13388 + }, + { + "epoch": 0.39702873409838985, + "grad_norm": 0.13554859161376953, + "learning_rate": 0.0006680468537628444, + "loss": 2.7185, + "step": 13389 + }, + { + "epoch": 0.3970583874507013, + "grad_norm": 0.1311749368906021, + "learning_rate": 0.0006680025390806138, + "loss": 2.6856, + "step": 13390 + }, + { + "epoch": 0.3970880408030128, + "grad_norm": 0.1476934403181076, + "learning_rate": 0.0006679582229106917, + "loss": 2.7007, + "step": 13391 + }, + { + "epoch": 0.3971176941553243, + "grad_norm": 0.1430981308221817, + "learning_rate": 0.0006679139052534708, + "loss": 2.6868, + "step": 13392 + }, + { + "epoch": 0.39714734750763575, + "grad_norm": 0.12022066861391068, + "learning_rate": 0.0006678695861093435, + "loss": 2.6833, + "step": 13393 + }, + { + "epoch": 0.3971770008599472, + "grad_norm": 0.16754133999347687, + "learning_rate": 0.0006678252654787022, + "loss": 2.7174, + "step": 13394 + }, + { + "epoch": 0.3972066542122587, + "grad_norm": 0.19471219182014465, + "learning_rate": 0.0006677809433619393, + "loss": 2.6866, + "step": 13395 + }, + { + "epoch": 0.3972363075645702, + "grad_norm": 0.1828286349773407, + "learning_rate": 0.0006677366197594474, + "loss": 2.6677, + "step": 13396 + }, + { + "epoch": 0.39726596091688166, + "grad_norm": 0.17274942994117737, + "learning_rate": 0.0006676922946716188, + "loss": 2.6921, + "step": 13397 + }, + { + "epoch": 0.39729561426919313, + "grad_norm": 0.1524747759103775, + "learning_rate": 0.0006676479680988462, + "loss": 2.6822, + "step": 13398 + }, + { + "epoch": 0.3973252676215046, + "grad_norm": 0.14401470124721527, + "learning_rate": 0.0006676036400415222, + "loss": 2.6946, + "step": 13399 + }, + { + "epoch": 0.3973549209738161, + "grad_norm": 0.15499690175056458, + "learning_rate": 0.0006675593105000392, + "loss": 2.6956, + "step": 13400 + }, + { + "epoch": 0.39738457432612756, + "grad_norm": 0.12860004603862762, + "learning_rate": 0.0006675149794747897, + "loss": 2.6663, + "step": 13401 + }, + { + "epoch": 0.39741422767843904, + "grad_norm": 0.1317172348499298, + "learning_rate": 0.000667470646966166, + "loss": 2.6904, + "step": 13402 + }, + { + "epoch": 0.3974438810307505, + "grad_norm": 0.15072843432426453, + "learning_rate": 0.0006674263129745612, + "loss": 2.6938, + "step": 13403 + }, + { + "epoch": 0.397473534383062, + "grad_norm": 0.13605323433876038, + "learning_rate": 0.0006673819775003679, + "loss": 2.6874, + "step": 13404 + }, + { + "epoch": 0.39750318773537346, + "grad_norm": 0.12281712144613266, + "learning_rate": 0.0006673376405439783, + "loss": 2.6698, + "step": 13405 + }, + { + "epoch": 0.39753284108768494, + "grad_norm": 0.12978912889957428, + "learning_rate": 0.0006672933021057851, + "loss": 2.6781, + "step": 13406 + }, + { + "epoch": 0.3975624944399964, + "grad_norm": 0.13008631765842438, + "learning_rate": 0.000667248962186181, + "loss": 2.6775, + "step": 13407 + }, + { + "epoch": 0.3975921477923079, + "grad_norm": 0.13412940502166748, + "learning_rate": 0.0006672046207855585, + "loss": 2.6832, + "step": 13408 + }, + { + "epoch": 0.3976218011446194, + "grad_norm": 0.13580676913261414, + "learning_rate": 0.0006671602779043107, + "loss": 2.7101, + "step": 13409 + }, + { + "epoch": 0.3976514544969309, + "grad_norm": 0.13294236361980438, + "learning_rate": 0.0006671159335428298, + "loss": 2.7141, + "step": 13410 + }, + { + "epoch": 0.3976811078492424, + "grad_norm": 0.12919981777668, + "learning_rate": 0.0006670715877015085, + "loss": 2.7186, + "step": 13411 + }, + { + "epoch": 0.39771076120155385, + "grad_norm": 0.1353205144405365, + "learning_rate": 0.0006670272403807397, + "loss": 2.6933, + "step": 13412 + }, + { + "epoch": 0.39774041455386533, + "grad_norm": 0.1404150426387787, + "learning_rate": 0.0006669828915809161, + "loss": 2.7082, + "step": 13413 + }, + { + "epoch": 0.3977700679061768, + "grad_norm": 0.12762989103794098, + "learning_rate": 0.0006669385413024302, + "loss": 2.6685, + "step": 13414 + }, + { + "epoch": 0.3977997212584883, + "grad_norm": 0.13717395067214966, + "learning_rate": 0.000666894189545675, + "loss": 2.6917, + "step": 13415 + }, + { + "epoch": 0.39782937461079976, + "grad_norm": 0.14701418578624725, + "learning_rate": 0.0006668498363110429, + "loss": 2.6558, + "step": 13416 + }, + { + "epoch": 0.39785902796311123, + "grad_norm": 0.13672876358032227, + "learning_rate": 0.0006668054815989271, + "loss": 2.676, + "step": 13417 + }, + { + "epoch": 0.3978886813154227, + "grad_norm": 0.12604095041751862, + "learning_rate": 0.0006667611254097199, + "loss": 2.6715, + "step": 13418 + }, + { + "epoch": 0.3979183346677342, + "grad_norm": 0.12322293221950531, + "learning_rate": 0.0006667167677438145, + "loss": 2.6799, + "step": 13419 + }, + { + "epoch": 0.39794798802004566, + "grad_norm": 0.11615461111068726, + "learning_rate": 0.0006666724086016034, + "loss": 2.6684, + "step": 13420 + }, + { + "epoch": 0.39797764137235714, + "grad_norm": 0.12645158171653748, + "learning_rate": 0.0006666280479834796, + "loss": 2.6704, + "step": 13421 + }, + { + "epoch": 0.3980072947246686, + "grad_norm": 0.12105458974838257, + "learning_rate": 0.0006665836858898357, + "loss": 2.69, + "step": 13422 + }, + { + "epoch": 0.3980369480769801, + "grad_norm": 0.10641931742429733, + "learning_rate": 0.0006665393223210648, + "loss": 2.6806, + "step": 13423 + }, + { + "epoch": 0.39806660142929157, + "grad_norm": 0.14063236117362976, + "learning_rate": 0.0006664949572775596, + "loss": 2.7001, + "step": 13424 + }, + { + "epoch": 0.39809625478160304, + "grad_norm": 0.13335582613945007, + "learning_rate": 0.0006664505907597129, + "loss": 2.7214, + "step": 13425 + }, + { + "epoch": 0.3981259081339145, + "grad_norm": 0.14461135864257812, + "learning_rate": 0.0006664062227679177, + "loss": 2.7295, + "step": 13426 + }, + { + "epoch": 0.398155561486226, + "grad_norm": 0.13231761753559113, + "learning_rate": 0.0006663618533025668, + "loss": 2.6788, + "step": 13427 + }, + { + "epoch": 0.39818521483853747, + "grad_norm": 0.12120900303125381, + "learning_rate": 0.0006663174823640531, + "loss": 2.6984, + "step": 13428 + }, + { + "epoch": 0.398214868190849, + "grad_norm": 0.13534049689769745, + "learning_rate": 0.0006662731099527695, + "loss": 2.6636, + "step": 13429 + }, + { + "epoch": 0.3982445215431605, + "grad_norm": 0.1410641372203827, + "learning_rate": 0.0006662287360691091, + "loss": 2.7189, + "step": 13430 + }, + { + "epoch": 0.39827417489547196, + "grad_norm": 0.12611907720565796, + "learning_rate": 0.0006661843607134649, + "loss": 2.6872, + "step": 13431 + }, + { + "epoch": 0.39830382824778343, + "grad_norm": 0.12077043950557709, + "learning_rate": 0.0006661399838862294, + "loss": 2.6709, + "step": 13432 + }, + { + "epoch": 0.3983334816000949, + "grad_norm": 0.14115965366363525, + "learning_rate": 0.0006660956055877959, + "loss": 2.6686, + "step": 13433 + }, + { + "epoch": 0.3983631349524064, + "grad_norm": 0.15871016681194305, + "learning_rate": 0.0006660512258185572, + "loss": 2.6801, + "step": 13434 + }, + { + "epoch": 0.39839278830471786, + "grad_norm": 0.14144441485404968, + "learning_rate": 0.0006660068445789064, + "loss": 2.7023, + "step": 13435 + }, + { + "epoch": 0.39842244165702934, + "grad_norm": 0.11221971362829208, + "learning_rate": 0.0006659624618692366, + "loss": 2.7307, + "step": 13436 + }, + { + "epoch": 0.3984520950093408, + "grad_norm": 0.14085735380649567, + "learning_rate": 0.0006659180776899407, + "loss": 2.674, + "step": 13437 + }, + { + "epoch": 0.3984817483616523, + "grad_norm": 0.14639513194561005, + "learning_rate": 0.0006658736920414117, + "loss": 2.6872, + "step": 13438 + }, + { + "epoch": 0.39851140171396376, + "grad_norm": 0.1253231018781662, + "learning_rate": 0.0006658293049240427, + "loss": 2.6484, + "step": 13439 + }, + { + "epoch": 0.39854105506627524, + "grad_norm": 0.12077003717422485, + "learning_rate": 0.0006657849163382268, + "loss": 2.7042, + "step": 13440 + }, + { + "epoch": 0.3985707084185867, + "grad_norm": 0.12362583726644516, + "learning_rate": 0.0006657405262843568, + "loss": 2.6957, + "step": 13441 + }, + { + "epoch": 0.3986003617708982, + "grad_norm": 0.14364129304885864, + "learning_rate": 0.0006656961347628262, + "loss": 2.6827, + "step": 13442 + }, + { + "epoch": 0.39863001512320967, + "grad_norm": 0.15742073953151703, + "learning_rate": 0.0006656517417740279, + "loss": 2.6748, + "step": 13443 + }, + { + "epoch": 0.39865966847552115, + "grad_norm": 0.1830052137374878, + "learning_rate": 0.0006656073473183548, + "loss": 2.676, + "step": 13444 + }, + { + "epoch": 0.3986893218278326, + "grad_norm": 0.1519022285938263, + "learning_rate": 0.0006655629513962004, + "loss": 2.6765, + "step": 13445 + }, + { + "epoch": 0.3987189751801441, + "grad_norm": 0.12483038753271103, + "learning_rate": 0.0006655185540079576, + "loss": 2.7034, + "step": 13446 + }, + { + "epoch": 0.3987486285324556, + "grad_norm": 0.16057881712913513, + "learning_rate": 0.0006654741551540195, + "loss": 2.6795, + "step": 13447 + }, + { + "epoch": 0.39877828188476705, + "grad_norm": 0.16692477464675903, + "learning_rate": 0.0006654297548347794, + "loss": 2.7122, + "step": 13448 + }, + { + "epoch": 0.3988079352370785, + "grad_norm": 0.13144929707050323, + "learning_rate": 0.0006653853530506305, + "loss": 2.6547, + "step": 13449 + }, + { + "epoch": 0.39883758858939006, + "grad_norm": 0.11132882535457611, + "learning_rate": 0.0006653409498019658, + "loss": 2.6699, + "step": 13450 + }, + { + "epoch": 0.39886724194170153, + "grad_norm": 0.12452065944671631, + "learning_rate": 0.0006652965450891787, + "loss": 2.6746, + "step": 13451 + }, + { + "epoch": 0.398896895294013, + "grad_norm": 0.13753317296504974, + "learning_rate": 0.0006652521389126623, + "loss": 2.7054, + "step": 13452 + }, + { + "epoch": 0.3989265486463245, + "grad_norm": 0.15508471429347992, + "learning_rate": 0.0006652077312728098, + "loss": 2.709, + "step": 13453 + }, + { + "epoch": 0.39895620199863596, + "grad_norm": 0.1617075353860855, + "learning_rate": 0.0006651633221700145, + "loss": 2.6662, + "step": 13454 + }, + { + "epoch": 0.39898585535094744, + "grad_norm": 0.13992293179035187, + "learning_rate": 0.0006651189116046696, + "loss": 2.6936, + "step": 13455 + }, + { + "epoch": 0.3990155087032589, + "grad_norm": 0.12907934188842773, + "learning_rate": 0.0006650744995771685, + "loss": 2.6929, + "step": 13456 + }, + { + "epoch": 0.3990451620555704, + "grad_norm": 0.11356896162033081, + "learning_rate": 0.0006650300860879044, + "loss": 2.6942, + "step": 13457 + }, + { + "epoch": 0.39907481540788187, + "grad_norm": 0.12843412160873413, + "learning_rate": 0.0006649856711372704, + "loss": 2.6621, + "step": 13458 + }, + { + "epoch": 0.39910446876019334, + "grad_norm": 0.1254231333732605, + "learning_rate": 0.0006649412547256601, + "loss": 2.7261, + "step": 13459 + }, + { + "epoch": 0.3991341221125048, + "grad_norm": 0.11712376028299332, + "learning_rate": 0.0006648968368534666, + "loss": 2.7154, + "step": 13460 + }, + { + "epoch": 0.3991637754648163, + "grad_norm": 0.11915852129459381, + "learning_rate": 0.0006648524175210833, + "loss": 2.7032, + "step": 13461 + }, + { + "epoch": 0.39919342881712777, + "grad_norm": 0.12382975965738297, + "learning_rate": 0.0006648079967289035, + "loss": 2.6726, + "step": 13462 + }, + { + "epoch": 0.39922308216943925, + "grad_norm": 0.11191733926534653, + "learning_rate": 0.0006647635744773207, + "loss": 2.7155, + "step": 13463 + }, + { + "epoch": 0.3992527355217507, + "grad_norm": 0.12202712893486023, + "learning_rate": 0.0006647191507667282, + "loss": 2.7236, + "step": 13464 + }, + { + "epoch": 0.3992823888740622, + "grad_norm": 0.12909169495105743, + "learning_rate": 0.0006646747255975193, + "loss": 2.6982, + "step": 13465 + }, + { + "epoch": 0.3993120422263737, + "grad_norm": 0.1413642019033432, + "learning_rate": 0.0006646302989700874, + "loss": 2.6745, + "step": 13466 + }, + { + "epoch": 0.39934169557868515, + "grad_norm": 0.12441470474004745, + "learning_rate": 0.0006645858708848259, + "loss": 2.6976, + "step": 13467 + }, + { + "epoch": 0.39937134893099663, + "grad_norm": 0.12062110751867294, + "learning_rate": 0.0006645414413421283, + "loss": 2.7019, + "step": 13468 + }, + { + "epoch": 0.3994010022833081, + "grad_norm": 0.12229479104280472, + "learning_rate": 0.0006644970103423882, + "loss": 2.7207, + "step": 13469 + }, + { + "epoch": 0.3994306556356196, + "grad_norm": 0.1177549734711647, + "learning_rate": 0.0006644525778859985, + "loss": 2.719, + "step": 13470 + }, + { + "epoch": 0.3994603089879311, + "grad_norm": 0.11278124898672104, + "learning_rate": 0.0006644081439733532, + "loss": 2.7269, + "step": 13471 + }, + { + "epoch": 0.3994899623402426, + "grad_norm": 0.11363682150840759, + "learning_rate": 0.0006643637086048455, + "loss": 2.7041, + "step": 13472 + }, + { + "epoch": 0.39951961569255406, + "grad_norm": 0.10680648684501648, + "learning_rate": 0.0006643192717808689, + "loss": 2.6642, + "step": 13473 + }, + { + "epoch": 0.39954926904486554, + "grad_norm": 0.11104308068752289, + "learning_rate": 0.0006642748335018169, + "loss": 2.6804, + "step": 13474 + }, + { + "epoch": 0.399578922397177, + "grad_norm": 0.12242760509252548, + "learning_rate": 0.0006642303937680834, + "loss": 2.7016, + "step": 13475 + }, + { + "epoch": 0.3996085757494885, + "grad_norm": 0.1219712570309639, + "learning_rate": 0.0006641859525800614, + "loss": 2.644, + "step": 13476 + }, + { + "epoch": 0.39963822910179997, + "grad_norm": 0.12881676852703094, + "learning_rate": 0.0006641415099381445, + "loss": 2.678, + "step": 13477 + }, + { + "epoch": 0.39966788245411144, + "grad_norm": 0.12109798938035965, + "learning_rate": 0.0006640970658427263, + "loss": 2.7012, + "step": 13478 + }, + { + "epoch": 0.3996975358064229, + "grad_norm": 0.12661658227443695, + "learning_rate": 0.0006640526202942006, + "loss": 2.7074, + "step": 13479 + }, + { + "epoch": 0.3997271891587344, + "grad_norm": 0.12641668319702148, + "learning_rate": 0.0006640081732929606, + "loss": 2.7049, + "step": 13480 + }, + { + "epoch": 0.3997568425110459, + "grad_norm": 0.1295899599790573, + "learning_rate": 0.0006639637248394001, + "loss": 2.72, + "step": 13481 + }, + { + "epoch": 0.39978649586335735, + "grad_norm": 0.1489659696817398, + "learning_rate": 0.0006639192749339129, + "loss": 2.7154, + "step": 13482 + }, + { + "epoch": 0.3998161492156688, + "grad_norm": 0.15514642000198364, + "learning_rate": 0.0006638748235768921, + "loss": 2.6911, + "step": 13483 + }, + { + "epoch": 0.3998458025679803, + "grad_norm": 0.16317595541477203, + "learning_rate": 0.0006638303707687319, + "loss": 2.6782, + "step": 13484 + }, + { + "epoch": 0.3998754559202918, + "grad_norm": 0.1421811431646347, + "learning_rate": 0.0006637859165098255, + "loss": 2.7185, + "step": 13485 + }, + { + "epoch": 0.39990510927260325, + "grad_norm": 0.1204211562871933, + "learning_rate": 0.0006637414608005666, + "loss": 2.6542, + "step": 13486 + }, + { + "epoch": 0.39993476262491473, + "grad_norm": 0.12214980274438858, + "learning_rate": 0.000663697003641349, + "loss": 2.7316, + "step": 13487 + }, + { + "epoch": 0.3999644159772262, + "grad_norm": 0.12687858939170837, + "learning_rate": 0.0006636525450325663, + "loss": 2.6943, + "step": 13488 + }, + { + "epoch": 0.3999940693295377, + "grad_norm": 0.12476370483636856, + "learning_rate": 0.0006636080849746123, + "loss": 2.6389, + "step": 13489 + }, + { + "epoch": 0.40002372268184916, + "grad_norm": 0.12926974892616272, + "learning_rate": 0.0006635636234678807, + "loss": 2.691, + "step": 13490 + }, + { + "epoch": 0.40005337603416063, + "grad_norm": 0.15382793545722961, + "learning_rate": 0.0006635191605127651, + "loss": 2.7161, + "step": 13491 + }, + { + "epoch": 0.40008302938647217, + "grad_norm": 0.136208176612854, + "learning_rate": 0.0006634746961096591, + "loss": 2.7597, + "step": 13492 + }, + { + "epoch": 0.40011268273878364, + "grad_norm": 0.10827893018722534, + "learning_rate": 0.0006634302302589568, + "loss": 2.6905, + "step": 13493 + }, + { + "epoch": 0.4001423360910951, + "grad_norm": 0.1186211034655571, + "learning_rate": 0.0006633857629610517, + "loss": 2.687, + "step": 13494 + }, + { + "epoch": 0.4001719894434066, + "grad_norm": 0.12825599312782288, + "learning_rate": 0.0006633412942163376, + "loss": 2.6793, + "step": 13495 + }, + { + "epoch": 0.40020164279571807, + "grad_norm": 0.12439081072807312, + "learning_rate": 0.0006632968240252083, + "loss": 2.6721, + "step": 13496 + }, + { + "epoch": 0.40023129614802955, + "grad_norm": 0.1397484689950943, + "learning_rate": 0.0006632523523880577, + "loss": 2.6477, + "step": 13497 + }, + { + "epoch": 0.400260949500341, + "grad_norm": 0.13947732746601105, + "learning_rate": 0.0006632078793052794, + "loss": 2.7106, + "step": 13498 + }, + { + "epoch": 0.4002906028526525, + "grad_norm": 0.15521611273288727, + "learning_rate": 0.0006631634047772672, + "loss": 2.6813, + "step": 13499 + }, + { + "epoch": 0.400320256204964, + "grad_norm": 0.15999187529087067, + "learning_rate": 0.0006631189288044153, + "loss": 2.6898, + "step": 13500 + }, + { + "epoch": 0.40034990955727545, + "grad_norm": 0.13280875980854034, + "learning_rate": 0.0006630744513871171, + "loss": 2.7037, + "step": 13501 + }, + { + "epoch": 0.40037956290958693, + "grad_norm": 0.128646582365036, + "learning_rate": 0.0006630299725257667, + "loss": 2.7201, + "step": 13502 + }, + { + "epoch": 0.4004092162618984, + "grad_norm": 0.12373799085617065, + "learning_rate": 0.0006629854922207579, + "loss": 2.6886, + "step": 13503 + }, + { + "epoch": 0.4004388696142099, + "grad_norm": 0.11844196915626526, + "learning_rate": 0.0006629410104724846, + "loss": 2.6811, + "step": 13504 + }, + { + "epoch": 0.40046852296652136, + "grad_norm": 0.12720586359500885, + "learning_rate": 0.0006628965272813406, + "loss": 2.7264, + "step": 13505 + }, + { + "epoch": 0.40049817631883283, + "grad_norm": 0.1522335410118103, + "learning_rate": 0.00066285204264772, + "loss": 2.6794, + "step": 13506 + }, + { + "epoch": 0.4005278296711443, + "grad_norm": 0.1677793264389038, + "learning_rate": 0.0006628075565720166, + "loss": 2.671, + "step": 13507 + }, + { + "epoch": 0.4005574830234558, + "grad_norm": 0.1484198421239853, + "learning_rate": 0.0006627630690546243, + "loss": 2.6736, + "step": 13508 + }, + { + "epoch": 0.40058713637576726, + "grad_norm": 0.1270383596420288, + "learning_rate": 0.0006627185800959372, + "loss": 2.6823, + "step": 13509 + }, + { + "epoch": 0.40061678972807874, + "grad_norm": 0.16152317821979523, + "learning_rate": 0.000662674089696349, + "loss": 2.6987, + "step": 13510 + }, + { + "epoch": 0.4006464430803902, + "grad_norm": 0.1519172638654709, + "learning_rate": 0.0006626295978562538, + "loss": 2.6863, + "step": 13511 + }, + { + "epoch": 0.4006760964327017, + "grad_norm": 0.11555011570453644, + "learning_rate": 0.0006625851045760456, + "loss": 2.6637, + "step": 13512 + }, + { + "epoch": 0.4007057497850132, + "grad_norm": 0.12416227906942368, + "learning_rate": 0.0006625406098561186, + "loss": 2.7199, + "step": 13513 + }, + { + "epoch": 0.4007354031373247, + "grad_norm": 0.14853142201900482, + "learning_rate": 0.0006624961136968663, + "loss": 2.6608, + "step": 13514 + }, + { + "epoch": 0.4007650564896362, + "grad_norm": 0.13872063159942627, + "learning_rate": 0.0006624516160986833, + "loss": 2.6913, + "step": 13515 + }, + { + "epoch": 0.40079470984194765, + "grad_norm": 0.11764999479055405, + "learning_rate": 0.0006624071170619633, + "loss": 2.6976, + "step": 13516 + }, + { + "epoch": 0.4008243631942591, + "grad_norm": 0.12026296555995941, + "learning_rate": 0.0006623626165871002, + "loss": 2.6548, + "step": 13517 + }, + { + "epoch": 0.4008540165465706, + "grad_norm": 0.11811669170856476, + "learning_rate": 0.0006623181146744884, + "loss": 2.7226, + "step": 13518 + }, + { + "epoch": 0.4008836698988821, + "grad_norm": 0.13278914988040924, + "learning_rate": 0.0006622736113245218, + "loss": 2.7138, + "step": 13519 + }, + { + "epoch": 0.40091332325119355, + "grad_norm": 0.12877681851387024, + "learning_rate": 0.0006622291065375945, + "loss": 2.6948, + "step": 13520 + }, + { + "epoch": 0.40094297660350503, + "grad_norm": 0.13052628934383392, + "learning_rate": 0.0006621846003141007, + "loss": 2.712, + "step": 13521 + }, + { + "epoch": 0.4009726299558165, + "grad_norm": 0.13096076250076294, + "learning_rate": 0.0006621400926544344, + "loss": 2.6858, + "step": 13522 + }, + { + "epoch": 0.401002283308128, + "grad_norm": 0.1400444507598877, + "learning_rate": 0.0006620955835589897, + "loss": 2.7084, + "step": 13523 + }, + { + "epoch": 0.40103193666043946, + "grad_norm": 0.15597622096538544, + "learning_rate": 0.000662051073028161, + "loss": 2.6879, + "step": 13524 + }, + { + "epoch": 0.40106159001275093, + "grad_norm": 0.14953483641147614, + "learning_rate": 0.0006620065610623418, + "loss": 2.6968, + "step": 13525 + }, + { + "epoch": 0.4010912433650624, + "grad_norm": 0.12956778705120087, + "learning_rate": 0.000661962047661927, + "loss": 2.6937, + "step": 13526 + }, + { + "epoch": 0.4011208967173739, + "grad_norm": 0.13025346398353577, + "learning_rate": 0.0006619175328273104, + "loss": 2.7082, + "step": 13527 + }, + { + "epoch": 0.40115055006968536, + "grad_norm": 0.15125569701194763, + "learning_rate": 0.0006618730165588862, + "loss": 2.6755, + "step": 13528 + }, + { + "epoch": 0.40118020342199684, + "grad_norm": 0.1504228562116623, + "learning_rate": 0.0006618284988570488, + "loss": 2.6508, + "step": 13529 + }, + { + "epoch": 0.4012098567743083, + "grad_norm": 0.13664817810058594, + "learning_rate": 0.0006617839797221923, + "loss": 2.7127, + "step": 13530 + }, + { + "epoch": 0.4012395101266198, + "grad_norm": 0.16640131175518036, + "learning_rate": 0.0006617394591547106, + "loss": 2.6978, + "step": 13531 + }, + { + "epoch": 0.40126916347893127, + "grad_norm": 0.1508273333311081, + "learning_rate": 0.0006616949371549983, + "loss": 2.6893, + "step": 13532 + }, + { + "epoch": 0.4012988168312428, + "grad_norm": 0.12689165771007538, + "learning_rate": 0.0006616504137234498, + "loss": 2.7017, + "step": 13533 + }, + { + "epoch": 0.4013284701835543, + "grad_norm": 0.13122214376926422, + "learning_rate": 0.000661605888860459, + "loss": 2.6626, + "step": 13534 + }, + { + "epoch": 0.40135812353586575, + "grad_norm": 0.14762672781944275, + "learning_rate": 0.0006615613625664204, + "loss": 2.6741, + "step": 13535 + }, + { + "epoch": 0.40138777688817723, + "grad_norm": 0.14674824476242065, + "learning_rate": 0.0006615168348417281, + "loss": 2.7225, + "step": 13536 + }, + { + "epoch": 0.4014174302404887, + "grad_norm": 0.14821757376194, + "learning_rate": 0.0006614723056867765, + "loss": 2.7163, + "step": 13537 + }, + { + "epoch": 0.4014470835928002, + "grad_norm": 0.14477823674678802, + "learning_rate": 0.00066142777510196, + "loss": 2.6849, + "step": 13538 + }, + { + "epoch": 0.40147673694511166, + "grad_norm": 0.1325799971818924, + "learning_rate": 0.0006613832430876727, + "loss": 2.684, + "step": 13539 + }, + { + "epoch": 0.40150639029742313, + "grad_norm": 0.139644593000412, + "learning_rate": 0.0006613387096443093, + "loss": 2.6938, + "step": 13540 + }, + { + "epoch": 0.4015360436497346, + "grad_norm": 0.10983718186616898, + "learning_rate": 0.0006612941747722637, + "loss": 2.693, + "step": 13541 + }, + { + "epoch": 0.4015656970020461, + "grad_norm": 0.1188783198595047, + "learning_rate": 0.0006612496384719306, + "loss": 2.6831, + "step": 13542 + }, + { + "epoch": 0.40159535035435756, + "grad_norm": 0.10889097303152084, + "learning_rate": 0.0006612051007437043, + "loss": 2.6478, + "step": 13543 + }, + { + "epoch": 0.40162500370666904, + "grad_norm": 0.10845191776752472, + "learning_rate": 0.000661160561587979, + "loss": 2.6923, + "step": 13544 + }, + { + "epoch": 0.4016546570589805, + "grad_norm": 0.11685068160295486, + "learning_rate": 0.0006611160210051496, + "loss": 2.7107, + "step": 13545 + }, + { + "epoch": 0.401684310411292, + "grad_norm": 0.1315888911485672, + "learning_rate": 0.0006610714789956099, + "loss": 2.6923, + "step": 13546 + }, + { + "epoch": 0.40171396376360347, + "grad_norm": 0.12346453219652176, + "learning_rate": 0.0006610269355597547, + "loss": 2.6594, + "step": 13547 + }, + { + "epoch": 0.40174361711591494, + "grad_norm": 0.11124773323535919, + "learning_rate": 0.0006609823906979784, + "loss": 2.7167, + "step": 13548 + }, + { + "epoch": 0.4017732704682264, + "grad_norm": 0.1179022341966629, + "learning_rate": 0.0006609378444106753, + "loss": 2.7141, + "step": 13549 + }, + { + "epoch": 0.4018029238205379, + "grad_norm": 0.12659883499145508, + "learning_rate": 0.0006608932966982399, + "loss": 2.7097, + "step": 13550 + }, + { + "epoch": 0.40183257717284937, + "grad_norm": 0.1273094266653061, + "learning_rate": 0.000660848747561067, + "loss": 2.6938, + "step": 13551 + }, + { + "epoch": 0.40186223052516085, + "grad_norm": 0.12156069278717041, + "learning_rate": 0.0006608041969995505, + "loss": 2.6536, + "step": 13552 + }, + { + "epoch": 0.4018918838774723, + "grad_norm": 0.11750872433185577, + "learning_rate": 0.0006607596450140855, + "loss": 2.6948, + "step": 13553 + }, + { + "epoch": 0.40192153722978385, + "grad_norm": 0.13200438022613525, + "learning_rate": 0.0006607150916050662, + "loss": 2.6601, + "step": 13554 + }, + { + "epoch": 0.40195119058209533, + "grad_norm": 0.144075408577919, + "learning_rate": 0.000660670536772887, + "loss": 2.683, + "step": 13555 + }, + { + "epoch": 0.4019808439344068, + "grad_norm": 0.1709321290254593, + "learning_rate": 0.0006606259805179427, + "loss": 2.6909, + "step": 13556 + }, + { + "epoch": 0.4020104972867183, + "grad_norm": 0.17289119958877563, + "learning_rate": 0.0006605814228406279, + "loss": 2.6629, + "step": 13557 + }, + { + "epoch": 0.40204015063902976, + "grad_norm": 0.14364726841449738, + "learning_rate": 0.0006605368637413369, + "loss": 2.7279, + "step": 13558 + }, + { + "epoch": 0.40206980399134123, + "grad_norm": 0.1511363387107849, + "learning_rate": 0.0006604923032204645, + "loss": 2.687, + "step": 13559 + }, + { + "epoch": 0.4020994573436527, + "grad_norm": 0.15961970388889313, + "learning_rate": 0.0006604477412784051, + "loss": 2.7116, + "step": 13560 + }, + { + "epoch": 0.4021291106959642, + "grad_norm": 0.15440039336681366, + "learning_rate": 0.0006604031779155534, + "loss": 2.7105, + "step": 13561 + }, + { + "epoch": 0.40215876404827566, + "grad_norm": 0.12800171971321106, + "learning_rate": 0.0006603586131323043, + "loss": 2.6771, + "step": 13562 + }, + { + "epoch": 0.40218841740058714, + "grad_norm": 0.11919692158699036, + "learning_rate": 0.0006603140469290521, + "loss": 2.6663, + "step": 13563 + }, + { + "epoch": 0.4022180707528986, + "grad_norm": 0.13877637684345245, + "learning_rate": 0.0006602694793061912, + "loss": 2.7016, + "step": 13564 + }, + { + "epoch": 0.4022477241052101, + "grad_norm": 0.11953290551900864, + "learning_rate": 0.0006602249102641166, + "loss": 2.6943, + "step": 13565 + }, + { + "epoch": 0.40227737745752157, + "grad_norm": 0.1262921541929245, + "learning_rate": 0.0006601803398032231, + "loss": 2.691, + "step": 13566 + }, + { + "epoch": 0.40230703080983304, + "grad_norm": 0.13519132137298584, + "learning_rate": 0.0006601357679239052, + "loss": 2.6955, + "step": 13567 + }, + { + "epoch": 0.4023366841621445, + "grad_norm": 0.11855930089950562, + "learning_rate": 0.0006600911946265575, + "loss": 2.649, + "step": 13568 + }, + { + "epoch": 0.402366337514456, + "grad_norm": 0.1245110034942627, + "learning_rate": 0.0006600466199115748, + "loss": 2.6797, + "step": 13569 + }, + { + "epoch": 0.4023959908667675, + "grad_norm": 0.12420759350061417, + "learning_rate": 0.0006600020437793518, + "loss": 2.712, + "step": 13570 + }, + { + "epoch": 0.40242564421907895, + "grad_norm": 0.12519197165966034, + "learning_rate": 0.0006599574662302832, + "loss": 2.6724, + "step": 13571 + }, + { + "epoch": 0.4024552975713904, + "grad_norm": 0.11694561690092087, + "learning_rate": 0.000659912887264764, + "loss": 2.7199, + "step": 13572 + }, + { + "epoch": 0.4024849509237019, + "grad_norm": 0.11048740148544312, + "learning_rate": 0.0006598683068831885, + "loss": 2.6741, + "step": 13573 + }, + { + "epoch": 0.4025146042760134, + "grad_norm": 0.11249125003814697, + "learning_rate": 0.0006598237250859518, + "loss": 2.6935, + "step": 13574 + }, + { + "epoch": 0.4025442576283249, + "grad_norm": 0.12340123951435089, + "learning_rate": 0.0006597791418734485, + "loss": 2.6635, + "step": 13575 + }, + { + "epoch": 0.4025739109806364, + "grad_norm": 0.13067808747291565, + "learning_rate": 0.0006597345572460735, + "loss": 2.672, + "step": 13576 + }, + { + "epoch": 0.40260356433294786, + "grad_norm": 0.14010998606681824, + "learning_rate": 0.0006596899712042216, + "loss": 2.6788, + "step": 13577 + }, + { + "epoch": 0.40263321768525934, + "grad_norm": 0.14297597110271454, + "learning_rate": 0.0006596453837482876, + "loss": 2.7036, + "step": 13578 + }, + { + "epoch": 0.4026628710375708, + "grad_norm": 0.14785872399806976, + "learning_rate": 0.0006596007948786665, + "loss": 2.7148, + "step": 13579 + }, + { + "epoch": 0.4026925243898823, + "grad_norm": 0.15807579457759857, + "learning_rate": 0.0006595562045957527, + "loss": 2.6799, + "step": 13580 + }, + { + "epoch": 0.40272217774219377, + "grad_norm": 0.175831601023674, + "learning_rate": 0.0006595116128999414, + "loss": 2.6682, + "step": 13581 + }, + { + "epoch": 0.40275183109450524, + "grad_norm": 0.1584196239709854, + "learning_rate": 0.0006594670197916274, + "loss": 2.6851, + "step": 13582 + }, + { + "epoch": 0.4027814844468167, + "grad_norm": 0.12645089626312256, + "learning_rate": 0.0006594224252712055, + "loss": 2.6775, + "step": 13583 + }, + { + "epoch": 0.4028111377991282, + "grad_norm": 0.1328742653131485, + "learning_rate": 0.0006593778293390709, + "loss": 2.6905, + "step": 13584 + }, + { + "epoch": 0.40284079115143967, + "grad_norm": 0.1350339949131012, + "learning_rate": 0.000659333231995618, + "loss": 2.7286, + "step": 13585 + }, + { + "epoch": 0.40287044450375115, + "grad_norm": 0.12156488746404648, + "learning_rate": 0.000659288633241242, + "loss": 2.6606, + "step": 13586 + }, + { + "epoch": 0.4029000978560626, + "grad_norm": 0.11603865772485733, + "learning_rate": 0.0006592440330763379, + "loss": 2.6828, + "step": 13587 + }, + { + "epoch": 0.4029297512083741, + "grad_norm": 0.13093125820159912, + "learning_rate": 0.0006591994315013006, + "loss": 2.7158, + "step": 13588 + }, + { + "epoch": 0.4029594045606856, + "grad_norm": 0.12550465762615204, + "learning_rate": 0.0006591548285165249, + "loss": 2.6974, + "step": 13589 + }, + { + "epoch": 0.40298905791299705, + "grad_norm": 0.10330253094434738, + "learning_rate": 0.0006591102241224059, + "loss": 2.7021, + "step": 13590 + }, + { + "epoch": 0.4030187112653085, + "grad_norm": 0.11522835493087769, + "learning_rate": 0.0006590656183193387, + "loss": 2.685, + "step": 13591 + }, + { + "epoch": 0.40304836461762, + "grad_norm": 0.123235784471035, + "learning_rate": 0.0006590210111077179, + "loss": 2.6634, + "step": 13592 + }, + { + "epoch": 0.4030780179699315, + "grad_norm": 0.10824934393167496, + "learning_rate": 0.0006589764024879388, + "loss": 2.6935, + "step": 13593 + }, + { + "epoch": 0.40310767132224296, + "grad_norm": 0.10471804440021515, + "learning_rate": 0.0006589317924603965, + "loss": 2.6824, + "step": 13594 + }, + { + "epoch": 0.40313732467455443, + "grad_norm": 0.11371423304080963, + "learning_rate": 0.000658887181025486, + "loss": 2.6591, + "step": 13595 + }, + { + "epoch": 0.40316697802686596, + "grad_norm": 0.1172267273068428, + "learning_rate": 0.0006588425681836019, + "loss": 2.6687, + "step": 13596 + }, + { + "epoch": 0.40319663137917744, + "grad_norm": 0.12706103920936584, + "learning_rate": 0.0006587979539351399, + "loss": 2.7066, + "step": 13597 + }, + { + "epoch": 0.4032262847314889, + "grad_norm": 0.1061839759349823, + "learning_rate": 0.0006587533382804945, + "loss": 2.6797, + "step": 13598 + }, + { + "epoch": 0.4032559380838004, + "grad_norm": 0.09630344063043594, + "learning_rate": 0.0006587087212200612, + "loss": 2.657, + "step": 13599 + }, + { + "epoch": 0.40328559143611187, + "grad_norm": 0.1151069775223732, + "learning_rate": 0.0006586641027542348, + "loss": 2.6366, + "step": 13600 + }, + { + "epoch": 0.40331524478842334, + "grad_norm": 0.1261882334947586, + "learning_rate": 0.0006586194828834109, + "loss": 2.6946, + "step": 13601 + }, + { + "epoch": 0.4033448981407348, + "grad_norm": 0.13429778814315796, + "learning_rate": 0.0006585748616079838, + "loss": 2.6971, + "step": 13602 + }, + { + "epoch": 0.4033745514930463, + "grad_norm": 0.13415293395519257, + "learning_rate": 0.0006585302389283493, + "loss": 2.6974, + "step": 13603 + }, + { + "epoch": 0.40340420484535777, + "grad_norm": 0.11626259982585907, + "learning_rate": 0.0006584856148449023, + "loss": 2.681, + "step": 13604 + }, + { + "epoch": 0.40343385819766925, + "grad_norm": 0.1296842247247696, + "learning_rate": 0.000658440989358038, + "loss": 2.6908, + "step": 13605 + }, + { + "epoch": 0.4034635115499807, + "grad_norm": 0.14001040160655975, + "learning_rate": 0.0006583963624681515, + "loss": 2.6758, + "step": 13606 + }, + { + "epoch": 0.4034931649022922, + "grad_norm": 0.13526076078414917, + "learning_rate": 0.0006583517341756381, + "loss": 2.6759, + "step": 13607 + }, + { + "epoch": 0.4035228182546037, + "grad_norm": 0.13051737844944, + "learning_rate": 0.0006583071044808928, + "loss": 2.6597, + "step": 13608 + }, + { + "epoch": 0.40355247160691515, + "grad_norm": 0.11476711928844452, + "learning_rate": 0.0006582624733843109, + "loss": 2.6632, + "step": 13609 + }, + { + "epoch": 0.40358212495922663, + "grad_norm": 0.12422166764736176, + "learning_rate": 0.0006582178408862877, + "loss": 2.6979, + "step": 13610 + }, + { + "epoch": 0.4036117783115381, + "grad_norm": 0.14139927923679352, + "learning_rate": 0.0006581732069872183, + "loss": 2.6968, + "step": 13611 + }, + { + "epoch": 0.4036414316638496, + "grad_norm": 0.13474662601947784, + "learning_rate": 0.0006581285716874981, + "loss": 2.6888, + "step": 13612 + }, + { + "epoch": 0.40367108501616106, + "grad_norm": 0.1618843972682953, + "learning_rate": 0.0006580839349875223, + "loss": 2.714, + "step": 13613 + }, + { + "epoch": 0.40370073836847253, + "grad_norm": 0.11844760924577713, + "learning_rate": 0.000658039296887686, + "loss": 2.7127, + "step": 13614 + }, + { + "epoch": 0.403730391720784, + "grad_norm": 0.11787430942058563, + "learning_rate": 0.0006579946573883846, + "loss": 2.6661, + "step": 13615 + }, + { + "epoch": 0.4037600450730955, + "grad_norm": 0.11817890405654907, + "learning_rate": 0.0006579500164900135, + "loss": 2.6787, + "step": 13616 + }, + { + "epoch": 0.403789698425407, + "grad_norm": 0.12567463517189026, + "learning_rate": 0.0006579053741929678, + "loss": 2.7002, + "step": 13617 + }, + { + "epoch": 0.4038193517777185, + "grad_norm": 0.14774489402770996, + "learning_rate": 0.000657860730497643, + "loss": 2.6777, + "step": 13618 + }, + { + "epoch": 0.40384900513002997, + "grad_norm": 0.1478840857744217, + "learning_rate": 0.0006578160854044342, + "loss": 2.7012, + "step": 13619 + }, + { + "epoch": 0.40387865848234145, + "grad_norm": 0.15345944464206696, + "learning_rate": 0.0006577714389137369, + "loss": 2.6681, + "step": 13620 + }, + { + "epoch": 0.4039083118346529, + "grad_norm": 0.1819557100534439, + "learning_rate": 0.0006577267910259465, + "loss": 2.6449, + "step": 13621 + }, + { + "epoch": 0.4039379651869644, + "grad_norm": 0.18491660058498383, + "learning_rate": 0.0006576821417414582, + "loss": 2.6845, + "step": 13622 + }, + { + "epoch": 0.4039676185392759, + "grad_norm": 0.15903674066066742, + "learning_rate": 0.0006576374910606676, + "loss": 2.6757, + "step": 13623 + }, + { + "epoch": 0.40399727189158735, + "grad_norm": 0.12695854902267456, + "learning_rate": 0.0006575928389839698, + "loss": 2.692, + "step": 13624 + }, + { + "epoch": 0.4040269252438988, + "grad_norm": 0.13978873193264008, + "learning_rate": 0.0006575481855117606, + "loss": 2.7113, + "step": 13625 + }, + { + "epoch": 0.4040565785962103, + "grad_norm": 0.14661701023578644, + "learning_rate": 0.0006575035306444349, + "loss": 2.6805, + "step": 13626 + }, + { + "epoch": 0.4040862319485218, + "grad_norm": 0.1272648572921753, + "learning_rate": 0.0006574588743823886, + "loss": 2.6915, + "step": 13627 + }, + { + "epoch": 0.40411588530083326, + "grad_norm": 0.12505225837230682, + "learning_rate": 0.0006574142167260168, + "loss": 2.6879, + "step": 13628 + }, + { + "epoch": 0.40414553865314473, + "grad_norm": 0.11440305411815643, + "learning_rate": 0.0006573695576757152, + "loss": 2.6977, + "step": 13629 + }, + { + "epoch": 0.4041751920054562, + "grad_norm": 0.12050793319940567, + "learning_rate": 0.000657324897231879, + "loss": 2.6903, + "step": 13630 + }, + { + "epoch": 0.4042048453577677, + "grad_norm": 0.11199891567230225, + "learning_rate": 0.000657280235394904, + "loss": 2.6755, + "step": 13631 + }, + { + "epoch": 0.40423449871007916, + "grad_norm": 0.10684271901845932, + "learning_rate": 0.0006572355721651855, + "loss": 2.688, + "step": 13632 + }, + { + "epoch": 0.40426415206239064, + "grad_norm": 0.13332171738147736, + "learning_rate": 0.0006571909075431191, + "loss": 2.638, + "step": 13633 + }, + { + "epoch": 0.4042938054147021, + "grad_norm": 0.12981270253658295, + "learning_rate": 0.0006571462415291, + "loss": 2.6571, + "step": 13634 + }, + { + "epoch": 0.4043234587670136, + "grad_norm": 0.12106811255216599, + "learning_rate": 0.000657101574123524, + "loss": 2.7421, + "step": 13635 + }, + { + "epoch": 0.40435311211932506, + "grad_norm": 0.11563083529472351, + "learning_rate": 0.0006570569053267867, + "loss": 2.6867, + "step": 13636 + }, + { + "epoch": 0.4043827654716366, + "grad_norm": 0.1190134808421135, + "learning_rate": 0.0006570122351392835, + "loss": 2.702, + "step": 13637 + }, + { + "epoch": 0.40441241882394807, + "grad_norm": 0.13135790824890137, + "learning_rate": 0.0006569675635614099, + "loss": 2.6834, + "step": 13638 + }, + { + "epoch": 0.40444207217625955, + "grad_norm": 0.11922172456979752, + "learning_rate": 0.0006569228905935618, + "loss": 2.6581, + "step": 13639 + }, + { + "epoch": 0.404471725528571, + "grad_norm": 0.09784198552370071, + "learning_rate": 0.0006568782162361344, + "loss": 2.6595, + "step": 13640 + }, + { + "epoch": 0.4045013788808825, + "grad_norm": 0.11027384549379349, + "learning_rate": 0.0006568335404895235, + "loss": 2.6568, + "step": 13641 + }, + { + "epoch": 0.404531032233194, + "grad_norm": 0.10669766366481781, + "learning_rate": 0.0006567888633541247, + "loss": 2.6737, + "step": 13642 + }, + { + "epoch": 0.40456068558550545, + "grad_norm": 0.11809539049863815, + "learning_rate": 0.0006567441848303336, + "loss": 2.6688, + "step": 13643 + }, + { + "epoch": 0.40459033893781693, + "grad_norm": 0.1256769746541977, + "learning_rate": 0.0006566995049185461, + "loss": 2.6585, + "step": 13644 + }, + { + "epoch": 0.4046199922901284, + "grad_norm": 0.11003025621175766, + "learning_rate": 0.0006566548236191571, + "loss": 2.654, + "step": 13645 + }, + { + "epoch": 0.4046496456424399, + "grad_norm": 0.10664398223161697, + "learning_rate": 0.0006566101409325631, + "loss": 2.6964, + "step": 13646 + }, + { + "epoch": 0.40467929899475136, + "grad_norm": 0.08906293660402298, + "learning_rate": 0.0006565654568591592, + "loss": 2.6604, + "step": 13647 + }, + { + "epoch": 0.40470895234706283, + "grad_norm": 0.10166703909635544, + "learning_rate": 0.0006565207713993413, + "loss": 2.7144, + "step": 13648 + }, + { + "epoch": 0.4047386056993743, + "grad_norm": 0.10284969955682755, + "learning_rate": 0.0006564760845535054, + "loss": 2.6654, + "step": 13649 + }, + { + "epoch": 0.4047682590516858, + "grad_norm": 0.10433409363031387, + "learning_rate": 0.0006564313963220468, + "loss": 2.6985, + "step": 13650 + }, + { + "epoch": 0.40479791240399726, + "grad_norm": 0.106613390147686, + "learning_rate": 0.0006563867067053611, + "loss": 2.6488, + "step": 13651 + }, + { + "epoch": 0.40482756575630874, + "grad_norm": 0.12079675495624542, + "learning_rate": 0.0006563420157038444, + "loss": 2.686, + "step": 13652 + }, + { + "epoch": 0.4048572191086202, + "grad_norm": 0.15079569816589355, + "learning_rate": 0.0006562973233178923, + "loss": 2.6818, + "step": 13653 + }, + { + "epoch": 0.4048868724609317, + "grad_norm": 0.18000678718090057, + "learning_rate": 0.0006562526295479008, + "loss": 2.6668, + "step": 13654 + }, + { + "epoch": 0.40491652581324317, + "grad_norm": 0.17909622192382812, + "learning_rate": 0.0006562079343942652, + "loss": 2.6869, + "step": 13655 + }, + { + "epoch": 0.40494617916555464, + "grad_norm": 0.15287712216377258, + "learning_rate": 0.0006561632378573817, + "loss": 2.6397, + "step": 13656 + }, + { + "epoch": 0.4049758325178661, + "grad_norm": 0.12331438064575195, + "learning_rate": 0.0006561185399376457, + "loss": 2.6804, + "step": 13657 + }, + { + "epoch": 0.40500548587017765, + "grad_norm": 0.1253904104232788, + "learning_rate": 0.0006560738406354532, + "loss": 2.7011, + "step": 13658 + }, + { + "epoch": 0.4050351392224891, + "grad_norm": 0.13156788051128387, + "learning_rate": 0.0006560291399512003, + "loss": 2.7034, + "step": 13659 + }, + { + "epoch": 0.4050647925748006, + "grad_norm": 0.12171574681997299, + "learning_rate": 0.0006559844378852825, + "loss": 2.6628, + "step": 13660 + }, + { + "epoch": 0.4050944459271121, + "grad_norm": 0.13375473022460938, + "learning_rate": 0.0006559397344380958, + "loss": 2.7, + "step": 13661 + }, + { + "epoch": 0.40512409927942356, + "grad_norm": 0.14534248411655426, + "learning_rate": 0.0006558950296100358, + "loss": 2.6875, + "step": 13662 + }, + { + "epoch": 0.40515375263173503, + "grad_norm": 0.1613442450761795, + "learning_rate": 0.0006558503234014986, + "loss": 2.7104, + "step": 13663 + }, + { + "epoch": 0.4051834059840465, + "grad_norm": 0.16153620183467865, + "learning_rate": 0.0006558056158128802, + "loss": 2.7042, + "step": 13664 + }, + { + "epoch": 0.405213059336358, + "grad_norm": 0.1520097702741623, + "learning_rate": 0.0006557609068445761, + "loss": 2.6938, + "step": 13665 + }, + { + "epoch": 0.40524271268866946, + "grad_norm": 0.12399382144212723, + "learning_rate": 0.0006557161964969826, + "loss": 2.6857, + "step": 13666 + }, + { + "epoch": 0.40527236604098094, + "grad_norm": 0.13583190739154816, + "learning_rate": 0.0006556714847704954, + "loss": 2.6919, + "step": 13667 + }, + { + "epoch": 0.4053020193932924, + "grad_norm": 0.14145325124263763, + "learning_rate": 0.0006556267716655104, + "loss": 2.6481, + "step": 13668 + }, + { + "epoch": 0.4053316727456039, + "grad_norm": 0.12619492411613464, + "learning_rate": 0.0006555820571824237, + "loss": 2.6984, + "step": 13669 + }, + { + "epoch": 0.40536132609791536, + "grad_norm": 0.13065731525421143, + "learning_rate": 0.0006555373413216312, + "loss": 2.7196, + "step": 13670 + }, + { + "epoch": 0.40539097945022684, + "grad_norm": 0.11693119257688522, + "learning_rate": 0.0006554926240835288, + "loss": 2.7022, + "step": 13671 + }, + { + "epoch": 0.4054206328025383, + "grad_norm": 0.129006490111351, + "learning_rate": 0.0006554479054685126, + "loss": 2.6965, + "step": 13672 + }, + { + "epoch": 0.4054502861548498, + "grad_norm": 0.14632661640644073, + "learning_rate": 0.0006554031854769784, + "loss": 2.7323, + "step": 13673 + }, + { + "epoch": 0.40547993950716127, + "grad_norm": 0.1434922218322754, + "learning_rate": 0.0006553584641093225, + "loss": 2.6497, + "step": 13674 + }, + { + "epoch": 0.40550959285947275, + "grad_norm": 0.125082328915596, + "learning_rate": 0.0006553137413659405, + "loss": 2.689, + "step": 13675 + }, + { + "epoch": 0.4055392462117842, + "grad_norm": 0.135414719581604, + "learning_rate": 0.0006552690172472288, + "loss": 2.6891, + "step": 13676 + }, + { + "epoch": 0.4055688995640957, + "grad_norm": 0.1406935453414917, + "learning_rate": 0.0006552242917535834, + "loss": 2.6595, + "step": 13677 + }, + { + "epoch": 0.4055985529164072, + "grad_norm": 0.12266886979341507, + "learning_rate": 0.0006551795648854, + "loss": 2.6869, + "step": 13678 + }, + { + "epoch": 0.4056282062687187, + "grad_norm": 0.1412927657365799, + "learning_rate": 0.0006551348366430752, + "loss": 2.7295, + "step": 13679 + }, + { + "epoch": 0.4056578596210302, + "grad_norm": 0.14006653428077698, + "learning_rate": 0.0006550901070270044, + "loss": 2.6596, + "step": 13680 + }, + { + "epoch": 0.40568751297334166, + "grad_norm": 0.13344070315361023, + "learning_rate": 0.0006550453760375843, + "loss": 2.6873, + "step": 13681 + }, + { + "epoch": 0.40571716632565313, + "grad_norm": 0.126221165060997, + "learning_rate": 0.000655000643675211, + "loss": 2.6865, + "step": 13682 + }, + { + "epoch": 0.4057468196779646, + "grad_norm": 0.11180947721004486, + "learning_rate": 0.0006549559099402801, + "loss": 2.6663, + "step": 13683 + }, + { + "epoch": 0.4057764730302761, + "grad_norm": 0.12946221232414246, + "learning_rate": 0.0006549111748331882, + "loss": 2.6807, + "step": 13684 + }, + { + "epoch": 0.40580612638258756, + "grad_norm": 0.10545645654201508, + "learning_rate": 0.0006548664383543312, + "loss": 2.6345, + "step": 13685 + }, + { + "epoch": 0.40583577973489904, + "grad_norm": 0.11839236319065094, + "learning_rate": 0.000654821700504105, + "loss": 2.674, + "step": 13686 + }, + { + "epoch": 0.4058654330872105, + "grad_norm": 0.12676116824150085, + "learning_rate": 0.0006547769612829065, + "loss": 2.6548, + "step": 13687 + }, + { + "epoch": 0.405895086439522, + "grad_norm": 0.1396183967590332, + "learning_rate": 0.0006547322206911313, + "loss": 2.704, + "step": 13688 + }, + { + "epoch": 0.40592473979183347, + "grad_norm": 0.14590364694595337, + "learning_rate": 0.0006546874787291757, + "loss": 2.7031, + "step": 13689 + }, + { + "epoch": 0.40595439314414494, + "grad_norm": 0.12434148788452148, + "learning_rate": 0.0006546427353974359, + "loss": 2.7039, + "step": 13690 + }, + { + "epoch": 0.4059840464964564, + "grad_norm": 0.11853154003620148, + "learning_rate": 0.0006545979906963082, + "loss": 2.6876, + "step": 13691 + }, + { + "epoch": 0.4060136998487679, + "grad_norm": 0.1274670958518982, + "learning_rate": 0.0006545532446261887, + "loss": 2.6943, + "step": 13692 + }, + { + "epoch": 0.40604335320107937, + "grad_norm": 0.1350039541721344, + "learning_rate": 0.0006545084971874737, + "loss": 2.6994, + "step": 13693 + }, + { + "epoch": 0.40607300655339085, + "grad_norm": 0.12477980554103851, + "learning_rate": 0.0006544637483805595, + "loss": 2.7035, + "step": 13694 + }, + { + "epoch": 0.4061026599057023, + "grad_norm": 0.14636103808879852, + "learning_rate": 0.0006544189982058422, + "loss": 2.7061, + "step": 13695 + }, + { + "epoch": 0.4061323132580138, + "grad_norm": 0.13638979196548462, + "learning_rate": 0.0006543742466637183, + "loss": 2.6772, + "step": 13696 + }, + { + "epoch": 0.4061619666103253, + "grad_norm": 0.12543298304080963, + "learning_rate": 0.0006543294937545838, + "loss": 2.6968, + "step": 13697 + }, + { + "epoch": 0.40619161996263675, + "grad_norm": 0.11472618579864502, + "learning_rate": 0.0006542847394788351, + "loss": 2.7004, + "step": 13698 + }, + { + "epoch": 0.40622127331494823, + "grad_norm": 0.11945821344852448, + "learning_rate": 0.0006542399838368688, + "loss": 2.6916, + "step": 13699 + }, + { + "epoch": 0.40625092666725976, + "grad_norm": 0.13285794854164124, + "learning_rate": 0.0006541952268290807, + "loss": 2.6916, + "step": 13700 + }, + { + "epoch": 0.40628058001957124, + "grad_norm": 0.13945282995700836, + "learning_rate": 0.0006541504684558676, + "loss": 2.7077, + "step": 13701 + }, + { + "epoch": 0.4063102333718827, + "grad_norm": 0.1599428355693817, + "learning_rate": 0.0006541057087176256, + "loss": 2.6483, + "step": 13702 + }, + { + "epoch": 0.4063398867241942, + "grad_norm": 0.14484797418117523, + "learning_rate": 0.000654060947614751, + "loss": 2.7002, + "step": 13703 + }, + { + "epoch": 0.40636954007650566, + "grad_norm": 0.1166243627667427, + "learning_rate": 0.0006540161851476404, + "loss": 2.6737, + "step": 13704 + }, + { + "epoch": 0.40639919342881714, + "grad_norm": 0.15358662605285645, + "learning_rate": 0.0006539714213166899, + "loss": 2.6658, + "step": 13705 + }, + { + "epoch": 0.4064288467811286, + "grad_norm": 0.16827650368213654, + "learning_rate": 0.000653926656122296, + "loss": 2.7041, + "step": 13706 + }, + { + "epoch": 0.4064585001334401, + "grad_norm": 0.15368735790252686, + "learning_rate": 0.0006538818895648553, + "loss": 2.6969, + "step": 13707 + }, + { + "epoch": 0.40648815348575157, + "grad_norm": 0.14640840888023376, + "learning_rate": 0.000653837121644764, + "loss": 2.694, + "step": 13708 + }, + { + "epoch": 0.40651780683806304, + "grad_norm": 0.1413961499929428, + "learning_rate": 0.0006537923523624187, + "loss": 2.708, + "step": 13709 + }, + { + "epoch": 0.4065474601903745, + "grad_norm": 0.15546052157878876, + "learning_rate": 0.0006537475817182156, + "loss": 2.6866, + "step": 13710 + }, + { + "epoch": 0.406577113542686, + "grad_norm": 0.151102215051651, + "learning_rate": 0.0006537028097125513, + "loss": 2.6995, + "step": 13711 + }, + { + "epoch": 0.4066067668949975, + "grad_norm": 0.13491730391979218, + "learning_rate": 0.000653658036345822, + "loss": 2.6448, + "step": 13712 + }, + { + "epoch": 0.40663642024730895, + "grad_norm": 0.12312762439250946, + "learning_rate": 0.0006536132616184247, + "loss": 2.6796, + "step": 13713 + }, + { + "epoch": 0.4066660735996204, + "grad_norm": 0.14268451929092407, + "learning_rate": 0.0006535684855307556, + "loss": 2.6974, + "step": 13714 + }, + { + "epoch": 0.4066957269519319, + "grad_norm": 0.12715205550193787, + "learning_rate": 0.0006535237080832111, + "loss": 2.7226, + "step": 13715 + }, + { + "epoch": 0.4067253803042434, + "grad_norm": 0.11344905197620392, + "learning_rate": 0.0006534789292761879, + "loss": 2.7015, + "step": 13716 + }, + { + "epoch": 0.40675503365655485, + "grad_norm": 0.12517400085926056, + "learning_rate": 0.0006534341491100824, + "loss": 2.6703, + "step": 13717 + }, + { + "epoch": 0.40678468700886633, + "grad_norm": 0.12464260309934616, + "learning_rate": 0.0006533893675852911, + "loss": 2.6866, + "step": 13718 + }, + { + "epoch": 0.4068143403611778, + "grad_norm": 0.13411952555179596, + "learning_rate": 0.0006533445847022106, + "loss": 2.6492, + "step": 13719 + }, + { + "epoch": 0.4068439937134893, + "grad_norm": 0.1169314980506897, + "learning_rate": 0.0006532998004612376, + "loss": 2.6748, + "step": 13720 + }, + { + "epoch": 0.4068736470658008, + "grad_norm": 0.1290825754404068, + "learning_rate": 0.0006532550148627685, + "loss": 2.661, + "step": 13721 + }, + { + "epoch": 0.4069033004181123, + "grad_norm": 0.13779911398887634, + "learning_rate": 0.0006532102279071999, + "loss": 2.7149, + "step": 13722 + }, + { + "epoch": 0.40693295377042377, + "grad_norm": 0.12722159922122955, + "learning_rate": 0.0006531654395949284, + "loss": 2.7112, + "step": 13723 + }, + { + "epoch": 0.40696260712273524, + "grad_norm": 0.09728789329528809, + "learning_rate": 0.0006531206499263508, + "loss": 2.6926, + "step": 13724 + }, + { + "epoch": 0.4069922604750467, + "grad_norm": 0.11125493794679642, + "learning_rate": 0.0006530758589018635, + "loss": 2.6818, + "step": 13725 + }, + { + "epoch": 0.4070219138273582, + "grad_norm": 0.12066376954317093, + "learning_rate": 0.0006530310665218632, + "loss": 2.6826, + "step": 13726 + }, + { + "epoch": 0.40705156717966967, + "grad_norm": 0.10850758105516434, + "learning_rate": 0.0006529862727867465, + "loss": 2.6323, + "step": 13727 + }, + { + "epoch": 0.40708122053198115, + "grad_norm": 0.10972019284963608, + "learning_rate": 0.00065294147769691, + "loss": 2.6423, + "step": 13728 + }, + { + "epoch": 0.4071108738842926, + "grad_norm": 0.1312810629606247, + "learning_rate": 0.0006528966812527506, + "loss": 2.6742, + "step": 13729 + }, + { + "epoch": 0.4071405272366041, + "grad_norm": 0.1274038851261139, + "learning_rate": 0.0006528518834546649, + "loss": 2.6564, + "step": 13730 + }, + { + "epoch": 0.4071701805889156, + "grad_norm": 0.13678856194019318, + "learning_rate": 0.0006528070843030494, + "loss": 2.6727, + "step": 13731 + }, + { + "epoch": 0.40719983394122705, + "grad_norm": 0.15928781032562256, + "learning_rate": 0.0006527622837983009, + "loss": 2.6793, + "step": 13732 + }, + { + "epoch": 0.40722948729353853, + "grad_norm": 0.15720264613628387, + "learning_rate": 0.0006527174819408164, + "loss": 2.6442, + "step": 13733 + }, + { + "epoch": 0.40725914064585, + "grad_norm": 0.12400732189416885, + "learning_rate": 0.0006526726787309922, + "loss": 2.6679, + "step": 13734 + }, + { + "epoch": 0.4072887939981615, + "grad_norm": 0.11418417096138, + "learning_rate": 0.0006526278741692252, + "loss": 2.6677, + "step": 13735 + }, + { + "epoch": 0.40731844735047296, + "grad_norm": 0.13813738524913788, + "learning_rate": 0.0006525830682559122, + "loss": 2.6511, + "step": 13736 + }, + { + "epoch": 0.40734810070278443, + "grad_norm": 0.1320243775844574, + "learning_rate": 0.0006525382609914501, + "loss": 2.6858, + "step": 13737 + }, + { + "epoch": 0.4073777540550959, + "grad_norm": 0.13242025673389435, + "learning_rate": 0.0006524934523762353, + "loss": 2.6854, + "step": 13738 + }, + { + "epoch": 0.4074074074074074, + "grad_norm": 0.16715572774410248, + "learning_rate": 0.0006524486424106648, + "loss": 2.6359, + "step": 13739 + }, + { + "epoch": 0.40743706075971886, + "grad_norm": 0.15815138816833496, + "learning_rate": 0.0006524038310951356, + "loss": 2.673, + "step": 13740 + }, + { + "epoch": 0.4074667141120304, + "grad_norm": 0.1194697767496109, + "learning_rate": 0.0006523590184300441, + "loss": 2.6976, + "step": 13741 + }, + { + "epoch": 0.40749636746434187, + "grad_norm": 0.13450844585895538, + "learning_rate": 0.0006523142044157875, + "loss": 2.6917, + "step": 13742 + }, + { + "epoch": 0.40752602081665334, + "grad_norm": 0.126102015376091, + "learning_rate": 0.0006522693890527625, + "loss": 2.6757, + "step": 13743 + }, + { + "epoch": 0.4075556741689648, + "grad_norm": 0.13886302709579468, + "learning_rate": 0.0006522245723413658, + "loss": 2.709, + "step": 13744 + }, + { + "epoch": 0.4075853275212763, + "grad_norm": 0.14893558621406555, + "learning_rate": 0.0006521797542819944, + "loss": 2.6539, + "step": 13745 + }, + { + "epoch": 0.4076149808735878, + "grad_norm": 0.1415376365184784, + "learning_rate": 0.0006521349348750452, + "loss": 2.7165, + "step": 13746 + }, + { + "epoch": 0.40764463422589925, + "grad_norm": 0.12870417535305023, + "learning_rate": 0.000652090114120915, + "loss": 2.6738, + "step": 13747 + }, + { + "epoch": 0.4076742875782107, + "grad_norm": 0.11832724511623383, + "learning_rate": 0.0006520452920200008, + "loss": 2.6735, + "step": 13748 + }, + { + "epoch": 0.4077039409305222, + "grad_norm": 0.14524368941783905, + "learning_rate": 0.0006520004685726994, + "loss": 2.6699, + "step": 13749 + }, + { + "epoch": 0.4077335942828337, + "grad_norm": 0.15514464676380157, + "learning_rate": 0.0006519556437794078, + "loss": 2.7381, + "step": 13750 + }, + { + "epoch": 0.40776324763514515, + "grad_norm": 0.1255592554807663, + "learning_rate": 0.0006519108176405227, + "loss": 2.6708, + "step": 13751 + }, + { + "epoch": 0.40779290098745663, + "grad_norm": 0.12077595293521881, + "learning_rate": 0.0006518659901564414, + "loss": 2.6908, + "step": 13752 + }, + { + "epoch": 0.4078225543397681, + "grad_norm": 0.13942040503025055, + "learning_rate": 0.0006518211613275607, + "loss": 2.6935, + "step": 13753 + }, + { + "epoch": 0.4078522076920796, + "grad_norm": 0.12287125736474991, + "learning_rate": 0.0006517763311542776, + "loss": 2.6999, + "step": 13754 + }, + { + "epoch": 0.40788186104439106, + "grad_norm": 0.1331315040588379, + "learning_rate": 0.000651731499636989, + "loss": 2.6903, + "step": 13755 + }, + { + "epoch": 0.40791151439670253, + "grad_norm": 0.13311053812503815, + "learning_rate": 0.0006516866667760919, + "loss": 2.6545, + "step": 13756 + }, + { + "epoch": 0.407941167749014, + "grad_norm": 0.1303166300058365, + "learning_rate": 0.0006516418325719833, + "loss": 2.6734, + "step": 13757 + }, + { + "epoch": 0.4079708211013255, + "grad_norm": 0.1277812272310257, + "learning_rate": 0.0006515969970250601, + "loss": 2.674, + "step": 13758 + }, + { + "epoch": 0.40800047445363696, + "grad_norm": 0.1320481151342392, + "learning_rate": 0.0006515521601357197, + "loss": 2.6738, + "step": 13759 + }, + { + "epoch": 0.40803012780594844, + "grad_norm": 0.11565481871366501, + "learning_rate": 0.0006515073219043589, + "loss": 2.6765, + "step": 13760 + }, + { + "epoch": 0.4080597811582599, + "grad_norm": 0.13341425359249115, + "learning_rate": 0.0006514624823313746, + "loss": 2.6875, + "step": 13761 + }, + { + "epoch": 0.40808943451057145, + "grad_norm": 0.12126098573207855, + "learning_rate": 0.0006514176414171642, + "loss": 2.6888, + "step": 13762 + }, + { + "epoch": 0.4081190878628829, + "grad_norm": 0.11863750219345093, + "learning_rate": 0.0006513727991621246, + "loss": 2.6704, + "step": 13763 + }, + { + "epoch": 0.4081487412151944, + "grad_norm": 0.117739237844944, + "learning_rate": 0.0006513279555666527, + "loss": 2.695, + "step": 13764 + }, + { + "epoch": 0.4081783945675059, + "grad_norm": 0.11507673561573029, + "learning_rate": 0.0006512831106311459, + "loss": 2.6973, + "step": 13765 + }, + { + "epoch": 0.40820804791981735, + "grad_norm": 0.12939396500587463, + "learning_rate": 0.0006512382643560011, + "loss": 2.6777, + "step": 13766 + }, + { + "epoch": 0.40823770127212883, + "grad_norm": 0.12507571280002594, + "learning_rate": 0.0006511934167416156, + "loss": 2.6666, + "step": 13767 + }, + { + "epoch": 0.4082673546244403, + "grad_norm": 0.10850481688976288, + "learning_rate": 0.0006511485677883863, + "loss": 2.6851, + "step": 13768 + }, + { + "epoch": 0.4082970079767518, + "grad_norm": 0.1360655575990677, + "learning_rate": 0.0006511037174967107, + "loss": 2.7169, + "step": 13769 + }, + { + "epoch": 0.40832666132906326, + "grad_norm": 0.1729549914598465, + "learning_rate": 0.0006510588658669856, + "loss": 2.7015, + "step": 13770 + }, + { + "epoch": 0.40835631468137473, + "grad_norm": 0.1792670637369156, + "learning_rate": 0.0006510140128996084, + "loss": 2.7029, + "step": 13771 + }, + { + "epoch": 0.4083859680336862, + "grad_norm": 0.17827172577381134, + "learning_rate": 0.0006509691585949762, + "loss": 2.657, + "step": 13772 + }, + { + "epoch": 0.4084156213859977, + "grad_norm": 0.13917818665504456, + "learning_rate": 0.0006509243029534862, + "loss": 2.6514, + "step": 13773 + }, + { + "epoch": 0.40844527473830916, + "grad_norm": 0.13577309250831604, + "learning_rate": 0.0006508794459755354, + "loss": 2.6743, + "step": 13774 + }, + { + "epoch": 0.40847492809062064, + "grad_norm": 0.15460249781608582, + "learning_rate": 0.0006508345876615215, + "loss": 2.686, + "step": 13775 + }, + { + "epoch": 0.4085045814429321, + "grad_norm": 0.14496441185474396, + "learning_rate": 0.0006507897280118413, + "loss": 2.6504, + "step": 13776 + }, + { + "epoch": 0.4085342347952436, + "grad_norm": 0.15760383009910583, + "learning_rate": 0.000650744867026892, + "loss": 2.6831, + "step": 13777 + }, + { + "epoch": 0.40856388814755507, + "grad_norm": 0.13822989165782928, + "learning_rate": 0.0006507000047070711, + "loss": 2.7122, + "step": 13778 + }, + { + "epoch": 0.40859354149986654, + "grad_norm": 0.1484384387731552, + "learning_rate": 0.0006506551410527759, + "loss": 2.6515, + "step": 13779 + }, + { + "epoch": 0.408623194852178, + "grad_norm": 0.1373644471168518, + "learning_rate": 0.0006506102760644037, + "loss": 2.6932, + "step": 13780 + }, + { + "epoch": 0.4086528482044895, + "grad_norm": 0.12459941953420639, + "learning_rate": 0.0006505654097423515, + "loss": 2.7154, + "step": 13781 + }, + { + "epoch": 0.40868250155680097, + "grad_norm": 0.12213601917028427, + "learning_rate": 0.0006505205420870167, + "loss": 2.7448, + "step": 13782 + }, + { + "epoch": 0.4087121549091125, + "grad_norm": 0.13612771034240723, + "learning_rate": 0.0006504756730987966, + "loss": 2.6881, + "step": 13783 + }, + { + "epoch": 0.408741808261424, + "grad_norm": 0.12614494562149048, + "learning_rate": 0.0006504308027780887, + "loss": 2.6585, + "step": 13784 + }, + { + "epoch": 0.40877146161373545, + "grad_norm": 0.12314530462026596, + "learning_rate": 0.0006503859311252903, + "loss": 2.6701, + "step": 13785 + }, + { + "epoch": 0.40880111496604693, + "grad_norm": 0.12398508191108704, + "learning_rate": 0.0006503410581407986, + "loss": 2.6685, + "step": 13786 + }, + { + "epoch": 0.4088307683183584, + "grad_norm": 0.13430257141590118, + "learning_rate": 0.000650296183825011, + "loss": 2.6912, + "step": 13787 + }, + { + "epoch": 0.4088604216706699, + "grad_norm": 0.1305517703294754, + "learning_rate": 0.0006502513081783249, + "loss": 2.6735, + "step": 13788 + }, + { + "epoch": 0.40889007502298136, + "grad_norm": 0.1222628802061081, + "learning_rate": 0.0006502064312011377, + "loss": 2.6824, + "step": 13789 + }, + { + "epoch": 0.40891972837529283, + "grad_norm": 0.11551421135663986, + "learning_rate": 0.0006501615528938466, + "loss": 2.686, + "step": 13790 + }, + { + "epoch": 0.4089493817276043, + "grad_norm": 0.12195300310850143, + "learning_rate": 0.0006501166732568494, + "loss": 2.7028, + "step": 13791 + }, + { + "epoch": 0.4089790350799158, + "grad_norm": 0.09586959332227707, + "learning_rate": 0.0006500717922905433, + "loss": 2.685, + "step": 13792 + }, + { + "epoch": 0.40900868843222726, + "grad_norm": 0.10141245275735855, + "learning_rate": 0.0006500269099953256, + "loss": 2.6617, + "step": 13793 + }, + { + "epoch": 0.40903834178453874, + "grad_norm": 0.11198153346776962, + "learning_rate": 0.0006499820263715938, + "loss": 2.6841, + "step": 13794 + }, + { + "epoch": 0.4090679951368502, + "grad_norm": 0.11387629806995392, + "learning_rate": 0.0006499371414197454, + "loss": 2.6608, + "step": 13795 + }, + { + "epoch": 0.4090976484891617, + "grad_norm": 0.12244108319282532, + "learning_rate": 0.0006498922551401781, + "loss": 2.6528, + "step": 13796 + }, + { + "epoch": 0.40912730184147317, + "grad_norm": 0.12228551506996155, + "learning_rate": 0.000649847367533289, + "loss": 2.6951, + "step": 13797 + }, + { + "epoch": 0.40915695519378464, + "grad_norm": 0.11145424842834473, + "learning_rate": 0.0006498024785994758, + "loss": 2.6482, + "step": 13798 + }, + { + "epoch": 0.4091866085460961, + "grad_norm": 0.1439656913280487, + "learning_rate": 0.0006497575883391359, + "loss": 2.6886, + "step": 13799 + }, + { + "epoch": 0.4092162618984076, + "grad_norm": 0.18747639656066895, + "learning_rate": 0.0006497126967526668, + "loss": 2.6978, + "step": 13800 + }, + { + "epoch": 0.4092459152507191, + "grad_norm": 0.17958776652812958, + "learning_rate": 0.0006496678038404662, + "loss": 2.71, + "step": 13801 + }, + { + "epoch": 0.40927556860303055, + "grad_norm": 0.12494948506355286, + "learning_rate": 0.0006496229096029314, + "loss": 2.6905, + "step": 13802 + }, + { + "epoch": 0.409305221955342, + "grad_norm": 0.14986497163772583, + "learning_rate": 0.0006495780140404601, + "loss": 2.665, + "step": 13803 + }, + { + "epoch": 0.40933487530765356, + "grad_norm": 0.1658545881509781, + "learning_rate": 0.0006495331171534498, + "loss": 2.6776, + "step": 13804 + }, + { + "epoch": 0.40936452865996503, + "grad_norm": 0.13468892872333527, + "learning_rate": 0.0006494882189422981, + "loss": 2.6936, + "step": 13805 + }, + { + "epoch": 0.4093941820122765, + "grad_norm": 0.13957522809505463, + "learning_rate": 0.0006494433194074025, + "loss": 2.69, + "step": 13806 + }, + { + "epoch": 0.409423835364588, + "grad_norm": 0.1560104638338089, + "learning_rate": 0.0006493984185491607, + "loss": 2.6732, + "step": 13807 + }, + { + "epoch": 0.40945348871689946, + "grad_norm": 0.146762877702713, + "learning_rate": 0.0006493535163679704, + "loss": 2.6553, + "step": 13808 + }, + { + "epoch": 0.40948314206921094, + "grad_norm": 0.1252361685037613, + "learning_rate": 0.0006493086128642288, + "loss": 2.6981, + "step": 13809 + }, + { + "epoch": 0.4095127954215224, + "grad_norm": 0.1106487289071083, + "learning_rate": 0.0006492637080383339, + "loss": 2.6777, + "step": 13810 + }, + { + "epoch": 0.4095424487738339, + "grad_norm": 0.1432904601097107, + "learning_rate": 0.0006492188018906833, + "loss": 2.69, + "step": 13811 + }, + { + "epoch": 0.40957210212614537, + "grad_norm": 0.1489887535572052, + "learning_rate": 0.0006491738944216746, + "loss": 2.6732, + "step": 13812 + }, + { + "epoch": 0.40960175547845684, + "grad_norm": 0.13107344508171082, + "learning_rate": 0.0006491289856317055, + "loss": 2.6645, + "step": 13813 + }, + { + "epoch": 0.4096314088307683, + "grad_norm": 0.12371079623699188, + "learning_rate": 0.0006490840755211736, + "loss": 2.7104, + "step": 13814 + }, + { + "epoch": 0.4096610621830798, + "grad_norm": 0.11952613294124603, + "learning_rate": 0.0006490391640904766, + "loss": 2.6817, + "step": 13815 + }, + { + "epoch": 0.40969071553539127, + "grad_norm": 0.10956225544214249, + "learning_rate": 0.0006489942513400121, + "loss": 2.6841, + "step": 13816 + }, + { + "epoch": 0.40972036888770275, + "grad_norm": 0.12000447511672974, + "learning_rate": 0.000648949337270178, + "loss": 2.6841, + "step": 13817 + }, + { + "epoch": 0.4097500222400142, + "grad_norm": 0.1306486278772354, + "learning_rate": 0.0006489044218813722, + "loss": 2.6948, + "step": 13818 + }, + { + "epoch": 0.4097796755923257, + "grad_norm": 0.1307206004858017, + "learning_rate": 0.0006488595051739919, + "loss": 2.6939, + "step": 13819 + }, + { + "epoch": 0.4098093289446372, + "grad_norm": 0.12897154688835144, + "learning_rate": 0.0006488145871484352, + "loss": 2.662, + "step": 13820 + }, + { + "epoch": 0.40983898229694865, + "grad_norm": 0.10618124902248383, + "learning_rate": 0.0006487696678050998, + "loss": 2.6859, + "step": 13821 + }, + { + "epoch": 0.4098686356492601, + "grad_norm": 0.12219790369272232, + "learning_rate": 0.0006487247471443833, + "loss": 2.6719, + "step": 13822 + }, + { + "epoch": 0.4098982890015716, + "grad_norm": 0.1391894668340683, + "learning_rate": 0.000648679825166684, + "loss": 2.6677, + "step": 13823 + }, + { + "epoch": 0.4099279423538831, + "grad_norm": 0.13260577619075775, + "learning_rate": 0.0006486349018723992, + "loss": 2.6762, + "step": 13824 + }, + { + "epoch": 0.4099575957061946, + "grad_norm": 0.1306818425655365, + "learning_rate": 0.0006485899772619266, + "loss": 2.6697, + "step": 13825 + }, + { + "epoch": 0.4099872490585061, + "grad_norm": 0.12668952345848083, + "learning_rate": 0.0006485450513356644, + "loss": 2.6546, + "step": 13826 + }, + { + "epoch": 0.41001690241081756, + "grad_norm": 0.13263416290283203, + "learning_rate": 0.0006485001240940102, + "loss": 2.6786, + "step": 13827 + }, + { + "epoch": 0.41004655576312904, + "grad_norm": 0.11849340051412582, + "learning_rate": 0.0006484551955373621, + "loss": 2.6515, + "step": 13828 + }, + { + "epoch": 0.4100762091154405, + "grad_norm": 0.1180693507194519, + "learning_rate": 0.0006484102656661176, + "loss": 2.7207, + "step": 13829 + }, + { + "epoch": 0.410105862467752, + "grad_norm": 0.12604747712612152, + "learning_rate": 0.0006483653344806749, + "loss": 2.7379, + "step": 13830 + }, + { + "epoch": 0.41013551582006347, + "grad_norm": 0.11894894391298294, + "learning_rate": 0.0006483204019814315, + "loss": 2.6791, + "step": 13831 + }, + { + "epoch": 0.41016516917237494, + "grad_norm": 0.10349209606647491, + "learning_rate": 0.0006482754681687854, + "loss": 2.6935, + "step": 13832 + }, + { + "epoch": 0.4101948225246864, + "grad_norm": 0.1293584108352661, + "learning_rate": 0.0006482305330431349, + "loss": 2.6626, + "step": 13833 + }, + { + "epoch": 0.4102244758769979, + "grad_norm": 0.12429377436637878, + "learning_rate": 0.0006481855966048773, + "loss": 2.6682, + "step": 13834 + }, + { + "epoch": 0.41025412922930937, + "grad_norm": 0.130644753575325, + "learning_rate": 0.0006481406588544109, + "loss": 2.6896, + "step": 13835 + }, + { + "epoch": 0.41028378258162085, + "grad_norm": 0.11200260370969772, + "learning_rate": 0.0006480957197921336, + "loss": 2.6821, + "step": 13836 + }, + { + "epoch": 0.4103134359339323, + "grad_norm": 0.12088236212730408, + "learning_rate": 0.0006480507794184431, + "loss": 2.6651, + "step": 13837 + }, + { + "epoch": 0.4103430892862438, + "grad_norm": 0.13297726213932037, + "learning_rate": 0.0006480058377337377, + "loss": 2.6541, + "step": 13838 + }, + { + "epoch": 0.4103727426385553, + "grad_norm": 0.14842581748962402, + "learning_rate": 0.0006479608947384152, + "loss": 2.6394, + "step": 13839 + }, + { + "epoch": 0.41040239599086675, + "grad_norm": 0.13247980177402496, + "learning_rate": 0.0006479159504328736, + "loss": 2.6462, + "step": 13840 + }, + { + "epoch": 0.41043204934317823, + "grad_norm": 0.13646376132965088, + "learning_rate": 0.0006478710048175109, + "loss": 2.6373, + "step": 13841 + }, + { + "epoch": 0.4104617026954897, + "grad_norm": 0.12289762496948242, + "learning_rate": 0.0006478260578927249, + "loss": 2.6635, + "step": 13842 + }, + { + "epoch": 0.4104913560478012, + "grad_norm": 0.11701101809740067, + "learning_rate": 0.0006477811096589139, + "loss": 2.652, + "step": 13843 + }, + { + "epoch": 0.41052100940011266, + "grad_norm": 0.10863805562257767, + "learning_rate": 0.0006477361601164757, + "loss": 2.6845, + "step": 13844 + }, + { + "epoch": 0.4105506627524242, + "grad_norm": 0.12038055062294006, + "learning_rate": 0.0006476912092658085, + "loss": 2.6951, + "step": 13845 + }, + { + "epoch": 0.41058031610473567, + "grad_norm": 0.11086242645978928, + "learning_rate": 0.0006476462571073105, + "loss": 2.6607, + "step": 13846 + }, + { + "epoch": 0.41060996945704714, + "grad_norm": 0.10769634693861008, + "learning_rate": 0.0006476013036413792, + "loss": 2.6921, + "step": 13847 + }, + { + "epoch": 0.4106396228093586, + "grad_norm": 0.10955388844013214, + "learning_rate": 0.0006475563488684132, + "loss": 2.6316, + "step": 13848 + }, + { + "epoch": 0.4106692761616701, + "grad_norm": 0.09995950013399124, + "learning_rate": 0.0006475113927888103, + "loss": 2.6974, + "step": 13849 + }, + { + "epoch": 0.41069892951398157, + "grad_norm": 0.10966327786445618, + "learning_rate": 0.0006474664354029689, + "loss": 2.6494, + "step": 13850 + }, + { + "epoch": 0.41072858286629305, + "grad_norm": 0.12423920631408691, + "learning_rate": 0.0006474214767112869, + "loss": 2.6377, + "step": 13851 + }, + { + "epoch": 0.4107582362186045, + "grad_norm": 0.1191270649433136, + "learning_rate": 0.0006473765167141623, + "loss": 2.6566, + "step": 13852 + }, + { + "epoch": 0.410787889570916, + "grad_norm": 0.1175374761223793, + "learning_rate": 0.0006473315554119933, + "loss": 2.6521, + "step": 13853 + }, + { + "epoch": 0.4108175429232275, + "grad_norm": 0.1137089654803276, + "learning_rate": 0.0006472865928051781, + "loss": 2.6624, + "step": 13854 + }, + { + "epoch": 0.41084719627553895, + "grad_norm": 0.12697923183441162, + "learning_rate": 0.0006472416288941149, + "loss": 2.6795, + "step": 13855 + }, + { + "epoch": 0.4108768496278504, + "grad_norm": 0.14374099671840668, + "learning_rate": 0.0006471966636792018, + "loss": 2.6728, + "step": 13856 + }, + { + "epoch": 0.4109065029801619, + "grad_norm": 0.15830576419830322, + "learning_rate": 0.000647151697160837, + "loss": 2.6823, + "step": 13857 + }, + { + "epoch": 0.4109361563324734, + "grad_norm": 0.16191303730010986, + "learning_rate": 0.0006471067293394187, + "loss": 2.6567, + "step": 13858 + }, + { + "epoch": 0.41096580968478486, + "grad_norm": 0.16752511262893677, + "learning_rate": 0.0006470617602153449, + "loss": 2.6723, + "step": 13859 + }, + { + "epoch": 0.41099546303709633, + "grad_norm": 0.16746574640274048, + "learning_rate": 0.0006470167897890141, + "loss": 2.693, + "step": 13860 + }, + { + "epoch": 0.4110251163894078, + "grad_norm": 0.14624589681625366, + "learning_rate": 0.0006469718180608243, + "loss": 2.6483, + "step": 13861 + }, + { + "epoch": 0.4110547697417193, + "grad_norm": 0.15144875645637512, + "learning_rate": 0.0006469268450311739, + "loss": 2.7317, + "step": 13862 + }, + { + "epoch": 0.41108442309403076, + "grad_norm": 0.18437185883522034, + "learning_rate": 0.000646881870700461, + "loss": 2.7063, + "step": 13863 + }, + { + "epoch": 0.41111407644634224, + "grad_norm": 0.13183189928531647, + "learning_rate": 0.000646836895069084, + "loss": 2.7105, + "step": 13864 + }, + { + "epoch": 0.4111437297986537, + "grad_norm": 0.13391652703285217, + "learning_rate": 0.0006467919181374409, + "loss": 2.6838, + "step": 13865 + }, + { + "epoch": 0.41117338315096524, + "grad_norm": 0.1672421097755432, + "learning_rate": 0.0006467469399059304, + "loss": 2.692, + "step": 13866 + }, + { + "epoch": 0.4112030365032767, + "grad_norm": 0.1308574676513672, + "learning_rate": 0.0006467019603749504, + "loss": 2.6631, + "step": 13867 + }, + { + "epoch": 0.4112326898555882, + "grad_norm": 0.12591946125030518, + "learning_rate": 0.0006466569795448995, + "loss": 2.7409, + "step": 13868 + }, + { + "epoch": 0.41126234320789967, + "grad_norm": 0.14647124707698822, + "learning_rate": 0.0006466119974161759, + "loss": 2.671, + "step": 13869 + }, + { + "epoch": 0.41129199656021115, + "grad_norm": 0.10827726125717163, + "learning_rate": 0.0006465670139891777, + "loss": 2.6726, + "step": 13870 + }, + { + "epoch": 0.4113216499125226, + "grad_norm": 0.11328845471143723, + "learning_rate": 0.0006465220292643036, + "loss": 2.6564, + "step": 13871 + }, + { + "epoch": 0.4113513032648341, + "grad_norm": 0.1126711368560791, + "learning_rate": 0.0006464770432419518, + "loss": 2.6788, + "step": 13872 + }, + { + "epoch": 0.4113809566171456, + "grad_norm": 0.11686854809522629, + "learning_rate": 0.0006464320559225205, + "loss": 2.7188, + "step": 13873 + }, + { + "epoch": 0.41141060996945705, + "grad_norm": 0.11653249710798264, + "learning_rate": 0.0006463870673064083, + "loss": 2.6704, + "step": 13874 + }, + { + "epoch": 0.41144026332176853, + "grad_norm": 0.11118235439062119, + "learning_rate": 0.0006463420773940135, + "loss": 2.6826, + "step": 13875 + }, + { + "epoch": 0.41146991667408, + "grad_norm": 0.11745349317789078, + "learning_rate": 0.0006462970861857343, + "loss": 2.6889, + "step": 13876 + }, + { + "epoch": 0.4114995700263915, + "grad_norm": 0.11303942650556564, + "learning_rate": 0.0006462520936819695, + "loss": 2.7053, + "step": 13877 + }, + { + "epoch": 0.41152922337870296, + "grad_norm": 0.11550934612751007, + "learning_rate": 0.0006462070998831172, + "loss": 2.6958, + "step": 13878 + }, + { + "epoch": 0.41155887673101443, + "grad_norm": 0.1363796591758728, + "learning_rate": 0.0006461621047895761, + "loss": 2.7077, + "step": 13879 + }, + { + "epoch": 0.4115885300833259, + "grad_norm": 0.15292878448963165, + "learning_rate": 0.0006461171084017443, + "loss": 2.6829, + "step": 13880 + }, + { + "epoch": 0.4116181834356374, + "grad_norm": 0.1534990817308426, + "learning_rate": 0.0006460721107200205, + "loss": 2.6905, + "step": 13881 + }, + { + "epoch": 0.41164783678794886, + "grad_norm": 0.16601260006427765, + "learning_rate": 0.0006460271117448029, + "loss": 2.6623, + "step": 13882 + }, + { + "epoch": 0.41167749014026034, + "grad_norm": 0.15922510623931885, + "learning_rate": 0.0006459821114764904, + "loss": 2.6749, + "step": 13883 + }, + { + "epoch": 0.4117071434925718, + "grad_norm": 0.14214371144771576, + "learning_rate": 0.0006459371099154813, + "loss": 2.6493, + "step": 13884 + }, + { + "epoch": 0.4117367968448833, + "grad_norm": 0.11751709878444672, + "learning_rate": 0.0006458921070621739, + "loss": 2.7134, + "step": 13885 + }, + { + "epoch": 0.41176645019719477, + "grad_norm": 0.12757143378257751, + "learning_rate": 0.0006458471029169669, + "loss": 2.673, + "step": 13886 + }, + { + "epoch": 0.4117961035495063, + "grad_norm": 0.15896499156951904, + "learning_rate": 0.0006458020974802587, + "loss": 2.6546, + "step": 13887 + }, + { + "epoch": 0.4118257569018178, + "grad_norm": 0.15374431014060974, + "learning_rate": 0.0006457570907524478, + "loss": 2.6989, + "step": 13888 + }, + { + "epoch": 0.41185541025412925, + "grad_norm": 0.1337919980287552, + "learning_rate": 0.0006457120827339331, + "loss": 2.6934, + "step": 13889 + }, + { + "epoch": 0.4118850636064407, + "grad_norm": 0.1364782154560089, + "learning_rate": 0.0006456670734251127, + "loss": 2.663, + "step": 13890 + }, + { + "epoch": 0.4119147169587522, + "grad_norm": 0.1332981437444687, + "learning_rate": 0.0006456220628263856, + "loss": 2.6427, + "step": 13891 + }, + { + "epoch": 0.4119443703110637, + "grad_norm": 0.10287126153707504, + "learning_rate": 0.0006455770509381499, + "loss": 2.6834, + "step": 13892 + }, + { + "epoch": 0.41197402366337516, + "grad_norm": 0.14071518182754517, + "learning_rate": 0.0006455320377608043, + "loss": 2.6551, + "step": 13893 + }, + { + "epoch": 0.41200367701568663, + "grad_norm": 0.15156659483909607, + "learning_rate": 0.0006454870232947479, + "loss": 2.6826, + "step": 13894 + }, + { + "epoch": 0.4120333303679981, + "grad_norm": 0.1249346137046814, + "learning_rate": 0.0006454420075403788, + "loss": 2.647, + "step": 13895 + }, + { + "epoch": 0.4120629837203096, + "grad_norm": 0.15177229046821594, + "learning_rate": 0.0006453969904980957, + "loss": 2.717, + "step": 13896 + }, + { + "epoch": 0.41209263707262106, + "grad_norm": 0.14980976283550262, + "learning_rate": 0.0006453519721682972, + "loss": 2.6763, + "step": 13897 + }, + { + "epoch": 0.41212229042493254, + "grad_norm": 0.143419548869133, + "learning_rate": 0.0006453069525513822, + "loss": 2.6654, + "step": 13898 + }, + { + "epoch": 0.412151943777244, + "grad_norm": 0.14087356626987457, + "learning_rate": 0.0006452619316477491, + "loss": 2.6655, + "step": 13899 + }, + { + "epoch": 0.4121815971295555, + "grad_norm": 0.1491251438856125, + "learning_rate": 0.0006452169094577967, + "loss": 2.657, + "step": 13900 + }, + { + "epoch": 0.41221125048186696, + "grad_norm": 0.1570906639099121, + "learning_rate": 0.0006451718859819236, + "loss": 2.6832, + "step": 13901 + }, + { + "epoch": 0.41224090383417844, + "grad_norm": 0.130584254860878, + "learning_rate": 0.0006451268612205287, + "loss": 2.6951, + "step": 13902 + }, + { + "epoch": 0.4122705571864899, + "grad_norm": 0.13319431245326996, + "learning_rate": 0.0006450818351740104, + "loss": 2.6315, + "step": 13903 + }, + { + "epoch": 0.4123002105388014, + "grad_norm": 0.14590345323085785, + "learning_rate": 0.0006450368078427675, + "loss": 2.6423, + "step": 13904 + }, + { + "epoch": 0.41232986389111287, + "grad_norm": 0.12089316546916962, + "learning_rate": 0.0006449917792271989, + "loss": 2.7002, + "step": 13905 + }, + { + "epoch": 0.41235951724342434, + "grad_norm": 0.10879600048065186, + "learning_rate": 0.000644946749327703, + "loss": 2.7133, + "step": 13906 + }, + { + "epoch": 0.4123891705957358, + "grad_norm": 0.12891440093517303, + "learning_rate": 0.0006449017181446791, + "loss": 2.6901, + "step": 13907 + }, + { + "epoch": 0.41241882394804735, + "grad_norm": 0.1346965879201889, + "learning_rate": 0.0006448566856785253, + "loss": 2.6984, + "step": 13908 + }, + { + "epoch": 0.41244847730035883, + "grad_norm": 0.15262024104595184, + "learning_rate": 0.0006448116519296407, + "loss": 2.6729, + "step": 13909 + }, + { + "epoch": 0.4124781306526703, + "grad_norm": 0.1410692036151886, + "learning_rate": 0.0006447666168984242, + "loss": 2.673, + "step": 13910 + }, + { + "epoch": 0.4125077840049818, + "grad_norm": 0.10761778056621552, + "learning_rate": 0.0006447215805852745, + "loss": 2.677, + "step": 13911 + }, + { + "epoch": 0.41253743735729326, + "grad_norm": 0.13750773668289185, + "learning_rate": 0.0006446765429905903, + "loss": 2.6882, + "step": 13912 + }, + { + "epoch": 0.41256709070960473, + "grad_norm": 0.13578146696090698, + "learning_rate": 0.0006446315041147703, + "loss": 2.6864, + "step": 13913 + }, + { + "epoch": 0.4125967440619162, + "grad_norm": 0.13839854300022125, + "learning_rate": 0.0006445864639582136, + "loss": 2.7036, + "step": 13914 + }, + { + "epoch": 0.4126263974142277, + "grad_norm": 0.1261919140815735, + "learning_rate": 0.000644541422521319, + "loss": 2.6875, + "step": 13915 + }, + { + "epoch": 0.41265605076653916, + "grad_norm": 0.1484137773513794, + "learning_rate": 0.0006444963798044854, + "loss": 2.6614, + "step": 13916 + }, + { + "epoch": 0.41268570411885064, + "grad_norm": 0.1408783495426178, + "learning_rate": 0.0006444513358081114, + "loss": 2.6996, + "step": 13917 + }, + { + "epoch": 0.4127153574711621, + "grad_norm": 0.12557174265384674, + "learning_rate": 0.0006444062905325962, + "loss": 2.6735, + "step": 13918 + }, + { + "epoch": 0.4127450108234736, + "grad_norm": 0.11842531710863113, + "learning_rate": 0.0006443612439783382, + "loss": 2.6682, + "step": 13919 + }, + { + "epoch": 0.41277466417578507, + "grad_norm": 0.12333828210830688, + "learning_rate": 0.0006443161961457368, + "loss": 2.6904, + "step": 13920 + }, + { + "epoch": 0.41280431752809654, + "grad_norm": 0.12525223195552826, + "learning_rate": 0.0006442711470351907, + "loss": 2.6375, + "step": 13921 + }, + { + "epoch": 0.412833970880408, + "grad_norm": 0.13381582498550415, + "learning_rate": 0.000644226096647099, + "loss": 2.6541, + "step": 13922 + }, + { + "epoch": 0.4128636242327195, + "grad_norm": 0.13200564682483673, + "learning_rate": 0.0006441810449818602, + "loss": 2.7122, + "step": 13923 + }, + { + "epoch": 0.41289327758503097, + "grad_norm": 0.11447799205780029, + "learning_rate": 0.0006441359920398736, + "loss": 2.6629, + "step": 13924 + }, + { + "epoch": 0.41292293093734245, + "grad_norm": 0.1270982027053833, + "learning_rate": 0.000644090937821538, + "loss": 2.7043, + "step": 13925 + }, + { + "epoch": 0.4129525842896539, + "grad_norm": 0.12899385392665863, + "learning_rate": 0.0006440458823272524, + "loss": 2.6524, + "step": 13926 + }, + { + "epoch": 0.4129822376419654, + "grad_norm": 0.12444373965263367, + "learning_rate": 0.0006440008255574159, + "loss": 2.6801, + "step": 13927 + }, + { + "epoch": 0.4130118909942769, + "grad_norm": 0.137184739112854, + "learning_rate": 0.0006439557675124273, + "loss": 2.6832, + "step": 13928 + }, + { + "epoch": 0.4130415443465884, + "grad_norm": 0.1385057270526886, + "learning_rate": 0.0006439107081926857, + "loss": 2.6589, + "step": 13929 + }, + { + "epoch": 0.4130711976988999, + "grad_norm": 0.14981018006801605, + "learning_rate": 0.00064386564759859, + "loss": 2.6694, + "step": 13930 + }, + { + "epoch": 0.41310085105121136, + "grad_norm": 0.13046053051948547, + "learning_rate": 0.0006438205857305395, + "loss": 2.6554, + "step": 13931 + }, + { + "epoch": 0.41313050440352284, + "grad_norm": 0.14149929583072662, + "learning_rate": 0.0006437755225889328, + "loss": 2.7073, + "step": 13932 + }, + { + "epoch": 0.4131601577558343, + "grad_norm": 0.1405247151851654, + "learning_rate": 0.0006437304581741692, + "loss": 2.7308, + "step": 13933 + }, + { + "epoch": 0.4131898111081458, + "grad_norm": 0.13530534505844116, + "learning_rate": 0.0006436853924866479, + "loss": 2.666, + "step": 13934 + }, + { + "epoch": 0.41321946446045726, + "grad_norm": 0.13951337337493896, + "learning_rate": 0.0006436403255267676, + "loss": 2.6926, + "step": 13935 + }, + { + "epoch": 0.41324911781276874, + "grad_norm": 0.14084431529045105, + "learning_rate": 0.0006435952572949275, + "loss": 2.6708, + "step": 13936 + }, + { + "epoch": 0.4132787711650802, + "grad_norm": 0.14650171995162964, + "learning_rate": 0.0006435501877915269, + "loss": 2.6805, + "step": 13937 + }, + { + "epoch": 0.4133084245173917, + "grad_norm": 0.11761025339365005, + "learning_rate": 0.0006435051170169647, + "loss": 2.7031, + "step": 13938 + }, + { + "epoch": 0.41333807786970317, + "grad_norm": 0.12246038764715195, + "learning_rate": 0.0006434600449716401, + "loss": 2.6765, + "step": 13939 + }, + { + "epoch": 0.41336773122201464, + "grad_norm": 0.10904025286436081, + "learning_rate": 0.0006434149716559521, + "loss": 2.6999, + "step": 13940 + }, + { + "epoch": 0.4133973845743261, + "grad_norm": 0.11131055653095245, + "learning_rate": 0.0006433698970703001, + "loss": 2.6557, + "step": 13941 + }, + { + "epoch": 0.4134270379266376, + "grad_norm": 0.11259347200393677, + "learning_rate": 0.0006433248212150828, + "loss": 2.6591, + "step": 13942 + }, + { + "epoch": 0.4134566912789491, + "grad_norm": 0.12602196633815765, + "learning_rate": 0.0006432797440906997, + "loss": 2.6907, + "step": 13943 + }, + { + "epoch": 0.41348634463126055, + "grad_norm": 0.13303756713867188, + "learning_rate": 0.0006432346656975499, + "loss": 2.7219, + "step": 13944 + }, + { + "epoch": 0.413515997983572, + "grad_norm": 0.16744858026504517, + "learning_rate": 0.0006431895860360325, + "loss": 2.6894, + "step": 13945 + }, + { + "epoch": 0.4135456513358835, + "grad_norm": 0.18656335771083832, + "learning_rate": 0.0006431445051065468, + "loss": 2.6819, + "step": 13946 + }, + { + "epoch": 0.413575304688195, + "grad_norm": 0.14534756541252136, + "learning_rate": 0.0006430994229094919, + "loss": 2.6797, + "step": 13947 + }, + { + "epoch": 0.41360495804050645, + "grad_norm": 0.1259937286376953, + "learning_rate": 0.000643054339445267, + "loss": 2.6885, + "step": 13948 + }, + { + "epoch": 0.413634611392818, + "grad_norm": 0.13106970489025116, + "learning_rate": 0.0006430092547142716, + "loss": 2.681, + "step": 13949 + }, + { + "epoch": 0.41366426474512946, + "grad_norm": 0.15319830179214478, + "learning_rate": 0.0006429641687169046, + "loss": 2.6575, + "step": 13950 + }, + { + "epoch": 0.41369391809744094, + "grad_norm": 0.1304948627948761, + "learning_rate": 0.0006429190814535651, + "loss": 2.6372, + "step": 13951 + }, + { + "epoch": 0.4137235714497524, + "grad_norm": 0.11043114215135574, + "learning_rate": 0.0006428739929246527, + "loss": 2.6851, + "step": 13952 + }, + { + "epoch": 0.4137532248020639, + "grad_norm": 0.13064329326152802, + "learning_rate": 0.0006428289031305668, + "loss": 2.6877, + "step": 13953 + }, + { + "epoch": 0.41378287815437537, + "grad_norm": 0.12614738941192627, + "learning_rate": 0.0006427838120717062, + "loss": 2.6958, + "step": 13954 + }, + { + "epoch": 0.41381253150668684, + "grad_norm": 0.13289576768875122, + "learning_rate": 0.0006427387197484707, + "loss": 2.6879, + "step": 13955 + }, + { + "epoch": 0.4138421848589983, + "grad_norm": 0.15966477990150452, + "learning_rate": 0.0006426936261612591, + "loss": 2.697, + "step": 13956 + }, + { + "epoch": 0.4138718382113098, + "grad_norm": 0.15292920172214508, + "learning_rate": 0.000642648531310471, + "loss": 2.6664, + "step": 13957 + }, + { + "epoch": 0.41390149156362127, + "grad_norm": 0.12102013826370239, + "learning_rate": 0.0006426034351965055, + "loss": 2.6895, + "step": 13958 + }, + { + "epoch": 0.41393114491593275, + "grad_norm": 0.10885229706764221, + "learning_rate": 0.0006425583378197624, + "loss": 2.6813, + "step": 13959 + }, + { + "epoch": 0.4139607982682442, + "grad_norm": 0.12107551097869873, + "learning_rate": 0.0006425132391806406, + "loss": 2.703, + "step": 13960 + }, + { + "epoch": 0.4139904516205557, + "grad_norm": 0.11939080059528351, + "learning_rate": 0.0006424681392795397, + "loss": 2.6843, + "step": 13961 + }, + { + "epoch": 0.4140201049728672, + "grad_norm": 0.1264885812997818, + "learning_rate": 0.000642423038116859, + "loss": 2.706, + "step": 13962 + }, + { + "epoch": 0.41404975832517865, + "grad_norm": 0.12951785326004028, + "learning_rate": 0.0006423779356929978, + "loss": 2.7074, + "step": 13963 + }, + { + "epoch": 0.41407941167749013, + "grad_norm": 0.14675970375537872, + "learning_rate": 0.0006423328320083552, + "loss": 2.6954, + "step": 13964 + }, + { + "epoch": 0.4141090650298016, + "grad_norm": 0.1578073799610138, + "learning_rate": 0.0006422877270633314, + "loss": 2.735, + "step": 13965 + }, + { + "epoch": 0.4141387183821131, + "grad_norm": 0.13329088687896729, + "learning_rate": 0.0006422426208583252, + "loss": 2.6602, + "step": 13966 + }, + { + "epoch": 0.41416837173442456, + "grad_norm": 0.1190643385052681, + "learning_rate": 0.0006421975133937361, + "loss": 2.6778, + "step": 13967 + }, + { + "epoch": 0.41419802508673603, + "grad_norm": 0.1274256557226181, + "learning_rate": 0.0006421524046699639, + "loss": 2.6944, + "step": 13968 + }, + { + "epoch": 0.4142276784390475, + "grad_norm": 0.12184928357601166, + "learning_rate": 0.0006421072946874073, + "loss": 2.697, + "step": 13969 + }, + { + "epoch": 0.41425733179135904, + "grad_norm": 0.12390893697738647, + "learning_rate": 0.0006420621834464666, + "loss": 2.688, + "step": 13970 + }, + { + "epoch": 0.4142869851436705, + "grad_norm": 0.13169538974761963, + "learning_rate": 0.0006420170709475407, + "loss": 2.7125, + "step": 13971 + }, + { + "epoch": 0.414316638495982, + "grad_norm": 0.13373836874961853, + "learning_rate": 0.0006419719571910293, + "loss": 2.6771, + "step": 13972 + }, + { + "epoch": 0.41434629184829347, + "grad_norm": 0.10587545484304428, + "learning_rate": 0.0006419268421773319, + "loss": 2.6572, + "step": 13973 + }, + { + "epoch": 0.41437594520060494, + "grad_norm": 0.12892328202724457, + "learning_rate": 0.0006418817259068478, + "loss": 2.6508, + "step": 13974 + }, + { + "epoch": 0.4144055985529164, + "grad_norm": 0.13648153841495514, + "learning_rate": 0.0006418366083799767, + "loss": 2.7215, + "step": 13975 + }, + { + "epoch": 0.4144352519052279, + "grad_norm": 0.12475308775901794, + "learning_rate": 0.0006417914895971182, + "loss": 2.6872, + "step": 13976 + }, + { + "epoch": 0.4144649052575394, + "grad_norm": 0.11056362837553024, + "learning_rate": 0.0006417463695586718, + "loss": 2.6958, + "step": 13977 + }, + { + "epoch": 0.41449455860985085, + "grad_norm": 0.11930472403764725, + "learning_rate": 0.0006417012482650367, + "loss": 2.6453, + "step": 13978 + }, + { + "epoch": 0.4145242119621623, + "grad_norm": 0.11970286816358566, + "learning_rate": 0.0006416561257166129, + "loss": 2.7025, + "step": 13979 + }, + { + "epoch": 0.4145538653144738, + "grad_norm": 0.12065885215997696, + "learning_rate": 0.0006416110019137997, + "loss": 2.6588, + "step": 13980 + }, + { + "epoch": 0.4145835186667853, + "grad_norm": 0.13710777461528778, + "learning_rate": 0.0006415658768569968, + "loss": 2.7002, + "step": 13981 + }, + { + "epoch": 0.41461317201909675, + "grad_norm": 0.13142088055610657, + "learning_rate": 0.0006415207505466038, + "loss": 2.6825, + "step": 13982 + }, + { + "epoch": 0.41464282537140823, + "grad_norm": 0.13298197090625763, + "learning_rate": 0.0006414756229830203, + "loss": 2.6589, + "step": 13983 + }, + { + "epoch": 0.4146724787237197, + "grad_norm": 0.14854159951210022, + "learning_rate": 0.0006414304941666458, + "loss": 2.6799, + "step": 13984 + }, + { + "epoch": 0.4147021320760312, + "grad_norm": 0.1252245008945465, + "learning_rate": 0.00064138536409788, + "loss": 2.6882, + "step": 13985 + }, + { + "epoch": 0.41473178542834266, + "grad_norm": 0.11262739449739456, + "learning_rate": 0.0006413402327771225, + "loss": 2.6817, + "step": 13986 + }, + { + "epoch": 0.41476143878065413, + "grad_norm": 0.13012564182281494, + "learning_rate": 0.0006412951002047731, + "loss": 2.7416, + "step": 13987 + }, + { + "epoch": 0.4147910921329656, + "grad_norm": 0.11122997105121613, + "learning_rate": 0.0006412499663812313, + "loss": 2.6345, + "step": 13988 + }, + { + "epoch": 0.4148207454852771, + "grad_norm": 0.13015758991241455, + "learning_rate": 0.0006412048313068967, + "loss": 2.6784, + "step": 13989 + }, + { + "epoch": 0.41485039883758856, + "grad_norm": 0.13941894471645355, + "learning_rate": 0.0006411596949821691, + "loss": 2.6735, + "step": 13990 + }, + { + "epoch": 0.4148800521899001, + "grad_norm": 0.11169038712978363, + "learning_rate": 0.0006411145574074481, + "loss": 2.6996, + "step": 13991 + }, + { + "epoch": 0.41490970554221157, + "grad_norm": 0.10944006592035294, + "learning_rate": 0.0006410694185831337, + "loss": 2.6791, + "step": 13992 + }, + { + "epoch": 0.41493935889452305, + "grad_norm": 0.1207137405872345, + "learning_rate": 0.0006410242785096254, + "loss": 2.6937, + "step": 13993 + }, + { + "epoch": 0.4149690122468345, + "grad_norm": 0.13568700850009918, + "learning_rate": 0.0006409791371873228, + "loss": 2.6653, + "step": 13994 + }, + { + "epoch": 0.414998665599146, + "grad_norm": 0.13505002856254578, + "learning_rate": 0.0006409339946166257, + "loss": 2.6717, + "step": 13995 + }, + { + "epoch": 0.4150283189514575, + "grad_norm": 0.13619089126586914, + "learning_rate": 0.0006408888507979339, + "loss": 2.699, + "step": 13996 + }, + { + "epoch": 0.41505797230376895, + "grad_norm": 0.14552561938762665, + "learning_rate": 0.000640843705731647, + "loss": 2.7061, + "step": 13997 + }, + { + "epoch": 0.41508762565608043, + "grad_norm": 0.15050159394741058, + "learning_rate": 0.0006407985594181653, + "loss": 2.718, + "step": 13998 + }, + { + "epoch": 0.4151172790083919, + "grad_norm": 0.13275642693042755, + "learning_rate": 0.0006407534118578878, + "loss": 2.6505, + "step": 13999 + }, + { + "epoch": 0.4151469323607034, + "grad_norm": 0.13899274170398712, + "learning_rate": 0.0006407082630512148, + "loss": 2.7204, + "step": 14000 + }, + { + "epoch": 0.41517658571301486, + "grad_norm": 0.14272315800189972, + "learning_rate": 0.000640663112998546, + "loss": 2.6935, + "step": 14001 + }, + { + "epoch": 0.41520623906532633, + "grad_norm": 0.14862428605556488, + "learning_rate": 0.0006406179617002813, + "loss": 2.654, + "step": 14002 + }, + { + "epoch": 0.4152358924176378, + "grad_norm": 0.17497991025447845, + "learning_rate": 0.0006405728091568203, + "loss": 2.7072, + "step": 14003 + }, + { + "epoch": 0.4152655457699493, + "grad_norm": 0.19255703687667847, + "learning_rate": 0.0006405276553685629, + "loss": 2.6808, + "step": 14004 + }, + { + "epoch": 0.41529519912226076, + "grad_norm": 0.1511072814464569, + "learning_rate": 0.0006404825003359091, + "loss": 2.6621, + "step": 14005 + }, + { + "epoch": 0.41532485247457224, + "grad_norm": 0.11102467030286789, + "learning_rate": 0.0006404373440592586, + "loss": 2.6628, + "step": 14006 + }, + { + "epoch": 0.4153545058268837, + "grad_norm": 0.1384437531232834, + "learning_rate": 0.0006403921865390112, + "loss": 2.6638, + "step": 14007 + }, + { + "epoch": 0.4153841591791952, + "grad_norm": 0.1372210681438446, + "learning_rate": 0.0006403470277755671, + "loss": 2.6729, + "step": 14008 + }, + { + "epoch": 0.41541381253150667, + "grad_norm": 0.13339565694332123, + "learning_rate": 0.0006403018677693258, + "loss": 2.6666, + "step": 14009 + }, + { + "epoch": 0.41544346588381814, + "grad_norm": 0.15676242113113403, + "learning_rate": 0.0006402567065206875, + "loss": 2.7328, + "step": 14010 + }, + { + "epoch": 0.4154731192361296, + "grad_norm": 0.16064243018627167, + "learning_rate": 0.000640211544030052, + "loss": 2.6421, + "step": 14011 + }, + { + "epoch": 0.41550277258844115, + "grad_norm": 0.14311754703521729, + "learning_rate": 0.000640166380297819, + "loss": 2.6657, + "step": 14012 + }, + { + "epoch": 0.4155324259407526, + "grad_norm": 0.13036581873893738, + "learning_rate": 0.000640121215324389, + "loss": 2.6901, + "step": 14013 + }, + { + "epoch": 0.4155620792930641, + "grad_norm": 0.1273219734430313, + "learning_rate": 0.0006400760491101613, + "loss": 2.6895, + "step": 14014 + }, + { + "epoch": 0.4155917326453756, + "grad_norm": 0.13509435951709747, + "learning_rate": 0.0006400308816555362, + "loss": 2.6741, + "step": 14015 + }, + { + "epoch": 0.41562138599768705, + "grad_norm": 0.12881946563720703, + "learning_rate": 0.0006399857129609135, + "loss": 2.6279, + "step": 14016 + }, + { + "epoch": 0.41565103934999853, + "grad_norm": 0.10571354627609253, + "learning_rate": 0.0006399405430266935, + "loss": 2.6913, + "step": 14017 + }, + { + "epoch": 0.41568069270231, + "grad_norm": 0.11358248442411423, + "learning_rate": 0.0006398953718532758, + "loss": 2.6619, + "step": 14018 + }, + { + "epoch": 0.4157103460546215, + "grad_norm": 0.11295153200626373, + "learning_rate": 0.0006398501994410607, + "loss": 2.6865, + "step": 14019 + }, + { + "epoch": 0.41573999940693296, + "grad_norm": 0.11153729259967804, + "learning_rate": 0.0006398050257904482, + "loss": 2.6785, + "step": 14020 + }, + { + "epoch": 0.41576965275924443, + "grad_norm": 0.11607451736927032, + "learning_rate": 0.000639759850901838, + "loss": 2.666, + "step": 14021 + }, + { + "epoch": 0.4157993061115559, + "grad_norm": 0.1120966449379921, + "learning_rate": 0.0006397146747756304, + "loss": 2.6902, + "step": 14022 + }, + { + "epoch": 0.4158289594638674, + "grad_norm": 0.10877104103565216, + "learning_rate": 0.0006396694974122253, + "loss": 2.6682, + "step": 14023 + }, + { + "epoch": 0.41585861281617886, + "grad_norm": 0.12349046766757965, + "learning_rate": 0.0006396243188120228, + "loss": 2.6871, + "step": 14024 + }, + { + "epoch": 0.41588826616849034, + "grad_norm": 0.11121414601802826, + "learning_rate": 0.0006395791389754231, + "loss": 2.666, + "step": 14025 + }, + { + "epoch": 0.4159179195208018, + "grad_norm": 0.11164339631795883, + "learning_rate": 0.0006395339579028261, + "loss": 2.677, + "step": 14026 + }, + { + "epoch": 0.4159475728731133, + "grad_norm": 0.10049745440483093, + "learning_rate": 0.000639488775594632, + "loss": 2.6679, + "step": 14027 + }, + { + "epoch": 0.41597722622542477, + "grad_norm": 0.11590708792209625, + "learning_rate": 0.0006394435920512408, + "loss": 2.6889, + "step": 14028 + }, + { + "epoch": 0.41600687957773624, + "grad_norm": 0.12130962312221527, + "learning_rate": 0.0006393984072730525, + "loss": 2.6849, + "step": 14029 + }, + { + "epoch": 0.4160365329300477, + "grad_norm": 0.12043262273073196, + "learning_rate": 0.0006393532212604676, + "loss": 2.7111, + "step": 14030 + }, + { + "epoch": 0.4160661862823592, + "grad_norm": 0.1304318755865097, + "learning_rate": 0.0006393080340138861, + "loss": 2.7004, + "step": 14031 + }, + { + "epoch": 0.4160958396346707, + "grad_norm": 0.10830973833799362, + "learning_rate": 0.000639262845533708, + "loss": 2.6928, + "step": 14032 + }, + { + "epoch": 0.4161254929869822, + "grad_norm": 0.10127236694097519, + "learning_rate": 0.0006392176558203333, + "loss": 2.6906, + "step": 14033 + }, + { + "epoch": 0.4161551463392937, + "grad_norm": 0.11626655608415604, + "learning_rate": 0.0006391724648741625, + "loss": 2.6496, + "step": 14034 + }, + { + "epoch": 0.41618479969160516, + "grad_norm": 0.14186368882656097, + "learning_rate": 0.0006391272726955955, + "loss": 2.6995, + "step": 14035 + }, + { + "epoch": 0.41621445304391663, + "grad_norm": 0.15304043889045715, + "learning_rate": 0.0006390820792850328, + "loss": 2.6729, + "step": 14036 + }, + { + "epoch": 0.4162441063962281, + "grad_norm": 0.14133085310459137, + "learning_rate": 0.0006390368846428743, + "loss": 2.6595, + "step": 14037 + }, + { + "epoch": 0.4162737597485396, + "grad_norm": 0.13648490607738495, + "learning_rate": 0.0006389916887695204, + "loss": 2.6974, + "step": 14038 + }, + { + "epoch": 0.41630341310085106, + "grad_norm": 0.12696994841098785, + "learning_rate": 0.0006389464916653711, + "loss": 2.6983, + "step": 14039 + }, + { + "epoch": 0.41633306645316254, + "grad_norm": 0.1322292685508728, + "learning_rate": 0.000638901293330827, + "loss": 2.6786, + "step": 14040 + }, + { + "epoch": 0.416362719805474, + "grad_norm": 0.1439235508441925, + "learning_rate": 0.000638856093766288, + "loss": 2.6674, + "step": 14041 + }, + { + "epoch": 0.4163923731577855, + "grad_norm": 0.1383657157421112, + "learning_rate": 0.0006388108929721543, + "loss": 2.6779, + "step": 14042 + }, + { + "epoch": 0.41642202651009697, + "grad_norm": 0.13975542783737183, + "learning_rate": 0.0006387656909488264, + "loss": 2.6488, + "step": 14043 + }, + { + "epoch": 0.41645167986240844, + "grad_norm": 0.1239178404211998, + "learning_rate": 0.0006387204876967046, + "loss": 2.6843, + "step": 14044 + }, + { + "epoch": 0.4164813332147199, + "grad_norm": 0.12347512692213058, + "learning_rate": 0.0006386752832161889, + "loss": 2.6881, + "step": 14045 + }, + { + "epoch": 0.4165109865670314, + "grad_norm": 0.13616012036800385, + "learning_rate": 0.0006386300775076799, + "loss": 2.637, + "step": 14046 + }, + { + "epoch": 0.41654063991934287, + "grad_norm": 0.14866004884243011, + "learning_rate": 0.0006385848705715778, + "loss": 2.6366, + "step": 14047 + }, + { + "epoch": 0.41657029327165435, + "grad_norm": 0.14193129539489746, + "learning_rate": 0.0006385396624082828, + "loss": 2.6849, + "step": 14048 + }, + { + "epoch": 0.4165999466239658, + "grad_norm": 0.13197004795074463, + "learning_rate": 0.0006384944530181953, + "loss": 2.657, + "step": 14049 + }, + { + "epoch": 0.4166295999762773, + "grad_norm": 0.15765652060508728, + "learning_rate": 0.0006384492424017157, + "loss": 2.6892, + "step": 14050 + }, + { + "epoch": 0.4166592533285888, + "grad_norm": 0.1536930352449417, + "learning_rate": 0.0006384040305592442, + "loss": 2.67, + "step": 14051 + }, + { + "epoch": 0.41668890668090025, + "grad_norm": 0.15002790093421936, + "learning_rate": 0.0006383588174911813, + "loss": 2.7019, + "step": 14052 + }, + { + "epoch": 0.4167185600332118, + "grad_norm": 0.11741921305656433, + "learning_rate": 0.0006383136031979274, + "loss": 2.683, + "step": 14053 + }, + { + "epoch": 0.41674821338552326, + "grad_norm": 0.10647616535425186, + "learning_rate": 0.0006382683876798829, + "loss": 2.6699, + "step": 14054 + }, + { + "epoch": 0.41677786673783473, + "grad_norm": 0.13113324344158173, + "learning_rate": 0.0006382231709374477, + "loss": 2.6734, + "step": 14055 + }, + { + "epoch": 0.4168075200901462, + "grad_norm": 0.12002639472484589, + "learning_rate": 0.0006381779529710229, + "loss": 2.6878, + "step": 14056 + }, + { + "epoch": 0.4168371734424577, + "grad_norm": 0.10645417124032974, + "learning_rate": 0.0006381327337810084, + "loss": 2.6777, + "step": 14057 + }, + { + "epoch": 0.41686682679476916, + "grad_norm": 0.12249168753623962, + "learning_rate": 0.0006380875133678052, + "loss": 2.6745, + "step": 14058 + }, + { + "epoch": 0.41689648014708064, + "grad_norm": 0.1358649730682373, + "learning_rate": 0.0006380422917318131, + "loss": 2.688, + "step": 14059 + }, + { + "epoch": 0.4169261334993921, + "grad_norm": 0.12269578874111176, + "learning_rate": 0.0006379970688734327, + "loss": 2.6706, + "step": 14060 + }, + { + "epoch": 0.4169557868517036, + "grad_norm": 0.10899342596530914, + "learning_rate": 0.0006379518447930648, + "loss": 2.6938, + "step": 14061 + }, + { + "epoch": 0.41698544020401507, + "grad_norm": 0.12163891643285751, + "learning_rate": 0.0006379066194911095, + "loss": 2.6553, + "step": 14062 + }, + { + "epoch": 0.41701509355632654, + "grad_norm": 0.13441985845565796, + "learning_rate": 0.0006378613929679675, + "loss": 2.6963, + "step": 14063 + }, + { + "epoch": 0.417044746908638, + "grad_norm": 0.16540959477424622, + "learning_rate": 0.0006378161652240391, + "loss": 2.686, + "step": 14064 + }, + { + "epoch": 0.4170744002609495, + "grad_norm": 0.16645704209804535, + "learning_rate": 0.0006377709362597251, + "loss": 2.6808, + "step": 14065 + }, + { + "epoch": 0.41710405361326097, + "grad_norm": 0.1653047800064087, + "learning_rate": 0.0006377257060754257, + "loss": 2.6712, + "step": 14066 + }, + { + "epoch": 0.41713370696557245, + "grad_norm": 0.13849776983261108, + "learning_rate": 0.0006376804746715414, + "loss": 2.6512, + "step": 14067 + }, + { + "epoch": 0.4171633603178839, + "grad_norm": 0.13767707347869873, + "learning_rate": 0.0006376352420484728, + "loss": 2.7005, + "step": 14068 + }, + { + "epoch": 0.4171930136701954, + "grad_norm": 0.15964752435684204, + "learning_rate": 0.000637590008206621, + "loss": 2.6999, + "step": 14069 + }, + { + "epoch": 0.4172226670225069, + "grad_norm": 0.14093832671642303, + "learning_rate": 0.0006375447731463857, + "loss": 2.6974, + "step": 14070 + }, + { + "epoch": 0.41725232037481835, + "grad_norm": 0.11609850078821182, + "learning_rate": 0.0006374995368681678, + "loss": 2.646, + "step": 14071 + }, + { + "epoch": 0.41728197372712983, + "grad_norm": 0.14764179289340973, + "learning_rate": 0.000637454299372368, + "loss": 2.6999, + "step": 14072 + }, + { + "epoch": 0.4173116270794413, + "grad_norm": 0.15999984741210938, + "learning_rate": 0.0006374090606593867, + "loss": 2.6651, + "step": 14073 + }, + { + "epoch": 0.41734128043175284, + "grad_norm": 0.1333305686712265, + "learning_rate": 0.0006373638207296246, + "loss": 2.6448, + "step": 14074 + }, + { + "epoch": 0.4173709337840643, + "grad_norm": 0.1274852603673935, + "learning_rate": 0.0006373185795834823, + "loss": 2.6821, + "step": 14075 + }, + { + "epoch": 0.4174005871363758, + "grad_norm": 0.12572915852069855, + "learning_rate": 0.0006372733372213605, + "loss": 2.7133, + "step": 14076 + }, + { + "epoch": 0.41743024048868727, + "grad_norm": 0.12737895548343658, + "learning_rate": 0.0006372280936436597, + "loss": 2.6452, + "step": 14077 + }, + { + "epoch": 0.41745989384099874, + "grad_norm": 0.1317012906074524, + "learning_rate": 0.0006371828488507805, + "loss": 2.7156, + "step": 14078 + }, + { + "epoch": 0.4174895471933102, + "grad_norm": 0.14030255377292633, + "learning_rate": 0.0006371376028431237, + "loss": 2.6982, + "step": 14079 + }, + { + "epoch": 0.4175192005456217, + "grad_norm": 0.12230800837278366, + "learning_rate": 0.0006370923556210898, + "loss": 2.6743, + "step": 14080 + }, + { + "epoch": 0.41754885389793317, + "grad_norm": 0.1357859969139099, + "learning_rate": 0.0006370471071850797, + "loss": 2.6738, + "step": 14081 + }, + { + "epoch": 0.41757850725024465, + "grad_norm": 0.13779182732105255, + "learning_rate": 0.0006370018575354938, + "loss": 2.6852, + "step": 14082 + }, + { + "epoch": 0.4176081606025561, + "grad_norm": 0.12206099182367325, + "learning_rate": 0.000636956606672733, + "loss": 2.6732, + "step": 14083 + }, + { + "epoch": 0.4176378139548676, + "grad_norm": 0.11002122610807419, + "learning_rate": 0.000636911354597198, + "loss": 2.6591, + "step": 14084 + }, + { + "epoch": 0.4176674673071791, + "grad_norm": 0.12143689393997192, + "learning_rate": 0.0006368661013092893, + "loss": 2.6787, + "step": 14085 + }, + { + "epoch": 0.41769712065949055, + "grad_norm": 0.11262428760528564, + "learning_rate": 0.000636820846809408, + "loss": 2.6562, + "step": 14086 + }, + { + "epoch": 0.417726774011802, + "grad_norm": 0.11423199623823166, + "learning_rate": 0.0006367755910979543, + "loss": 2.6684, + "step": 14087 + }, + { + "epoch": 0.4177564273641135, + "grad_norm": 0.11762461811304092, + "learning_rate": 0.0006367303341753294, + "loss": 2.6843, + "step": 14088 + }, + { + "epoch": 0.417786080716425, + "grad_norm": 0.10806327313184738, + "learning_rate": 0.000636685076041934, + "loss": 2.6771, + "step": 14089 + }, + { + "epoch": 0.41781573406873646, + "grad_norm": 0.1375405341386795, + "learning_rate": 0.0006366398166981689, + "loss": 2.6988, + "step": 14090 + }, + { + "epoch": 0.41784538742104793, + "grad_norm": 0.12373777478933334, + "learning_rate": 0.0006365945561444346, + "loss": 2.7072, + "step": 14091 + }, + { + "epoch": 0.4178750407733594, + "grad_norm": 0.13013195991516113, + "learning_rate": 0.0006365492943811321, + "loss": 2.6919, + "step": 14092 + }, + { + "epoch": 0.4179046941256709, + "grad_norm": 0.1292905956506729, + "learning_rate": 0.0006365040314086622, + "loss": 2.684, + "step": 14093 + }, + { + "epoch": 0.41793434747798236, + "grad_norm": 0.1074700579047203, + "learning_rate": 0.0006364587672274255, + "loss": 2.7014, + "step": 14094 + }, + { + "epoch": 0.4179640008302939, + "grad_norm": 0.09340399503707886, + "learning_rate": 0.0006364135018378231, + "loss": 2.6815, + "step": 14095 + }, + { + "epoch": 0.41799365418260537, + "grad_norm": 0.10506145656108856, + "learning_rate": 0.0006363682352402558, + "loss": 2.7095, + "step": 14096 + }, + { + "epoch": 0.41802330753491684, + "grad_norm": 0.11951128393411636, + "learning_rate": 0.0006363229674351243, + "loss": 2.6798, + "step": 14097 + }, + { + "epoch": 0.4180529608872283, + "grad_norm": 0.1251915544271469, + "learning_rate": 0.0006362776984228295, + "loss": 2.6208, + "step": 14098 + }, + { + "epoch": 0.4180826142395398, + "grad_norm": 0.11721593886613846, + "learning_rate": 0.0006362324282037724, + "loss": 2.662, + "step": 14099 + }, + { + "epoch": 0.41811226759185127, + "grad_norm": 0.11440218240022659, + "learning_rate": 0.0006361871567783536, + "loss": 2.6393, + "step": 14100 + }, + { + "epoch": 0.41814192094416275, + "grad_norm": 0.12519825994968414, + "learning_rate": 0.0006361418841469743, + "loss": 2.6622, + "step": 14101 + }, + { + "epoch": 0.4181715742964742, + "grad_norm": 0.1396346539258957, + "learning_rate": 0.0006360966103100352, + "loss": 2.6997, + "step": 14102 + }, + { + "epoch": 0.4182012276487857, + "grad_norm": 0.1324690282344818, + "learning_rate": 0.0006360513352679372, + "loss": 2.6566, + "step": 14103 + }, + { + "epoch": 0.4182308810010972, + "grad_norm": 0.13387182354927063, + "learning_rate": 0.0006360060590210814, + "loss": 2.6639, + "step": 14104 + }, + { + "epoch": 0.41826053435340865, + "grad_norm": 0.14033512771129608, + "learning_rate": 0.0006359607815698685, + "loss": 2.661, + "step": 14105 + }, + { + "epoch": 0.41829018770572013, + "grad_norm": 0.16384464502334595, + "learning_rate": 0.0006359155029146995, + "loss": 2.6798, + "step": 14106 + }, + { + "epoch": 0.4183198410580316, + "grad_norm": 0.14079374074935913, + "learning_rate": 0.0006358702230559755, + "loss": 2.7125, + "step": 14107 + }, + { + "epoch": 0.4183494944103431, + "grad_norm": 0.11915639042854309, + "learning_rate": 0.0006358249419940972, + "loss": 2.6684, + "step": 14108 + }, + { + "epoch": 0.41837914776265456, + "grad_norm": 0.11200059950351715, + "learning_rate": 0.0006357796597294659, + "loss": 2.6401, + "step": 14109 + }, + { + "epoch": 0.41840880111496603, + "grad_norm": 0.10897757112979889, + "learning_rate": 0.0006357343762624823, + "loss": 2.6465, + "step": 14110 + }, + { + "epoch": 0.4184384544672775, + "grad_norm": 0.11924581974744797, + "learning_rate": 0.0006356890915935475, + "loss": 2.643, + "step": 14111 + }, + { + "epoch": 0.418468107819589, + "grad_norm": 0.10469887405633926, + "learning_rate": 0.0006356438057230626, + "loss": 2.609, + "step": 14112 + }, + { + "epoch": 0.41849776117190046, + "grad_norm": 0.10574007034301758, + "learning_rate": 0.0006355985186514284, + "loss": 2.663, + "step": 14113 + }, + { + "epoch": 0.41852741452421194, + "grad_norm": 0.13246653974056244, + "learning_rate": 0.0006355532303790461, + "loss": 2.6579, + "step": 14114 + }, + { + "epoch": 0.4185570678765234, + "grad_norm": 0.12967686355113983, + "learning_rate": 0.0006355079409063167, + "loss": 2.6862, + "step": 14115 + }, + { + "epoch": 0.41858672122883495, + "grad_norm": 0.10659307986497879, + "learning_rate": 0.0006354626502336412, + "loss": 2.6606, + "step": 14116 + }, + { + "epoch": 0.4186163745811464, + "grad_norm": 0.14153040945529938, + "learning_rate": 0.0006354173583614207, + "loss": 2.6779, + "step": 14117 + }, + { + "epoch": 0.4186460279334579, + "grad_norm": 0.14223013818264008, + "learning_rate": 0.0006353720652900561, + "loss": 2.6561, + "step": 14118 + }, + { + "epoch": 0.4186756812857694, + "grad_norm": 0.12886746227741241, + "learning_rate": 0.0006353267710199488, + "loss": 2.6674, + "step": 14119 + }, + { + "epoch": 0.41870533463808085, + "grad_norm": 0.15099969506263733, + "learning_rate": 0.0006352814755514997, + "loss": 2.6681, + "step": 14120 + }, + { + "epoch": 0.4187349879903923, + "grad_norm": 0.13171231746673584, + "learning_rate": 0.0006352361788851098, + "loss": 2.6852, + "step": 14121 + }, + { + "epoch": 0.4187646413427038, + "grad_norm": 0.12558145821094513, + "learning_rate": 0.0006351908810211804, + "loss": 2.6673, + "step": 14122 + }, + { + "epoch": 0.4187942946950153, + "grad_norm": 0.14143048226833344, + "learning_rate": 0.0006351455819601125, + "loss": 2.7, + "step": 14123 + }, + { + "epoch": 0.41882394804732676, + "grad_norm": 0.11323820054531097, + "learning_rate": 0.0006351002817023075, + "loss": 2.6958, + "step": 14124 + }, + { + "epoch": 0.41885360139963823, + "grad_norm": 0.11571969836950302, + "learning_rate": 0.0006350549802481661, + "loss": 2.6798, + "step": 14125 + }, + { + "epoch": 0.4188832547519497, + "grad_norm": 0.12522508203983307, + "learning_rate": 0.0006350096775980896, + "loss": 2.6708, + "step": 14126 + }, + { + "epoch": 0.4189129081042612, + "grad_norm": 0.11963922530412674, + "learning_rate": 0.0006349643737524793, + "loss": 2.6543, + "step": 14127 + }, + { + "epoch": 0.41894256145657266, + "grad_norm": 0.12282342463731766, + "learning_rate": 0.0006349190687117363, + "loss": 2.6825, + "step": 14128 + }, + { + "epoch": 0.41897221480888414, + "grad_norm": 0.13705652952194214, + "learning_rate": 0.0006348737624762619, + "loss": 2.6885, + "step": 14129 + }, + { + "epoch": 0.4190018681611956, + "grad_norm": 0.16200897097587585, + "learning_rate": 0.0006348284550464572, + "loss": 2.6879, + "step": 14130 + }, + { + "epoch": 0.4190315215135071, + "grad_norm": 0.15972258150577545, + "learning_rate": 0.0006347831464227233, + "loss": 2.7183, + "step": 14131 + }, + { + "epoch": 0.41906117486581856, + "grad_norm": 0.1427856683731079, + "learning_rate": 0.0006347378366054614, + "loss": 2.682, + "step": 14132 + }, + { + "epoch": 0.41909082821813004, + "grad_norm": 0.12045403569936752, + "learning_rate": 0.0006346925255950728, + "loss": 2.6733, + "step": 14133 + }, + { + "epoch": 0.4191204815704415, + "grad_norm": 0.1547458916902542, + "learning_rate": 0.0006346472133919591, + "loss": 2.6702, + "step": 14134 + }, + { + "epoch": 0.419150134922753, + "grad_norm": 0.14854902029037476, + "learning_rate": 0.0006346018999965209, + "loss": 2.6792, + "step": 14135 + }, + { + "epoch": 0.41917978827506447, + "grad_norm": 0.13763655722141266, + "learning_rate": 0.0006345565854091599, + "loss": 2.6825, + "step": 14136 + }, + { + "epoch": 0.419209441627376, + "grad_norm": 0.13355915248394012, + "learning_rate": 0.0006345112696302772, + "loss": 2.6669, + "step": 14137 + }, + { + "epoch": 0.4192390949796875, + "grad_norm": 0.1054554209113121, + "learning_rate": 0.0006344659526602742, + "loss": 2.6438, + "step": 14138 + }, + { + "epoch": 0.41926874833199895, + "grad_norm": 0.1130516529083252, + "learning_rate": 0.000634420634499552, + "loss": 2.6308, + "step": 14139 + }, + { + "epoch": 0.41929840168431043, + "grad_norm": 0.11611004918813705, + "learning_rate": 0.0006343753151485121, + "loss": 2.6882, + "step": 14140 + }, + { + "epoch": 0.4193280550366219, + "grad_norm": 0.12805680930614471, + "learning_rate": 0.0006343299946075556, + "loss": 2.6721, + "step": 14141 + }, + { + "epoch": 0.4193577083889334, + "grad_norm": 0.14741073548793793, + "learning_rate": 0.0006342846728770841, + "loss": 2.66, + "step": 14142 + }, + { + "epoch": 0.41938736174124486, + "grad_norm": 0.14344589412212372, + "learning_rate": 0.0006342393499574986, + "loss": 2.6909, + "step": 14143 + }, + { + "epoch": 0.41941701509355633, + "grad_norm": 0.13145817816257477, + "learning_rate": 0.0006341940258492007, + "loss": 2.6462, + "step": 14144 + }, + { + "epoch": 0.4194466684458678, + "grad_norm": 0.11299711465835571, + "learning_rate": 0.0006341487005525917, + "loss": 2.6818, + "step": 14145 + }, + { + "epoch": 0.4194763217981793, + "grad_norm": 0.11893327534198761, + "learning_rate": 0.0006341033740680729, + "loss": 2.6745, + "step": 14146 + }, + { + "epoch": 0.41950597515049076, + "grad_norm": 0.10680631548166275, + "learning_rate": 0.0006340580463960457, + "loss": 2.6494, + "step": 14147 + }, + { + "epoch": 0.41953562850280224, + "grad_norm": 0.10799460113048553, + "learning_rate": 0.0006340127175369115, + "loss": 2.7202, + "step": 14148 + }, + { + "epoch": 0.4195652818551137, + "grad_norm": 0.11313562095165253, + "learning_rate": 0.0006339673874910716, + "loss": 2.6637, + "step": 14149 + }, + { + "epoch": 0.4195949352074252, + "grad_norm": 0.10142260044813156, + "learning_rate": 0.0006339220562589276, + "loss": 2.6788, + "step": 14150 + }, + { + "epoch": 0.41962458855973667, + "grad_norm": 0.11228017508983612, + "learning_rate": 0.0006338767238408809, + "loss": 2.661, + "step": 14151 + }, + { + "epoch": 0.41965424191204814, + "grad_norm": 0.09305793792009354, + "learning_rate": 0.0006338313902373325, + "loss": 2.6684, + "step": 14152 + }, + { + "epoch": 0.4196838952643596, + "grad_norm": 0.12032756209373474, + "learning_rate": 0.0006337860554486844, + "loss": 2.6847, + "step": 14153 + }, + { + "epoch": 0.4197135486166711, + "grad_norm": 0.11367785930633545, + "learning_rate": 0.0006337407194753377, + "loss": 2.672, + "step": 14154 + }, + { + "epoch": 0.41974320196898257, + "grad_norm": 0.11406444758176804, + "learning_rate": 0.0006336953823176941, + "loss": 2.6455, + "step": 14155 + }, + { + "epoch": 0.41977285532129405, + "grad_norm": 0.1388792097568512, + "learning_rate": 0.0006336500439761549, + "loss": 2.6642, + "step": 14156 + }, + { + "epoch": 0.4198025086736055, + "grad_norm": 0.14137893915176392, + "learning_rate": 0.0006336047044511217, + "loss": 2.6418, + "step": 14157 + }, + { + "epoch": 0.41983216202591706, + "grad_norm": 0.13105061650276184, + "learning_rate": 0.0006335593637429957, + "loss": 2.7052, + "step": 14158 + }, + { + "epoch": 0.41986181537822853, + "grad_norm": 0.15462654829025269, + "learning_rate": 0.0006335140218521788, + "loss": 2.7009, + "step": 14159 + }, + { + "epoch": 0.41989146873054, + "grad_norm": 0.16886095702648163, + "learning_rate": 0.0006334686787790722, + "loss": 2.7028, + "step": 14160 + }, + { + "epoch": 0.4199211220828515, + "grad_norm": 0.158869206905365, + "learning_rate": 0.0006334233345240776, + "loss": 2.6609, + "step": 14161 + }, + { + "epoch": 0.41995077543516296, + "grad_norm": 0.15188829600811005, + "learning_rate": 0.0006333779890875966, + "loss": 2.637, + "step": 14162 + }, + { + "epoch": 0.41998042878747444, + "grad_norm": 0.13946856558322906, + "learning_rate": 0.0006333326424700304, + "loss": 2.6928, + "step": 14163 + }, + { + "epoch": 0.4200100821397859, + "grad_norm": 0.1328698694705963, + "learning_rate": 0.000633287294671781, + "loss": 2.6675, + "step": 14164 + }, + { + "epoch": 0.4200397354920974, + "grad_norm": 0.11849011480808258, + "learning_rate": 0.0006332419456932493, + "loss": 2.7064, + "step": 14165 + }, + { + "epoch": 0.42006938884440886, + "grad_norm": 0.14369408786296844, + "learning_rate": 0.0006331965955348375, + "loss": 2.6839, + "step": 14166 + }, + { + "epoch": 0.42009904219672034, + "grad_norm": 0.16346819698810577, + "learning_rate": 0.0006331512441969473, + "loss": 2.6807, + "step": 14167 + }, + { + "epoch": 0.4201286955490318, + "grad_norm": 0.1317630261182785, + "learning_rate": 0.0006331058916799797, + "loss": 2.6949, + "step": 14168 + }, + { + "epoch": 0.4201583489013433, + "grad_norm": 0.13168157637119293, + "learning_rate": 0.0006330605379843366, + "loss": 2.702, + "step": 14169 + }, + { + "epoch": 0.42018800225365477, + "grad_norm": 0.1265283226966858, + "learning_rate": 0.0006330151831104196, + "loss": 2.6485, + "step": 14170 + }, + { + "epoch": 0.42021765560596624, + "grad_norm": 0.12256040424108505, + "learning_rate": 0.0006329698270586302, + "loss": 2.6759, + "step": 14171 + }, + { + "epoch": 0.4202473089582777, + "grad_norm": 0.1507558822631836, + "learning_rate": 0.0006329244698293704, + "loss": 2.681, + "step": 14172 + }, + { + "epoch": 0.4202769623105892, + "grad_norm": 0.1268395632505417, + "learning_rate": 0.0006328791114230414, + "loss": 2.6718, + "step": 14173 + }, + { + "epoch": 0.4203066156629007, + "grad_norm": 0.12163352966308594, + "learning_rate": 0.0006328337518400453, + "loss": 2.6687, + "step": 14174 + }, + { + "epoch": 0.42033626901521215, + "grad_norm": 0.13785411417484283, + "learning_rate": 0.0006327883910807832, + "loss": 2.6858, + "step": 14175 + }, + { + "epoch": 0.4203659223675236, + "grad_norm": 0.12997014820575714, + "learning_rate": 0.0006327430291456573, + "loss": 2.6596, + "step": 14176 + }, + { + "epoch": 0.4203955757198351, + "grad_norm": 0.13656921684741974, + "learning_rate": 0.0006326976660350691, + "loss": 2.7098, + "step": 14177 + }, + { + "epoch": 0.42042522907214663, + "grad_norm": 0.14693865180015564, + "learning_rate": 0.0006326523017494202, + "loss": 2.6988, + "step": 14178 + }, + { + "epoch": 0.4204548824244581, + "grad_norm": 0.13236412405967712, + "learning_rate": 0.0006326069362891125, + "loss": 2.6912, + "step": 14179 + }, + { + "epoch": 0.4204845357767696, + "grad_norm": 0.12162786722183228, + "learning_rate": 0.0006325615696545476, + "loss": 2.6745, + "step": 14180 + }, + { + "epoch": 0.42051418912908106, + "grad_norm": 0.1310712844133377, + "learning_rate": 0.0006325162018461272, + "loss": 2.6813, + "step": 14181 + }, + { + "epoch": 0.42054384248139254, + "grad_norm": 0.11531872302293777, + "learning_rate": 0.0006324708328642531, + "loss": 2.6603, + "step": 14182 + }, + { + "epoch": 0.420573495833704, + "grad_norm": 0.12449506670236588, + "learning_rate": 0.0006324254627093271, + "loss": 2.6842, + "step": 14183 + }, + { + "epoch": 0.4206031491860155, + "grad_norm": 0.12449745833873749, + "learning_rate": 0.0006323800913817508, + "loss": 2.6736, + "step": 14184 + }, + { + "epoch": 0.42063280253832697, + "grad_norm": 0.14226122200489044, + "learning_rate": 0.000632334718881926, + "loss": 2.6894, + "step": 14185 + }, + { + "epoch": 0.42066245589063844, + "grad_norm": 0.12600448727607727, + "learning_rate": 0.0006322893452102548, + "loss": 2.6848, + "step": 14186 + }, + { + "epoch": 0.4206921092429499, + "grad_norm": 0.14326627552509308, + "learning_rate": 0.0006322439703671385, + "loss": 2.6709, + "step": 14187 + }, + { + "epoch": 0.4207217625952614, + "grad_norm": 0.16562676429748535, + "learning_rate": 0.0006321985943529793, + "loss": 2.6769, + "step": 14188 + }, + { + "epoch": 0.42075141594757287, + "grad_norm": 0.1559835821390152, + "learning_rate": 0.0006321532171681788, + "loss": 2.6838, + "step": 14189 + }, + { + "epoch": 0.42078106929988435, + "grad_norm": 0.1508663147687912, + "learning_rate": 0.000632107838813139, + "loss": 2.6988, + "step": 14190 + }, + { + "epoch": 0.4208107226521958, + "grad_norm": 0.11999964714050293, + "learning_rate": 0.0006320624592882614, + "loss": 2.6669, + "step": 14191 + }, + { + "epoch": 0.4208403760045073, + "grad_norm": 0.11800715327262878, + "learning_rate": 0.0006320170785939481, + "loss": 2.688, + "step": 14192 + }, + { + "epoch": 0.4208700293568188, + "grad_norm": 0.1308068335056305, + "learning_rate": 0.000631971696730601, + "loss": 2.6927, + "step": 14193 + }, + { + "epoch": 0.42089968270913025, + "grad_norm": 0.13753743469715118, + "learning_rate": 0.0006319263136986218, + "loss": 2.681, + "step": 14194 + }, + { + "epoch": 0.42092933606144173, + "grad_norm": 0.14162404835224152, + "learning_rate": 0.0006318809294984125, + "loss": 2.7144, + "step": 14195 + }, + { + "epoch": 0.4209589894137532, + "grad_norm": 0.13245204091072083, + "learning_rate": 0.000631835544130375, + "loss": 2.6584, + "step": 14196 + }, + { + "epoch": 0.4209886427660647, + "grad_norm": 0.12761728465557098, + "learning_rate": 0.0006317901575949109, + "loss": 2.6631, + "step": 14197 + }, + { + "epoch": 0.42101829611837616, + "grad_norm": 0.12697474658489227, + "learning_rate": 0.0006317447698924223, + "loss": 2.7082, + "step": 14198 + }, + { + "epoch": 0.4210479494706877, + "grad_norm": 0.1256260722875595, + "learning_rate": 0.0006316993810233114, + "loss": 2.6824, + "step": 14199 + }, + { + "epoch": 0.42107760282299916, + "grad_norm": 0.14004892110824585, + "learning_rate": 0.00063165399098798, + "loss": 2.6987, + "step": 14200 + }, + { + "epoch": 0.42110725617531064, + "grad_norm": 0.13346298038959503, + "learning_rate": 0.0006316085997868297, + "loss": 2.7035, + "step": 14201 + }, + { + "epoch": 0.4211369095276221, + "grad_norm": 0.1242302879691124, + "learning_rate": 0.0006315632074202626, + "loss": 2.6814, + "step": 14202 + }, + { + "epoch": 0.4211665628799336, + "grad_norm": 0.1260339915752411, + "learning_rate": 0.0006315178138886808, + "loss": 2.6737, + "step": 14203 + }, + { + "epoch": 0.42119621623224507, + "grad_norm": 0.11851991713047028, + "learning_rate": 0.000631472419192486, + "loss": 2.6898, + "step": 14204 + }, + { + "epoch": 0.42122586958455654, + "grad_norm": 0.11542539298534393, + "learning_rate": 0.0006314270233320806, + "loss": 2.6783, + "step": 14205 + }, + { + "epoch": 0.421255522936868, + "grad_norm": 0.123759426176548, + "learning_rate": 0.0006313816263078662, + "loss": 2.6594, + "step": 14206 + }, + { + "epoch": 0.4212851762891795, + "grad_norm": 0.12337324023246765, + "learning_rate": 0.000631336228120245, + "loss": 2.6763, + "step": 14207 + }, + { + "epoch": 0.421314829641491, + "grad_norm": 0.1284313052892685, + "learning_rate": 0.0006312908287696191, + "loss": 2.6604, + "step": 14208 + }, + { + "epoch": 0.42134448299380245, + "grad_norm": 0.1274505853652954, + "learning_rate": 0.0006312454282563902, + "loss": 2.7098, + "step": 14209 + }, + { + "epoch": 0.4213741363461139, + "grad_norm": 0.12644517421722412, + "learning_rate": 0.0006312000265809606, + "loss": 2.6742, + "step": 14210 + }, + { + "epoch": 0.4214037896984254, + "grad_norm": 0.13371862471103668, + "learning_rate": 0.0006311546237437321, + "loss": 2.6345, + "step": 14211 + }, + { + "epoch": 0.4214334430507369, + "grad_norm": 0.12721136212348938, + "learning_rate": 0.000631109219745107, + "loss": 2.6994, + "step": 14212 + }, + { + "epoch": 0.42146309640304835, + "grad_norm": 0.11237907409667969, + "learning_rate": 0.0006310638145854872, + "loss": 2.698, + "step": 14213 + }, + { + "epoch": 0.42149274975535983, + "grad_norm": 0.12343048304319382, + "learning_rate": 0.000631018408265275, + "loss": 2.6805, + "step": 14214 + }, + { + "epoch": 0.4215224031076713, + "grad_norm": 0.11805622279644012, + "learning_rate": 0.0006309730007848722, + "loss": 2.683, + "step": 14215 + }, + { + "epoch": 0.4215520564599828, + "grad_norm": 0.12682482600212097, + "learning_rate": 0.0006309275921446808, + "loss": 2.7051, + "step": 14216 + }, + { + "epoch": 0.42158170981229426, + "grad_norm": 0.11427690833806992, + "learning_rate": 0.0006308821823451035, + "loss": 2.6755, + "step": 14217 + }, + { + "epoch": 0.42161136316460573, + "grad_norm": 0.1296980232000351, + "learning_rate": 0.0006308367713865416, + "loss": 2.6665, + "step": 14218 + }, + { + "epoch": 0.4216410165169172, + "grad_norm": 0.13690824806690216, + "learning_rate": 0.0006307913592693979, + "loss": 2.6781, + "step": 14219 + }, + { + "epoch": 0.42167066986922874, + "grad_norm": 0.13852530717849731, + "learning_rate": 0.0006307459459940741, + "loss": 2.6816, + "step": 14220 + }, + { + "epoch": 0.4217003232215402, + "grad_norm": 0.11707315593957901, + "learning_rate": 0.0006307005315609726, + "loss": 2.7025, + "step": 14221 + }, + { + "epoch": 0.4217299765738517, + "grad_norm": 0.13402584195137024, + "learning_rate": 0.0006306551159704955, + "loss": 2.6861, + "step": 14222 + }, + { + "epoch": 0.42175962992616317, + "grad_norm": 0.12355323880910873, + "learning_rate": 0.0006306096992230448, + "loss": 2.6583, + "step": 14223 + }, + { + "epoch": 0.42178928327847465, + "grad_norm": 0.12350968271493912, + "learning_rate": 0.0006305642813190229, + "loss": 2.6876, + "step": 14224 + }, + { + "epoch": 0.4218189366307861, + "grad_norm": 0.11529877781867981, + "learning_rate": 0.0006305188622588318, + "loss": 2.6528, + "step": 14225 + }, + { + "epoch": 0.4218485899830976, + "grad_norm": 0.12828591465950012, + "learning_rate": 0.0006304734420428739, + "loss": 2.6623, + "step": 14226 + }, + { + "epoch": 0.4218782433354091, + "grad_norm": 0.1403450220823288, + "learning_rate": 0.0006304280206715511, + "loss": 2.6774, + "step": 14227 + }, + { + "epoch": 0.42190789668772055, + "grad_norm": 0.13123837113380432, + "learning_rate": 0.000630382598145266, + "loss": 2.6876, + "step": 14228 + }, + { + "epoch": 0.42193755004003203, + "grad_norm": 0.14009149372577667, + "learning_rate": 0.0006303371744644203, + "loss": 2.6615, + "step": 14229 + }, + { + "epoch": 0.4219672033923435, + "grad_norm": 0.1422194093465805, + "learning_rate": 0.0006302917496294168, + "loss": 2.6972, + "step": 14230 + }, + { + "epoch": 0.421996856744655, + "grad_norm": 0.14155213534832, + "learning_rate": 0.0006302463236406573, + "loss": 2.6911, + "step": 14231 + }, + { + "epoch": 0.42202651009696646, + "grad_norm": 0.13054370880126953, + "learning_rate": 0.0006302008964985444, + "loss": 2.681, + "step": 14232 + }, + { + "epoch": 0.42205616344927793, + "grad_norm": 0.11904478073120117, + "learning_rate": 0.0006301554682034803, + "loss": 2.7057, + "step": 14233 + }, + { + "epoch": 0.4220858168015894, + "grad_norm": 0.13351276516914368, + "learning_rate": 0.0006301100387558671, + "loss": 2.6998, + "step": 14234 + }, + { + "epoch": 0.4221154701539009, + "grad_norm": 0.15688903629779816, + "learning_rate": 0.0006300646081561071, + "loss": 2.6626, + "step": 14235 + }, + { + "epoch": 0.42214512350621236, + "grad_norm": 0.18237845599651337, + "learning_rate": 0.0006300191764046026, + "loss": 2.7065, + "step": 14236 + }, + { + "epoch": 0.42217477685852384, + "grad_norm": 0.16443653404712677, + "learning_rate": 0.0006299737435017562, + "loss": 2.6489, + "step": 14237 + }, + { + "epoch": 0.4222044302108353, + "grad_norm": 0.142892524600029, + "learning_rate": 0.0006299283094479699, + "loss": 2.6938, + "step": 14238 + }, + { + "epoch": 0.4222340835631468, + "grad_norm": 0.12685281038284302, + "learning_rate": 0.0006298828742436461, + "loss": 2.6729, + "step": 14239 + }, + { + "epoch": 0.42226373691545827, + "grad_norm": 0.12519896030426025, + "learning_rate": 0.0006298374378891871, + "loss": 2.6647, + "step": 14240 + }, + { + "epoch": 0.4222933902677698, + "grad_norm": 0.14212501049041748, + "learning_rate": 0.0006297920003849954, + "loss": 2.6576, + "step": 14241 + }, + { + "epoch": 0.4223230436200813, + "grad_norm": 0.12911947071552277, + "learning_rate": 0.0006297465617314731, + "loss": 2.6789, + "step": 14242 + }, + { + "epoch": 0.42235269697239275, + "grad_norm": 0.11712821573019028, + "learning_rate": 0.000629701121929023, + "loss": 2.6677, + "step": 14243 + }, + { + "epoch": 0.4223823503247042, + "grad_norm": 0.11962416768074036, + "learning_rate": 0.0006296556809780471, + "loss": 2.6823, + "step": 14244 + }, + { + "epoch": 0.4224120036770157, + "grad_norm": 0.13223087787628174, + "learning_rate": 0.0006296102388789477, + "loss": 2.6794, + "step": 14245 + }, + { + "epoch": 0.4224416570293272, + "grad_norm": 0.13894109427928925, + "learning_rate": 0.0006295647956321276, + "loss": 2.7004, + "step": 14246 + }, + { + "epoch": 0.42247131038163865, + "grad_norm": 0.12411034852266312, + "learning_rate": 0.0006295193512379888, + "loss": 2.689, + "step": 14247 + }, + { + "epoch": 0.42250096373395013, + "grad_norm": 0.12333064526319504, + "learning_rate": 0.0006294739056969341, + "loss": 2.6885, + "step": 14248 + }, + { + "epoch": 0.4225306170862616, + "grad_norm": 0.12031564116477966, + "learning_rate": 0.0006294284590093657, + "loss": 2.6853, + "step": 14249 + }, + { + "epoch": 0.4225602704385731, + "grad_norm": 0.111724354326725, + "learning_rate": 0.000629383011175686, + "loss": 2.6796, + "step": 14250 + }, + { + "epoch": 0.42258992379088456, + "grad_norm": 0.12204542011022568, + "learning_rate": 0.0006293375621962975, + "loss": 2.6822, + "step": 14251 + }, + { + "epoch": 0.42261957714319603, + "grad_norm": 0.13391904532909393, + "learning_rate": 0.0006292921120716029, + "loss": 2.687, + "step": 14252 + }, + { + "epoch": 0.4226492304955075, + "grad_norm": 0.12808191776275635, + "learning_rate": 0.0006292466608020043, + "loss": 2.6829, + "step": 14253 + }, + { + "epoch": 0.422678883847819, + "grad_norm": 0.13819842040538788, + "learning_rate": 0.0006292012083879044, + "loss": 2.6745, + "step": 14254 + }, + { + "epoch": 0.42270853720013046, + "grad_norm": 0.11937204748392105, + "learning_rate": 0.0006291557548297055, + "loss": 2.6282, + "step": 14255 + }, + { + "epoch": 0.42273819055244194, + "grad_norm": 0.1150941550731659, + "learning_rate": 0.0006291103001278102, + "loss": 2.6566, + "step": 14256 + }, + { + "epoch": 0.4227678439047534, + "grad_norm": 0.11204612255096436, + "learning_rate": 0.0006290648442826213, + "loss": 2.7193, + "step": 14257 + }, + { + "epoch": 0.4227974972570649, + "grad_norm": 0.11665309220552444, + "learning_rate": 0.0006290193872945408, + "loss": 2.6948, + "step": 14258 + }, + { + "epoch": 0.42282715060937637, + "grad_norm": 0.12829023599624634, + "learning_rate": 0.0006289739291639716, + "loss": 2.6836, + "step": 14259 + }, + { + "epoch": 0.42285680396168784, + "grad_norm": 0.12516508996486664, + "learning_rate": 0.0006289284698913161, + "loss": 2.6618, + "step": 14260 + }, + { + "epoch": 0.4228864573139993, + "grad_norm": 0.13921798765659332, + "learning_rate": 0.0006288830094769768, + "loss": 2.6456, + "step": 14261 + }, + { + "epoch": 0.42291611066631085, + "grad_norm": 0.14460912346839905, + "learning_rate": 0.0006288375479213564, + "loss": 2.6965, + "step": 14262 + }, + { + "epoch": 0.4229457640186223, + "grad_norm": 0.1559712290763855, + "learning_rate": 0.0006287920852248573, + "loss": 2.6842, + "step": 14263 + }, + { + "epoch": 0.4229754173709338, + "grad_norm": 0.17260603606700897, + "learning_rate": 0.0006287466213878824, + "loss": 2.7012, + "step": 14264 + }, + { + "epoch": 0.4230050707232453, + "grad_norm": 0.1718173623085022, + "learning_rate": 0.0006287011564108338, + "loss": 2.6438, + "step": 14265 + }, + { + "epoch": 0.42303472407555676, + "grad_norm": 0.13461723923683167, + "learning_rate": 0.0006286556902941145, + "loss": 2.6793, + "step": 14266 + }, + { + "epoch": 0.42306437742786823, + "grad_norm": 0.15240660309791565, + "learning_rate": 0.000628610223038127, + "loss": 2.6365, + "step": 14267 + }, + { + "epoch": 0.4230940307801797, + "grad_norm": 0.16740970313549042, + "learning_rate": 0.0006285647546432738, + "loss": 2.6848, + "step": 14268 + }, + { + "epoch": 0.4231236841324912, + "grad_norm": 0.1357571929693222, + "learning_rate": 0.0006285192851099577, + "loss": 2.675, + "step": 14269 + }, + { + "epoch": 0.42315333748480266, + "grad_norm": 0.14477381110191345, + "learning_rate": 0.0006284738144385812, + "loss": 2.6555, + "step": 14270 + }, + { + "epoch": 0.42318299083711414, + "grad_norm": 0.14369063079357147, + "learning_rate": 0.0006284283426295471, + "loss": 2.6753, + "step": 14271 + }, + { + "epoch": 0.4232126441894256, + "grad_norm": 0.13189859688282013, + "learning_rate": 0.0006283828696832581, + "loss": 2.647, + "step": 14272 + }, + { + "epoch": 0.4232422975417371, + "grad_norm": 0.11864487826824188, + "learning_rate": 0.0006283373956001167, + "loss": 2.689, + "step": 14273 + }, + { + "epoch": 0.42327195089404857, + "grad_norm": 0.11312282830476761, + "learning_rate": 0.0006282919203805255, + "loss": 2.687, + "step": 14274 + }, + { + "epoch": 0.42330160424636004, + "grad_norm": 0.12290363013744354, + "learning_rate": 0.0006282464440248872, + "loss": 2.6659, + "step": 14275 + }, + { + "epoch": 0.4233312575986715, + "grad_norm": 0.13260948657989502, + "learning_rate": 0.0006282009665336049, + "loss": 2.6635, + "step": 14276 + }, + { + "epoch": 0.423360910950983, + "grad_norm": 0.11343319714069366, + "learning_rate": 0.000628155487907081, + "loss": 2.6923, + "step": 14277 + }, + { + "epoch": 0.42339056430329447, + "grad_norm": 0.12619170546531677, + "learning_rate": 0.0006281100081457181, + "loss": 2.6895, + "step": 14278 + }, + { + "epoch": 0.42342021765560595, + "grad_norm": 0.1342933028936386, + "learning_rate": 0.0006280645272499193, + "loss": 2.6967, + "step": 14279 + }, + { + "epoch": 0.4234498710079174, + "grad_norm": 0.13010963797569275, + "learning_rate": 0.000628019045220087, + "loss": 2.6745, + "step": 14280 + }, + { + "epoch": 0.4234795243602289, + "grad_norm": 0.1361878365278244, + "learning_rate": 0.000627973562056624, + "loss": 2.648, + "step": 14281 + }, + { + "epoch": 0.42350917771254043, + "grad_norm": 0.12746956944465637, + "learning_rate": 0.0006279280777599332, + "loss": 2.7011, + "step": 14282 + }, + { + "epoch": 0.4235388310648519, + "grad_norm": 0.13932085037231445, + "learning_rate": 0.0006278825923304174, + "loss": 2.6729, + "step": 14283 + }, + { + "epoch": 0.4235684844171634, + "grad_norm": 0.1285223513841629, + "learning_rate": 0.0006278371057684793, + "loss": 2.6438, + "step": 14284 + }, + { + "epoch": 0.42359813776947486, + "grad_norm": 0.11938720941543579, + "learning_rate": 0.0006277916180745215, + "loss": 2.6825, + "step": 14285 + }, + { + "epoch": 0.42362779112178633, + "grad_norm": 0.14395658671855927, + "learning_rate": 0.0006277461292489473, + "loss": 2.6277, + "step": 14286 + }, + { + "epoch": 0.4236574444740978, + "grad_norm": 0.13743728399276733, + "learning_rate": 0.000627700639292159, + "loss": 2.6727, + "step": 14287 + }, + { + "epoch": 0.4236870978264093, + "grad_norm": 0.12190514802932739, + "learning_rate": 0.0006276551482045596, + "loss": 2.6943, + "step": 14288 + }, + { + "epoch": 0.42371675117872076, + "grad_norm": 0.12686686217784882, + "learning_rate": 0.000627609655986552, + "loss": 2.6692, + "step": 14289 + }, + { + "epoch": 0.42374640453103224, + "grad_norm": 0.15068450570106506, + "learning_rate": 0.0006275641626385389, + "loss": 2.6711, + "step": 14290 + }, + { + "epoch": 0.4237760578833437, + "grad_norm": 0.13691121339797974, + "learning_rate": 0.0006275186681609233, + "loss": 2.6681, + "step": 14291 + }, + { + "epoch": 0.4238057112356552, + "grad_norm": 0.09555702656507492, + "learning_rate": 0.0006274731725541081, + "loss": 2.6711, + "step": 14292 + }, + { + "epoch": 0.42383536458796667, + "grad_norm": 0.1352015882730484, + "learning_rate": 0.0006274276758184961, + "loss": 2.6919, + "step": 14293 + }, + { + "epoch": 0.42386501794027814, + "grad_norm": 0.14377570152282715, + "learning_rate": 0.0006273821779544899, + "loss": 2.6503, + "step": 14294 + }, + { + "epoch": 0.4238946712925896, + "grad_norm": 0.143875852227211, + "learning_rate": 0.0006273366789624928, + "loss": 2.7018, + "step": 14295 + }, + { + "epoch": 0.4239243246449011, + "grad_norm": 0.1517053246498108, + "learning_rate": 0.0006272911788429076, + "loss": 2.6656, + "step": 14296 + }, + { + "epoch": 0.42395397799721257, + "grad_norm": 0.14960190653800964, + "learning_rate": 0.0006272456775961371, + "loss": 2.6769, + "step": 14297 + }, + { + "epoch": 0.42398363134952405, + "grad_norm": 0.12608948349952698, + "learning_rate": 0.0006272001752225844, + "loss": 2.673, + "step": 14298 + }, + { + "epoch": 0.4240132847018355, + "grad_norm": 0.11537393927574158, + "learning_rate": 0.0006271546717226522, + "loss": 2.6676, + "step": 14299 + }, + { + "epoch": 0.424042938054147, + "grad_norm": 0.13885453343391418, + "learning_rate": 0.0006271091670967436, + "loss": 2.7305, + "step": 14300 + }, + { + "epoch": 0.4240725914064585, + "grad_norm": 0.13121555745601654, + "learning_rate": 0.0006270636613452614, + "loss": 2.6848, + "step": 14301 + }, + { + "epoch": 0.42410224475876995, + "grad_norm": 0.11974699795246124, + "learning_rate": 0.0006270181544686086, + "loss": 2.6588, + "step": 14302 + }, + { + "epoch": 0.4241318981110815, + "grad_norm": 0.12512549757957458, + "learning_rate": 0.0006269726464671885, + "loss": 2.651, + "step": 14303 + }, + { + "epoch": 0.42416155146339296, + "grad_norm": 0.11788394302129745, + "learning_rate": 0.0006269271373414039, + "loss": 2.6528, + "step": 14304 + }, + { + "epoch": 0.42419120481570444, + "grad_norm": 0.10043928772211075, + "learning_rate": 0.0006268816270916574, + "loss": 2.6691, + "step": 14305 + }, + { + "epoch": 0.4242208581680159, + "grad_norm": 0.11854471266269684, + "learning_rate": 0.0006268361157183524, + "loss": 2.6668, + "step": 14306 + }, + { + "epoch": 0.4242505115203274, + "grad_norm": 0.13579432666301727, + "learning_rate": 0.0006267906032218917, + "loss": 2.696, + "step": 14307 + }, + { + "epoch": 0.42428016487263887, + "grad_norm": 0.14048735797405243, + "learning_rate": 0.0006267450896026787, + "loss": 2.6943, + "step": 14308 + }, + { + "epoch": 0.42430981822495034, + "grad_norm": 0.11536160856485367, + "learning_rate": 0.0006266995748611162, + "loss": 2.6596, + "step": 14309 + }, + { + "epoch": 0.4243394715772618, + "grad_norm": 0.1304900348186493, + "learning_rate": 0.0006266540589976071, + "loss": 2.7021, + "step": 14310 + }, + { + "epoch": 0.4243691249295733, + "grad_norm": 0.1465945839881897, + "learning_rate": 0.0006266085420125546, + "loss": 2.6716, + "step": 14311 + }, + { + "epoch": 0.42439877828188477, + "grad_norm": 0.14006762206554413, + "learning_rate": 0.0006265630239063617, + "loss": 2.7004, + "step": 14312 + }, + { + "epoch": 0.42442843163419625, + "grad_norm": 0.14651986956596375, + "learning_rate": 0.0006265175046794313, + "loss": 2.6571, + "step": 14313 + }, + { + "epoch": 0.4244580849865077, + "grad_norm": 0.11895211040973663, + "learning_rate": 0.000626471984332167, + "loss": 2.6586, + "step": 14314 + }, + { + "epoch": 0.4244877383388192, + "grad_norm": 0.13205306231975555, + "learning_rate": 0.0006264264628649714, + "loss": 2.6387, + "step": 14315 + }, + { + "epoch": 0.4245173916911307, + "grad_norm": 0.12918947637081146, + "learning_rate": 0.0006263809402782479, + "loss": 2.6164, + "step": 14316 + }, + { + "epoch": 0.42454704504344215, + "grad_norm": 0.13443805277347565, + "learning_rate": 0.0006263354165723993, + "loss": 2.688, + "step": 14317 + }, + { + "epoch": 0.4245766983957536, + "grad_norm": 0.12733979523181915, + "learning_rate": 0.000626289891747829, + "loss": 2.6651, + "step": 14318 + }, + { + "epoch": 0.4246063517480651, + "grad_norm": 0.12732772529125214, + "learning_rate": 0.00062624436580494, + "loss": 2.6843, + "step": 14319 + }, + { + "epoch": 0.4246360051003766, + "grad_norm": 0.13186417520046234, + "learning_rate": 0.0006261988387441356, + "loss": 2.7008, + "step": 14320 + }, + { + "epoch": 0.42466565845268806, + "grad_norm": 0.13892561197280884, + "learning_rate": 0.0006261533105658187, + "loss": 2.6964, + "step": 14321 + }, + { + "epoch": 0.42469531180499953, + "grad_norm": 0.14183038473129272, + "learning_rate": 0.0006261077812703926, + "loss": 2.6736, + "step": 14322 + }, + { + "epoch": 0.424724965157311, + "grad_norm": 0.14164277911186218, + "learning_rate": 0.0006260622508582604, + "loss": 2.6761, + "step": 14323 + }, + { + "epoch": 0.42475461850962254, + "grad_norm": 0.13468699157238007, + "learning_rate": 0.0006260167193298254, + "loss": 2.6884, + "step": 14324 + }, + { + "epoch": 0.424784271861934, + "grad_norm": 0.11820966750383377, + "learning_rate": 0.0006259711866854906, + "loss": 2.7073, + "step": 14325 + }, + { + "epoch": 0.4248139252142455, + "grad_norm": 0.11159394681453705, + "learning_rate": 0.0006259256529256596, + "loss": 2.6709, + "step": 14326 + }, + { + "epoch": 0.42484357856655697, + "grad_norm": 0.12882620096206665, + "learning_rate": 0.0006258801180507351, + "loss": 2.6631, + "step": 14327 + }, + { + "epoch": 0.42487323191886844, + "grad_norm": 0.12495166808366776, + "learning_rate": 0.0006258345820611206, + "loss": 2.6677, + "step": 14328 + }, + { + "epoch": 0.4249028852711799, + "grad_norm": 0.12929928302764893, + "learning_rate": 0.0006257890449572192, + "loss": 2.6984, + "step": 14329 + }, + { + "epoch": 0.4249325386234914, + "grad_norm": 0.12479831278324127, + "learning_rate": 0.0006257435067394344, + "loss": 2.6562, + "step": 14330 + }, + { + "epoch": 0.42496219197580287, + "grad_norm": 0.13526101410388947, + "learning_rate": 0.0006256979674081692, + "loss": 2.6273, + "step": 14331 + }, + { + "epoch": 0.42499184532811435, + "grad_norm": 0.1250203251838684, + "learning_rate": 0.0006256524269638268, + "loss": 2.6788, + "step": 14332 + }, + { + "epoch": 0.4250214986804258, + "grad_norm": 0.1254330575466156, + "learning_rate": 0.0006256068854068107, + "loss": 2.683, + "step": 14333 + }, + { + "epoch": 0.4250511520327373, + "grad_norm": 0.10990308970212936, + "learning_rate": 0.000625561342737524, + "loss": 2.6922, + "step": 14334 + }, + { + "epoch": 0.4250808053850488, + "grad_norm": 0.11492335051298141, + "learning_rate": 0.00062551579895637, + "loss": 2.6637, + "step": 14335 + }, + { + "epoch": 0.42511045873736025, + "grad_norm": 0.11552633345127106, + "learning_rate": 0.0006254702540637523, + "loss": 2.704, + "step": 14336 + }, + { + "epoch": 0.42514011208967173, + "grad_norm": 0.10601513832807541, + "learning_rate": 0.0006254247080600738, + "loss": 2.6693, + "step": 14337 + }, + { + "epoch": 0.4251697654419832, + "grad_norm": 0.09938370436429977, + "learning_rate": 0.000625379160945738, + "loss": 2.6792, + "step": 14338 + }, + { + "epoch": 0.4251994187942947, + "grad_norm": 0.11064834147691727, + "learning_rate": 0.0006253336127211481, + "loss": 2.6765, + "step": 14339 + }, + { + "epoch": 0.42522907214660616, + "grad_norm": 0.1218935176730156, + "learning_rate": 0.0006252880633867077, + "loss": 2.6661, + "step": 14340 + }, + { + "epoch": 0.42525872549891763, + "grad_norm": 0.1299596130847931, + "learning_rate": 0.00062524251294282, + "loss": 2.6448, + "step": 14341 + }, + { + "epoch": 0.4252883788512291, + "grad_norm": 0.11796101927757263, + "learning_rate": 0.0006251969613898882, + "loss": 2.6854, + "step": 14342 + }, + { + "epoch": 0.4253180322035406, + "grad_norm": 0.12539589405059814, + "learning_rate": 0.000625151408728316, + "loss": 2.6747, + "step": 14343 + }, + { + "epoch": 0.42534768555585206, + "grad_norm": 0.12162655591964722, + "learning_rate": 0.0006251058549585065, + "loss": 2.6544, + "step": 14344 + }, + { + "epoch": 0.4253773389081636, + "grad_norm": 0.13016310334205627, + "learning_rate": 0.0006250603000808632, + "loss": 2.6593, + "step": 14345 + }, + { + "epoch": 0.42540699226047507, + "grad_norm": 0.17096342146396637, + "learning_rate": 0.0006250147440957894, + "loss": 2.6987, + "step": 14346 + }, + { + "epoch": 0.42543664561278655, + "grad_norm": 0.183268204331398, + "learning_rate": 0.0006249691870036886, + "loss": 2.6682, + "step": 14347 + }, + { + "epoch": 0.425466298965098, + "grad_norm": 0.15723948180675507, + "learning_rate": 0.0006249236288049644, + "loss": 2.6693, + "step": 14348 + }, + { + "epoch": 0.4254959523174095, + "grad_norm": 0.14246977865695953, + "learning_rate": 0.0006248780695000198, + "loss": 2.6784, + "step": 14349 + }, + { + "epoch": 0.425525605669721, + "grad_norm": 0.15412181615829468, + "learning_rate": 0.0006248325090892585, + "loss": 2.6913, + "step": 14350 + }, + { + "epoch": 0.42555525902203245, + "grad_norm": 0.1485002338886261, + "learning_rate": 0.0006247869475730839, + "loss": 2.6835, + "step": 14351 + }, + { + "epoch": 0.4255849123743439, + "grad_norm": 0.1276998519897461, + "learning_rate": 0.0006247413849518995, + "loss": 2.6621, + "step": 14352 + }, + { + "epoch": 0.4256145657266554, + "grad_norm": 0.13900801539421082, + "learning_rate": 0.0006246958212261087, + "loss": 2.6793, + "step": 14353 + }, + { + "epoch": 0.4256442190789669, + "grad_norm": 0.12501227855682373, + "learning_rate": 0.0006246502563961151, + "loss": 2.6836, + "step": 14354 + }, + { + "epoch": 0.42567387243127836, + "grad_norm": 0.1151905208826065, + "learning_rate": 0.0006246046904623219, + "loss": 2.6602, + "step": 14355 + }, + { + "epoch": 0.42570352578358983, + "grad_norm": 0.13331137597560883, + "learning_rate": 0.0006245591234251329, + "loss": 2.7197, + "step": 14356 + }, + { + "epoch": 0.4257331791359013, + "grad_norm": 0.11108139157295227, + "learning_rate": 0.0006245135552849514, + "loss": 2.6923, + "step": 14357 + }, + { + "epoch": 0.4257628324882128, + "grad_norm": 0.11545057594776154, + "learning_rate": 0.0006244679860421811, + "loss": 2.6417, + "step": 14358 + }, + { + "epoch": 0.42579248584052426, + "grad_norm": 0.11856374144554138, + "learning_rate": 0.0006244224156972254, + "loss": 2.6591, + "step": 14359 + }, + { + "epoch": 0.42582213919283574, + "grad_norm": 0.11673538386821747, + "learning_rate": 0.0006243768442504878, + "loss": 2.6936, + "step": 14360 + }, + { + "epoch": 0.4258517925451472, + "grad_norm": 0.12003067880868912, + "learning_rate": 0.000624331271702372, + "loss": 2.6635, + "step": 14361 + }, + { + "epoch": 0.4258814458974587, + "grad_norm": 0.13001711666584015, + "learning_rate": 0.0006242856980532813, + "loss": 2.6825, + "step": 14362 + }, + { + "epoch": 0.42591109924977016, + "grad_norm": 0.1185239776968956, + "learning_rate": 0.0006242401233036195, + "loss": 2.6751, + "step": 14363 + }, + { + "epoch": 0.42594075260208164, + "grad_norm": 0.11246813833713531, + "learning_rate": 0.0006241945474537901, + "loss": 2.669, + "step": 14364 + }, + { + "epoch": 0.4259704059543931, + "grad_norm": 0.10686305165290833, + "learning_rate": 0.0006241489705041965, + "loss": 2.6679, + "step": 14365 + }, + { + "epoch": 0.42600005930670465, + "grad_norm": 0.1277722865343094, + "learning_rate": 0.0006241033924552427, + "loss": 2.6802, + "step": 14366 + }, + { + "epoch": 0.4260297126590161, + "grad_norm": 0.13410033285617828, + "learning_rate": 0.0006240578133073319, + "loss": 2.6495, + "step": 14367 + }, + { + "epoch": 0.4260593660113276, + "grad_norm": 0.11366413533687592, + "learning_rate": 0.0006240122330608679, + "loss": 2.675, + "step": 14368 + }, + { + "epoch": 0.4260890193636391, + "grad_norm": 0.12799662351608276, + "learning_rate": 0.0006239666517162543, + "loss": 2.6567, + "step": 14369 + }, + { + "epoch": 0.42611867271595055, + "grad_norm": 0.15052801370620728, + "learning_rate": 0.0006239210692738948, + "loss": 2.6594, + "step": 14370 + }, + { + "epoch": 0.42614832606826203, + "grad_norm": 0.14516115188598633, + "learning_rate": 0.0006238754857341929, + "loss": 2.6586, + "step": 14371 + }, + { + "epoch": 0.4261779794205735, + "grad_norm": 0.13284461200237274, + "learning_rate": 0.0006238299010975522, + "loss": 2.6703, + "step": 14372 + }, + { + "epoch": 0.426207632772885, + "grad_norm": 0.10633096098899841, + "learning_rate": 0.0006237843153643765, + "loss": 2.6819, + "step": 14373 + }, + { + "epoch": 0.42623728612519646, + "grad_norm": 0.1302211731672287, + "learning_rate": 0.0006237387285350696, + "loss": 2.6653, + "step": 14374 + }, + { + "epoch": 0.42626693947750793, + "grad_norm": 0.15100184082984924, + "learning_rate": 0.0006236931406100349, + "loss": 2.6605, + "step": 14375 + }, + { + "epoch": 0.4262965928298194, + "grad_norm": 0.1403791308403015, + "learning_rate": 0.0006236475515896762, + "loss": 2.6819, + "step": 14376 + }, + { + "epoch": 0.4263262461821309, + "grad_norm": 0.1388823240995407, + "learning_rate": 0.0006236019614743973, + "loss": 2.6493, + "step": 14377 + }, + { + "epoch": 0.42635589953444236, + "grad_norm": 0.143352210521698, + "learning_rate": 0.0006235563702646017, + "loss": 2.6893, + "step": 14378 + }, + { + "epoch": 0.42638555288675384, + "grad_norm": 0.14815115928649902, + "learning_rate": 0.0006235107779606932, + "loss": 2.6945, + "step": 14379 + }, + { + "epoch": 0.4264152062390653, + "grad_norm": 0.15758243203163147, + "learning_rate": 0.0006234651845630758, + "loss": 2.6644, + "step": 14380 + }, + { + "epoch": 0.4264448595913768, + "grad_norm": 0.13837930560112, + "learning_rate": 0.0006234195900721528, + "loss": 2.7039, + "step": 14381 + }, + { + "epoch": 0.42647451294368827, + "grad_norm": 0.11568184196949005, + "learning_rate": 0.0006233739944883283, + "loss": 2.6613, + "step": 14382 + }, + { + "epoch": 0.42650416629599974, + "grad_norm": 0.10298798978328705, + "learning_rate": 0.0006233283978120057, + "loss": 2.6735, + "step": 14383 + }, + { + "epoch": 0.4265338196483112, + "grad_norm": 0.12062855064868927, + "learning_rate": 0.0006232828000435891, + "loss": 2.7091, + "step": 14384 + }, + { + "epoch": 0.4265634730006227, + "grad_norm": 0.125058114528656, + "learning_rate": 0.000623237201183482, + "loss": 2.6546, + "step": 14385 + }, + { + "epoch": 0.4265931263529342, + "grad_norm": 0.10581924021244049, + "learning_rate": 0.0006231916012320884, + "loss": 2.6552, + "step": 14386 + }, + { + "epoch": 0.4266227797052457, + "grad_norm": 0.11826737225055695, + "learning_rate": 0.0006231460001898121, + "loss": 2.6768, + "step": 14387 + }, + { + "epoch": 0.4266524330575572, + "grad_norm": 0.13433802127838135, + "learning_rate": 0.0006231003980570567, + "loss": 2.678, + "step": 14388 + }, + { + "epoch": 0.42668208640986865, + "grad_norm": 0.1396125704050064, + "learning_rate": 0.0006230547948342264, + "loss": 2.6884, + "step": 14389 + }, + { + "epoch": 0.42671173976218013, + "grad_norm": 0.13342566788196564, + "learning_rate": 0.0006230091905217246, + "loss": 2.7092, + "step": 14390 + }, + { + "epoch": 0.4267413931144916, + "grad_norm": 0.13280463218688965, + "learning_rate": 0.0006229635851199552, + "loss": 2.6802, + "step": 14391 + }, + { + "epoch": 0.4267710464668031, + "grad_norm": 0.13568523526191711, + "learning_rate": 0.0006229179786293223, + "loss": 2.6799, + "step": 14392 + }, + { + "epoch": 0.42680069981911456, + "grad_norm": 0.1585042029619217, + "learning_rate": 0.0006228723710502295, + "loss": 2.6798, + "step": 14393 + }, + { + "epoch": 0.42683035317142604, + "grad_norm": 0.1532175987958908, + "learning_rate": 0.0006228267623830809, + "loss": 2.6659, + "step": 14394 + }, + { + "epoch": 0.4268600065237375, + "grad_norm": 0.10277986526489258, + "learning_rate": 0.00062278115262828, + "loss": 2.6692, + "step": 14395 + }, + { + "epoch": 0.426889659876049, + "grad_norm": 0.11445024609565735, + "learning_rate": 0.0006227355417862311, + "loss": 2.6618, + "step": 14396 + }, + { + "epoch": 0.42691931322836046, + "grad_norm": 0.13120077550411224, + "learning_rate": 0.0006226899298573381, + "loss": 2.6585, + "step": 14397 + }, + { + "epoch": 0.42694896658067194, + "grad_norm": 0.11462272703647614, + "learning_rate": 0.0006226443168420045, + "loss": 2.6787, + "step": 14398 + }, + { + "epoch": 0.4269786199329834, + "grad_norm": 0.10041563957929611, + "learning_rate": 0.0006225987027406343, + "loss": 2.675, + "step": 14399 + }, + { + "epoch": 0.4270082732852949, + "grad_norm": 0.12275617569684982, + "learning_rate": 0.0006225530875536316, + "loss": 2.718, + "step": 14400 + }, + { + "epoch": 0.42703792663760637, + "grad_norm": 0.10179772973060608, + "learning_rate": 0.0006225074712814004, + "loss": 2.6918, + "step": 14401 + }, + { + "epoch": 0.42706757998991784, + "grad_norm": 0.13207724690437317, + "learning_rate": 0.0006224618539243445, + "loss": 2.6515, + "step": 14402 + }, + { + "epoch": 0.4270972333422293, + "grad_norm": 0.1595420092344284, + "learning_rate": 0.0006224162354828679, + "loss": 2.6463, + "step": 14403 + }, + { + "epoch": 0.4271268866945408, + "grad_norm": 0.12982076406478882, + "learning_rate": 0.0006223706159573742, + "loss": 2.6457, + "step": 14404 + }, + { + "epoch": 0.4271565400468523, + "grad_norm": 0.12436608225107193, + "learning_rate": 0.0006223249953482679, + "loss": 2.6901, + "step": 14405 + }, + { + "epoch": 0.42718619339916375, + "grad_norm": 0.14173974096775055, + "learning_rate": 0.0006222793736559529, + "loss": 2.6893, + "step": 14406 + }, + { + "epoch": 0.4272158467514753, + "grad_norm": 0.14330458641052246, + "learning_rate": 0.000622233750880833, + "loss": 2.6339, + "step": 14407 + }, + { + "epoch": 0.42724550010378676, + "grad_norm": 0.12380442023277283, + "learning_rate": 0.0006221881270233123, + "loss": 2.6765, + "step": 14408 + }, + { + "epoch": 0.42727515345609823, + "grad_norm": 0.1233564168214798, + "learning_rate": 0.0006221425020837947, + "loss": 2.6557, + "step": 14409 + }, + { + "epoch": 0.4273048068084097, + "grad_norm": 0.1394900530576706, + "learning_rate": 0.000622096876062684, + "loss": 2.7125, + "step": 14410 + }, + { + "epoch": 0.4273344601607212, + "grad_norm": 0.1244427040219307, + "learning_rate": 0.0006220512489603847, + "loss": 2.6349, + "step": 14411 + }, + { + "epoch": 0.42736411351303266, + "grad_norm": 0.12138484418392181, + "learning_rate": 0.0006220056207773008, + "loss": 2.6704, + "step": 14412 + }, + { + "epoch": 0.42739376686534414, + "grad_norm": 0.1263991743326187, + "learning_rate": 0.0006219599915138361, + "loss": 2.6425, + "step": 14413 + }, + { + "epoch": 0.4274234202176556, + "grad_norm": 0.13740117847919464, + "learning_rate": 0.0006219143611703948, + "loss": 2.6632, + "step": 14414 + }, + { + "epoch": 0.4274530735699671, + "grad_norm": 0.12053900212049484, + "learning_rate": 0.0006218687297473808, + "loss": 2.6497, + "step": 14415 + }, + { + "epoch": 0.42748272692227857, + "grad_norm": 0.13204017281532288, + "learning_rate": 0.0006218230972451983, + "loss": 2.724, + "step": 14416 + }, + { + "epoch": 0.42751238027459004, + "grad_norm": 0.12823863327503204, + "learning_rate": 0.0006217774636642512, + "loss": 2.6879, + "step": 14417 + }, + { + "epoch": 0.4275420336269015, + "grad_norm": 0.10689868777990341, + "learning_rate": 0.000621731829004944, + "loss": 2.6832, + "step": 14418 + }, + { + "epoch": 0.427571686979213, + "grad_norm": 0.10940694063901901, + "learning_rate": 0.0006216861932676806, + "loss": 2.6458, + "step": 14419 + }, + { + "epoch": 0.42760134033152447, + "grad_norm": 0.10926451534032822, + "learning_rate": 0.0006216405564528649, + "loss": 2.6751, + "step": 14420 + }, + { + "epoch": 0.42763099368383595, + "grad_norm": 0.11153265088796616, + "learning_rate": 0.0006215949185609012, + "loss": 2.6815, + "step": 14421 + }, + { + "epoch": 0.4276606470361474, + "grad_norm": 0.12734895944595337, + "learning_rate": 0.0006215492795921938, + "loss": 2.7047, + "step": 14422 + }, + { + "epoch": 0.4276903003884589, + "grad_norm": 0.12705011665821075, + "learning_rate": 0.0006215036395471465, + "loss": 2.6516, + "step": 14423 + }, + { + "epoch": 0.4277199537407704, + "grad_norm": 0.1280345916748047, + "learning_rate": 0.0006214579984261636, + "loss": 2.6681, + "step": 14424 + }, + { + "epoch": 0.42774960709308185, + "grad_norm": 0.14586831629276276, + "learning_rate": 0.0006214123562296493, + "loss": 2.6558, + "step": 14425 + }, + { + "epoch": 0.42777926044539333, + "grad_norm": 0.15817925333976746, + "learning_rate": 0.0006213667129580079, + "loss": 2.6466, + "step": 14426 + }, + { + "epoch": 0.4278089137977048, + "grad_norm": 0.1507922261953354, + "learning_rate": 0.0006213210686116433, + "loss": 2.6882, + "step": 14427 + }, + { + "epoch": 0.42783856715001634, + "grad_norm": 0.11961200833320618, + "learning_rate": 0.0006212754231909597, + "loss": 2.6893, + "step": 14428 + }, + { + "epoch": 0.4278682205023278, + "grad_norm": 0.1175781860947609, + "learning_rate": 0.0006212297766963617, + "loss": 2.6339, + "step": 14429 + }, + { + "epoch": 0.4278978738546393, + "grad_norm": 0.14337533712387085, + "learning_rate": 0.0006211841291282529, + "loss": 2.681, + "step": 14430 + }, + { + "epoch": 0.42792752720695076, + "grad_norm": 0.1895798295736313, + "learning_rate": 0.000621138480487038, + "loss": 2.6985, + "step": 14431 + }, + { + "epoch": 0.42795718055926224, + "grad_norm": 0.18877892196178436, + "learning_rate": 0.000621092830773121, + "loss": 2.6554, + "step": 14432 + }, + { + "epoch": 0.4279868339115737, + "grad_norm": 0.16764329373836517, + "learning_rate": 0.0006210471799869062, + "loss": 2.6555, + "step": 14433 + }, + { + "epoch": 0.4280164872638852, + "grad_norm": 0.11685286462306976, + "learning_rate": 0.000621001528128798, + "loss": 2.6847, + "step": 14434 + }, + { + "epoch": 0.42804614061619667, + "grad_norm": 0.15670979022979736, + "learning_rate": 0.0006209558751992004, + "loss": 2.6867, + "step": 14435 + }, + { + "epoch": 0.42807579396850814, + "grad_norm": 0.1577611267566681, + "learning_rate": 0.0006209102211985177, + "loss": 2.6675, + "step": 14436 + }, + { + "epoch": 0.4281054473208196, + "grad_norm": 0.12283267080783844, + "learning_rate": 0.0006208645661271542, + "loss": 2.6776, + "step": 14437 + }, + { + "epoch": 0.4281351006731311, + "grad_norm": 0.15094837546348572, + "learning_rate": 0.0006208189099855143, + "loss": 2.6859, + "step": 14438 + }, + { + "epoch": 0.4281647540254426, + "grad_norm": 0.14863112568855286, + "learning_rate": 0.0006207732527740022, + "loss": 2.6851, + "step": 14439 + }, + { + "epoch": 0.42819440737775405, + "grad_norm": 0.11843819171190262, + "learning_rate": 0.0006207275944930224, + "loss": 2.681, + "step": 14440 + }, + { + "epoch": 0.4282240607300655, + "grad_norm": 0.13006466627120972, + "learning_rate": 0.0006206819351429789, + "loss": 2.6778, + "step": 14441 + }, + { + "epoch": 0.428253714082377, + "grad_norm": 0.13915200531482697, + "learning_rate": 0.0006206362747242761, + "loss": 2.6682, + "step": 14442 + }, + { + "epoch": 0.4282833674346885, + "grad_norm": 0.13050144910812378, + "learning_rate": 0.0006205906132373182, + "loss": 2.6881, + "step": 14443 + }, + { + "epoch": 0.42831302078699995, + "grad_norm": 0.11241056025028229, + "learning_rate": 0.0006205449506825099, + "loss": 2.701, + "step": 14444 + }, + { + "epoch": 0.42834267413931143, + "grad_norm": 0.13457593321800232, + "learning_rate": 0.0006204992870602555, + "loss": 2.6996, + "step": 14445 + }, + { + "epoch": 0.4283723274916229, + "grad_norm": 0.12290447950363159, + "learning_rate": 0.0006204536223709591, + "loss": 2.6808, + "step": 14446 + }, + { + "epoch": 0.4284019808439344, + "grad_norm": 0.10392254590988159, + "learning_rate": 0.0006204079566150253, + "loss": 2.6765, + "step": 14447 + }, + { + "epoch": 0.42843163419624586, + "grad_norm": 0.11712373793125153, + "learning_rate": 0.0006203622897928583, + "loss": 2.6481, + "step": 14448 + }, + { + "epoch": 0.4284612875485574, + "grad_norm": 0.14060591161251068, + "learning_rate": 0.0006203166219048623, + "loss": 2.6877, + "step": 14449 + }, + { + "epoch": 0.42849094090086887, + "grad_norm": 0.11239676177501678, + "learning_rate": 0.0006202709529514424, + "loss": 2.649, + "step": 14450 + }, + { + "epoch": 0.42852059425318034, + "grad_norm": 0.13124942779541016, + "learning_rate": 0.0006202252829330024, + "loss": 2.7364, + "step": 14451 + }, + { + "epoch": 0.4285502476054918, + "grad_norm": 0.13653632998466492, + "learning_rate": 0.0006201796118499469, + "loss": 2.6439, + "step": 14452 + }, + { + "epoch": 0.4285799009578033, + "grad_norm": 0.11567702889442444, + "learning_rate": 0.0006201339397026802, + "loss": 2.6531, + "step": 14453 + }, + { + "epoch": 0.42860955431011477, + "grad_norm": 0.12062741816043854, + "learning_rate": 0.0006200882664916069, + "loss": 2.7043, + "step": 14454 + }, + { + "epoch": 0.42863920766242625, + "grad_norm": 0.13875019550323486, + "learning_rate": 0.0006200425922171315, + "loss": 2.6559, + "step": 14455 + }, + { + "epoch": 0.4286688610147377, + "grad_norm": 0.12875516712665558, + "learning_rate": 0.0006199969168796581, + "loss": 2.6429, + "step": 14456 + }, + { + "epoch": 0.4286985143670492, + "grad_norm": 0.11671818047761917, + "learning_rate": 0.0006199512404795916, + "loss": 2.6892, + "step": 14457 + }, + { + "epoch": 0.4287281677193607, + "grad_norm": 0.11682410538196564, + "learning_rate": 0.0006199055630173362, + "loss": 2.6631, + "step": 14458 + }, + { + "epoch": 0.42875782107167215, + "grad_norm": 0.12779903411865234, + "learning_rate": 0.0006198598844932965, + "loss": 2.6815, + "step": 14459 + }, + { + "epoch": 0.42878747442398363, + "grad_norm": 0.14822395145893097, + "learning_rate": 0.0006198142049078769, + "loss": 2.7053, + "step": 14460 + }, + { + "epoch": 0.4288171277762951, + "grad_norm": 0.13339193165302277, + "learning_rate": 0.000619768524261482, + "loss": 2.6595, + "step": 14461 + }, + { + "epoch": 0.4288467811286066, + "grad_norm": 0.11784645169973373, + "learning_rate": 0.0006197228425545162, + "loss": 2.6767, + "step": 14462 + }, + { + "epoch": 0.42887643448091806, + "grad_norm": 0.12176632881164551, + "learning_rate": 0.0006196771597873842, + "loss": 2.6846, + "step": 14463 + }, + { + "epoch": 0.42890608783322953, + "grad_norm": 0.11383196711540222, + "learning_rate": 0.0006196314759604902, + "loss": 2.6454, + "step": 14464 + }, + { + "epoch": 0.428935741185541, + "grad_norm": 0.10291951894760132, + "learning_rate": 0.0006195857910742391, + "loss": 2.6866, + "step": 14465 + }, + { + "epoch": 0.4289653945378525, + "grad_norm": 0.11025413870811462, + "learning_rate": 0.0006195401051290353, + "loss": 2.6666, + "step": 14466 + }, + { + "epoch": 0.42899504789016396, + "grad_norm": 0.10609709471464157, + "learning_rate": 0.0006194944181252834, + "loss": 2.6844, + "step": 14467 + }, + { + "epoch": 0.42902470124247544, + "grad_norm": 0.11318372189998627, + "learning_rate": 0.0006194487300633879, + "loss": 2.7027, + "step": 14468 + }, + { + "epoch": 0.4290543545947869, + "grad_norm": 0.10413994640111923, + "learning_rate": 0.0006194030409437531, + "loss": 2.6835, + "step": 14469 + }, + { + "epoch": 0.42908400794709844, + "grad_norm": 0.12371369451284409, + "learning_rate": 0.0006193573507667842, + "loss": 2.6781, + "step": 14470 + }, + { + "epoch": 0.4291136612994099, + "grad_norm": 0.15096953511238098, + "learning_rate": 0.0006193116595328853, + "loss": 2.6353, + "step": 14471 + }, + { + "epoch": 0.4291433146517214, + "grad_norm": 0.14254574477672577, + "learning_rate": 0.0006192659672424612, + "loss": 2.6695, + "step": 14472 + }, + { + "epoch": 0.4291729680040329, + "grad_norm": 0.14540770649909973, + "learning_rate": 0.0006192202738959168, + "loss": 2.6904, + "step": 14473 + }, + { + "epoch": 0.42920262135634435, + "grad_norm": 0.1463673859834671, + "learning_rate": 0.0006191745794936561, + "loss": 2.6496, + "step": 14474 + }, + { + "epoch": 0.4292322747086558, + "grad_norm": 0.1539728343486786, + "learning_rate": 0.000619128884036084, + "loss": 2.6851, + "step": 14475 + }, + { + "epoch": 0.4292619280609673, + "grad_norm": 0.1549113392829895, + "learning_rate": 0.0006190831875236051, + "loss": 2.6807, + "step": 14476 + }, + { + "epoch": 0.4292915814132788, + "grad_norm": 0.11336340010166168, + "learning_rate": 0.0006190374899566244, + "loss": 2.6563, + "step": 14477 + }, + { + "epoch": 0.42932123476559025, + "grad_norm": 0.12084852159023285, + "learning_rate": 0.0006189917913355463, + "loss": 2.6524, + "step": 14478 + }, + { + "epoch": 0.42935088811790173, + "grad_norm": 0.1467738002538681, + "learning_rate": 0.0006189460916607754, + "loss": 2.6692, + "step": 14479 + }, + { + "epoch": 0.4293805414702132, + "grad_norm": 0.13833999633789062, + "learning_rate": 0.0006189003909327163, + "loss": 2.6901, + "step": 14480 + }, + { + "epoch": 0.4294101948225247, + "grad_norm": 0.12129238992929459, + "learning_rate": 0.000618854689151774, + "loss": 2.6921, + "step": 14481 + }, + { + "epoch": 0.42943984817483616, + "grad_norm": 0.13756175339221954, + "learning_rate": 0.0006188089863183528, + "loss": 2.6884, + "step": 14482 + }, + { + "epoch": 0.42946950152714763, + "grad_norm": 0.15136536955833435, + "learning_rate": 0.000618763282432858, + "loss": 2.6776, + "step": 14483 + }, + { + "epoch": 0.4294991548794591, + "grad_norm": 0.1484767645597458, + "learning_rate": 0.0006187175774956937, + "loss": 2.6904, + "step": 14484 + }, + { + "epoch": 0.4295288082317706, + "grad_norm": 0.139641672372818, + "learning_rate": 0.0006186718715072649, + "loss": 2.6858, + "step": 14485 + }, + { + "epoch": 0.42955846158408206, + "grad_norm": 0.12674950063228607, + "learning_rate": 0.0006186261644679763, + "loss": 2.6472, + "step": 14486 + }, + { + "epoch": 0.42958811493639354, + "grad_norm": 0.11133188009262085, + "learning_rate": 0.0006185804563782327, + "loss": 2.6796, + "step": 14487 + }, + { + "epoch": 0.429617768288705, + "grad_norm": 0.11717075854539871, + "learning_rate": 0.0006185347472384388, + "loss": 2.6856, + "step": 14488 + }, + { + "epoch": 0.4296474216410165, + "grad_norm": 0.11774999648332596, + "learning_rate": 0.0006184890370489992, + "loss": 2.684, + "step": 14489 + }, + { + "epoch": 0.429677074993328, + "grad_norm": 0.11609099805355072, + "learning_rate": 0.0006184433258103191, + "loss": 2.6625, + "step": 14490 + }, + { + "epoch": 0.4297067283456395, + "grad_norm": 0.11403758078813553, + "learning_rate": 0.0006183976135228029, + "loss": 2.6465, + "step": 14491 + }, + { + "epoch": 0.429736381697951, + "grad_norm": 0.1024991124868393, + "learning_rate": 0.0006183519001868555, + "loss": 2.6746, + "step": 14492 + }, + { + "epoch": 0.42976603505026245, + "grad_norm": 0.12445172667503357, + "learning_rate": 0.0006183061858028818, + "loss": 2.6846, + "step": 14493 + }, + { + "epoch": 0.4297956884025739, + "grad_norm": 0.13714013993740082, + "learning_rate": 0.0006182604703712864, + "loss": 2.6693, + "step": 14494 + }, + { + "epoch": 0.4298253417548854, + "grad_norm": 0.11992862820625305, + "learning_rate": 0.0006182147538924742, + "loss": 2.6431, + "step": 14495 + }, + { + "epoch": 0.4298549951071969, + "grad_norm": 0.10038499534130096, + "learning_rate": 0.0006181690363668502, + "loss": 2.6402, + "step": 14496 + }, + { + "epoch": 0.42988464845950836, + "grad_norm": 0.12591619789600372, + "learning_rate": 0.000618123317794819, + "loss": 2.6529, + "step": 14497 + }, + { + "epoch": 0.42991430181181983, + "grad_norm": 0.14149078726768494, + "learning_rate": 0.0006180775981767856, + "loss": 2.67, + "step": 14498 + }, + { + "epoch": 0.4299439551641313, + "grad_norm": 0.13066232204437256, + "learning_rate": 0.0006180318775131548, + "loss": 2.6708, + "step": 14499 + }, + { + "epoch": 0.4299736085164428, + "grad_norm": 0.15104596316814423, + "learning_rate": 0.0006179861558043316, + "loss": 2.6781, + "step": 14500 + }, + { + "epoch": 0.43000326186875426, + "grad_norm": 0.1734541654586792, + "learning_rate": 0.0006179404330507205, + "loss": 2.6681, + "step": 14501 + }, + { + "epoch": 0.43003291522106574, + "grad_norm": 0.13835717737674713, + "learning_rate": 0.0006178947092527267, + "loss": 2.6881, + "step": 14502 + }, + { + "epoch": 0.4300625685733772, + "grad_norm": 0.13145466148853302, + "learning_rate": 0.000617848984410755, + "loss": 2.6433, + "step": 14503 + }, + { + "epoch": 0.4300922219256887, + "grad_norm": 0.11447901278734207, + "learning_rate": 0.0006178032585252102, + "loss": 2.7079, + "step": 14504 + }, + { + "epoch": 0.43012187527800017, + "grad_norm": 0.1192871630191803, + "learning_rate": 0.0006177575315964976, + "loss": 2.6775, + "step": 14505 + }, + { + "epoch": 0.43015152863031164, + "grad_norm": 0.12639184296131134, + "learning_rate": 0.0006177118036250217, + "loss": 2.6784, + "step": 14506 + }, + { + "epoch": 0.4301811819826231, + "grad_norm": 0.12718313932418823, + "learning_rate": 0.0006176660746111875, + "loss": 2.6819, + "step": 14507 + }, + { + "epoch": 0.4302108353349346, + "grad_norm": 0.12261077761650085, + "learning_rate": 0.0006176203445554002, + "loss": 2.6455, + "step": 14508 + }, + { + "epoch": 0.43024048868724607, + "grad_norm": 0.11942902207374573, + "learning_rate": 0.0006175746134580645, + "loss": 2.6442, + "step": 14509 + }, + { + "epoch": 0.43027014203955755, + "grad_norm": 0.11633344739675522, + "learning_rate": 0.0006175288813195852, + "loss": 2.6535, + "step": 14510 + }, + { + "epoch": 0.4302997953918691, + "grad_norm": 0.10905686020851135, + "learning_rate": 0.0006174831481403678, + "loss": 2.7011, + "step": 14511 + }, + { + "epoch": 0.43032944874418055, + "grad_norm": 0.11474285274744034, + "learning_rate": 0.0006174374139208168, + "loss": 2.6989, + "step": 14512 + }, + { + "epoch": 0.43035910209649203, + "grad_norm": 0.10556858777999878, + "learning_rate": 0.0006173916786613374, + "loss": 2.6725, + "step": 14513 + }, + { + "epoch": 0.4303887554488035, + "grad_norm": 0.11468864232301712, + "learning_rate": 0.0006173459423623344, + "loss": 2.6383, + "step": 14514 + }, + { + "epoch": 0.430418408801115, + "grad_norm": 0.12147495150566101, + "learning_rate": 0.0006173002050242129, + "loss": 2.6827, + "step": 14515 + }, + { + "epoch": 0.43044806215342646, + "grad_norm": 0.12092684209346771, + "learning_rate": 0.0006172544666473783, + "loss": 2.6501, + "step": 14516 + }, + { + "epoch": 0.43047771550573793, + "grad_norm": 0.12538915872573853, + "learning_rate": 0.000617208727232235, + "loss": 2.6575, + "step": 14517 + }, + { + "epoch": 0.4305073688580494, + "grad_norm": 0.13340243697166443, + "learning_rate": 0.0006171629867791884, + "loss": 2.6974, + "step": 14518 + }, + { + "epoch": 0.4305370222103609, + "grad_norm": 0.12257354706525803, + "learning_rate": 0.0006171172452886433, + "loss": 2.6783, + "step": 14519 + }, + { + "epoch": 0.43056667556267236, + "grad_norm": 0.12709656357765198, + "learning_rate": 0.0006170715027610049, + "loss": 2.6633, + "step": 14520 + }, + { + "epoch": 0.43059632891498384, + "grad_norm": 0.13973042368888855, + "learning_rate": 0.0006170257591966784, + "loss": 2.6901, + "step": 14521 + }, + { + "epoch": 0.4306259822672953, + "grad_norm": 0.15359024703502655, + "learning_rate": 0.0006169800145960686, + "loss": 2.691, + "step": 14522 + }, + { + "epoch": 0.4306556356196068, + "grad_norm": 0.15751907229423523, + "learning_rate": 0.0006169342689595808, + "loss": 2.7303, + "step": 14523 + }, + { + "epoch": 0.43068528897191827, + "grad_norm": 0.13889357447624207, + "learning_rate": 0.00061688852228762, + "loss": 2.6568, + "step": 14524 + }, + { + "epoch": 0.43071494232422974, + "grad_norm": 0.13888435065746307, + "learning_rate": 0.0006168427745805911, + "loss": 2.6347, + "step": 14525 + }, + { + "epoch": 0.4307445956765412, + "grad_norm": 0.13833312690258026, + "learning_rate": 0.0006167970258388994, + "loss": 2.6878, + "step": 14526 + }, + { + "epoch": 0.4307742490288527, + "grad_norm": 0.11083271354436874, + "learning_rate": 0.0006167512760629501, + "loss": 2.6436, + "step": 14527 + }, + { + "epoch": 0.43080390238116417, + "grad_norm": 0.13109448552131653, + "learning_rate": 0.0006167055252531482, + "loss": 2.6938, + "step": 14528 + }, + { + "epoch": 0.43083355573347565, + "grad_norm": 0.12013504654169083, + "learning_rate": 0.0006166597734098987, + "loss": 2.6726, + "step": 14529 + }, + { + "epoch": 0.4308632090857871, + "grad_norm": 0.09472161531448364, + "learning_rate": 0.0006166140205336071, + "loss": 2.6642, + "step": 14530 + }, + { + "epoch": 0.4308928624380986, + "grad_norm": 0.11275479942560196, + "learning_rate": 0.0006165682666246781, + "loss": 2.6841, + "step": 14531 + }, + { + "epoch": 0.43092251579041013, + "grad_norm": 0.12406150996685028, + "learning_rate": 0.0006165225116835173, + "loss": 2.6563, + "step": 14532 + }, + { + "epoch": 0.4309521691427216, + "grad_norm": 0.12462713569402695, + "learning_rate": 0.0006164767557105296, + "loss": 2.6118, + "step": 14533 + }, + { + "epoch": 0.4309818224950331, + "grad_norm": 0.11913636326789856, + "learning_rate": 0.00061643099870612, + "loss": 2.7049, + "step": 14534 + }, + { + "epoch": 0.43101147584734456, + "grad_norm": 0.12211921066045761, + "learning_rate": 0.0006163852406706942, + "loss": 2.6295, + "step": 14535 + }, + { + "epoch": 0.43104112919965604, + "grad_norm": 0.11523067951202393, + "learning_rate": 0.000616339481604657, + "loss": 2.6656, + "step": 14536 + }, + { + "epoch": 0.4310707825519675, + "grad_norm": 0.1216747984290123, + "learning_rate": 0.0006162937215084137, + "loss": 2.6816, + "step": 14537 + }, + { + "epoch": 0.431100435904279, + "grad_norm": 0.11262436956167221, + "learning_rate": 0.0006162479603823698, + "loss": 2.6691, + "step": 14538 + }, + { + "epoch": 0.43113008925659047, + "grad_norm": 0.1162080317735672, + "learning_rate": 0.00061620219822693, + "loss": 2.6334, + "step": 14539 + }, + { + "epoch": 0.43115974260890194, + "grad_norm": 0.10963069647550583, + "learning_rate": 0.0006161564350424997, + "loss": 2.7032, + "step": 14540 + }, + { + "epoch": 0.4311893959612134, + "grad_norm": 0.12495157122612, + "learning_rate": 0.0006161106708294843, + "loss": 2.6361, + "step": 14541 + }, + { + "epoch": 0.4312190493135249, + "grad_norm": 0.1356429159641266, + "learning_rate": 0.0006160649055882891, + "loss": 2.679, + "step": 14542 + }, + { + "epoch": 0.43124870266583637, + "grad_norm": 0.15370413661003113, + "learning_rate": 0.0006160191393193193, + "loss": 2.6709, + "step": 14543 + }, + { + "epoch": 0.43127835601814785, + "grad_norm": 0.16457216441631317, + "learning_rate": 0.0006159733720229799, + "loss": 2.6705, + "step": 14544 + }, + { + "epoch": 0.4313080093704593, + "grad_norm": 0.16288097202777863, + "learning_rate": 0.0006159276036996766, + "loss": 2.6596, + "step": 14545 + }, + { + "epoch": 0.4313376627227708, + "grad_norm": 0.14004220068454742, + "learning_rate": 0.0006158818343498143, + "loss": 2.6719, + "step": 14546 + }, + { + "epoch": 0.4313673160750823, + "grad_norm": 0.1471325308084488, + "learning_rate": 0.0006158360639737984, + "loss": 2.6566, + "step": 14547 + }, + { + "epoch": 0.43139696942739375, + "grad_norm": 0.14891736209392548, + "learning_rate": 0.0006157902925720345, + "loss": 2.6956, + "step": 14548 + }, + { + "epoch": 0.4314266227797052, + "grad_norm": 0.10633739084005356, + "learning_rate": 0.0006157445201449276, + "loss": 2.6584, + "step": 14549 + }, + { + "epoch": 0.4314562761320167, + "grad_norm": 0.12388759851455688, + "learning_rate": 0.000615698746692883, + "loss": 2.6397, + "step": 14550 + }, + { + "epoch": 0.4314859294843282, + "grad_norm": 0.13298457860946655, + "learning_rate": 0.0006156529722163062, + "loss": 2.678, + "step": 14551 + }, + { + "epoch": 0.43151558283663966, + "grad_norm": 0.12365921586751938, + "learning_rate": 0.0006156071967156025, + "loss": 2.637, + "step": 14552 + }, + { + "epoch": 0.4315452361889512, + "grad_norm": 0.11929935216903687, + "learning_rate": 0.0006155614201911771, + "loss": 2.6906, + "step": 14553 + }, + { + "epoch": 0.43157488954126266, + "grad_norm": 0.1277805119752884, + "learning_rate": 0.0006155156426434357, + "loss": 2.6875, + "step": 14554 + }, + { + "epoch": 0.43160454289357414, + "grad_norm": 0.12057378143072128, + "learning_rate": 0.0006154698640727834, + "loss": 2.6931, + "step": 14555 + }, + { + "epoch": 0.4316341962458856, + "grad_norm": 0.10814832150936127, + "learning_rate": 0.0006154240844796256, + "loss": 2.6786, + "step": 14556 + }, + { + "epoch": 0.4316638495981971, + "grad_norm": 0.11559777706861496, + "learning_rate": 0.0006153783038643678, + "loss": 2.7071, + "step": 14557 + }, + { + "epoch": 0.43169350295050857, + "grad_norm": 0.12386031448841095, + "learning_rate": 0.0006153325222274152, + "loss": 2.6801, + "step": 14558 + }, + { + "epoch": 0.43172315630282004, + "grad_norm": 0.13796159625053406, + "learning_rate": 0.0006152867395691732, + "loss": 2.682, + "step": 14559 + }, + { + "epoch": 0.4317528096551315, + "grad_norm": 0.13233943283557892, + "learning_rate": 0.0006152409558900475, + "loss": 2.6484, + "step": 14560 + }, + { + "epoch": 0.431782463007443, + "grad_norm": 0.11849396675825119, + "learning_rate": 0.0006151951711904435, + "loss": 2.6557, + "step": 14561 + }, + { + "epoch": 0.43181211635975447, + "grad_norm": 0.10687298327684402, + "learning_rate": 0.0006151493854707663, + "loss": 2.7035, + "step": 14562 + }, + { + "epoch": 0.43184176971206595, + "grad_norm": 0.11177587509155273, + "learning_rate": 0.0006151035987314215, + "loss": 2.7198, + "step": 14563 + }, + { + "epoch": 0.4318714230643774, + "grad_norm": 0.10256192833185196, + "learning_rate": 0.0006150578109728146, + "loss": 2.7017, + "step": 14564 + }, + { + "epoch": 0.4319010764166889, + "grad_norm": 0.1131531223654747, + "learning_rate": 0.000615012022195351, + "loss": 2.6601, + "step": 14565 + }, + { + "epoch": 0.4319307297690004, + "grad_norm": 0.1298665702342987, + "learning_rate": 0.0006149662323994363, + "loss": 2.6595, + "step": 14566 + }, + { + "epoch": 0.43196038312131185, + "grad_norm": 0.14801286160945892, + "learning_rate": 0.0006149204415854759, + "loss": 2.6745, + "step": 14567 + }, + { + "epoch": 0.43199003647362333, + "grad_norm": 0.11876139044761658, + "learning_rate": 0.0006148746497538752, + "loss": 2.697, + "step": 14568 + }, + { + "epoch": 0.4320196898259348, + "grad_norm": 0.09947793185710907, + "learning_rate": 0.0006148288569050398, + "loss": 2.6912, + "step": 14569 + }, + { + "epoch": 0.4320493431782463, + "grad_norm": 0.10472459346055984, + "learning_rate": 0.0006147830630393751, + "loss": 2.6754, + "step": 14570 + }, + { + "epoch": 0.43207899653055776, + "grad_norm": 0.11343928426504135, + "learning_rate": 0.0006147372681572868, + "loss": 2.6759, + "step": 14571 + }, + { + "epoch": 0.43210864988286923, + "grad_norm": 0.11956865340471268, + "learning_rate": 0.0006146914722591801, + "loss": 2.6849, + "step": 14572 + }, + { + "epoch": 0.4321383032351807, + "grad_norm": 0.12934263050556183, + "learning_rate": 0.0006146456753454608, + "loss": 2.6677, + "step": 14573 + }, + { + "epoch": 0.43216795658749224, + "grad_norm": 0.11791711300611496, + "learning_rate": 0.0006145998774165344, + "loss": 2.6822, + "step": 14574 + }, + { + "epoch": 0.4321976099398037, + "grad_norm": 0.1364143341779709, + "learning_rate": 0.0006145540784728063, + "loss": 2.7106, + "step": 14575 + }, + { + "epoch": 0.4322272632921152, + "grad_norm": 0.16490522027015686, + "learning_rate": 0.0006145082785146825, + "loss": 2.6784, + "step": 14576 + }, + { + "epoch": 0.43225691664442667, + "grad_norm": 0.16755197942256927, + "learning_rate": 0.000614462477542568, + "loss": 2.6717, + "step": 14577 + }, + { + "epoch": 0.43228656999673815, + "grad_norm": 0.1844746172428131, + "learning_rate": 0.0006144166755568685, + "loss": 2.6935, + "step": 14578 + }, + { + "epoch": 0.4323162233490496, + "grad_norm": 0.15463118255138397, + "learning_rate": 0.0006143708725579899, + "loss": 2.6965, + "step": 14579 + }, + { + "epoch": 0.4323458767013611, + "grad_norm": 0.13071681559085846, + "learning_rate": 0.0006143250685463374, + "loss": 2.6844, + "step": 14580 + }, + { + "epoch": 0.4323755300536726, + "grad_norm": 0.14899945259094238, + "learning_rate": 0.000614279263522317, + "loss": 2.6692, + "step": 14581 + }, + { + "epoch": 0.43240518340598405, + "grad_norm": 0.15139208734035492, + "learning_rate": 0.0006142334574863341, + "loss": 2.6679, + "step": 14582 + }, + { + "epoch": 0.4324348367582955, + "grad_norm": 0.12800130248069763, + "learning_rate": 0.0006141876504387942, + "loss": 2.6399, + "step": 14583 + }, + { + "epoch": 0.432464490110607, + "grad_norm": 0.11995365470647812, + "learning_rate": 0.0006141418423801031, + "loss": 2.6747, + "step": 14584 + }, + { + "epoch": 0.4324941434629185, + "grad_norm": 0.13323460519313812, + "learning_rate": 0.0006140960333106664, + "loss": 2.6668, + "step": 14585 + }, + { + "epoch": 0.43252379681522996, + "grad_norm": 0.12873409688472748, + "learning_rate": 0.0006140502232308897, + "loss": 2.644, + "step": 14586 + }, + { + "epoch": 0.43255345016754143, + "grad_norm": 0.12009520083665848, + "learning_rate": 0.0006140044121411787, + "loss": 2.6651, + "step": 14587 + }, + { + "epoch": 0.4325831035198529, + "grad_norm": 0.1277702897787094, + "learning_rate": 0.0006139586000419392, + "loss": 2.6713, + "step": 14588 + }, + { + "epoch": 0.4326127568721644, + "grad_norm": 0.10902108252048492, + "learning_rate": 0.0006139127869335766, + "loss": 2.6381, + "step": 14589 + }, + { + "epoch": 0.43264241022447586, + "grad_norm": 0.11936857551336288, + "learning_rate": 0.0006138669728164968, + "loss": 2.6921, + "step": 14590 + }, + { + "epoch": 0.43267206357678734, + "grad_norm": 0.1292334347963333, + "learning_rate": 0.0006138211576911051, + "loss": 2.6904, + "step": 14591 + }, + { + "epoch": 0.4327017169290988, + "grad_norm": 0.113996721804142, + "learning_rate": 0.000613775341557808, + "loss": 2.6816, + "step": 14592 + }, + { + "epoch": 0.4327313702814103, + "grad_norm": 0.11353731155395508, + "learning_rate": 0.0006137295244170105, + "loss": 2.709, + "step": 14593 + }, + { + "epoch": 0.4327610236337218, + "grad_norm": 0.11944229155778885, + "learning_rate": 0.0006136837062691186, + "loss": 2.6628, + "step": 14594 + }, + { + "epoch": 0.4327906769860333, + "grad_norm": 0.1232292652130127, + "learning_rate": 0.0006136378871145377, + "loss": 2.6566, + "step": 14595 + }, + { + "epoch": 0.43282033033834477, + "grad_norm": 0.13544756174087524, + "learning_rate": 0.0006135920669536741, + "loss": 2.6818, + "step": 14596 + }, + { + "epoch": 0.43284998369065625, + "grad_norm": 0.14761191606521606, + "learning_rate": 0.0006135462457869331, + "loss": 2.676, + "step": 14597 + }, + { + "epoch": 0.4328796370429677, + "grad_norm": 0.11885770410299301, + "learning_rate": 0.0006135004236147207, + "loss": 2.6863, + "step": 14598 + }, + { + "epoch": 0.4329092903952792, + "grad_norm": 0.11719832569360733, + "learning_rate": 0.0006134546004374425, + "loss": 2.6672, + "step": 14599 + }, + { + "epoch": 0.4329389437475907, + "grad_norm": 0.11014761030673981, + "learning_rate": 0.0006134087762555044, + "loss": 2.6582, + "step": 14600 + }, + { + "epoch": 0.43296859709990215, + "grad_norm": 0.11576149612665176, + "learning_rate": 0.0006133629510693121, + "loss": 2.6696, + "step": 14601 + }, + { + "epoch": 0.43299825045221363, + "grad_norm": 0.1211453229188919, + "learning_rate": 0.0006133171248792713, + "loss": 2.6761, + "step": 14602 + }, + { + "epoch": 0.4330279038045251, + "grad_norm": 0.12061599642038345, + "learning_rate": 0.000613271297685788, + "loss": 2.6599, + "step": 14603 + }, + { + "epoch": 0.4330575571568366, + "grad_norm": 0.1179540604352951, + "learning_rate": 0.0006132254694892679, + "loss": 2.6906, + "step": 14604 + }, + { + "epoch": 0.43308721050914806, + "grad_norm": 0.10989569127559662, + "learning_rate": 0.0006131796402901169, + "loss": 2.664, + "step": 14605 + }, + { + "epoch": 0.43311686386145953, + "grad_norm": 0.11858940124511719, + "learning_rate": 0.0006131338100887407, + "loss": 2.661, + "step": 14606 + }, + { + "epoch": 0.433146517213771, + "grad_norm": 0.1303597390651703, + "learning_rate": 0.0006130879788855452, + "loss": 2.6951, + "step": 14607 + }, + { + "epoch": 0.4331761705660825, + "grad_norm": 0.11296871304512024, + "learning_rate": 0.0006130421466809361, + "loss": 2.6862, + "step": 14608 + }, + { + "epoch": 0.43320582391839396, + "grad_norm": 0.13266636431217194, + "learning_rate": 0.0006129963134753197, + "loss": 2.6654, + "step": 14609 + }, + { + "epoch": 0.43323547727070544, + "grad_norm": 0.12214422225952148, + "learning_rate": 0.0006129504792691014, + "loss": 2.6621, + "step": 14610 + }, + { + "epoch": 0.4332651306230169, + "grad_norm": 0.13483846187591553, + "learning_rate": 0.0006129046440626871, + "loss": 2.6478, + "step": 14611 + }, + { + "epoch": 0.4332947839753284, + "grad_norm": 0.13280366361141205, + "learning_rate": 0.0006128588078564829, + "loss": 2.6851, + "step": 14612 + }, + { + "epoch": 0.43332443732763987, + "grad_norm": 0.13765673339366913, + "learning_rate": 0.0006128129706508946, + "loss": 2.6663, + "step": 14613 + }, + { + "epoch": 0.43335409067995134, + "grad_norm": 0.12647604942321777, + "learning_rate": 0.0006127671324463281, + "loss": 2.6487, + "step": 14614 + }, + { + "epoch": 0.4333837440322629, + "grad_norm": 0.14052413403987885, + "learning_rate": 0.0006127212932431893, + "loss": 2.6617, + "step": 14615 + }, + { + "epoch": 0.43341339738457435, + "grad_norm": 0.1374824345111847, + "learning_rate": 0.000612675453041884, + "loss": 2.6666, + "step": 14616 + }, + { + "epoch": 0.4334430507368858, + "grad_norm": 0.12836895883083344, + "learning_rate": 0.0006126296118428181, + "loss": 2.6419, + "step": 14617 + }, + { + "epoch": 0.4334727040891973, + "grad_norm": 0.11249434947967529, + "learning_rate": 0.0006125837696463978, + "loss": 2.6317, + "step": 14618 + }, + { + "epoch": 0.4335023574415088, + "grad_norm": 0.13093720376491547, + "learning_rate": 0.000612537926453029, + "loss": 2.6976, + "step": 14619 + }, + { + "epoch": 0.43353201079382025, + "grad_norm": 0.13811281323432922, + "learning_rate": 0.0006124920822631175, + "loss": 2.687, + "step": 14620 + }, + { + "epoch": 0.43356166414613173, + "grad_norm": 0.10862256586551666, + "learning_rate": 0.0006124462370770692, + "loss": 2.7023, + "step": 14621 + }, + { + "epoch": 0.4335913174984432, + "grad_norm": 0.12148705124855042, + "learning_rate": 0.0006124003908952903, + "loss": 2.7413, + "step": 14622 + }, + { + "epoch": 0.4336209708507547, + "grad_norm": 0.11407695710659027, + "learning_rate": 0.0006123545437181865, + "loss": 2.6417, + "step": 14623 + }, + { + "epoch": 0.43365062420306616, + "grad_norm": 0.12141964584589005, + "learning_rate": 0.000612308695546164, + "loss": 2.6765, + "step": 14624 + }, + { + "epoch": 0.43368027755537764, + "grad_norm": 0.14387594163417816, + "learning_rate": 0.0006122628463796288, + "loss": 2.6747, + "step": 14625 + }, + { + "epoch": 0.4337099309076891, + "grad_norm": 0.14126847684383392, + "learning_rate": 0.0006122169962189867, + "loss": 2.6727, + "step": 14626 + }, + { + "epoch": 0.4337395842600006, + "grad_norm": 0.1284407526254654, + "learning_rate": 0.0006121711450646439, + "loss": 2.6767, + "step": 14627 + }, + { + "epoch": 0.43376923761231206, + "grad_norm": 0.1128450259566307, + "learning_rate": 0.0006121252929170063, + "loss": 2.6744, + "step": 14628 + }, + { + "epoch": 0.43379889096462354, + "grad_norm": 0.11984128504991531, + "learning_rate": 0.0006120794397764801, + "loss": 2.7034, + "step": 14629 + }, + { + "epoch": 0.433828544316935, + "grad_norm": 0.13956840336322784, + "learning_rate": 0.0006120335856434711, + "loss": 2.6863, + "step": 14630 + }, + { + "epoch": 0.4338581976692465, + "grad_norm": 0.1376219391822815, + "learning_rate": 0.0006119877305183855, + "loss": 2.6888, + "step": 14631 + }, + { + "epoch": 0.43388785102155797, + "grad_norm": 0.10933679342269897, + "learning_rate": 0.0006119418744016294, + "loss": 2.7177, + "step": 14632 + }, + { + "epoch": 0.43391750437386944, + "grad_norm": 0.12161407619714737, + "learning_rate": 0.0006118960172936087, + "loss": 2.6832, + "step": 14633 + }, + { + "epoch": 0.4339471577261809, + "grad_norm": 0.12332234531641006, + "learning_rate": 0.0006118501591947296, + "loss": 2.6323, + "step": 14634 + }, + { + "epoch": 0.4339768110784924, + "grad_norm": 0.11289933323860168, + "learning_rate": 0.0006118043001053981, + "loss": 2.6605, + "step": 14635 + }, + { + "epoch": 0.43400646443080393, + "grad_norm": 0.10632362961769104, + "learning_rate": 0.0006117584400260204, + "loss": 2.6485, + "step": 14636 + }, + { + "epoch": 0.4340361177831154, + "grad_norm": 0.11339615285396576, + "learning_rate": 0.0006117125789570025, + "loss": 2.6893, + "step": 14637 + }, + { + "epoch": 0.4340657711354269, + "grad_norm": 0.11484018713235855, + "learning_rate": 0.0006116667168987505, + "loss": 2.6985, + "step": 14638 + }, + { + "epoch": 0.43409542448773836, + "grad_norm": 0.1179480031132698, + "learning_rate": 0.0006116208538516707, + "loss": 2.6467, + "step": 14639 + }, + { + "epoch": 0.43412507784004983, + "grad_norm": 0.13951295614242554, + "learning_rate": 0.0006115749898161688, + "loss": 2.6695, + "step": 14640 + }, + { + "epoch": 0.4341547311923613, + "grad_norm": 0.1542230248451233, + "learning_rate": 0.0006115291247926515, + "loss": 2.6823, + "step": 14641 + }, + { + "epoch": 0.4341843845446728, + "grad_norm": 0.16153843700885773, + "learning_rate": 0.0006114832587815247, + "loss": 2.6469, + "step": 14642 + }, + { + "epoch": 0.43421403789698426, + "grad_norm": 0.14565564692020416, + "learning_rate": 0.0006114373917831942, + "loss": 2.6939, + "step": 14643 + }, + { + "epoch": 0.43424369124929574, + "grad_norm": 0.13992926478385925, + "learning_rate": 0.0006113915237980666, + "loss": 2.6589, + "step": 14644 + }, + { + "epoch": 0.4342733446016072, + "grad_norm": 0.126104474067688, + "learning_rate": 0.0006113456548265479, + "loss": 2.6804, + "step": 14645 + }, + { + "epoch": 0.4343029979539187, + "grad_norm": 0.13773417472839355, + "learning_rate": 0.0006112997848690444, + "loss": 2.667, + "step": 14646 + }, + { + "epoch": 0.43433265130623017, + "grad_norm": 0.13933725655078888, + "learning_rate": 0.0006112539139259623, + "loss": 2.6798, + "step": 14647 + }, + { + "epoch": 0.43436230465854164, + "grad_norm": 0.14896397292613983, + "learning_rate": 0.0006112080419977075, + "loss": 2.6946, + "step": 14648 + }, + { + "epoch": 0.4343919580108531, + "grad_norm": 0.14540430903434753, + "learning_rate": 0.0006111621690846865, + "loss": 2.6691, + "step": 14649 + }, + { + "epoch": 0.4344216113631646, + "grad_norm": 0.11749150604009628, + "learning_rate": 0.0006111162951873052, + "loss": 2.6904, + "step": 14650 + }, + { + "epoch": 0.43445126471547607, + "grad_norm": 0.10718461126089096, + "learning_rate": 0.0006110704203059703, + "loss": 2.6796, + "step": 14651 + }, + { + "epoch": 0.43448091806778755, + "grad_norm": 0.11646822094917297, + "learning_rate": 0.0006110245444410876, + "loss": 2.6544, + "step": 14652 + }, + { + "epoch": 0.434510571420099, + "grad_norm": 0.12706512212753296, + "learning_rate": 0.0006109786675930636, + "loss": 2.6848, + "step": 14653 + }, + { + "epoch": 0.4345402247724105, + "grad_norm": 0.131498783826828, + "learning_rate": 0.0006109327897623045, + "loss": 2.6882, + "step": 14654 + }, + { + "epoch": 0.434569878124722, + "grad_norm": 0.13107995688915253, + "learning_rate": 0.0006108869109492165, + "loss": 2.6771, + "step": 14655 + }, + { + "epoch": 0.43459953147703345, + "grad_norm": 0.13213743269443512, + "learning_rate": 0.0006108410311542056, + "loss": 2.6426, + "step": 14656 + }, + { + "epoch": 0.434629184829345, + "grad_norm": 0.1361958533525467, + "learning_rate": 0.0006107951503776785, + "loss": 2.7031, + "step": 14657 + }, + { + "epoch": 0.43465883818165646, + "grad_norm": 0.11833316832780838, + "learning_rate": 0.0006107492686200415, + "loss": 2.6984, + "step": 14658 + }, + { + "epoch": 0.43468849153396794, + "grad_norm": 0.12202121317386627, + "learning_rate": 0.0006107033858817006, + "loss": 2.6908, + "step": 14659 + }, + { + "epoch": 0.4347181448862794, + "grad_norm": 0.147317573428154, + "learning_rate": 0.0006106575021630621, + "loss": 2.6739, + "step": 14660 + }, + { + "epoch": 0.4347477982385909, + "grad_norm": 0.12304327636957169, + "learning_rate": 0.0006106116174645327, + "loss": 2.6595, + "step": 14661 + }, + { + "epoch": 0.43477745159090236, + "grad_norm": 0.10741757601499557, + "learning_rate": 0.0006105657317865182, + "loss": 2.6677, + "step": 14662 + }, + { + "epoch": 0.43480710494321384, + "grad_norm": 0.15503737330436707, + "learning_rate": 0.0006105198451294251, + "loss": 2.6972, + "step": 14663 + }, + { + "epoch": 0.4348367582955253, + "grad_norm": 0.13795757293701172, + "learning_rate": 0.0006104739574936599, + "loss": 2.6715, + "step": 14664 + }, + { + "epoch": 0.4348664116478368, + "grad_norm": 0.13234202563762665, + "learning_rate": 0.000610428068879629, + "loss": 2.6673, + "step": 14665 + }, + { + "epoch": 0.43489606500014827, + "grad_norm": 0.16709724068641663, + "learning_rate": 0.0006103821792877384, + "loss": 2.6663, + "step": 14666 + }, + { + "epoch": 0.43492571835245974, + "grad_norm": 0.12582914531230927, + "learning_rate": 0.0006103362887183947, + "loss": 2.6537, + "step": 14667 + }, + { + "epoch": 0.4349553717047712, + "grad_norm": 0.14425280690193176, + "learning_rate": 0.0006102903971720043, + "loss": 2.6583, + "step": 14668 + }, + { + "epoch": 0.4349850250570827, + "grad_norm": 0.14117255806922913, + "learning_rate": 0.0006102445046489736, + "loss": 2.6678, + "step": 14669 + }, + { + "epoch": 0.4350146784093942, + "grad_norm": 0.13202407956123352, + "learning_rate": 0.0006101986111497087, + "loss": 2.645, + "step": 14670 + }, + { + "epoch": 0.43504433176170565, + "grad_norm": 0.1216205507516861, + "learning_rate": 0.0006101527166746161, + "loss": 2.6452, + "step": 14671 + }, + { + "epoch": 0.4350739851140171, + "grad_norm": 0.12599818408489227, + "learning_rate": 0.0006101068212241024, + "loss": 2.6458, + "step": 14672 + }, + { + "epoch": 0.4351036384663286, + "grad_norm": 0.12544259428977966, + "learning_rate": 0.000610060924798574, + "loss": 2.7143, + "step": 14673 + }, + { + "epoch": 0.4351332918186401, + "grad_norm": 0.11965946853160858, + "learning_rate": 0.000610015027398437, + "loss": 2.6838, + "step": 14674 + }, + { + "epoch": 0.43516294517095155, + "grad_norm": 0.12231829017400742, + "learning_rate": 0.0006099691290240984, + "loss": 2.6765, + "step": 14675 + }, + { + "epoch": 0.43519259852326303, + "grad_norm": 0.124374158680439, + "learning_rate": 0.000609923229675964, + "loss": 2.6855, + "step": 14676 + }, + { + "epoch": 0.4352222518755745, + "grad_norm": 0.1256372332572937, + "learning_rate": 0.0006098773293544405, + "loss": 2.6914, + "step": 14677 + }, + { + "epoch": 0.43525190522788604, + "grad_norm": 0.12299498170614243, + "learning_rate": 0.0006098314280599345, + "loss": 2.6899, + "step": 14678 + }, + { + "epoch": 0.4352815585801975, + "grad_norm": 0.11243921518325806, + "learning_rate": 0.0006097855257928522, + "loss": 2.6345, + "step": 14679 + }, + { + "epoch": 0.435311211932509, + "grad_norm": 0.12415780872106552, + "learning_rate": 0.0006097396225536006, + "loss": 2.6996, + "step": 14680 + }, + { + "epoch": 0.43534086528482047, + "grad_norm": 0.12869839370250702, + "learning_rate": 0.0006096937183425856, + "loss": 2.6551, + "step": 14681 + }, + { + "epoch": 0.43537051863713194, + "grad_norm": 0.14244739711284637, + "learning_rate": 0.0006096478131602137, + "loss": 2.653, + "step": 14682 + }, + { + "epoch": 0.4354001719894434, + "grad_norm": 0.14411978423595428, + "learning_rate": 0.0006096019070068918, + "loss": 2.6686, + "step": 14683 + }, + { + "epoch": 0.4354298253417549, + "grad_norm": 0.1404157429933548, + "learning_rate": 0.0006095559998830261, + "loss": 2.6939, + "step": 14684 + }, + { + "epoch": 0.43545947869406637, + "grad_norm": 0.13022609055042267, + "learning_rate": 0.0006095100917890234, + "loss": 2.6829, + "step": 14685 + }, + { + "epoch": 0.43548913204637785, + "grad_norm": 0.13715331256389618, + "learning_rate": 0.00060946418272529, + "loss": 2.6937, + "step": 14686 + }, + { + "epoch": 0.4355187853986893, + "grad_norm": 0.15195401012897491, + "learning_rate": 0.0006094182726922323, + "loss": 2.6335, + "step": 14687 + }, + { + "epoch": 0.4355484387510008, + "grad_norm": 0.14513573050498962, + "learning_rate": 0.0006093723616902569, + "loss": 2.647, + "step": 14688 + }, + { + "epoch": 0.4355780921033123, + "grad_norm": 0.11008687317371368, + "learning_rate": 0.0006093264497197707, + "loss": 2.6729, + "step": 14689 + }, + { + "epoch": 0.43560774545562375, + "grad_norm": 0.10786862671375275, + "learning_rate": 0.0006092805367811801, + "loss": 2.6397, + "step": 14690 + }, + { + "epoch": 0.4356373988079352, + "grad_norm": 0.13371488451957703, + "learning_rate": 0.0006092346228748915, + "loss": 2.6445, + "step": 14691 + }, + { + "epoch": 0.4356670521602467, + "grad_norm": 0.12733709812164307, + "learning_rate": 0.0006091887080013115, + "loss": 2.6806, + "step": 14692 + }, + { + "epoch": 0.4356967055125582, + "grad_norm": 0.12501397728919983, + "learning_rate": 0.0006091427921608468, + "loss": 2.6815, + "step": 14693 + }, + { + "epoch": 0.43572635886486966, + "grad_norm": 0.10724930465221405, + "learning_rate": 0.0006090968753539039, + "loss": 2.6759, + "step": 14694 + }, + { + "epoch": 0.43575601221718113, + "grad_norm": 0.12759584188461304, + "learning_rate": 0.0006090509575808893, + "loss": 2.6685, + "step": 14695 + }, + { + "epoch": 0.4357856655694926, + "grad_norm": 0.14444175362586975, + "learning_rate": 0.0006090050388422102, + "loss": 2.6832, + "step": 14696 + }, + { + "epoch": 0.4358153189218041, + "grad_norm": 0.12697583436965942, + "learning_rate": 0.0006089591191382724, + "loss": 2.6941, + "step": 14697 + }, + { + "epoch": 0.4358449722741156, + "grad_norm": 0.11309713125228882, + "learning_rate": 0.000608913198469483, + "loss": 2.6516, + "step": 14698 + }, + { + "epoch": 0.4358746256264271, + "grad_norm": 0.11532127112150192, + "learning_rate": 0.0006088672768362485, + "loss": 2.7009, + "step": 14699 + }, + { + "epoch": 0.43590427897873857, + "grad_norm": 0.12027164548635483, + "learning_rate": 0.0006088213542389756, + "loss": 2.6438, + "step": 14700 + }, + { + "epoch": 0.43593393233105004, + "grad_norm": 0.14172519743442535, + "learning_rate": 0.000608775430678071, + "loss": 2.6904, + "step": 14701 + }, + { + "epoch": 0.4359635856833615, + "grad_norm": 0.1703612357378006, + "learning_rate": 0.0006087295061539412, + "loss": 2.6331, + "step": 14702 + }, + { + "epoch": 0.435993239035673, + "grad_norm": 0.14882925152778625, + "learning_rate": 0.000608683580666993, + "loss": 2.6478, + "step": 14703 + }, + { + "epoch": 0.4360228923879845, + "grad_norm": 0.11158362776041031, + "learning_rate": 0.000608637654217633, + "loss": 2.6821, + "step": 14704 + }, + { + "epoch": 0.43605254574029595, + "grad_norm": 0.12780378758907318, + "learning_rate": 0.0006085917268062679, + "loss": 2.6117, + "step": 14705 + }, + { + "epoch": 0.4360821990926074, + "grad_norm": 0.1425325572490692, + "learning_rate": 0.0006085457984333044, + "loss": 2.6335, + "step": 14706 + }, + { + "epoch": 0.4361118524449189, + "grad_norm": 0.1422838717699051, + "learning_rate": 0.0006084998690991495, + "loss": 2.6559, + "step": 14707 + }, + { + "epoch": 0.4361415057972304, + "grad_norm": 0.11625582724809647, + "learning_rate": 0.0006084539388042092, + "loss": 2.6245, + "step": 14708 + }, + { + "epoch": 0.43617115914954185, + "grad_norm": 0.11629378795623779, + "learning_rate": 0.0006084080075488909, + "loss": 2.7165, + "step": 14709 + }, + { + "epoch": 0.43620081250185333, + "grad_norm": 0.12165063619613647, + "learning_rate": 0.0006083620753336011, + "loss": 2.6953, + "step": 14710 + }, + { + "epoch": 0.4362304658541648, + "grad_norm": 0.10992053896188736, + "learning_rate": 0.0006083161421587464, + "loss": 2.676, + "step": 14711 + }, + { + "epoch": 0.4362601192064763, + "grad_norm": 0.10906989127397537, + "learning_rate": 0.0006082702080247338, + "loss": 2.6735, + "step": 14712 + }, + { + "epoch": 0.43628977255878776, + "grad_norm": 0.1227765679359436, + "learning_rate": 0.00060822427293197, + "loss": 2.7149, + "step": 14713 + }, + { + "epoch": 0.43631942591109923, + "grad_norm": 0.11770614236593246, + "learning_rate": 0.0006081783368808614, + "loss": 2.6944, + "step": 14714 + }, + { + "epoch": 0.4363490792634107, + "grad_norm": 0.10293503105640411, + "learning_rate": 0.0006081323998718152, + "loss": 2.6391, + "step": 14715 + }, + { + "epoch": 0.4363787326157222, + "grad_norm": 0.11168369650840759, + "learning_rate": 0.0006080864619052381, + "loss": 2.7094, + "step": 14716 + }, + { + "epoch": 0.43640838596803366, + "grad_norm": 0.11452987045049667, + "learning_rate": 0.0006080405229815368, + "loss": 2.6892, + "step": 14717 + }, + { + "epoch": 0.43643803932034514, + "grad_norm": 0.128716841340065, + "learning_rate": 0.0006079945831011182, + "loss": 2.7305, + "step": 14718 + }, + { + "epoch": 0.43646769267265667, + "grad_norm": 0.13145054876804352, + "learning_rate": 0.000607948642264389, + "loss": 2.6615, + "step": 14719 + }, + { + "epoch": 0.43649734602496815, + "grad_norm": 0.13683968782424927, + "learning_rate": 0.0006079027004717559, + "loss": 2.6767, + "step": 14720 + }, + { + "epoch": 0.4365269993772796, + "grad_norm": 0.11288748681545258, + "learning_rate": 0.0006078567577236259, + "loss": 2.6984, + "step": 14721 + }, + { + "epoch": 0.4365566527295911, + "grad_norm": 0.11306734383106232, + "learning_rate": 0.0006078108140204058, + "loss": 2.6894, + "step": 14722 + }, + { + "epoch": 0.4365863060819026, + "grad_norm": 0.11238472908735275, + "learning_rate": 0.0006077648693625027, + "loss": 2.6725, + "step": 14723 + }, + { + "epoch": 0.43661595943421405, + "grad_norm": 0.12772725522518158, + "learning_rate": 0.0006077189237503229, + "loss": 2.6777, + "step": 14724 + }, + { + "epoch": 0.4366456127865255, + "grad_norm": 0.12035582214593887, + "learning_rate": 0.0006076729771842736, + "loss": 2.7132, + "step": 14725 + }, + { + "epoch": 0.436675266138837, + "grad_norm": 0.11892496049404144, + "learning_rate": 0.0006076270296647615, + "loss": 2.6749, + "step": 14726 + }, + { + "epoch": 0.4367049194911485, + "grad_norm": 0.1254829466342926, + "learning_rate": 0.0006075810811921936, + "loss": 2.6481, + "step": 14727 + }, + { + "epoch": 0.43673457284345996, + "grad_norm": 0.11902713775634766, + "learning_rate": 0.0006075351317669771, + "loss": 2.6742, + "step": 14728 + }, + { + "epoch": 0.43676422619577143, + "grad_norm": 0.11163376271724701, + "learning_rate": 0.0006074891813895182, + "loss": 2.6565, + "step": 14729 + }, + { + "epoch": 0.4367938795480829, + "grad_norm": 0.14705950021743774, + "learning_rate": 0.0006074432300602243, + "loss": 2.6618, + "step": 14730 + }, + { + "epoch": 0.4368235329003944, + "grad_norm": 0.1513090282678604, + "learning_rate": 0.0006073972777795021, + "loss": 2.6789, + "step": 14731 + }, + { + "epoch": 0.43685318625270586, + "grad_norm": 0.13030371069908142, + "learning_rate": 0.0006073513245477586, + "loss": 2.648, + "step": 14732 + }, + { + "epoch": 0.43688283960501734, + "grad_norm": 0.13930059969425201, + "learning_rate": 0.0006073053703654006, + "loss": 2.6688, + "step": 14733 + }, + { + "epoch": 0.4369124929573288, + "grad_norm": 0.16752390563488007, + "learning_rate": 0.0006072594152328353, + "loss": 2.6849, + "step": 14734 + }, + { + "epoch": 0.4369421463096403, + "grad_norm": 0.15173842012882233, + "learning_rate": 0.0006072134591504692, + "loss": 2.6892, + "step": 14735 + }, + { + "epoch": 0.43697179966195177, + "grad_norm": 0.11923597753047943, + "learning_rate": 0.0006071675021187097, + "loss": 2.6585, + "step": 14736 + }, + { + "epoch": 0.43700145301426324, + "grad_norm": 0.140329971909523, + "learning_rate": 0.0006071215441379636, + "loss": 2.6622, + "step": 14737 + }, + { + "epoch": 0.4370311063665747, + "grad_norm": 0.14408403635025024, + "learning_rate": 0.0006070755852086378, + "loss": 2.6837, + "step": 14738 + }, + { + "epoch": 0.4370607597188862, + "grad_norm": 0.12311847507953644, + "learning_rate": 0.0006070296253311392, + "loss": 2.6916, + "step": 14739 + }, + { + "epoch": 0.4370904130711977, + "grad_norm": 0.12308058887720108, + "learning_rate": 0.0006069836645058751, + "loss": 2.67, + "step": 14740 + }, + { + "epoch": 0.4371200664235092, + "grad_norm": 0.13600265979766846, + "learning_rate": 0.0006069377027332522, + "loss": 2.6597, + "step": 14741 + }, + { + "epoch": 0.4371497197758207, + "grad_norm": 0.12687897682189941, + "learning_rate": 0.0006068917400136775, + "loss": 2.7026, + "step": 14742 + }, + { + "epoch": 0.43717937312813215, + "grad_norm": 0.13591742515563965, + "learning_rate": 0.0006068457763475582, + "loss": 2.6621, + "step": 14743 + }, + { + "epoch": 0.43720902648044363, + "grad_norm": 0.11917509883642197, + "learning_rate": 0.0006067998117353011, + "loss": 2.6551, + "step": 14744 + }, + { + "epoch": 0.4372386798327551, + "grad_norm": 0.11279445141553879, + "learning_rate": 0.0006067538461773137, + "loss": 2.6891, + "step": 14745 + }, + { + "epoch": 0.4372683331850666, + "grad_norm": 0.10400646179914474, + "learning_rate": 0.0006067078796740023, + "loss": 2.6605, + "step": 14746 + }, + { + "epoch": 0.43729798653737806, + "grad_norm": 0.10568006336688995, + "learning_rate": 0.0006066619122257743, + "loss": 2.6855, + "step": 14747 + }, + { + "epoch": 0.43732763988968953, + "grad_norm": 0.1295190155506134, + "learning_rate": 0.0006066159438330369, + "loss": 2.6849, + "step": 14748 + }, + { + "epoch": 0.437357293242001, + "grad_norm": 0.13396379351615906, + "learning_rate": 0.000606569974496197, + "loss": 2.6845, + "step": 14749 + }, + { + "epoch": 0.4373869465943125, + "grad_norm": 0.1214798092842102, + "learning_rate": 0.0006065240042156616, + "loss": 2.6046, + "step": 14750 + }, + { + "epoch": 0.43741659994662396, + "grad_norm": 0.13698162138462067, + "learning_rate": 0.000606478032991838, + "loss": 2.6665, + "step": 14751 + }, + { + "epoch": 0.43744625329893544, + "grad_norm": 0.13887837529182434, + "learning_rate": 0.000606432060825133, + "loss": 2.6899, + "step": 14752 + }, + { + "epoch": 0.4374759066512469, + "grad_norm": 0.12318812310695648, + "learning_rate": 0.0006063860877159538, + "loss": 2.6547, + "step": 14753 + }, + { + "epoch": 0.4375055600035584, + "grad_norm": 0.11054157465696335, + "learning_rate": 0.0006063401136647077, + "loss": 2.6773, + "step": 14754 + }, + { + "epoch": 0.43753521335586987, + "grad_norm": 0.11387214064598083, + "learning_rate": 0.0006062941386718015, + "loss": 2.6918, + "step": 14755 + }, + { + "epoch": 0.43756486670818134, + "grad_norm": 0.10501382499933243, + "learning_rate": 0.0006062481627376426, + "loss": 2.6592, + "step": 14756 + }, + { + "epoch": 0.4375945200604928, + "grad_norm": 0.11473589390516281, + "learning_rate": 0.000606202185862638, + "loss": 2.6621, + "step": 14757 + }, + { + "epoch": 0.4376241734128043, + "grad_norm": 0.12563619017601013, + "learning_rate": 0.0006061562080471947, + "loss": 2.6846, + "step": 14758 + }, + { + "epoch": 0.43765382676511577, + "grad_norm": 0.12143637984991074, + "learning_rate": 0.0006061102292917199, + "loss": 2.6832, + "step": 14759 + }, + { + "epoch": 0.43768348011742725, + "grad_norm": 0.10662497580051422, + "learning_rate": 0.0006060642495966207, + "loss": 2.6639, + "step": 14760 + }, + { + "epoch": 0.4377131334697388, + "grad_norm": 0.12207654863595963, + "learning_rate": 0.0006060182689623047, + "loss": 2.6918, + "step": 14761 + }, + { + "epoch": 0.43774278682205026, + "grad_norm": 0.12681356072425842, + "learning_rate": 0.0006059722873891786, + "loss": 2.6612, + "step": 14762 + }, + { + "epoch": 0.43777244017436173, + "grad_norm": 0.14718851447105408, + "learning_rate": 0.0006059263048776496, + "loss": 2.7176, + "step": 14763 + }, + { + "epoch": 0.4378020935266732, + "grad_norm": 0.1528264433145523, + "learning_rate": 0.0006058803214281252, + "loss": 2.6337, + "step": 14764 + }, + { + "epoch": 0.4378317468789847, + "grad_norm": 0.12116778641939163, + "learning_rate": 0.0006058343370410123, + "loss": 2.6953, + "step": 14765 + }, + { + "epoch": 0.43786140023129616, + "grad_norm": 0.12885881960391998, + "learning_rate": 0.0006057883517167179, + "loss": 2.7053, + "step": 14766 + }, + { + "epoch": 0.43789105358360764, + "grad_norm": 0.14674031734466553, + "learning_rate": 0.0006057423654556497, + "loss": 2.6745, + "step": 14767 + }, + { + "epoch": 0.4379207069359191, + "grad_norm": 0.14295898377895355, + "learning_rate": 0.0006056963782582148, + "loss": 2.6654, + "step": 14768 + }, + { + "epoch": 0.4379503602882306, + "grad_norm": 0.1375485211610794, + "learning_rate": 0.0006056503901248203, + "loss": 2.7014, + "step": 14769 + }, + { + "epoch": 0.43798001364054207, + "grad_norm": 0.1407788246870041, + "learning_rate": 0.0006056044010558733, + "loss": 2.6596, + "step": 14770 + }, + { + "epoch": 0.43800966699285354, + "grad_norm": 0.12042514979839325, + "learning_rate": 0.0006055584110517813, + "loss": 2.6531, + "step": 14771 + }, + { + "epoch": 0.438039320345165, + "grad_norm": 0.14213183522224426, + "learning_rate": 0.0006055124201129515, + "loss": 2.6608, + "step": 14772 + }, + { + "epoch": 0.4380689736974765, + "grad_norm": 0.14302097260951996, + "learning_rate": 0.000605466428239791, + "loss": 2.6804, + "step": 14773 + }, + { + "epoch": 0.43809862704978797, + "grad_norm": 0.14804327487945557, + "learning_rate": 0.0006054204354327073, + "loss": 2.6723, + "step": 14774 + }, + { + "epoch": 0.43812828040209945, + "grad_norm": 0.1386466920375824, + "learning_rate": 0.0006053744416921075, + "loss": 2.6703, + "step": 14775 + }, + { + "epoch": 0.4381579337544109, + "grad_norm": 0.1258794218301773, + "learning_rate": 0.0006053284470183989, + "loss": 2.6801, + "step": 14776 + }, + { + "epoch": 0.4381875871067224, + "grad_norm": 0.11730065196752548, + "learning_rate": 0.0006052824514119888, + "loss": 2.6883, + "step": 14777 + }, + { + "epoch": 0.4382172404590339, + "grad_norm": 0.12450375407934189, + "learning_rate": 0.0006052364548732848, + "loss": 2.6857, + "step": 14778 + }, + { + "epoch": 0.43824689381134535, + "grad_norm": 0.12485148012638092, + "learning_rate": 0.0006051904574026935, + "loss": 2.6584, + "step": 14779 + }, + { + "epoch": 0.4382765471636568, + "grad_norm": 0.1112089455127716, + "learning_rate": 0.0006051444590006227, + "loss": 2.6869, + "step": 14780 + }, + { + "epoch": 0.4383062005159683, + "grad_norm": 0.11960314214229584, + "learning_rate": 0.0006050984596674798, + "loss": 2.7061, + "step": 14781 + }, + { + "epoch": 0.43833585386827983, + "grad_norm": 0.11200633645057678, + "learning_rate": 0.0006050524594036721, + "loss": 2.6721, + "step": 14782 + }, + { + "epoch": 0.4383655072205913, + "grad_norm": 0.12199059128761292, + "learning_rate": 0.0006050064582096069, + "loss": 2.7035, + "step": 14783 + }, + { + "epoch": 0.4383951605729028, + "grad_norm": 0.12952075898647308, + "learning_rate": 0.0006049604560856913, + "loss": 2.6554, + "step": 14784 + }, + { + "epoch": 0.43842481392521426, + "grad_norm": 0.12804116308689117, + "learning_rate": 0.0006049144530323327, + "loss": 2.6461, + "step": 14785 + }, + { + "epoch": 0.43845446727752574, + "grad_norm": 0.1366416960954666, + "learning_rate": 0.0006048684490499389, + "loss": 2.6782, + "step": 14786 + }, + { + "epoch": 0.4384841206298372, + "grad_norm": 0.12366834282875061, + "learning_rate": 0.0006048224441389168, + "loss": 2.7015, + "step": 14787 + }, + { + "epoch": 0.4385137739821487, + "grad_norm": 0.1163603663444519, + "learning_rate": 0.0006047764382996741, + "loss": 2.6825, + "step": 14788 + }, + { + "epoch": 0.43854342733446017, + "grad_norm": 0.1112222820520401, + "learning_rate": 0.0006047304315326181, + "loss": 2.6771, + "step": 14789 + }, + { + "epoch": 0.43857308068677164, + "grad_norm": 0.11800863593816757, + "learning_rate": 0.0006046844238381561, + "loss": 2.6552, + "step": 14790 + }, + { + "epoch": 0.4386027340390831, + "grad_norm": 0.12500683963298798, + "learning_rate": 0.0006046384152166953, + "loss": 2.6899, + "step": 14791 + }, + { + "epoch": 0.4386323873913946, + "grad_norm": 0.13072799146175385, + "learning_rate": 0.0006045924056686436, + "loss": 2.6597, + "step": 14792 + }, + { + "epoch": 0.43866204074370607, + "grad_norm": 0.1359710991382599, + "learning_rate": 0.0006045463951944081, + "loss": 2.6932, + "step": 14793 + }, + { + "epoch": 0.43869169409601755, + "grad_norm": 0.13446444272994995, + "learning_rate": 0.0006045003837943965, + "loss": 2.6868, + "step": 14794 + }, + { + "epoch": 0.438721347448329, + "grad_norm": 0.1043272465467453, + "learning_rate": 0.000604454371469016, + "loss": 2.6965, + "step": 14795 + }, + { + "epoch": 0.4387510008006405, + "grad_norm": 0.12379205226898193, + "learning_rate": 0.000604408358218674, + "loss": 2.67, + "step": 14796 + }, + { + "epoch": 0.438780654152952, + "grad_norm": 0.14421896636486053, + "learning_rate": 0.0006043623440437781, + "loss": 2.6716, + "step": 14797 + }, + { + "epoch": 0.43881030750526345, + "grad_norm": 0.14490854740142822, + "learning_rate": 0.0006043163289447357, + "loss": 2.6797, + "step": 14798 + }, + { + "epoch": 0.43883996085757493, + "grad_norm": 0.11458110064268112, + "learning_rate": 0.0006042703129219545, + "loss": 2.6468, + "step": 14799 + }, + { + "epoch": 0.4388696142098864, + "grad_norm": 0.10491570085287094, + "learning_rate": 0.0006042242959758416, + "loss": 2.6243, + "step": 14800 + }, + { + "epoch": 0.4388992675621979, + "grad_norm": 0.12554790079593658, + "learning_rate": 0.0006041782781068046, + "loss": 2.6602, + "step": 14801 + }, + { + "epoch": 0.4389289209145094, + "grad_norm": 0.13186012208461761, + "learning_rate": 0.0006041322593152513, + "loss": 2.657, + "step": 14802 + }, + { + "epoch": 0.4389585742668209, + "grad_norm": 0.12886035442352295, + "learning_rate": 0.0006040862396015888, + "loss": 2.6617, + "step": 14803 + }, + { + "epoch": 0.43898822761913237, + "grad_norm": 0.11833800375461578, + "learning_rate": 0.0006040402189662248, + "loss": 2.621, + "step": 14804 + }, + { + "epoch": 0.43901788097144384, + "grad_norm": 0.12341973930597305, + "learning_rate": 0.0006039941974095669, + "loss": 2.6845, + "step": 14805 + }, + { + "epoch": 0.4390475343237553, + "grad_norm": 0.14028491079807281, + "learning_rate": 0.0006039481749320225, + "loss": 2.688, + "step": 14806 + }, + { + "epoch": 0.4390771876760668, + "grad_norm": 0.15371403098106384, + "learning_rate": 0.0006039021515339991, + "loss": 2.6733, + "step": 14807 + }, + { + "epoch": 0.43910684102837827, + "grad_norm": 0.11692547798156738, + "learning_rate": 0.0006038561272159043, + "loss": 2.6654, + "step": 14808 + }, + { + "epoch": 0.43913649438068975, + "grad_norm": 0.1268443763256073, + "learning_rate": 0.0006038101019781457, + "loss": 2.6669, + "step": 14809 + }, + { + "epoch": 0.4391661477330012, + "grad_norm": 0.13528220355510712, + "learning_rate": 0.0006037640758211309, + "loss": 2.6655, + "step": 14810 + }, + { + "epoch": 0.4391958010853127, + "grad_norm": 0.11901291459798813, + "learning_rate": 0.0006037180487452674, + "loss": 2.6651, + "step": 14811 + }, + { + "epoch": 0.4392254544376242, + "grad_norm": 0.11426366120576859, + "learning_rate": 0.0006036720207509628, + "loss": 2.6317, + "step": 14812 + }, + { + "epoch": 0.43925510778993565, + "grad_norm": 0.14066098630428314, + "learning_rate": 0.0006036259918386245, + "loss": 2.6277, + "step": 14813 + }, + { + "epoch": 0.4392847611422471, + "grad_norm": 0.17460604012012482, + "learning_rate": 0.0006035799620086603, + "loss": 2.6663, + "step": 14814 + }, + { + "epoch": 0.4393144144945586, + "grad_norm": 0.16008155047893524, + "learning_rate": 0.0006035339312614778, + "loss": 2.6765, + "step": 14815 + }, + { + "epoch": 0.4393440678468701, + "grad_norm": 0.15345865488052368, + "learning_rate": 0.0006034878995974846, + "loss": 2.6684, + "step": 14816 + }, + { + "epoch": 0.43937372119918155, + "grad_norm": 0.1629784107208252, + "learning_rate": 0.0006034418670170882, + "loss": 2.7016, + "step": 14817 + }, + { + "epoch": 0.43940337455149303, + "grad_norm": 0.15708550810813904, + "learning_rate": 0.0006033958335206963, + "loss": 2.6892, + "step": 14818 + }, + { + "epoch": 0.4394330279038045, + "grad_norm": 0.12672334909439087, + "learning_rate": 0.0006033497991087166, + "loss": 2.6597, + "step": 14819 + }, + { + "epoch": 0.439462681256116, + "grad_norm": 0.14663998782634735, + "learning_rate": 0.0006033037637815567, + "loss": 2.6454, + "step": 14820 + }, + { + "epoch": 0.43949233460842746, + "grad_norm": 0.15880213677883148, + "learning_rate": 0.0006032577275396243, + "loss": 2.6531, + "step": 14821 + }, + { + "epoch": 0.43952198796073894, + "grad_norm": 0.14432236552238464, + "learning_rate": 0.0006032116903833269, + "loss": 2.6646, + "step": 14822 + }, + { + "epoch": 0.43955164131305047, + "grad_norm": 0.15077169239521027, + "learning_rate": 0.0006031656523130724, + "loss": 2.6849, + "step": 14823 + }, + { + "epoch": 0.43958129466536194, + "grad_norm": 0.13120383024215698, + "learning_rate": 0.0006031196133292682, + "loss": 2.709, + "step": 14824 + }, + { + "epoch": 0.4396109480176734, + "grad_norm": 0.129729762673378, + "learning_rate": 0.000603073573432322, + "loss": 2.6585, + "step": 14825 + }, + { + "epoch": 0.4396406013699849, + "grad_norm": 0.14002592861652374, + "learning_rate": 0.0006030275326226417, + "loss": 2.6856, + "step": 14826 + }, + { + "epoch": 0.43967025472229637, + "grad_norm": 0.1306668221950531, + "learning_rate": 0.0006029814909006353, + "loss": 2.6891, + "step": 14827 + }, + { + "epoch": 0.43969990807460785, + "grad_norm": 0.12984642386436462, + "learning_rate": 0.0006029354482667097, + "loss": 2.6562, + "step": 14828 + }, + { + "epoch": 0.4397295614269193, + "grad_norm": 0.11915930360555649, + "learning_rate": 0.0006028894047212732, + "loss": 2.6455, + "step": 14829 + }, + { + "epoch": 0.4397592147792308, + "grad_norm": 0.1342676728963852, + "learning_rate": 0.0006028433602647333, + "loss": 2.669, + "step": 14830 + }, + { + "epoch": 0.4397888681315423, + "grad_norm": 0.10739152878522873, + "learning_rate": 0.0006027973148974977, + "loss": 2.6676, + "step": 14831 + }, + { + "epoch": 0.43981852148385375, + "grad_norm": 0.1210368275642395, + "learning_rate": 0.0006027512686199745, + "loss": 2.6811, + "step": 14832 + }, + { + "epoch": 0.43984817483616523, + "grad_norm": 0.12210401147603989, + "learning_rate": 0.0006027052214325709, + "loss": 2.6472, + "step": 14833 + }, + { + "epoch": 0.4398778281884767, + "grad_norm": 0.13675282895565033, + "learning_rate": 0.000602659173335695, + "loss": 2.6476, + "step": 14834 + }, + { + "epoch": 0.4399074815407882, + "grad_norm": 0.11341341584920883, + "learning_rate": 0.0006026131243297546, + "loss": 2.6552, + "step": 14835 + }, + { + "epoch": 0.43993713489309966, + "grad_norm": 0.11554137617349625, + "learning_rate": 0.0006025670744151573, + "loss": 2.6819, + "step": 14836 + }, + { + "epoch": 0.43996678824541113, + "grad_norm": 0.12587842345237732, + "learning_rate": 0.000602521023592311, + "loss": 2.7022, + "step": 14837 + }, + { + "epoch": 0.4399964415977226, + "grad_norm": 0.14083488285541534, + "learning_rate": 0.0006024749718616234, + "loss": 2.678, + "step": 14838 + }, + { + "epoch": 0.4400260949500341, + "grad_norm": 0.1347571611404419, + "learning_rate": 0.0006024289192235023, + "loss": 2.6492, + "step": 14839 + }, + { + "epoch": 0.44005574830234556, + "grad_norm": 0.11405115574598312, + "learning_rate": 0.0006023828656783555, + "loss": 2.6619, + "step": 14840 + }, + { + "epoch": 0.44008540165465704, + "grad_norm": 0.11462441086769104, + "learning_rate": 0.000602336811226591, + "loss": 2.646, + "step": 14841 + }, + { + "epoch": 0.4401150550069685, + "grad_norm": 0.11266620457172394, + "learning_rate": 0.0006022907558686164, + "loss": 2.6633, + "step": 14842 + }, + { + "epoch": 0.44014470835928, + "grad_norm": 0.09754151850938797, + "learning_rate": 0.0006022446996048396, + "loss": 2.6467, + "step": 14843 + }, + { + "epoch": 0.4401743617115915, + "grad_norm": 0.1176542118191719, + "learning_rate": 0.0006021986424356684, + "loss": 2.685, + "step": 14844 + }, + { + "epoch": 0.440204015063903, + "grad_norm": 0.1347695291042328, + "learning_rate": 0.0006021525843615108, + "loss": 2.6547, + "step": 14845 + }, + { + "epoch": 0.4402336684162145, + "grad_norm": 0.13552989065647125, + "learning_rate": 0.0006021065253827744, + "loss": 2.6316, + "step": 14846 + }, + { + "epoch": 0.44026332176852595, + "grad_norm": 0.13105568289756775, + "learning_rate": 0.0006020604654998671, + "loss": 2.6638, + "step": 14847 + }, + { + "epoch": 0.4402929751208374, + "grad_norm": 0.11859539896249771, + "learning_rate": 0.0006020144047131971, + "loss": 2.6796, + "step": 14848 + }, + { + "epoch": 0.4403226284731489, + "grad_norm": 0.12108075618743896, + "learning_rate": 0.0006019683430231721, + "loss": 2.6404, + "step": 14849 + }, + { + "epoch": 0.4403522818254604, + "grad_norm": 0.11180426180362701, + "learning_rate": 0.0006019222804301996, + "loss": 2.6714, + "step": 14850 + }, + { + "epoch": 0.44038193517777185, + "grad_norm": 0.12522178888320923, + "learning_rate": 0.000601876216934688, + "loss": 2.6865, + "step": 14851 + }, + { + "epoch": 0.44041158853008333, + "grad_norm": 0.12080766260623932, + "learning_rate": 0.0006018301525370449, + "loss": 2.6457, + "step": 14852 + }, + { + "epoch": 0.4404412418823948, + "grad_norm": 0.11383786052465439, + "learning_rate": 0.0006017840872376784, + "loss": 2.672, + "step": 14853 + }, + { + "epoch": 0.4404708952347063, + "grad_norm": 0.1436101645231247, + "learning_rate": 0.0006017380210369965, + "loss": 2.6575, + "step": 14854 + }, + { + "epoch": 0.44050054858701776, + "grad_norm": 0.1679469794034958, + "learning_rate": 0.0006016919539354068, + "loss": 2.6931, + "step": 14855 + }, + { + "epoch": 0.44053020193932924, + "grad_norm": 0.18724088370800018, + "learning_rate": 0.0006016458859333173, + "loss": 2.6592, + "step": 14856 + }, + { + "epoch": 0.4405598552916407, + "grad_norm": 0.14844924211502075, + "learning_rate": 0.000601599817031136, + "loss": 2.668, + "step": 14857 + }, + { + "epoch": 0.4405895086439522, + "grad_norm": 0.1196429431438446, + "learning_rate": 0.000601553747229271, + "loss": 2.6577, + "step": 14858 + }, + { + "epoch": 0.44061916199626366, + "grad_norm": 0.12543708086013794, + "learning_rate": 0.0006015076765281301, + "loss": 2.6696, + "step": 14859 + }, + { + "epoch": 0.44064881534857514, + "grad_norm": 0.12276840209960938, + "learning_rate": 0.0006014616049281216, + "loss": 2.6758, + "step": 14860 + }, + { + "epoch": 0.4406784687008866, + "grad_norm": 0.11591161787509918, + "learning_rate": 0.0006014155324296528, + "loss": 2.6846, + "step": 14861 + }, + { + "epoch": 0.4407081220531981, + "grad_norm": 0.1170259490609169, + "learning_rate": 0.0006013694590331321, + "loss": 2.6526, + "step": 14862 + }, + { + "epoch": 0.44073777540550957, + "grad_norm": 0.12306389957666397, + "learning_rate": 0.0006013233847389674, + "loss": 2.655, + "step": 14863 + }, + { + "epoch": 0.44076742875782104, + "grad_norm": 0.1211492046713829, + "learning_rate": 0.0006012773095475668, + "loss": 2.6534, + "step": 14864 + }, + { + "epoch": 0.4407970821101326, + "grad_norm": 0.12074923515319824, + "learning_rate": 0.0006012312334593385, + "loss": 2.6412, + "step": 14865 + }, + { + "epoch": 0.44082673546244405, + "grad_norm": 0.11282263696193695, + "learning_rate": 0.0006011851564746899, + "loss": 2.6335, + "step": 14866 + }, + { + "epoch": 0.44085638881475553, + "grad_norm": 0.13205406069755554, + "learning_rate": 0.0006011390785940296, + "loss": 2.687, + "step": 14867 + }, + { + "epoch": 0.440886042167067, + "grad_norm": 0.1361812800168991, + "learning_rate": 0.0006010929998177653, + "loss": 2.6641, + "step": 14868 + }, + { + "epoch": 0.4409156955193785, + "grad_norm": 0.12243865430355072, + "learning_rate": 0.000601046920146305, + "loss": 2.6822, + "step": 14869 + }, + { + "epoch": 0.44094534887168996, + "grad_norm": 0.11629017442464828, + "learning_rate": 0.0006010008395800571, + "loss": 2.6793, + "step": 14870 + }, + { + "epoch": 0.44097500222400143, + "grad_norm": 0.13880275189876556, + "learning_rate": 0.0006009547581194293, + "loss": 2.6861, + "step": 14871 + }, + { + "epoch": 0.4410046555763129, + "grad_norm": 0.14070892333984375, + "learning_rate": 0.0006009086757648299, + "loss": 2.7229, + "step": 14872 + }, + { + "epoch": 0.4410343089286244, + "grad_norm": 0.15840427577495575, + "learning_rate": 0.0006008625925166668, + "loss": 2.6702, + "step": 14873 + }, + { + "epoch": 0.44106396228093586, + "grad_norm": 0.1528496891260147, + "learning_rate": 0.0006008165083753481, + "loss": 2.6838, + "step": 14874 + }, + { + "epoch": 0.44109361563324734, + "grad_norm": 0.13099870085716248, + "learning_rate": 0.0006007704233412819, + "loss": 2.6643, + "step": 14875 + }, + { + "epoch": 0.4411232689855588, + "grad_norm": 0.11585932224988937, + "learning_rate": 0.0006007243374148763, + "loss": 2.6488, + "step": 14876 + }, + { + "epoch": 0.4411529223378703, + "grad_norm": 0.11290480941534042, + "learning_rate": 0.0006006782505965395, + "loss": 2.6976, + "step": 14877 + }, + { + "epoch": 0.44118257569018177, + "grad_norm": 0.11103805154561996, + "learning_rate": 0.0006006321628866794, + "loss": 2.6658, + "step": 14878 + }, + { + "epoch": 0.44121222904249324, + "grad_norm": 0.11480417102575302, + "learning_rate": 0.0006005860742857042, + "loss": 2.6883, + "step": 14879 + }, + { + "epoch": 0.4412418823948047, + "grad_norm": 0.09392435848712921, + "learning_rate": 0.0006005399847940221, + "loss": 2.6828, + "step": 14880 + }, + { + "epoch": 0.4412715357471162, + "grad_norm": 0.1161143109202385, + "learning_rate": 0.0006004938944120413, + "loss": 2.6613, + "step": 14881 + }, + { + "epoch": 0.44130118909942767, + "grad_norm": 0.12988336384296417, + "learning_rate": 0.0006004478031401697, + "loss": 2.7181, + "step": 14882 + }, + { + "epoch": 0.44133084245173915, + "grad_norm": 0.10648013651371002, + "learning_rate": 0.0006004017109788156, + "loss": 2.6447, + "step": 14883 + }, + { + "epoch": 0.4413604958040506, + "grad_norm": 0.11908376216888428, + "learning_rate": 0.000600355617928387, + "loss": 2.6863, + "step": 14884 + }, + { + "epoch": 0.4413901491563621, + "grad_norm": 0.13252253830432892, + "learning_rate": 0.0006003095239892923, + "loss": 2.6237, + "step": 14885 + }, + { + "epoch": 0.44141980250867363, + "grad_norm": 0.12623852491378784, + "learning_rate": 0.0006002634291619396, + "loss": 2.6959, + "step": 14886 + }, + { + "epoch": 0.4414494558609851, + "grad_norm": 0.14023272693157196, + "learning_rate": 0.0006002173334467369, + "loss": 2.6712, + "step": 14887 + }, + { + "epoch": 0.4414791092132966, + "grad_norm": 0.1434805989265442, + "learning_rate": 0.0006001712368440926, + "loss": 2.6502, + "step": 14888 + }, + { + "epoch": 0.44150876256560806, + "grad_norm": 0.13764892518520355, + "learning_rate": 0.0006001251393544146, + "loss": 2.7055, + "step": 14889 + }, + { + "epoch": 0.44153841591791954, + "grad_norm": 0.12371297925710678, + "learning_rate": 0.0006000790409781115, + "loss": 2.6644, + "step": 14890 + }, + { + "epoch": 0.441568069270231, + "grad_norm": 0.10890551656484604, + "learning_rate": 0.0006000329417155912, + "loss": 2.6795, + "step": 14891 + }, + { + "epoch": 0.4415977226225425, + "grad_norm": 0.12854745984077454, + "learning_rate": 0.0005999868415672622, + "loss": 2.674, + "step": 14892 + }, + { + "epoch": 0.44162737597485396, + "grad_norm": 0.1403116136789322, + "learning_rate": 0.0005999407405335325, + "loss": 2.6487, + "step": 14893 + }, + { + "epoch": 0.44165702932716544, + "grad_norm": 0.13142292201519012, + "learning_rate": 0.0005998946386148104, + "loss": 2.6673, + "step": 14894 + }, + { + "epoch": 0.4416866826794769, + "grad_norm": 0.145527645945549, + "learning_rate": 0.0005998485358115039, + "loss": 2.6834, + "step": 14895 + }, + { + "epoch": 0.4417163360317884, + "grad_norm": 0.1440296173095703, + "learning_rate": 0.0005998024321240217, + "loss": 2.6479, + "step": 14896 + }, + { + "epoch": 0.44174598938409987, + "grad_norm": 0.13844089210033417, + "learning_rate": 0.0005997563275527717, + "loss": 2.6861, + "step": 14897 + }, + { + "epoch": 0.44177564273641134, + "grad_norm": 0.13038817048072815, + "learning_rate": 0.0005997102220981625, + "loss": 2.6543, + "step": 14898 + }, + { + "epoch": 0.4418052960887228, + "grad_norm": 0.15838216245174408, + "learning_rate": 0.000599664115760602, + "loss": 2.6056, + "step": 14899 + }, + { + "epoch": 0.4418349494410343, + "grad_norm": 0.13060107827186584, + "learning_rate": 0.0005996180085404987, + "loss": 2.6498, + "step": 14900 + }, + { + "epoch": 0.4418646027933458, + "grad_norm": 0.12937641143798828, + "learning_rate": 0.0005995719004382609, + "loss": 2.6516, + "step": 14901 + }, + { + "epoch": 0.44189425614565725, + "grad_norm": 0.12854212522506714, + "learning_rate": 0.0005995257914542966, + "loss": 2.6367, + "step": 14902 + }, + { + "epoch": 0.4419239094979687, + "grad_norm": 0.11769257485866547, + "learning_rate": 0.0005994796815890146, + "loss": 2.6586, + "step": 14903 + }, + { + "epoch": 0.4419535628502802, + "grad_norm": 0.12956136465072632, + "learning_rate": 0.0005994335708428228, + "loss": 2.6722, + "step": 14904 + }, + { + "epoch": 0.4419832162025917, + "grad_norm": 0.12196774780750275, + "learning_rate": 0.0005993874592161297, + "loss": 2.6828, + "step": 14905 + }, + { + "epoch": 0.4420128695549032, + "grad_norm": 0.13017067313194275, + "learning_rate": 0.0005993413467093436, + "loss": 2.6506, + "step": 14906 + }, + { + "epoch": 0.4420425229072147, + "grad_norm": 0.12443571537733078, + "learning_rate": 0.0005992952333228728, + "loss": 2.6391, + "step": 14907 + }, + { + "epoch": 0.44207217625952616, + "grad_norm": 0.11636976152658463, + "learning_rate": 0.0005992491190571256, + "loss": 2.6854, + "step": 14908 + }, + { + "epoch": 0.44210182961183764, + "grad_norm": 0.1256076842546463, + "learning_rate": 0.0005992030039125105, + "loss": 2.6022, + "step": 14909 + }, + { + "epoch": 0.4421314829641491, + "grad_norm": 0.13807477056980133, + "learning_rate": 0.0005991568878894358, + "loss": 2.6532, + "step": 14910 + }, + { + "epoch": 0.4421611363164606, + "grad_norm": 0.1383587270975113, + "learning_rate": 0.0005991107709883098, + "loss": 2.6918, + "step": 14911 + }, + { + "epoch": 0.44219078966877207, + "grad_norm": 0.11924606561660767, + "learning_rate": 0.000599064653209541, + "loss": 2.6874, + "step": 14912 + }, + { + "epoch": 0.44222044302108354, + "grad_norm": 0.12014724314212799, + "learning_rate": 0.0005990185345535375, + "loss": 2.6666, + "step": 14913 + }, + { + "epoch": 0.442250096373395, + "grad_norm": 0.13307015597820282, + "learning_rate": 0.000598972415020708, + "loss": 2.6316, + "step": 14914 + }, + { + "epoch": 0.4422797497257065, + "grad_norm": 0.1223750188946724, + "learning_rate": 0.0005989262946114607, + "loss": 2.6834, + "step": 14915 + }, + { + "epoch": 0.44230940307801797, + "grad_norm": 0.11866269260644913, + "learning_rate": 0.0005988801733262042, + "loss": 2.6608, + "step": 14916 + }, + { + "epoch": 0.44233905643032945, + "grad_norm": 0.1291816383600235, + "learning_rate": 0.0005988340511653467, + "loss": 2.6829, + "step": 14917 + }, + { + "epoch": 0.4423687097826409, + "grad_norm": 0.1311298906803131, + "learning_rate": 0.0005987879281292968, + "loss": 2.6525, + "step": 14918 + }, + { + "epoch": 0.4423983631349524, + "grad_norm": 0.12577152252197266, + "learning_rate": 0.0005987418042184627, + "loss": 2.6331, + "step": 14919 + }, + { + "epoch": 0.4424280164872639, + "grad_norm": 0.11380001902580261, + "learning_rate": 0.0005986956794332533, + "loss": 2.6499, + "step": 14920 + }, + { + "epoch": 0.44245766983957535, + "grad_norm": 0.12142225354909897, + "learning_rate": 0.0005986495537740762, + "loss": 2.6974, + "step": 14921 + }, + { + "epoch": 0.4424873231918868, + "grad_norm": 0.13074852526187897, + "learning_rate": 0.0005986034272413407, + "loss": 2.6815, + "step": 14922 + }, + { + "epoch": 0.4425169765441983, + "grad_norm": 0.11191504448652267, + "learning_rate": 0.0005985572998354549, + "loss": 2.6849, + "step": 14923 + }, + { + "epoch": 0.4425466298965098, + "grad_norm": 0.10656233876943588, + "learning_rate": 0.0005985111715568273, + "loss": 2.6897, + "step": 14924 + }, + { + "epoch": 0.44257628324882126, + "grad_norm": 0.1343679577112198, + "learning_rate": 0.0005984650424058664, + "loss": 2.6595, + "step": 14925 + }, + { + "epoch": 0.44260593660113273, + "grad_norm": 0.11398535966873169, + "learning_rate": 0.0005984189123829806, + "loss": 2.6659, + "step": 14926 + }, + { + "epoch": 0.44263558995344426, + "grad_norm": 0.11913616955280304, + "learning_rate": 0.0005983727814885783, + "loss": 2.6741, + "step": 14927 + }, + { + "epoch": 0.44266524330575574, + "grad_norm": 0.10934728384017944, + "learning_rate": 0.0005983266497230681, + "loss": 2.6667, + "step": 14928 + }, + { + "epoch": 0.4426948966580672, + "grad_norm": 0.10020803660154343, + "learning_rate": 0.0005982805170868589, + "loss": 2.665, + "step": 14929 + }, + { + "epoch": 0.4427245500103787, + "grad_norm": 0.11703869700431824, + "learning_rate": 0.0005982343835803587, + "loss": 2.6755, + "step": 14930 + }, + { + "epoch": 0.44275420336269017, + "grad_norm": 0.1328389197587967, + "learning_rate": 0.0005981882492039761, + "loss": 2.6673, + "step": 14931 + }, + { + "epoch": 0.44278385671500164, + "grad_norm": 0.13221217691898346, + "learning_rate": 0.0005981421139581197, + "loss": 2.6494, + "step": 14932 + }, + { + "epoch": 0.4428135100673131, + "grad_norm": 0.15395699441432953, + "learning_rate": 0.000598095977843198, + "loss": 2.6621, + "step": 14933 + }, + { + "epoch": 0.4428431634196246, + "grad_norm": 0.14876842498779297, + "learning_rate": 0.0005980498408596196, + "loss": 2.6725, + "step": 14934 + }, + { + "epoch": 0.4428728167719361, + "grad_norm": 0.12348805367946625, + "learning_rate": 0.000598003703007793, + "loss": 2.6447, + "step": 14935 + }, + { + "epoch": 0.44290247012424755, + "grad_norm": 0.11672460287809372, + "learning_rate": 0.0005979575642881268, + "loss": 2.6893, + "step": 14936 + }, + { + "epoch": 0.442932123476559, + "grad_norm": 0.12488818168640137, + "learning_rate": 0.0005979114247010297, + "loss": 2.6394, + "step": 14937 + }, + { + "epoch": 0.4429617768288705, + "grad_norm": 0.12594741582870483, + "learning_rate": 0.00059786528424691, + "loss": 2.6706, + "step": 14938 + }, + { + "epoch": 0.442991430181182, + "grad_norm": 0.12267185747623444, + "learning_rate": 0.0005978191429261764, + "loss": 2.6585, + "step": 14939 + }, + { + "epoch": 0.44302108353349345, + "grad_norm": 0.10280804336071014, + "learning_rate": 0.0005977730007392376, + "loss": 2.6466, + "step": 14940 + }, + { + "epoch": 0.44305073688580493, + "grad_norm": 0.11647649854421616, + "learning_rate": 0.000597726857686502, + "loss": 2.6902, + "step": 14941 + }, + { + "epoch": 0.4430803902381164, + "grad_norm": 0.13040848076343536, + "learning_rate": 0.0005976807137683783, + "loss": 2.6926, + "step": 14942 + }, + { + "epoch": 0.4431100435904279, + "grad_norm": 0.12424790114164352, + "learning_rate": 0.0005976345689852751, + "loss": 2.6748, + "step": 14943 + }, + { + "epoch": 0.44313969694273936, + "grad_norm": 0.12267334014177322, + "learning_rate": 0.0005975884233376011, + "loss": 2.6776, + "step": 14944 + }, + { + "epoch": 0.44316935029505083, + "grad_norm": 0.12072582542896271, + "learning_rate": 0.0005975422768257648, + "loss": 2.6852, + "step": 14945 + }, + { + "epoch": 0.4431990036473623, + "grad_norm": 0.1305164098739624, + "learning_rate": 0.0005974961294501751, + "loss": 2.6683, + "step": 14946 + }, + { + "epoch": 0.4432286569996738, + "grad_norm": 0.14796841144561768, + "learning_rate": 0.0005974499812112402, + "loss": 2.6903, + "step": 14947 + }, + { + "epoch": 0.4432583103519853, + "grad_norm": 0.1423717588186264, + "learning_rate": 0.000597403832109369, + "loss": 2.6617, + "step": 14948 + }, + { + "epoch": 0.4432879637042968, + "grad_norm": 0.1299213021993637, + "learning_rate": 0.0005973576821449703, + "loss": 2.6773, + "step": 14949 + }, + { + "epoch": 0.44331761705660827, + "grad_norm": 0.12418555468320847, + "learning_rate": 0.0005973115313184525, + "loss": 2.6748, + "step": 14950 + }, + { + "epoch": 0.44334727040891975, + "grad_norm": 0.11369583755731583, + "learning_rate": 0.0005972653796302243, + "loss": 2.6592, + "step": 14951 + }, + { + "epoch": 0.4433769237612312, + "grad_norm": 0.11506553739309311, + "learning_rate": 0.0005972192270806946, + "loss": 2.6746, + "step": 14952 + }, + { + "epoch": 0.4434065771135427, + "grad_norm": 0.13482388854026794, + "learning_rate": 0.000597173073670272, + "loss": 2.6893, + "step": 14953 + }, + { + "epoch": 0.4434362304658542, + "grad_norm": 0.1550893485546112, + "learning_rate": 0.000597126919399365, + "loss": 2.6979, + "step": 14954 + }, + { + "epoch": 0.44346588381816565, + "grad_norm": 0.14308121800422668, + "learning_rate": 0.0005970807642683827, + "loss": 2.6992, + "step": 14955 + }, + { + "epoch": 0.4434955371704771, + "grad_norm": 0.14717550575733185, + "learning_rate": 0.0005970346082777333, + "loss": 2.6466, + "step": 14956 + }, + { + "epoch": 0.4435251905227886, + "grad_norm": 0.14893822371959686, + "learning_rate": 0.0005969884514278259, + "loss": 2.6557, + "step": 14957 + }, + { + "epoch": 0.4435548438751001, + "grad_norm": 0.13259746134281158, + "learning_rate": 0.0005969422937190692, + "loss": 2.693, + "step": 14958 + }, + { + "epoch": 0.44358449722741156, + "grad_norm": 0.12585707008838654, + "learning_rate": 0.0005968961351518718, + "loss": 2.6396, + "step": 14959 + }, + { + "epoch": 0.44361415057972303, + "grad_norm": 0.12861788272857666, + "learning_rate": 0.0005968499757266424, + "loss": 2.632, + "step": 14960 + }, + { + "epoch": 0.4436438039320345, + "grad_norm": 0.13257771730422974, + "learning_rate": 0.0005968038154437898, + "loss": 2.6599, + "step": 14961 + }, + { + "epoch": 0.443673457284346, + "grad_norm": 0.12190728634595871, + "learning_rate": 0.0005967576543037229, + "loss": 2.6712, + "step": 14962 + }, + { + "epoch": 0.44370311063665746, + "grad_norm": 0.1275995671749115, + "learning_rate": 0.0005967114923068505, + "loss": 2.6795, + "step": 14963 + }, + { + "epoch": 0.44373276398896894, + "grad_norm": 0.12221089750528336, + "learning_rate": 0.0005966653294535811, + "loss": 2.6386, + "step": 14964 + }, + { + "epoch": 0.4437624173412804, + "grad_norm": 0.10678473114967346, + "learning_rate": 0.0005966191657443236, + "loss": 2.677, + "step": 14965 + }, + { + "epoch": 0.4437920706935919, + "grad_norm": 0.11874549090862274, + "learning_rate": 0.0005965730011794866, + "loss": 2.691, + "step": 14966 + }, + { + "epoch": 0.44382172404590337, + "grad_norm": 0.13342024385929108, + "learning_rate": 0.0005965268357594794, + "loss": 2.6807, + "step": 14967 + }, + { + "epoch": 0.44385137739821484, + "grad_norm": 0.12029965966939926, + "learning_rate": 0.0005964806694847104, + "loss": 2.6907, + "step": 14968 + }, + { + "epoch": 0.4438810307505264, + "grad_norm": 0.1239466741681099, + "learning_rate": 0.0005964345023555886, + "loss": 2.6574, + "step": 14969 + }, + { + "epoch": 0.44391068410283785, + "grad_norm": 0.13883109390735626, + "learning_rate": 0.0005963883343725226, + "loss": 2.7057, + "step": 14970 + }, + { + "epoch": 0.4439403374551493, + "grad_norm": 0.1400938183069229, + "learning_rate": 0.0005963421655359215, + "loss": 2.6298, + "step": 14971 + }, + { + "epoch": 0.4439699908074608, + "grad_norm": 0.13545365631580353, + "learning_rate": 0.0005962959958461939, + "loss": 2.6478, + "step": 14972 + }, + { + "epoch": 0.4439996441597723, + "grad_norm": 0.1391890048980713, + "learning_rate": 0.0005962498253037485, + "loss": 2.6764, + "step": 14973 + }, + { + "epoch": 0.44402929751208375, + "grad_norm": 0.12452143430709839, + "learning_rate": 0.0005962036539089948, + "loss": 2.6733, + "step": 14974 + }, + { + "epoch": 0.44405895086439523, + "grad_norm": 0.11152797192335129, + "learning_rate": 0.0005961574816623409, + "loss": 2.6666, + "step": 14975 + }, + { + "epoch": 0.4440886042167067, + "grad_norm": 0.10515764355659485, + "learning_rate": 0.0005961113085641962, + "loss": 2.6732, + "step": 14976 + }, + { + "epoch": 0.4441182575690182, + "grad_norm": 0.1057453379034996, + "learning_rate": 0.0005960651346149692, + "loss": 2.6802, + "step": 14977 + }, + { + "epoch": 0.44414791092132966, + "grad_norm": 0.10829457640647888, + "learning_rate": 0.000596018959815069, + "loss": 2.6439, + "step": 14978 + }, + { + "epoch": 0.44417756427364113, + "grad_norm": 0.11417637765407562, + "learning_rate": 0.0005959727841649045, + "loss": 2.7175, + "step": 14979 + }, + { + "epoch": 0.4442072176259526, + "grad_norm": 0.10503184050321579, + "learning_rate": 0.0005959266076648845, + "loss": 2.6243, + "step": 14980 + }, + { + "epoch": 0.4442368709782641, + "grad_norm": 0.12976132333278656, + "learning_rate": 0.000595880430315418, + "loss": 2.6832, + "step": 14981 + }, + { + "epoch": 0.44426652433057556, + "grad_norm": 0.13824933767318726, + "learning_rate": 0.0005958342521169137, + "loss": 2.6968, + "step": 14982 + }, + { + "epoch": 0.44429617768288704, + "grad_norm": 0.13382230699062347, + "learning_rate": 0.0005957880730697807, + "loss": 2.6641, + "step": 14983 + }, + { + "epoch": 0.4443258310351985, + "grad_norm": 0.11985892802476883, + "learning_rate": 0.0005957418931744279, + "loss": 2.6506, + "step": 14984 + }, + { + "epoch": 0.44435548438751, + "grad_norm": 0.12293677031993866, + "learning_rate": 0.0005956957124312642, + "loss": 2.6518, + "step": 14985 + }, + { + "epoch": 0.44438513773982147, + "grad_norm": 0.11303167045116425, + "learning_rate": 0.0005956495308406984, + "loss": 2.6636, + "step": 14986 + }, + { + "epoch": 0.44441479109213294, + "grad_norm": 0.11826709657907486, + "learning_rate": 0.0005956033484031396, + "loss": 2.6553, + "step": 14987 + }, + { + "epoch": 0.4444444444444444, + "grad_norm": 0.1216888576745987, + "learning_rate": 0.0005955571651189968, + "loss": 2.6903, + "step": 14988 + }, + { + "epoch": 0.4444740977967559, + "grad_norm": 0.14226458966732025, + "learning_rate": 0.0005955109809886789, + "loss": 2.6562, + "step": 14989 + }, + { + "epoch": 0.4445037511490674, + "grad_norm": 0.15558414161205292, + "learning_rate": 0.000595464796012595, + "loss": 2.6847, + "step": 14990 + }, + { + "epoch": 0.4445334045013789, + "grad_norm": 0.13714446127414703, + "learning_rate": 0.0005954186101911538, + "loss": 2.6786, + "step": 14991 + }, + { + "epoch": 0.4445630578536904, + "grad_norm": 0.11908582597970963, + "learning_rate": 0.0005953724235247645, + "loss": 2.648, + "step": 14992 + }, + { + "epoch": 0.44459271120600186, + "grad_norm": 0.14229902625083923, + "learning_rate": 0.0005953262360138359, + "loss": 2.6744, + "step": 14993 + }, + { + "epoch": 0.44462236455831333, + "grad_norm": 0.15987545251846313, + "learning_rate": 0.0005952800476587772, + "loss": 2.6583, + "step": 14994 + }, + { + "epoch": 0.4446520179106248, + "grad_norm": 0.13727794587612152, + "learning_rate": 0.0005952338584599973, + "loss": 2.6564, + "step": 14995 + }, + { + "epoch": 0.4446816712629363, + "grad_norm": 0.1341984122991562, + "learning_rate": 0.0005951876684179054, + "loss": 2.6897, + "step": 14996 + }, + { + "epoch": 0.44471132461524776, + "grad_norm": 0.13759870827198029, + "learning_rate": 0.0005951414775329102, + "loss": 2.643, + "step": 14997 + }, + { + "epoch": 0.44474097796755924, + "grad_norm": 0.13557279109954834, + "learning_rate": 0.0005950952858054209, + "loss": 2.6435, + "step": 14998 + }, + { + "epoch": 0.4447706313198707, + "grad_norm": 0.12225338816642761, + "learning_rate": 0.0005950490932358464, + "loss": 2.6598, + "step": 14999 + }, + { + "epoch": 0.4448002846721822, + "grad_norm": 0.1289537250995636, + "learning_rate": 0.0005950028998245961, + "loss": 2.6507, + "step": 15000 + }, + { + "epoch": 0.44482993802449367, + "grad_norm": 0.11749377846717834, + "learning_rate": 0.0005949567055720788, + "loss": 2.697, + "step": 15001 + }, + { + "epoch": 0.44485959137680514, + "grad_norm": 0.1247691884636879, + "learning_rate": 0.0005949105104787035, + "loss": 2.6338, + "step": 15002 + }, + { + "epoch": 0.4448892447291166, + "grad_norm": 0.11096743494272232, + "learning_rate": 0.0005948643145448794, + "loss": 2.6739, + "step": 15003 + }, + { + "epoch": 0.4449188980814281, + "grad_norm": 0.12475810945034027, + "learning_rate": 0.0005948181177710154, + "loss": 2.6766, + "step": 15004 + }, + { + "epoch": 0.44494855143373957, + "grad_norm": 0.13319146633148193, + "learning_rate": 0.0005947719201575206, + "loss": 2.6551, + "step": 15005 + }, + { + "epoch": 0.44497820478605105, + "grad_norm": 0.12208931148052216, + "learning_rate": 0.0005947257217048044, + "loss": 2.6778, + "step": 15006 + }, + { + "epoch": 0.4450078581383625, + "grad_norm": 0.12620419263839722, + "learning_rate": 0.0005946795224132755, + "loss": 2.6671, + "step": 15007 + }, + { + "epoch": 0.445037511490674, + "grad_norm": 0.1189192533493042, + "learning_rate": 0.0005946333222833433, + "loss": 2.6482, + "step": 15008 + }, + { + "epoch": 0.4450671648429855, + "grad_norm": 0.13004519045352936, + "learning_rate": 0.0005945871213154167, + "loss": 2.6914, + "step": 15009 + }, + { + "epoch": 0.445096818195297, + "grad_norm": 0.1485036462545395, + "learning_rate": 0.0005945409195099049, + "loss": 2.6873, + "step": 15010 + }, + { + "epoch": 0.4451264715476085, + "grad_norm": 0.13899531960487366, + "learning_rate": 0.0005944947168672172, + "loss": 2.6545, + "step": 15011 + }, + { + "epoch": 0.44515612489991996, + "grad_norm": 0.13031135499477386, + "learning_rate": 0.0005944485133877624, + "loss": 2.6989, + "step": 15012 + }, + { + "epoch": 0.44518577825223143, + "grad_norm": 0.11902043223381042, + "learning_rate": 0.0005944023090719498, + "loss": 2.7131, + "step": 15013 + }, + { + "epoch": 0.4452154316045429, + "grad_norm": 0.1254773586988449, + "learning_rate": 0.0005943561039201885, + "loss": 2.6991, + "step": 15014 + }, + { + "epoch": 0.4452450849568544, + "grad_norm": 0.1226542741060257, + "learning_rate": 0.0005943098979328878, + "loss": 2.6699, + "step": 15015 + }, + { + "epoch": 0.44527473830916586, + "grad_norm": 0.12489484250545502, + "learning_rate": 0.0005942636911104569, + "loss": 2.689, + "step": 15016 + }, + { + "epoch": 0.44530439166147734, + "grad_norm": 0.14455777406692505, + "learning_rate": 0.0005942174834533046, + "loss": 2.6856, + "step": 15017 + }, + { + "epoch": 0.4453340450137888, + "grad_norm": 0.16086073219776154, + "learning_rate": 0.0005941712749618404, + "loss": 2.6678, + "step": 15018 + }, + { + "epoch": 0.4453636983661003, + "grad_norm": 0.14563150703907013, + "learning_rate": 0.0005941250656364734, + "loss": 2.6601, + "step": 15019 + }, + { + "epoch": 0.44539335171841177, + "grad_norm": 0.14899665117263794, + "learning_rate": 0.0005940788554776128, + "loss": 2.6701, + "step": 15020 + }, + { + "epoch": 0.44542300507072324, + "grad_norm": 0.12938657402992249, + "learning_rate": 0.0005940326444856677, + "loss": 2.6789, + "step": 15021 + }, + { + "epoch": 0.4454526584230347, + "grad_norm": 0.13790076971054077, + "learning_rate": 0.0005939864326610475, + "loss": 2.6829, + "step": 15022 + }, + { + "epoch": 0.4454823117753462, + "grad_norm": 0.12157125025987625, + "learning_rate": 0.0005939402200041614, + "loss": 2.6631, + "step": 15023 + }, + { + "epoch": 0.44551196512765767, + "grad_norm": 0.10488878935575485, + "learning_rate": 0.0005938940065154185, + "loss": 2.6363, + "step": 15024 + }, + { + "epoch": 0.44554161847996915, + "grad_norm": 0.1294141262769699, + "learning_rate": 0.0005938477921952278, + "loss": 2.6568, + "step": 15025 + }, + { + "epoch": 0.4455712718322806, + "grad_norm": 0.1261868178844452, + "learning_rate": 0.000593801577043999, + "loss": 2.6677, + "step": 15026 + }, + { + "epoch": 0.4456009251845921, + "grad_norm": 0.11097350716590881, + "learning_rate": 0.000593755361062141, + "loss": 2.6736, + "step": 15027 + }, + { + "epoch": 0.4456305785369036, + "grad_norm": 0.10486616939306259, + "learning_rate": 0.0005937091442500633, + "loss": 2.6829, + "step": 15028 + }, + { + "epoch": 0.44566023188921505, + "grad_norm": 0.11527606844902039, + "learning_rate": 0.0005936629266081751, + "loss": 2.6413, + "step": 15029 + }, + { + "epoch": 0.44568988524152653, + "grad_norm": 0.12270848453044891, + "learning_rate": 0.0005936167081368855, + "loss": 2.6566, + "step": 15030 + }, + { + "epoch": 0.44571953859383806, + "grad_norm": 0.11546114832162857, + "learning_rate": 0.0005935704888366038, + "loss": 2.6462, + "step": 15031 + }, + { + "epoch": 0.44574919194614954, + "grad_norm": 0.1306978166103363, + "learning_rate": 0.0005935242687077394, + "loss": 2.6441, + "step": 15032 + }, + { + "epoch": 0.445778845298461, + "grad_norm": 0.12223246693611145, + "learning_rate": 0.0005934780477507017, + "loss": 2.6653, + "step": 15033 + }, + { + "epoch": 0.4458084986507725, + "grad_norm": 0.1223960816860199, + "learning_rate": 0.0005934318259658998, + "loss": 2.6973, + "step": 15034 + }, + { + "epoch": 0.44583815200308397, + "grad_norm": 0.13746289908885956, + "learning_rate": 0.000593385603353743, + "loss": 2.6583, + "step": 15035 + }, + { + "epoch": 0.44586780535539544, + "grad_norm": 0.1288088709115982, + "learning_rate": 0.0005933393799146407, + "loss": 2.6764, + "step": 15036 + }, + { + "epoch": 0.4458974587077069, + "grad_norm": 0.12932820618152618, + "learning_rate": 0.0005932931556490021, + "loss": 2.6716, + "step": 15037 + }, + { + "epoch": 0.4459271120600184, + "grad_norm": 0.1309012919664383, + "learning_rate": 0.0005932469305572365, + "loss": 2.6658, + "step": 15038 + }, + { + "epoch": 0.44595676541232987, + "grad_norm": 0.12828555703163147, + "learning_rate": 0.0005932007046397536, + "loss": 2.6869, + "step": 15039 + }, + { + "epoch": 0.44598641876464135, + "grad_norm": 0.11597258597612381, + "learning_rate": 0.0005931544778969622, + "loss": 2.6734, + "step": 15040 + }, + { + "epoch": 0.4460160721169528, + "grad_norm": 0.10947301238775253, + "learning_rate": 0.0005931082503292719, + "loss": 2.6178, + "step": 15041 + }, + { + "epoch": 0.4460457254692643, + "grad_norm": 0.1200244277715683, + "learning_rate": 0.0005930620219370922, + "loss": 2.6397, + "step": 15042 + }, + { + "epoch": 0.4460753788215758, + "grad_norm": 0.12056247889995575, + "learning_rate": 0.0005930157927208323, + "loss": 2.6899, + "step": 15043 + }, + { + "epoch": 0.44610503217388725, + "grad_norm": 0.14521270990371704, + "learning_rate": 0.0005929695626809016, + "loss": 2.6557, + "step": 15044 + }, + { + "epoch": 0.4461346855261987, + "grad_norm": 0.14607742428779602, + "learning_rate": 0.0005929233318177095, + "loss": 2.6974, + "step": 15045 + }, + { + "epoch": 0.4461643388785102, + "grad_norm": 0.1350240558385849, + "learning_rate": 0.0005928771001316653, + "loss": 2.677, + "step": 15046 + }, + { + "epoch": 0.4461939922308217, + "grad_norm": 0.11351759731769562, + "learning_rate": 0.0005928308676231784, + "loss": 2.6559, + "step": 15047 + }, + { + "epoch": 0.44622364558313315, + "grad_norm": 0.1198376938700676, + "learning_rate": 0.0005927846342926582, + "loss": 2.6931, + "step": 15048 + }, + { + "epoch": 0.44625329893544463, + "grad_norm": 0.1437164843082428, + "learning_rate": 0.0005927384001405141, + "loss": 2.6644, + "step": 15049 + }, + { + "epoch": 0.4462829522877561, + "grad_norm": 0.15690988302230835, + "learning_rate": 0.0005926921651671557, + "loss": 2.6736, + "step": 15050 + }, + { + "epoch": 0.4463126056400676, + "grad_norm": 0.12884803116321564, + "learning_rate": 0.0005926459293729922, + "loss": 2.6867, + "step": 15051 + }, + { + "epoch": 0.4463422589923791, + "grad_norm": 0.1231573224067688, + "learning_rate": 0.0005925996927584332, + "loss": 2.6668, + "step": 15052 + }, + { + "epoch": 0.4463719123446906, + "grad_norm": 0.13496504724025726, + "learning_rate": 0.000592553455323888, + "loss": 2.6479, + "step": 15053 + }, + { + "epoch": 0.44640156569700207, + "grad_norm": 0.1336270272731781, + "learning_rate": 0.0005925072170697658, + "loss": 2.6828, + "step": 15054 + }, + { + "epoch": 0.44643121904931354, + "grad_norm": 0.12023858726024628, + "learning_rate": 0.0005924609779964766, + "loss": 2.6724, + "step": 15055 + }, + { + "epoch": 0.446460872401625, + "grad_norm": 0.12122442573308945, + "learning_rate": 0.0005924147381044296, + "loss": 2.6657, + "step": 15056 + }, + { + "epoch": 0.4464905257539365, + "grad_norm": 0.13533559441566467, + "learning_rate": 0.000592368497394034, + "loss": 2.6947, + "step": 15057 + }, + { + "epoch": 0.44652017910624797, + "grad_norm": 0.1405678242444992, + "learning_rate": 0.0005923222558656996, + "loss": 2.6338, + "step": 15058 + }, + { + "epoch": 0.44654983245855945, + "grad_norm": 0.12990078330039978, + "learning_rate": 0.0005922760135198357, + "loss": 2.6782, + "step": 15059 + }, + { + "epoch": 0.4465794858108709, + "grad_norm": 0.1327168196439743, + "learning_rate": 0.000592229770356852, + "loss": 2.6829, + "step": 15060 + }, + { + "epoch": 0.4466091391631824, + "grad_norm": 0.13137371838092804, + "learning_rate": 0.0005921835263771578, + "loss": 2.6757, + "step": 15061 + }, + { + "epoch": 0.4466387925154939, + "grad_norm": 0.13691245019435883, + "learning_rate": 0.0005921372815811628, + "loss": 2.6754, + "step": 15062 + }, + { + "epoch": 0.44666844586780535, + "grad_norm": 0.12369835376739502, + "learning_rate": 0.000592091035969276, + "loss": 2.6608, + "step": 15063 + }, + { + "epoch": 0.44669809922011683, + "grad_norm": 0.10614247620105743, + "learning_rate": 0.0005920447895419076, + "loss": 2.6247, + "step": 15064 + }, + { + "epoch": 0.4467277525724283, + "grad_norm": 0.11697495728731155, + "learning_rate": 0.0005919985422994666, + "loss": 2.684, + "step": 15065 + }, + { + "epoch": 0.4467574059247398, + "grad_norm": 0.11979031562805176, + "learning_rate": 0.0005919522942423628, + "loss": 2.6847, + "step": 15066 + }, + { + "epoch": 0.44678705927705126, + "grad_norm": 0.11506122350692749, + "learning_rate": 0.0005919060453710057, + "loss": 2.6653, + "step": 15067 + }, + { + "epoch": 0.44681671262936273, + "grad_norm": 0.10827381163835526, + "learning_rate": 0.0005918597956858047, + "loss": 2.6878, + "step": 15068 + }, + { + "epoch": 0.4468463659816742, + "grad_norm": 0.11406557261943817, + "learning_rate": 0.0005918135451871696, + "loss": 2.6825, + "step": 15069 + }, + { + "epoch": 0.4468760193339857, + "grad_norm": 0.11836466938257217, + "learning_rate": 0.0005917672938755094, + "loss": 2.6681, + "step": 15070 + }, + { + "epoch": 0.44690567268629716, + "grad_norm": 0.12369491159915924, + "learning_rate": 0.0005917210417512344, + "loss": 2.677, + "step": 15071 + }, + { + "epoch": 0.44693532603860864, + "grad_norm": 0.11246901750564575, + "learning_rate": 0.0005916747888147539, + "loss": 2.6723, + "step": 15072 + }, + { + "epoch": 0.44696497939092017, + "grad_norm": 0.10911106318235397, + "learning_rate": 0.0005916285350664772, + "loss": 2.6782, + "step": 15073 + }, + { + "epoch": 0.44699463274323165, + "grad_norm": 0.1376197338104248, + "learning_rate": 0.0005915822805068142, + "loss": 2.7132, + "step": 15074 + }, + { + "epoch": 0.4470242860955431, + "grad_norm": 0.1200866624712944, + "learning_rate": 0.0005915360251361743, + "loss": 2.6654, + "step": 15075 + }, + { + "epoch": 0.4470539394478546, + "grad_norm": 0.11702053248882294, + "learning_rate": 0.0005914897689549672, + "loss": 2.6343, + "step": 15076 + }, + { + "epoch": 0.4470835928001661, + "grad_norm": 0.11477065831422806, + "learning_rate": 0.0005914435119636026, + "loss": 2.6571, + "step": 15077 + }, + { + "epoch": 0.44711324615247755, + "grad_norm": 0.11188861727714539, + "learning_rate": 0.0005913972541624899, + "loss": 2.6778, + "step": 15078 + }, + { + "epoch": 0.447142899504789, + "grad_norm": 0.11092177778482437, + "learning_rate": 0.0005913509955520388, + "loss": 2.6524, + "step": 15079 + }, + { + "epoch": 0.4471725528571005, + "grad_norm": 0.11090864986181259, + "learning_rate": 0.0005913047361326591, + "loss": 2.6483, + "step": 15080 + }, + { + "epoch": 0.447202206209412, + "grad_norm": 0.12044515460729599, + "learning_rate": 0.0005912584759047603, + "loss": 2.6394, + "step": 15081 + }, + { + "epoch": 0.44723185956172345, + "grad_norm": 0.1253913938999176, + "learning_rate": 0.0005912122148687518, + "loss": 2.6711, + "step": 15082 + }, + { + "epoch": 0.44726151291403493, + "grad_norm": 0.14575262367725372, + "learning_rate": 0.0005911659530250436, + "loss": 2.6933, + "step": 15083 + }, + { + "epoch": 0.4472911662663464, + "grad_norm": 0.16734077036380768, + "learning_rate": 0.0005911196903740453, + "loss": 2.6564, + "step": 15084 + }, + { + "epoch": 0.4473208196186579, + "grad_norm": 0.16030940413475037, + "learning_rate": 0.0005910734269161664, + "loss": 2.6522, + "step": 15085 + }, + { + "epoch": 0.44735047297096936, + "grad_norm": 0.12440312653779984, + "learning_rate": 0.0005910271626518168, + "loss": 2.6433, + "step": 15086 + }, + { + "epoch": 0.44738012632328084, + "grad_norm": 0.13901589810848236, + "learning_rate": 0.000590980897581406, + "loss": 2.6991, + "step": 15087 + }, + { + "epoch": 0.4474097796755923, + "grad_norm": 0.17468445003032684, + "learning_rate": 0.0005909346317053436, + "loss": 2.6845, + "step": 15088 + }, + { + "epoch": 0.4474394330279038, + "grad_norm": 0.13576196134090424, + "learning_rate": 0.0005908883650240396, + "loss": 2.6823, + "step": 15089 + }, + { + "epoch": 0.44746908638021526, + "grad_norm": 0.12653489410877228, + "learning_rate": 0.0005908420975379034, + "loss": 2.655, + "step": 15090 + }, + { + "epoch": 0.44749873973252674, + "grad_norm": 0.1394321769475937, + "learning_rate": 0.000590795829247345, + "loss": 2.6925, + "step": 15091 + }, + { + "epoch": 0.4475283930848382, + "grad_norm": 0.14472785592079163, + "learning_rate": 0.0005907495601527738, + "loss": 2.6622, + "step": 15092 + }, + { + "epoch": 0.4475580464371497, + "grad_norm": 0.15331265330314636, + "learning_rate": 0.0005907032902545997, + "loss": 2.6543, + "step": 15093 + }, + { + "epoch": 0.4475876997894612, + "grad_norm": 0.1584489345550537, + "learning_rate": 0.0005906570195532325, + "loss": 2.6321, + "step": 15094 + }, + { + "epoch": 0.4476173531417727, + "grad_norm": 0.14808732271194458, + "learning_rate": 0.0005906107480490818, + "loss": 2.6979, + "step": 15095 + }, + { + "epoch": 0.4476470064940842, + "grad_norm": 0.1550130546092987, + "learning_rate": 0.0005905644757425571, + "loss": 2.6998, + "step": 15096 + }, + { + "epoch": 0.44767665984639565, + "grad_norm": 0.14748676121234894, + "learning_rate": 0.0005905182026340688, + "loss": 2.661, + "step": 15097 + }, + { + "epoch": 0.44770631319870713, + "grad_norm": 0.15233255922794342, + "learning_rate": 0.000590471928724026, + "loss": 2.6925, + "step": 15098 + }, + { + "epoch": 0.4477359665510186, + "grad_norm": 0.16505631804466248, + "learning_rate": 0.0005904256540128389, + "loss": 2.6743, + "step": 15099 + }, + { + "epoch": 0.4477656199033301, + "grad_norm": 0.13942620158195496, + "learning_rate": 0.0005903793785009172, + "loss": 2.6615, + "step": 15100 + }, + { + "epoch": 0.44779527325564156, + "grad_norm": 0.1511731594800949, + "learning_rate": 0.0005903331021886705, + "loss": 2.6865, + "step": 15101 + }, + { + "epoch": 0.44782492660795303, + "grad_norm": 0.1456831693649292, + "learning_rate": 0.0005902868250765084, + "loss": 2.6746, + "step": 15102 + }, + { + "epoch": 0.4478545799602645, + "grad_norm": 0.11789561063051224, + "learning_rate": 0.0005902405471648412, + "loss": 2.6487, + "step": 15103 + }, + { + "epoch": 0.447884233312576, + "grad_norm": 0.13095338642597198, + "learning_rate": 0.0005901942684540785, + "loss": 2.6596, + "step": 15104 + }, + { + "epoch": 0.44791388666488746, + "grad_norm": 0.14910322427749634, + "learning_rate": 0.0005901479889446301, + "loss": 2.6589, + "step": 15105 + }, + { + "epoch": 0.44794354001719894, + "grad_norm": 0.12870877981185913, + "learning_rate": 0.0005901017086369057, + "loss": 2.7117, + "step": 15106 + }, + { + "epoch": 0.4479731933695104, + "grad_norm": 0.11877895891666412, + "learning_rate": 0.0005900554275313153, + "loss": 2.662, + "step": 15107 + }, + { + "epoch": 0.4480028467218219, + "grad_norm": 0.1030176505446434, + "learning_rate": 0.0005900091456282685, + "loss": 2.6804, + "step": 15108 + }, + { + "epoch": 0.44803250007413337, + "grad_norm": 0.12676729261875153, + "learning_rate": 0.0005899628629281753, + "loss": 2.6923, + "step": 15109 + }, + { + "epoch": 0.44806215342644484, + "grad_norm": 0.14796562492847443, + "learning_rate": 0.0005899165794314456, + "loss": 2.663, + "step": 15110 + }, + { + "epoch": 0.4480918067787563, + "grad_norm": 0.1303630769252777, + "learning_rate": 0.000589870295138489, + "loss": 2.6872, + "step": 15111 + }, + { + "epoch": 0.4481214601310678, + "grad_norm": 0.11258652061223984, + "learning_rate": 0.0005898240100497157, + "loss": 2.6737, + "step": 15112 + }, + { + "epoch": 0.44815111348337927, + "grad_norm": 0.1347031146287918, + "learning_rate": 0.0005897777241655353, + "loss": 2.6662, + "step": 15113 + }, + { + "epoch": 0.4481807668356908, + "grad_norm": 0.1275196373462677, + "learning_rate": 0.0005897314374863576, + "loss": 2.6551, + "step": 15114 + }, + { + "epoch": 0.4482104201880023, + "grad_norm": 0.11468292772769928, + "learning_rate": 0.0005896851500125927, + "loss": 2.6588, + "step": 15115 + }, + { + "epoch": 0.44824007354031375, + "grad_norm": 0.11295624077320099, + "learning_rate": 0.0005896388617446504, + "loss": 2.6544, + "step": 15116 + }, + { + "epoch": 0.44826972689262523, + "grad_norm": 0.10977679491043091, + "learning_rate": 0.0005895925726829407, + "loss": 2.7119, + "step": 15117 + }, + { + "epoch": 0.4482993802449367, + "grad_norm": 0.12335607409477234, + "learning_rate": 0.0005895462828278732, + "loss": 2.677, + "step": 15118 + }, + { + "epoch": 0.4483290335972482, + "grad_norm": 0.1235462874174118, + "learning_rate": 0.0005894999921798582, + "loss": 2.667, + "step": 15119 + }, + { + "epoch": 0.44835868694955966, + "grad_norm": 0.13101759552955627, + "learning_rate": 0.0005894537007393052, + "loss": 2.6638, + "step": 15120 + }, + { + "epoch": 0.44838834030187114, + "grad_norm": 0.12315088510513306, + "learning_rate": 0.0005894074085066246, + "loss": 2.6569, + "step": 15121 + }, + { + "epoch": 0.4484179936541826, + "grad_norm": 0.11844909191131592, + "learning_rate": 0.0005893611154822258, + "loss": 2.6418, + "step": 15122 + }, + { + "epoch": 0.4484476470064941, + "grad_norm": 0.11370069533586502, + "learning_rate": 0.0005893148216665191, + "loss": 2.6582, + "step": 15123 + }, + { + "epoch": 0.44847730035880556, + "grad_norm": 0.12841400504112244, + "learning_rate": 0.0005892685270599143, + "loss": 2.658, + "step": 15124 + }, + { + "epoch": 0.44850695371111704, + "grad_norm": 0.1290620118379593, + "learning_rate": 0.0005892222316628214, + "loss": 2.6782, + "step": 15125 + }, + { + "epoch": 0.4485366070634285, + "grad_norm": 0.1154896467924118, + "learning_rate": 0.0005891759354756503, + "loss": 2.6856, + "step": 15126 + }, + { + "epoch": 0.44856626041574, + "grad_norm": 0.11281872540712357, + "learning_rate": 0.0005891296384988111, + "loss": 2.6842, + "step": 15127 + }, + { + "epoch": 0.44859591376805147, + "grad_norm": 0.12061749398708344, + "learning_rate": 0.0005890833407327134, + "loss": 2.6359, + "step": 15128 + }, + { + "epoch": 0.44862556712036294, + "grad_norm": 0.13637852668762207, + "learning_rate": 0.0005890370421777675, + "loss": 2.6721, + "step": 15129 + }, + { + "epoch": 0.4486552204726744, + "grad_norm": 0.12465671449899673, + "learning_rate": 0.0005889907428343834, + "loss": 2.6971, + "step": 15130 + }, + { + "epoch": 0.4486848738249859, + "grad_norm": 0.11274828016757965, + "learning_rate": 0.000588944442702971, + "loss": 2.6416, + "step": 15131 + }, + { + "epoch": 0.4487145271772974, + "grad_norm": 0.11104826629161835, + "learning_rate": 0.0005888981417839403, + "loss": 2.6458, + "step": 15132 + }, + { + "epoch": 0.44874418052960885, + "grad_norm": 0.12329072505235672, + "learning_rate": 0.0005888518400777012, + "loss": 2.6694, + "step": 15133 + }, + { + "epoch": 0.4487738338819203, + "grad_norm": 0.1284220963716507, + "learning_rate": 0.0005888055375846639, + "loss": 2.6814, + "step": 15134 + }, + { + "epoch": 0.44880348723423186, + "grad_norm": 0.12522205710411072, + "learning_rate": 0.0005887592343052382, + "loss": 2.635, + "step": 15135 + }, + { + "epoch": 0.44883314058654333, + "grad_norm": 0.11982559412717819, + "learning_rate": 0.0005887129302398343, + "loss": 2.6861, + "step": 15136 + }, + { + "epoch": 0.4488627939388548, + "grad_norm": 0.10918652266263962, + "learning_rate": 0.0005886666253888622, + "loss": 2.6876, + "step": 15137 + }, + { + "epoch": 0.4488924472911663, + "grad_norm": 0.11348672956228256, + "learning_rate": 0.000588620319752732, + "loss": 2.6912, + "step": 15138 + }, + { + "epoch": 0.44892210064347776, + "grad_norm": 0.13005588948726654, + "learning_rate": 0.0005885740133318535, + "loss": 2.654, + "step": 15139 + }, + { + "epoch": 0.44895175399578924, + "grad_norm": 0.12037120014429092, + "learning_rate": 0.000588527706126637, + "loss": 2.6681, + "step": 15140 + }, + { + "epoch": 0.4489814073481007, + "grad_norm": 0.12516571581363678, + "learning_rate": 0.0005884813981374923, + "loss": 2.6696, + "step": 15141 + }, + { + "epoch": 0.4490110607004122, + "grad_norm": 0.12271692603826523, + "learning_rate": 0.0005884350893648298, + "loss": 2.6554, + "step": 15142 + }, + { + "epoch": 0.44904071405272367, + "grad_norm": 0.12286267429590225, + "learning_rate": 0.0005883887798090595, + "loss": 2.6478, + "step": 15143 + }, + { + "epoch": 0.44907036740503514, + "grad_norm": 0.1270977407693863, + "learning_rate": 0.0005883424694705913, + "loss": 2.6739, + "step": 15144 + }, + { + "epoch": 0.4491000207573466, + "grad_norm": 0.12614306807518005, + "learning_rate": 0.0005882961583498353, + "loss": 2.6865, + "step": 15145 + }, + { + "epoch": 0.4491296741096581, + "grad_norm": 0.13471414148807526, + "learning_rate": 0.0005882498464472017, + "loss": 2.6691, + "step": 15146 + }, + { + "epoch": 0.44915932746196957, + "grad_norm": 0.1580284982919693, + "learning_rate": 0.0005882035337631004, + "loss": 2.6525, + "step": 15147 + }, + { + "epoch": 0.44918898081428105, + "grad_norm": 0.12815989553928375, + "learning_rate": 0.000588157220297942, + "loss": 2.6621, + "step": 15148 + }, + { + "epoch": 0.4492186341665925, + "grad_norm": 0.10664919763803482, + "learning_rate": 0.0005881109060521362, + "loss": 2.6594, + "step": 15149 + }, + { + "epoch": 0.449248287518904, + "grad_norm": 0.14668044447898865, + "learning_rate": 0.000588064591026093, + "loss": 2.68, + "step": 15150 + }, + { + "epoch": 0.4492779408712155, + "grad_norm": 0.1403261423110962, + "learning_rate": 0.0005880182752202227, + "loss": 2.6781, + "step": 15151 + }, + { + "epoch": 0.44930759422352695, + "grad_norm": 0.1618805080652237, + "learning_rate": 0.0005879719586349357, + "loss": 2.7027, + "step": 15152 + }, + { + "epoch": 0.4493372475758384, + "grad_norm": 0.15668737888336182, + "learning_rate": 0.0005879256412706418, + "loss": 2.6549, + "step": 15153 + }, + { + "epoch": 0.4493669009281499, + "grad_norm": 0.14218053221702576, + "learning_rate": 0.0005878793231277513, + "loss": 2.6823, + "step": 15154 + }, + { + "epoch": 0.4493965542804614, + "grad_norm": 0.14309024810791016, + "learning_rate": 0.0005878330042066742, + "loss": 2.663, + "step": 15155 + }, + { + "epoch": 0.4494262076327729, + "grad_norm": 0.13482502102851868, + "learning_rate": 0.000587786684507821, + "loss": 2.6741, + "step": 15156 + }, + { + "epoch": 0.4494558609850844, + "grad_norm": 0.1376219391822815, + "learning_rate": 0.0005877403640316013, + "loss": 2.6713, + "step": 15157 + }, + { + "epoch": 0.44948551433739586, + "grad_norm": 0.1496315598487854, + "learning_rate": 0.0005876940427784259, + "loss": 2.6761, + "step": 15158 + }, + { + "epoch": 0.44951516768970734, + "grad_norm": 0.14185482263565063, + "learning_rate": 0.0005876477207487045, + "loss": 2.6593, + "step": 15159 + }, + { + "epoch": 0.4495448210420188, + "grad_norm": 0.12469490617513657, + "learning_rate": 0.0005876013979428476, + "loss": 2.6641, + "step": 15160 + }, + { + "epoch": 0.4495744743943303, + "grad_norm": 0.11596210300922394, + "learning_rate": 0.0005875550743612653, + "loss": 2.6891, + "step": 15161 + }, + { + "epoch": 0.44960412774664177, + "grad_norm": 0.14370709657669067, + "learning_rate": 0.0005875087500043678, + "loss": 2.6768, + "step": 15162 + }, + { + "epoch": 0.44963378109895324, + "grad_norm": 0.1393098086118698, + "learning_rate": 0.0005874624248725653, + "loss": 2.6398, + "step": 15163 + }, + { + "epoch": 0.4496634344512647, + "grad_norm": 0.10785448551177979, + "learning_rate": 0.0005874160989662679, + "loss": 2.6776, + "step": 15164 + }, + { + "epoch": 0.4496930878035762, + "grad_norm": 0.15659146010875702, + "learning_rate": 0.0005873697722858862, + "loss": 2.6942, + "step": 15165 + }, + { + "epoch": 0.4497227411558877, + "grad_norm": 0.15132461488246918, + "learning_rate": 0.0005873234448318298, + "loss": 2.6732, + "step": 15166 + }, + { + "epoch": 0.44975239450819915, + "grad_norm": 0.1293964684009552, + "learning_rate": 0.0005872771166045094, + "loss": 2.6872, + "step": 15167 + }, + { + "epoch": 0.4497820478605106, + "grad_norm": 0.1615089327096939, + "learning_rate": 0.0005872307876043354, + "loss": 2.6883, + "step": 15168 + }, + { + "epoch": 0.4498117012128221, + "grad_norm": 0.17611686885356903, + "learning_rate": 0.0005871844578317176, + "loss": 2.6609, + "step": 15169 + }, + { + "epoch": 0.4498413545651336, + "grad_norm": 0.15474551916122437, + "learning_rate": 0.0005871381272870666, + "loss": 2.6674, + "step": 15170 + }, + { + "epoch": 0.44987100791744505, + "grad_norm": 0.11384592205286026, + "learning_rate": 0.0005870917959707924, + "loss": 2.6432, + "step": 15171 + }, + { + "epoch": 0.44990066126975653, + "grad_norm": 0.1361897736787796, + "learning_rate": 0.0005870454638833054, + "loss": 2.6642, + "step": 15172 + }, + { + "epoch": 0.449930314622068, + "grad_norm": 0.13328050076961517, + "learning_rate": 0.0005869991310250158, + "loss": 2.6543, + "step": 15173 + }, + { + "epoch": 0.4499599679743795, + "grad_norm": 0.12033941596746445, + "learning_rate": 0.0005869527973963341, + "loss": 2.6594, + "step": 15174 + }, + { + "epoch": 0.44998962132669096, + "grad_norm": 0.10825397819280624, + "learning_rate": 0.0005869064629976704, + "loss": 2.6601, + "step": 15175 + }, + { + "epoch": 0.45001927467900243, + "grad_norm": 0.1190931499004364, + "learning_rate": 0.0005868601278294352, + "loss": 2.667, + "step": 15176 + }, + { + "epoch": 0.45004892803131397, + "grad_norm": 0.12995760142803192, + "learning_rate": 0.0005868137918920384, + "loss": 2.6558, + "step": 15177 + }, + { + "epoch": 0.45007858138362544, + "grad_norm": 0.11717028170824051, + "learning_rate": 0.0005867674551858908, + "loss": 2.6642, + "step": 15178 + }, + { + "epoch": 0.4501082347359369, + "grad_norm": 0.11410915106534958, + "learning_rate": 0.0005867211177114024, + "loss": 2.6703, + "step": 15179 + }, + { + "epoch": 0.4501378880882484, + "grad_norm": 0.11128576844930649, + "learning_rate": 0.0005866747794689835, + "loss": 2.6662, + "step": 15180 + }, + { + "epoch": 0.45016754144055987, + "grad_norm": 0.10925541073083878, + "learning_rate": 0.0005866284404590447, + "loss": 2.6956, + "step": 15181 + }, + { + "epoch": 0.45019719479287135, + "grad_norm": 0.10950488597154617, + "learning_rate": 0.0005865821006819963, + "loss": 2.7249, + "step": 15182 + }, + { + "epoch": 0.4502268481451828, + "grad_norm": 0.11103349179029465, + "learning_rate": 0.0005865357601382483, + "loss": 2.6497, + "step": 15183 + }, + { + "epoch": 0.4502565014974943, + "grad_norm": 0.12193522602319717, + "learning_rate": 0.0005864894188282114, + "loss": 2.6718, + "step": 15184 + }, + { + "epoch": 0.4502861548498058, + "grad_norm": 0.10957255214452744, + "learning_rate": 0.0005864430767522958, + "loss": 2.6719, + "step": 15185 + }, + { + "epoch": 0.45031580820211725, + "grad_norm": 0.10893306136131287, + "learning_rate": 0.000586396733910912, + "loss": 2.6573, + "step": 15186 + }, + { + "epoch": 0.4503454615544287, + "grad_norm": 0.11207780987024307, + "learning_rate": 0.0005863503903044701, + "loss": 2.6592, + "step": 15187 + }, + { + "epoch": 0.4503751149067402, + "grad_norm": 0.11393758654594421, + "learning_rate": 0.0005863040459333809, + "loss": 2.6351, + "step": 15188 + }, + { + "epoch": 0.4504047682590517, + "grad_norm": 0.10858369618654251, + "learning_rate": 0.0005862577007980544, + "loss": 2.66, + "step": 15189 + }, + { + "epoch": 0.45043442161136316, + "grad_norm": 0.11033352464437485, + "learning_rate": 0.0005862113548989012, + "loss": 2.6708, + "step": 15190 + }, + { + "epoch": 0.45046407496367463, + "grad_norm": 0.11786410212516785, + "learning_rate": 0.0005861650082363317, + "loss": 2.6652, + "step": 15191 + }, + { + "epoch": 0.4504937283159861, + "grad_norm": 0.1426234096288681, + "learning_rate": 0.0005861186608107562, + "loss": 2.705, + "step": 15192 + }, + { + "epoch": 0.4505233816682976, + "grad_norm": 0.14616921544075012, + "learning_rate": 0.0005860723126225854, + "loss": 2.6641, + "step": 15193 + }, + { + "epoch": 0.45055303502060906, + "grad_norm": 0.12680035829544067, + "learning_rate": 0.0005860259636722291, + "loss": 2.6454, + "step": 15194 + }, + { + "epoch": 0.45058268837292054, + "grad_norm": 0.11720654368400574, + "learning_rate": 0.0005859796139600984, + "loss": 2.6586, + "step": 15195 + }, + { + "epoch": 0.450612341725232, + "grad_norm": 0.1157606691122055, + "learning_rate": 0.0005859332634866034, + "loss": 2.6758, + "step": 15196 + }, + { + "epoch": 0.4506419950775435, + "grad_norm": 0.14718997478485107, + "learning_rate": 0.0005858869122521547, + "loss": 2.6295, + "step": 15197 + }, + { + "epoch": 0.450671648429855, + "grad_norm": 0.14906872808933258, + "learning_rate": 0.0005858405602571626, + "loss": 2.6613, + "step": 15198 + }, + { + "epoch": 0.4507013017821665, + "grad_norm": 0.1540510356426239, + "learning_rate": 0.0005857942075020374, + "loss": 2.6556, + "step": 15199 + }, + { + "epoch": 0.450730955134478, + "grad_norm": 0.14301414787769318, + "learning_rate": 0.0005857478539871898, + "loss": 2.6795, + "step": 15200 + }, + { + "epoch": 0.45076060848678945, + "grad_norm": 0.13061659038066864, + "learning_rate": 0.0005857014997130304, + "loss": 2.6833, + "step": 15201 + }, + { + "epoch": 0.4507902618391009, + "grad_norm": 0.15002715587615967, + "learning_rate": 0.0005856551446799695, + "loss": 2.6872, + "step": 15202 + }, + { + "epoch": 0.4508199151914124, + "grad_norm": 0.1719282567501068, + "learning_rate": 0.0005856087888884177, + "loss": 2.6527, + "step": 15203 + }, + { + "epoch": 0.4508495685437239, + "grad_norm": 0.1473095715045929, + "learning_rate": 0.0005855624323387853, + "loss": 2.6445, + "step": 15204 + }, + { + "epoch": 0.45087922189603535, + "grad_norm": 0.13245849311351776, + "learning_rate": 0.0005855160750314828, + "loss": 2.67, + "step": 15205 + }, + { + "epoch": 0.45090887524834683, + "grad_norm": 0.12326698750257492, + "learning_rate": 0.0005854697169669206, + "loss": 2.6321, + "step": 15206 + }, + { + "epoch": 0.4509385286006583, + "grad_norm": 0.10983886569738388, + "learning_rate": 0.0005854233581455096, + "loss": 2.6569, + "step": 15207 + }, + { + "epoch": 0.4509681819529698, + "grad_norm": 0.1338098794221878, + "learning_rate": 0.0005853769985676602, + "loss": 2.7099, + "step": 15208 + }, + { + "epoch": 0.45099783530528126, + "grad_norm": 0.14064481854438782, + "learning_rate": 0.0005853306382337827, + "loss": 2.6433, + "step": 15209 + }, + { + "epoch": 0.45102748865759273, + "grad_norm": 0.1063893735408783, + "learning_rate": 0.0005852842771442877, + "loss": 2.6572, + "step": 15210 + }, + { + "epoch": 0.4510571420099042, + "grad_norm": 0.12813971936702728, + "learning_rate": 0.0005852379152995859, + "loss": 2.673, + "step": 15211 + }, + { + "epoch": 0.4510867953622157, + "grad_norm": 0.12282063812017441, + "learning_rate": 0.0005851915527000875, + "loss": 2.6847, + "step": 15212 + }, + { + "epoch": 0.45111644871452716, + "grad_norm": 0.11380382627248764, + "learning_rate": 0.0005851451893462035, + "loss": 2.6826, + "step": 15213 + }, + { + "epoch": 0.45114610206683864, + "grad_norm": 0.11485323309898376, + "learning_rate": 0.0005850988252383443, + "loss": 2.6416, + "step": 15214 + }, + { + "epoch": 0.4511757554191501, + "grad_norm": 0.12384570389986038, + "learning_rate": 0.0005850524603769201, + "loss": 2.69, + "step": 15215 + }, + { + "epoch": 0.4512054087714616, + "grad_norm": 0.13225343823432922, + "learning_rate": 0.000585006094762342, + "loss": 2.6502, + "step": 15216 + }, + { + "epoch": 0.45123506212377307, + "grad_norm": 0.10716395825147629, + "learning_rate": 0.0005849597283950203, + "loss": 2.6743, + "step": 15217 + }, + { + "epoch": 0.4512647154760846, + "grad_norm": 0.12192731350660324, + "learning_rate": 0.0005849133612753656, + "loss": 2.6615, + "step": 15218 + }, + { + "epoch": 0.4512943688283961, + "grad_norm": 0.14555929601192474, + "learning_rate": 0.0005848669934037884, + "loss": 2.6944, + "step": 15219 + }, + { + "epoch": 0.45132402218070755, + "grad_norm": 0.14241856336593628, + "learning_rate": 0.0005848206247806996, + "loss": 2.666, + "step": 15220 + }, + { + "epoch": 0.451353675533019, + "grad_norm": 0.1206917092204094, + "learning_rate": 0.0005847742554065096, + "loss": 2.6458, + "step": 15221 + }, + { + "epoch": 0.4513833288853305, + "grad_norm": 0.11575821787118912, + "learning_rate": 0.0005847278852816289, + "loss": 2.6591, + "step": 15222 + }, + { + "epoch": 0.451412982237642, + "grad_norm": 0.11857898533344269, + "learning_rate": 0.0005846815144064682, + "loss": 2.6658, + "step": 15223 + }, + { + "epoch": 0.45144263558995346, + "grad_norm": 0.12813514471054077, + "learning_rate": 0.0005846351427814383, + "loss": 2.6665, + "step": 15224 + }, + { + "epoch": 0.45147228894226493, + "grad_norm": 0.1465074121952057, + "learning_rate": 0.0005845887704069495, + "loss": 2.6812, + "step": 15225 + }, + { + "epoch": 0.4515019422945764, + "grad_norm": 0.1757122278213501, + "learning_rate": 0.0005845423972834127, + "loss": 2.6871, + "step": 15226 + }, + { + "epoch": 0.4515315956468879, + "grad_norm": 0.15109999477863312, + "learning_rate": 0.0005844960234112385, + "loss": 2.6603, + "step": 15227 + }, + { + "epoch": 0.45156124899919936, + "grad_norm": 0.12554672360420227, + "learning_rate": 0.0005844496487908375, + "loss": 2.6588, + "step": 15228 + }, + { + "epoch": 0.45159090235151084, + "grad_norm": 0.20273014903068542, + "learning_rate": 0.0005844032734226204, + "loss": 2.6726, + "step": 15229 + }, + { + "epoch": 0.4516205557038223, + "grad_norm": 0.1226239949464798, + "learning_rate": 0.0005843568973069978, + "loss": 2.6482, + "step": 15230 + }, + { + "epoch": 0.4516502090561338, + "grad_norm": 0.123665951192379, + "learning_rate": 0.0005843105204443805, + "loss": 2.6498, + "step": 15231 + }, + { + "epoch": 0.45167986240844527, + "grad_norm": 0.12034833431243896, + "learning_rate": 0.0005842641428351788, + "loss": 2.6657, + "step": 15232 + }, + { + "epoch": 0.45170951576075674, + "grad_norm": 0.12268249690532684, + "learning_rate": 0.0005842177644798038, + "loss": 2.6935, + "step": 15233 + }, + { + "epoch": 0.4517391691130682, + "grad_norm": 0.12412207573652267, + "learning_rate": 0.0005841713853786661, + "loss": 2.684, + "step": 15234 + }, + { + "epoch": 0.4517688224653797, + "grad_norm": 0.11922436952590942, + "learning_rate": 0.0005841250055321763, + "loss": 2.7124, + "step": 15235 + }, + { + "epoch": 0.45179847581769117, + "grad_norm": 0.12430645525455475, + "learning_rate": 0.0005840786249407451, + "loss": 2.6582, + "step": 15236 + }, + { + "epoch": 0.45182812917000265, + "grad_norm": 0.10363488644361496, + "learning_rate": 0.0005840322436047833, + "loss": 2.6501, + "step": 15237 + }, + { + "epoch": 0.4518577825223141, + "grad_norm": 0.11323830485343933, + "learning_rate": 0.0005839858615247015, + "loss": 2.6703, + "step": 15238 + }, + { + "epoch": 0.45188743587462565, + "grad_norm": 0.12302371114492416, + "learning_rate": 0.0005839394787009105, + "loss": 2.7087, + "step": 15239 + }, + { + "epoch": 0.45191708922693713, + "grad_norm": 0.11876451224088669, + "learning_rate": 0.0005838930951338209, + "loss": 2.644, + "step": 15240 + }, + { + "epoch": 0.4519467425792486, + "grad_norm": 0.12487340718507767, + "learning_rate": 0.0005838467108238437, + "loss": 2.6275, + "step": 15241 + }, + { + "epoch": 0.4519763959315601, + "grad_norm": 0.12467575818300247, + "learning_rate": 0.0005838003257713896, + "loss": 2.6543, + "step": 15242 + }, + { + "epoch": 0.45200604928387156, + "grad_norm": 0.12549951672554016, + "learning_rate": 0.000583753939976869, + "loss": 2.6808, + "step": 15243 + }, + { + "epoch": 0.45203570263618303, + "grad_norm": 0.10759653896093369, + "learning_rate": 0.0005837075534406928, + "loss": 2.6426, + "step": 15244 + }, + { + "epoch": 0.4520653559884945, + "grad_norm": 0.11497601866722107, + "learning_rate": 0.000583661166163272, + "loss": 2.6635, + "step": 15245 + }, + { + "epoch": 0.452095009340806, + "grad_norm": 0.1338912844657898, + "learning_rate": 0.0005836147781450173, + "loss": 2.713, + "step": 15246 + }, + { + "epoch": 0.45212466269311746, + "grad_norm": 0.11230776458978653, + "learning_rate": 0.0005835683893863393, + "loss": 2.6494, + "step": 15247 + }, + { + "epoch": 0.45215431604542894, + "grad_norm": 0.11095771938562393, + "learning_rate": 0.0005835219998876488, + "loss": 2.6622, + "step": 15248 + }, + { + "epoch": 0.4521839693977404, + "grad_norm": 0.11571202427148819, + "learning_rate": 0.0005834756096493568, + "loss": 2.6816, + "step": 15249 + }, + { + "epoch": 0.4522136227500519, + "grad_norm": 0.11528514325618744, + "learning_rate": 0.0005834292186718738, + "loss": 2.7066, + "step": 15250 + }, + { + "epoch": 0.45224327610236337, + "grad_norm": 0.12249740213155746, + "learning_rate": 0.0005833828269556108, + "loss": 2.6214, + "step": 15251 + }, + { + "epoch": 0.45227292945467484, + "grad_norm": 0.1452745646238327, + "learning_rate": 0.0005833364345009787, + "loss": 2.6575, + "step": 15252 + }, + { + "epoch": 0.4523025828069863, + "grad_norm": 0.14133359491825104, + "learning_rate": 0.0005832900413083879, + "loss": 2.7001, + "step": 15253 + }, + { + "epoch": 0.4523322361592978, + "grad_norm": 0.14120075106620789, + "learning_rate": 0.0005832436473782496, + "loss": 2.6504, + "step": 15254 + }, + { + "epoch": 0.45236188951160927, + "grad_norm": 0.14418116211891174, + "learning_rate": 0.0005831972527109746, + "loss": 2.6518, + "step": 15255 + }, + { + "epoch": 0.45239154286392075, + "grad_norm": 0.14101941883563995, + "learning_rate": 0.0005831508573069736, + "loss": 2.6599, + "step": 15256 + }, + { + "epoch": 0.4524211962162322, + "grad_norm": 0.1413506716489792, + "learning_rate": 0.0005831044611666575, + "loss": 2.6752, + "step": 15257 + }, + { + "epoch": 0.4524508495685437, + "grad_norm": 0.14615662395954132, + "learning_rate": 0.000583058064290437, + "loss": 2.6688, + "step": 15258 + }, + { + "epoch": 0.4524805029208552, + "grad_norm": 0.13848845660686493, + "learning_rate": 0.0005830116666787233, + "loss": 2.6272, + "step": 15259 + }, + { + "epoch": 0.4525101562731667, + "grad_norm": 0.10297141224145889, + "learning_rate": 0.0005829652683319268, + "loss": 2.6426, + "step": 15260 + }, + { + "epoch": 0.4525398096254782, + "grad_norm": 0.13998833298683167, + "learning_rate": 0.0005829188692504588, + "loss": 2.6799, + "step": 15261 + }, + { + "epoch": 0.45256946297778966, + "grad_norm": 0.11597628146409988, + "learning_rate": 0.0005828724694347299, + "loss": 2.6663, + "step": 15262 + }, + { + "epoch": 0.45259911633010114, + "grad_norm": 0.10903765261173248, + "learning_rate": 0.0005828260688851511, + "loss": 2.6963, + "step": 15263 + }, + { + "epoch": 0.4526287696824126, + "grad_norm": 0.1264883279800415, + "learning_rate": 0.0005827796676021331, + "loss": 2.6714, + "step": 15264 + }, + { + "epoch": 0.4526584230347241, + "grad_norm": 0.1267767697572708, + "learning_rate": 0.0005827332655860872, + "loss": 2.6564, + "step": 15265 + }, + { + "epoch": 0.45268807638703557, + "grad_norm": 0.10835981369018555, + "learning_rate": 0.0005826868628374239, + "loss": 2.6782, + "step": 15266 + }, + { + "epoch": 0.45271772973934704, + "grad_norm": 0.09980890899896622, + "learning_rate": 0.0005826404593565541, + "loss": 2.6642, + "step": 15267 + }, + { + "epoch": 0.4527473830916585, + "grad_norm": 0.11445604264736176, + "learning_rate": 0.0005825940551438891, + "loss": 2.677, + "step": 15268 + }, + { + "epoch": 0.45277703644397, + "grad_norm": 0.10409504175186157, + "learning_rate": 0.0005825476501998395, + "loss": 2.6689, + "step": 15269 + }, + { + "epoch": 0.45280668979628147, + "grad_norm": 0.11023322492837906, + "learning_rate": 0.0005825012445248161, + "loss": 2.6891, + "step": 15270 + }, + { + "epoch": 0.45283634314859295, + "grad_norm": 0.1110253632068634, + "learning_rate": 0.0005824548381192302, + "loss": 2.6943, + "step": 15271 + }, + { + "epoch": 0.4528659965009044, + "grad_norm": 0.13337571918964386, + "learning_rate": 0.0005824084309834924, + "loss": 2.6843, + "step": 15272 + }, + { + "epoch": 0.4528956498532159, + "grad_norm": 0.15462136268615723, + "learning_rate": 0.0005823620231180139, + "loss": 2.6393, + "step": 15273 + }, + { + "epoch": 0.4529253032055274, + "grad_norm": 0.14622555673122406, + "learning_rate": 0.0005823156145232057, + "loss": 2.6703, + "step": 15274 + }, + { + "epoch": 0.45295495655783885, + "grad_norm": 0.10313308984041214, + "learning_rate": 0.0005822692051994785, + "loss": 2.679, + "step": 15275 + }, + { + "epoch": 0.4529846099101503, + "grad_norm": 0.11415224522352219, + "learning_rate": 0.0005822227951472432, + "loss": 2.6512, + "step": 15276 + }, + { + "epoch": 0.4530142632624618, + "grad_norm": 0.13269680738449097, + "learning_rate": 0.000582176384366911, + "loss": 2.6573, + "step": 15277 + }, + { + "epoch": 0.4530439166147733, + "grad_norm": 0.13257905840873718, + "learning_rate": 0.0005821299728588928, + "loss": 2.6693, + "step": 15278 + }, + { + "epoch": 0.45307356996708475, + "grad_norm": 0.1287640780210495, + "learning_rate": 0.0005820835606235998, + "loss": 2.6633, + "step": 15279 + }, + { + "epoch": 0.45310322331939623, + "grad_norm": 0.13193616271018982, + "learning_rate": 0.0005820371476614425, + "loss": 2.6761, + "step": 15280 + }, + { + "epoch": 0.45313287667170776, + "grad_norm": 0.1206507608294487, + "learning_rate": 0.0005819907339728324, + "loss": 2.6586, + "step": 15281 + }, + { + "epoch": 0.45316253002401924, + "grad_norm": 0.11974325031042099, + "learning_rate": 0.0005819443195581802, + "loss": 2.6508, + "step": 15282 + }, + { + "epoch": 0.4531921833763307, + "grad_norm": 0.120740607380867, + "learning_rate": 0.000581897904417897, + "loss": 2.6636, + "step": 15283 + }, + { + "epoch": 0.4532218367286422, + "grad_norm": 0.11617391556501389, + "learning_rate": 0.0005818514885523938, + "loss": 2.6767, + "step": 15284 + }, + { + "epoch": 0.45325149008095367, + "grad_norm": 0.1072949767112732, + "learning_rate": 0.0005818050719620815, + "loss": 2.6438, + "step": 15285 + }, + { + "epoch": 0.45328114343326514, + "grad_norm": 0.11743098497390747, + "learning_rate": 0.0005817586546473713, + "loss": 2.6802, + "step": 15286 + }, + { + "epoch": 0.4533107967855766, + "grad_norm": 0.11851909756660461, + "learning_rate": 0.0005817122366086742, + "loss": 2.7098, + "step": 15287 + }, + { + "epoch": 0.4533404501378881, + "grad_norm": 0.11423048377037048, + "learning_rate": 0.0005816658178464013, + "loss": 2.6611, + "step": 15288 + }, + { + "epoch": 0.45337010349019957, + "grad_norm": 0.12674148380756378, + "learning_rate": 0.0005816193983609636, + "loss": 2.7229, + "step": 15289 + }, + { + "epoch": 0.45339975684251105, + "grad_norm": 0.10605982691049576, + "learning_rate": 0.0005815729781527719, + "loss": 2.6904, + "step": 15290 + }, + { + "epoch": 0.4534294101948225, + "grad_norm": 0.1286131888628006, + "learning_rate": 0.0005815265572222376, + "loss": 2.6546, + "step": 15291 + }, + { + "epoch": 0.453459063547134, + "grad_norm": 0.13509832322597504, + "learning_rate": 0.0005814801355697717, + "loss": 2.6544, + "step": 15292 + }, + { + "epoch": 0.4534887168994455, + "grad_norm": 0.12219853699207306, + "learning_rate": 0.0005814337131957851, + "loss": 2.6513, + "step": 15293 + }, + { + "epoch": 0.45351837025175695, + "grad_norm": 0.11933402717113495, + "learning_rate": 0.0005813872901006891, + "loss": 2.6542, + "step": 15294 + }, + { + "epoch": 0.45354802360406843, + "grad_norm": 0.11489318311214447, + "learning_rate": 0.0005813408662848946, + "loss": 2.665, + "step": 15295 + }, + { + "epoch": 0.4535776769563799, + "grad_norm": 0.12268628925085068, + "learning_rate": 0.0005812944417488128, + "loss": 2.6701, + "step": 15296 + }, + { + "epoch": 0.4536073303086914, + "grad_norm": 0.1348114162683487, + "learning_rate": 0.0005812480164928546, + "loss": 2.6826, + "step": 15297 + }, + { + "epoch": 0.45363698366100286, + "grad_norm": 0.13379517197608948, + "learning_rate": 0.0005812015905174314, + "loss": 2.6519, + "step": 15298 + }, + { + "epoch": 0.45366663701331433, + "grad_norm": 0.13115815818309784, + "learning_rate": 0.0005811551638229543, + "loss": 2.6427, + "step": 15299 + }, + { + "epoch": 0.4536962903656258, + "grad_norm": 0.14234118163585663, + "learning_rate": 0.0005811087364098341, + "loss": 2.6332, + "step": 15300 + }, + { + "epoch": 0.4537259437179373, + "grad_norm": 0.13151438534259796, + "learning_rate": 0.0005810623082784823, + "loss": 2.6671, + "step": 15301 + }, + { + "epoch": 0.4537555970702488, + "grad_norm": 0.11065230518579483, + "learning_rate": 0.0005810158794293099, + "loss": 2.6676, + "step": 15302 + }, + { + "epoch": 0.4537852504225603, + "grad_norm": 0.10909074544906616, + "learning_rate": 0.0005809694498627277, + "loss": 2.6309, + "step": 15303 + }, + { + "epoch": 0.45381490377487177, + "grad_norm": 0.11627339571714401, + "learning_rate": 0.0005809230195791471, + "loss": 2.6803, + "step": 15304 + }, + { + "epoch": 0.45384455712718325, + "grad_norm": 0.11677654832601547, + "learning_rate": 0.0005808765885789795, + "loss": 2.6891, + "step": 15305 + }, + { + "epoch": 0.4538742104794947, + "grad_norm": 0.1219317615032196, + "learning_rate": 0.0005808301568626358, + "loss": 2.6813, + "step": 15306 + }, + { + "epoch": 0.4539038638318062, + "grad_norm": 0.13292661309242249, + "learning_rate": 0.0005807837244305271, + "loss": 2.6606, + "step": 15307 + }, + { + "epoch": 0.4539335171841177, + "grad_norm": 0.13729947805404663, + "learning_rate": 0.0005807372912830648, + "loss": 2.6538, + "step": 15308 + }, + { + "epoch": 0.45396317053642915, + "grad_norm": 0.1536470651626587, + "learning_rate": 0.0005806908574206598, + "loss": 2.6644, + "step": 15309 + }, + { + "epoch": 0.4539928238887406, + "grad_norm": 0.1433107703924179, + "learning_rate": 0.0005806444228437233, + "loss": 2.6565, + "step": 15310 + }, + { + "epoch": 0.4540224772410521, + "grad_norm": 0.11809580028057098, + "learning_rate": 0.0005805979875526668, + "loss": 2.6533, + "step": 15311 + }, + { + "epoch": 0.4540521305933636, + "grad_norm": 0.12727873027324677, + "learning_rate": 0.0005805515515479013, + "loss": 2.653, + "step": 15312 + }, + { + "epoch": 0.45408178394567505, + "grad_norm": 0.12227047234773636, + "learning_rate": 0.000580505114829838, + "loss": 2.6872, + "step": 15313 + }, + { + "epoch": 0.45411143729798653, + "grad_norm": 0.13006970286369324, + "learning_rate": 0.0005804586773988879, + "loss": 2.6403, + "step": 15314 + }, + { + "epoch": 0.454141090650298, + "grad_norm": 0.12192521244287491, + "learning_rate": 0.0005804122392554625, + "loss": 2.663, + "step": 15315 + }, + { + "epoch": 0.4541707440026095, + "grad_norm": 0.111321359872818, + "learning_rate": 0.0005803658003999728, + "loss": 2.6818, + "step": 15316 + }, + { + "epoch": 0.45420039735492096, + "grad_norm": 0.11347337812185287, + "learning_rate": 0.0005803193608328303, + "loss": 2.6629, + "step": 15317 + }, + { + "epoch": 0.45423005070723244, + "grad_norm": 0.10274971276521683, + "learning_rate": 0.000580272920554446, + "loss": 2.6928, + "step": 15318 + }, + { + "epoch": 0.4542597040595439, + "grad_norm": 0.10575820505619049, + "learning_rate": 0.0005802264795652313, + "loss": 2.6452, + "step": 15319 + }, + { + "epoch": 0.4542893574118554, + "grad_norm": 0.1291005164384842, + "learning_rate": 0.0005801800378655973, + "loss": 2.7141, + "step": 15320 + }, + { + "epoch": 0.45431901076416686, + "grad_norm": 0.14604797959327698, + "learning_rate": 0.0005801335954559552, + "loss": 2.6566, + "step": 15321 + }, + { + "epoch": 0.45434866411647834, + "grad_norm": 0.13999418914318085, + "learning_rate": 0.0005800871523367163, + "loss": 2.6112, + "step": 15322 + }, + { + "epoch": 0.45437831746878987, + "grad_norm": 0.1324402391910553, + "learning_rate": 0.0005800407085082922, + "loss": 2.6614, + "step": 15323 + }, + { + "epoch": 0.45440797082110135, + "grad_norm": 0.1366736739873886, + "learning_rate": 0.0005799942639710938, + "loss": 2.7096, + "step": 15324 + }, + { + "epoch": 0.4544376241734128, + "grad_norm": 0.15171170234680176, + "learning_rate": 0.0005799478187255324, + "loss": 2.6785, + "step": 15325 + }, + { + "epoch": 0.4544672775257243, + "grad_norm": 0.14391256868839264, + "learning_rate": 0.0005799013727720193, + "loss": 2.6791, + "step": 15326 + }, + { + "epoch": 0.4544969308780358, + "grad_norm": 0.1398620754480362, + "learning_rate": 0.0005798549261109659, + "loss": 2.6841, + "step": 15327 + }, + { + "epoch": 0.45452658423034725, + "grad_norm": 0.12954555451869965, + "learning_rate": 0.0005798084787427834, + "loss": 2.655, + "step": 15328 + }, + { + "epoch": 0.45455623758265873, + "grad_norm": 0.13434405624866486, + "learning_rate": 0.0005797620306678831, + "loss": 2.6896, + "step": 15329 + }, + { + "epoch": 0.4545858909349702, + "grad_norm": 0.11424344778060913, + "learning_rate": 0.0005797155818866764, + "loss": 2.6855, + "step": 15330 + }, + { + "epoch": 0.4546155442872817, + "grad_norm": 0.1327010840177536, + "learning_rate": 0.0005796691323995744, + "loss": 2.6703, + "step": 15331 + }, + { + "epoch": 0.45464519763959316, + "grad_norm": 0.14333608746528625, + "learning_rate": 0.0005796226822069886, + "loss": 2.6585, + "step": 15332 + }, + { + "epoch": 0.45467485099190463, + "grad_norm": 0.13091032207012177, + "learning_rate": 0.0005795762313093305, + "loss": 2.6621, + "step": 15333 + }, + { + "epoch": 0.4547045043442161, + "grad_norm": 0.12272290885448456, + "learning_rate": 0.000579529779707011, + "loss": 2.7013, + "step": 15334 + }, + { + "epoch": 0.4547341576965276, + "grad_norm": 0.14278925955295563, + "learning_rate": 0.0005794833274004416, + "loss": 2.6591, + "step": 15335 + }, + { + "epoch": 0.45476381104883906, + "grad_norm": 0.1330815702676773, + "learning_rate": 0.0005794368743900338, + "loss": 2.6829, + "step": 15336 + }, + { + "epoch": 0.45479346440115054, + "grad_norm": 0.11239952594041824, + "learning_rate": 0.0005793904206761989, + "loss": 2.6565, + "step": 15337 + }, + { + "epoch": 0.454823117753462, + "grad_norm": 0.13434255123138428, + "learning_rate": 0.000579343966259348, + "loss": 2.6828, + "step": 15338 + }, + { + "epoch": 0.4548527711057735, + "grad_norm": 0.13669106364250183, + "learning_rate": 0.0005792975111398928, + "loss": 2.69, + "step": 15339 + }, + { + "epoch": 0.45488242445808497, + "grad_norm": 0.13457277417182922, + "learning_rate": 0.0005792510553182446, + "loss": 2.7011, + "step": 15340 + }, + { + "epoch": 0.45491207781039644, + "grad_norm": 0.13792043924331665, + "learning_rate": 0.0005792045987948146, + "loss": 2.6745, + "step": 15341 + }, + { + "epoch": 0.4549417311627079, + "grad_norm": 0.13461430370807648, + "learning_rate": 0.0005791581415700143, + "loss": 2.6321, + "step": 15342 + }, + { + "epoch": 0.45497138451501945, + "grad_norm": 0.1324385404586792, + "learning_rate": 0.000579111683644255, + "loss": 2.6629, + "step": 15343 + }, + { + "epoch": 0.4550010378673309, + "grad_norm": 0.1347103714942932, + "learning_rate": 0.0005790652250179482, + "loss": 2.6889, + "step": 15344 + }, + { + "epoch": 0.4550306912196424, + "grad_norm": 0.12614049017429352, + "learning_rate": 0.0005790187656915055, + "loss": 2.636, + "step": 15345 + }, + { + "epoch": 0.4550603445719539, + "grad_norm": 0.11900593340396881, + "learning_rate": 0.0005789723056653377, + "loss": 2.6687, + "step": 15346 + }, + { + "epoch": 0.45508999792426535, + "grad_norm": 0.10365133732557297, + "learning_rate": 0.0005789258449398569, + "loss": 2.689, + "step": 15347 + }, + { + "epoch": 0.45511965127657683, + "grad_norm": 0.10940472781658173, + "learning_rate": 0.0005788793835154739, + "loss": 2.6878, + "step": 15348 + }, + { + "epoch": 0.4551493046288883, + "grad_norm": 0.09392771124839783, + "learning_rate": 0.0005788329213926005, + "loss": 2.6471, + "step": 15349 + }, + { + "epoch": 0.4551789579811998, + "grad_norm": 0.10356662422418594, + "learning_rate": 0.0005787864585716483, + "loss": 2.6639, + "step": 15350 + }, + { + "epoch": 0.45520861133351126, + "grad_norm": 0.10607662796974182, + "learning_rate": 0.0005787399950530282, + "loss": 2.6382, + "step": 15351 + }, + { + "epoch": 0.45523826468582274, + "grad_norm": 0.11713799089193344, + "learning_rate": 0.000578693530837152, + "loss": 2.6705, + "step": 15352 + }, + { + "epoch": 0.4552679180381342, + "grad_norm": 0.09996238350868225, + "learning_rate": 0.000578647065924431, + "loss": 2.6816, + "step": 15353 + }, + { + "epoch": 0.4552975713904457, + "grad_norm": 0.10968507826328278, + "learning_rate": 0.0005786006003152768, + "loss": 2.6708, + "step": 15354 + }, + { + "epoch": 0.45532722474275716, + "grad_norm": 0.12548843026161194, + "learning_rate": 0.000578554134010101, + "loss": 2.6499, + "step": 15355 + }, + { + "epoch": 0.45535687809506864, + "grad_norm": 0.13247370719909668, + "learning_rate": 0.0005785076670093146, + "loss": 2.6448, + "step": 15356 + }, + { + "epoch": 0.4553865314473801, + "grad_norm": 0.12721668183803558, + "learning_rate": 0.0005784611993133295, + "loss": 2.66, + "step": 15357 + }, + { + "epoch": 0.4554161847996916, + "grad_norm": 0.10951842367649078, + "learning_rate": 0.0005784147309225568, + "loss": 2.6701, + "step": 15358 + }, + { + "epoch": 0.45544583815200307, + "grad_norm": 0.13116933405399323, + "learning_rate": 0.0005783682618374083, + "loss": 2.6531, + "step": 15359 + }, + { + "epoch": 0.45547549150431454, + "grad_norm": 0.15093035995960236, + "learning_rate": 0.0005783217920582954, + "loss": 2.6988, + "step": 15360 + }, + { + "epoch": 0.455505144856626, + "grad_norm": 0.14892049133777618, + "learning_rate": 0.0005782753215856296, + "loss": 2.7129, + "step": 15361 + }, + { + "epoch": 0.4555347982089375, + "grad_norm": 0.13063064217567444, + "learning_rate": 0.0005782288504198224, + "loss": 2.6656, + "step": 15362 + }, + { + "epoch": 0.455564451561249, + "grad_norm": 0.12380510568618774, + "learning_rate": 0.0005781823785612853, + "loss": 2.6744, + "step": 15363 + }, + { + "epoch": 0.4555941049135605, + "grad_norm": 0.11326456815004349, + "learning_rate": 0.0005781359060104298, + "loss": 2.6633, + "step": 15364 + }, + { + "epoch": 0.455623758265872, + "grad_norm": 0.10840237885713577, + "learning_rate": 0.0005780894327676675, + "loss": 2.6419, + "step": 15365 + }, + { + "epoch": 0.45565341161818346, + "grad_norm": 0.10394537448883057, + "learning_rate": 0.0005780429588334098, + "loss": 2.6614, + "step": 15366 + }, + { + "epoch": 0.45568306497049493, + "grad_norm": 0.11716528981924057, + "learning_rate": 0.0005779964842080683, + "loss": 2.6512, + "step": 15367 + }, + { + "epoch": 0.4557127183228064, + "grad_norm": 0.13481523096561432, + "learning_rate": 0.0005779500088920546, + "loss": 2.6807, + "step": 15368 + }, + { + "epoch": 0.4557423716751179, + "grad_norm": 0.12492886185646057, + "learning_rate": 0.00057790353288578, + "loss": 2.6515, + "step": 15369 + }, + { + "epoch": 0.45577202502742936, + "grad_norm": 0.12223494052886963, + "learning_rate": 0.0005778570561896564, + "loss": 2.6351, + "step": 15370 + }, + { + "epoch": 0.45580167837974084, + "grad_norm": 0.15502849221229553, + "learning_rate": 0.0005778105788040953, + "loss": 2.6504, + "step": 15371 + }, + { + "epoch": 0.4558313317320523, + "grad_norm": 0.17128527164459229, + "learning_rate": 0.0005777641007295081, + "loss": 2.6695, + "step": 15372 + }, + { + "epoch": 0.4558609850843638, + "grad_norm": 0.1567632555961609, + "learning_rate": 0.0005777176219663065, + "loss": 2.6814, + "step": 15373 + }, + { + "epoch": 0.45589063843667527, + "grad_norm": 0.11920523643493652, + "learning_rate": 0.0005776711425149018, + "loss": 2.6655, + "step": 15374 + }, + { + "epoch": 0.45592029178898674, + "grad_norm": 0.13212613761425018, + "learning_rate": 0.0005776246623757059, + "loss": 2.6496, + "step": 15375 + }, + { + "epoch": 0.4559499451412982, + "grad_norm": 0.16780683398246765, + "learning_rate": 0.0005775781815491304, + "loss": 2.69, + "step": 15376 + }, + { + "epoch": 0.4559795984936097, + "grad_norm": 0.17921440303325653, + "learning_rate": 0.0005775317000355866, + "loss": 2.6454, + "step": 15377 + }, + { + "epoch": 0.45600925184592117, + "grad_norm": 0.13038311898708344, + "learning_rate": 0.0005774852178354865, + "loss": 2.6624, + "step": 15378 + }, + { + "epoch": 0.45603890519823265, + "grad_norm": 0.13484205305576324, + "learning_rate": 0.0005774387349492413, + "loss": 2.6442, + "step": 15379 + }, + { + "epoch": 0.4560685585505441, + "grad_norm": 0.14091382920742035, + "learning_rate": 0.0005773922513772629, + "loss": 2.6648, + "step": 15380 + }, + { + "epoch": 0.4560982119028556, + "grad_norm": 0.12416940927505493, + "learning_rate": 0.0005773457671199628, + "loss": 2.6671, + "step": 15381 + }, + { + "epoch": 0.4561278652551671, + "grad_norm": 0.11102059483528137, + "learning_rate": 0.0005772992821777527, + "loss": 2.6585, + "step": 15382 + }, + { + "epoch": 0.45615751860747855, + "grad_norm": 0.10111413896083832, + "learning_rate": 0.0005772527965510442, + "loss": 2.65, + "step": 15383 + }, + { + "epoch": 0.45618717195979, + "grad_norm": 0.11620514839887619, + "learning_rate": 0.000577206310240249, + "loss": 2.6933, + "step": 15384 + }, + { + "epoch": 0.45621682531210156, + "grad_norm": 0.1114116683602333, + "learning_rate": 0.0005771598232457786, + "loss": 2.6501, + "step": 15385 + }, + { + "epoch": 0.45624647866441304, + "grad_norm": 0.10493902862071991, + "learning_rate": 0.0005771133355680447, + "loss": 2.6678, + "step": 15386 + }, + { + "epoch": 0.4562761320167245, + "grad_norm": 0.09496348351240158, + "learning_rate": 0.0005770668472074587, + "loss": 2.6571, + "step": 15387 + }, + { + "epoch": 0.456305785369036, + "grad_norm": 0.1056382954120636, + "learning_rate": 0.000577020358164433, + "loss": 2.687, + "step": 15388 + }, + { + "epoch": 0.45633543872134746, + "grad_norm": 0.12442025542259216, + "learning_rate": 0.0005769738684393786, + "loss": 2.6844, + "step": 15389 + }, + { + "epoch": 0.45636509207365894, + "grad_norm": 0.11590232700109482, + "learning_rate": 0.0005769273780327074, + "loss": 2.6722, + "step": 15390 + }, + { + "epoch": 0.4563947454259704, + "grad_norm": 0.10594689846038818, + "learning_rate": 0.000576880886944831, + "loss": 2.6788, + "step": 15391 + }, + { + "epoch": 0.4564243987782819, + "grad_norm": 0.12217999994754791, + "learning_rate": 0.0005768343951761614, + "loss": 2.7005, + "step": 15392 + }, + { + "epoch": 0.45645405213059337, + "grad_norm": 0.13092367351055145, + "learning_rate": 0.0005767879027271097, + "loss": 2.6607, + "step": 15393 + }, + { + "epoch": 0.45648370548290484, + "grad_norm": 0.12058958411216736, + "learning_rate": 0.0005767414095980881, + "loss": 2.6647, + "step": 15394 + }, + { + "epoch": 0.4565133588352163, + "grad_norm": 0.12236468493938446, + "learning_rate": 0.0005766949157895081, + "loss": 2.665, + "step": 15395 + }, + { + "epoch": 0.4565430121875278, + "grad_norm": 0.11977478861808777, + "learning_rate": 0.0005766484213017816, + "loss": 2.6317, + "step": 15396 + }, + { + "epoch": 0.4565726655398393, + "grad_norm": 0.10791757702827454, + "learning_rate": 0.0005766019261353201, + "loss": 2.6681, + "step": 15397 + }, + { + "epoch": 0.45660231889215075, + "grad_norm": 0.12063751369714737, + "learning_rate": 0.0005765554302905353, + "loss": 2.6726, + "step": 15398 + }, + { + "epoch": 0.4566319722444622, + "grad_norm": 0.14485469460487366, + "learning_rate": 0.0005765089337678391, + "loss": 2.6547, + "step": 15399 + }, + { + "epoch": 0.4566616255967737, + "grad_norm": 0.17100755870342255, + "learning_rate": 0.0005764624365676431, + "loss": 2.6657, + "step": 15400 + }, + { + "epoch": 0.4566912789490852, + "grad_norm": 0.14420971274375916, + "learning_rate": 0.0005764159386903591, + "loss": 2.6664, + "step": 15401 + }, + { + "epoch": 0.45672093230139665, + "grad_norm": 0.1301341950893402, + "learning_rate": 0.0005763694401363989, + "loss": 2.6561, + "step": 15402 + }, + { + "epoch": 0.45675058565370813, + "grad_norm": 0.13774172961711884, + "learning_rate": 0.0005763229409061743, + "loss": 2.6634, + "step": 15403 + }, + { + "epoch": 0.4567802390060196, + "grad_norm": 0.12736625969409943, + "learning_rate": 0.0005762764410000968, + "loss": 2.6651, + "step": 15404 + }, + { + "epoch": 0.4568098923583311, + "grad_norm": 0.10922221094369888, + "learning_rate": 0.0005762299404185784, + "loss": 2.6621, + "step": 15405 + }, + { + "epoch": 0.4568395457106426, + "grad_norm": 0.12855690717697144, + "learning_rate": 0.0005761834391620307, + "loss": 2.6613, + "step": 15406 + }, + { + "epoch": 0.4568691990629541, + "grad_norm": 0.1195949912071228, + "learning_rate": 0.0005761369372308657, + "loss": 2.6637, + "step": 15407 + }, + { + "epoch": 0.45689885241526557, + "grad_norm": 0.11614684760570526, + "learning_rate": 0.0005760904346254949, + "loss": 2.6954, + "step": 15408 + }, + { + "epoch": 0.45692850576757704, + "grad_norm": 0.13040921092033386, + "learning_rate": 0.0005760439313463304, + "loss": 2.6938, + "step": 15409 + }, + { + "epoch": 0.4569581591198885, + "grad_norm": 0.10316117852926254, + "learning_rate": 0.0005759974273937839, + "loss": 2.6911, + "step": 15410 + }, + { + "epoch": 0.4569878124722, + "grad_norm": 0.10928156226873398, + "learning_rate": 0.0005759509227682668, + "loss": 2.6679, + "step": 15411 + }, + { + "epoch": 0.45701746582451147, + "grad_norm": 0.12203559279441833, + "learning_rate": 0.0005759044174701915, + "loss": 2.6506, + "step": 15412 + }, + { + "epoch": 0.45704711917682295, + "grad_norm": 0.12599998712539673, + "learning_rate": 0.0005758579114999695, + "loss": 2.6452, + "step": 15413 + }, + { + "epoch": 0.4570767725291344, + "grad_norm": 0.1354105919599533, + "learning_rate": 0.0005758114048580126, + "loss": 2.6736, + "step": 15414 + }, + { + "epoch": 0.4571064258814459, + "grad_norm": 0.13375148177146912, + "learning_rate": 0.0005757648975447327, + "loss": 2.6609, + "step": 15415 + }, + { + "epoch": 0.4571360792337574, + "grad_norm": 0.11432168632745743, + "learning_rate": 0.0005757183895605419, + "loss": 2.7028, + "step": 15416 + }, + { + "epoch": 0.45716573258606885, + "grad_norm": 0.10857556760311127, + "learning_rate": 0.0005756718809058516, + "loss": 2.669, + "step": 15417 + }, + { + "epoch": 0.4571953859383803, + "grad_norm": 0.12198494374752045, + "learning_rate": 0.0005756253715810736, + "loss": 2.6517, + "step": 15418 + }, + { + "epoch": 0.4572250392906918, + "grad_norm": 0.1306518316268921, + "learning_rate": 0.0005755788615866201, + "loss": 2.6336, + "step": 15419 + }, + { + "epoch": 0.4572546926430033, + "grad_norm": 0.14292198419570923, + "learning_rate": 0.0005755323509229028, + "loss": 2.647, + "step": 15420 + }, + { + "epoch": 0.45728434599531476, + "grad_norm": 0.14083965122699738, + "learning_rate": 0.0005754858395903337, + "loss": 2.6705, + "step": 15421 + }, + { + "epoch": 0.45731399934762623, + "grad_norm": 0.13279175758361816, + "learning_rate": 0.0005754393275893243, + "loss": 2.6871, + "step": 15422 + }, + { + "epoch": 0.4573436526999377, + "grad_norm": 0.11627395451068878, + "learning_rate": 0.0005753928149202869, + "loss": 2.6593, + "step": 15423 + }, + { + "epoch": 0.4573733060522492, + "grad_norm": 0.1118880957365036, + "learning_rate": 0.0005753463015836331, + "loss": 2.6476, + "step": 15424 + }, + { + "epoch": 0.45740295940456066, + "grad_norm": 0.12138926982879639, + "learning_rate": 0.0005752997875797749, + "loss": 2.6742, + "step": 15425 + }, + { + "epoch": 0.45743261275687214, + "grad_norm": 0.12234925478696823, + "learning_rate": 0.0005752532729091242, + "loss": 2.6356, + "step": 15426 + }, + { + "epoch": 0.45746226610918367, + "grad_norm": 0.12245047092437744, + "learning_rate": 0.0005752067575720927, + "loss": 2.709, + "step": 15427 + }, + { + "epoch": 0.45749191946149514, + "grad_norm": 0.1173478290438652, + "learning_rate": 0.0005751602415690925, + "loss": 2.6708, + "step": 15428 + }, + { + "epoch": 0.4575215728138066, + "grad_norm": 0.1194317638874054, + "learning_rate": 0.0005751137249005356, + "loss": 2.6698, + "step": 15429 + }, + { + "epoch": 0.4575512261661181, + "grad_norm": 0.13415606319904327, + "learning_rate": 0.0005750672075668336, + "loss": 2.6609, + "step": 15430 + }, + { + "epoch": 0.4575808795184296, + "grad_norm": 0.14682674407958984, + "learning_rate": 0.0005750206895683987, + "loss": 2.673, + "step": 15431 + }, + { + "epoch": 0.45761053287074105, + "grad_norm": 0.15326085686683655, + "learning_rate": 0.0005749741709056426, + "loss": 2.6821, + "step": 15432 + }, + { + "epoch": 0.4576401862230525, + "grad_norm": 0.15625135600566864, + "learning_rate": 0.0005749276515789775, + "loss": 2.6265, + "step": 15433 + }, + { + "epoch": 0.457669839575364, + "grad_norm": 0.15204055607318878, + "learning_rate": 0.0005748811315888152, + "loss": 2.6418, + "step": 15434 + }, + { + "epoch": 0.4576994929276755, + "grad_norm": 0.1431799829006195, + "learning_rate": 0.0005748346109355674, + "loss": 2.6452, + "step": 15435 + }, + { + "epoch": 0.45772914627998695, + "grad_norm": 0.11850462853908539, + "learning_rate": 0.0005747880896196465, + "loss": 2.6706, + "step": 15436 + }, + { + "epoch": 0.45775879963229843, + "grad_norm": 0.13982725143432617, + "learning_rate": 0.0005747415676414641, + "loss": 2.6995, + "step": 15437 + }, + { + "epoch": 0.4577884529846099, + "grad_norm": 0.13408485054969788, + "learning_rate": 0.0005746950450014323, + "loss": 2.6664, + "step": 15438 + }, + { + "epoch": 0.4578181063369214, + "grad_norm": 0.11835320293903351, + "learning_rate": 0.000574648521699963, + "loss": 2.6595, + "step": 15439 + }, + { + "epoch": 0.45784775968923286, + "grad_norm": 0.11058881878852844, + "learning_rate": 0.0005746019977374684, + "loss": 2.6796, + "step": 15440 + }, + { + "epoch": 0.45787741304154433, + "grad_norm": 0.14144350588321686, + "learning_rate": 0.0005745554731143602, + "loss": 2.6554, + "step": 15441 + }, + { + "epoch": 0.4579070663938558, + "grad_norm": 0.1476958990097046, + "learning_rate": 0.0005745089478310506, + "loss": 2.645, + "step": 15442 + }, + { + "epoch": 0.4579367197461673, + "grad_norm": 0.13431616127490997, + "learning_rate": 0.0005744624218879514, + "loss": 2.6639, + "step": 15443 + }, + { + "epoch": 0.45796637309847876, + "grad_norm": 0.1215042993426323, + "learning_rate": 0.0005744158952854747, + "loss": 2.671, + "step": 15444 + }, + { + "epoch": 0.45799602645079024, + "grad_norm": 0.10484043508768082, + "learning_rate": 0.0005743693680240323, + "loss": 2.6671, + "step": 15445 + }, + { + "epoch": 0.4580256798031017, + "grad_norm": 0.11406437307596207, + "learning_rate": 0.0005743228401040364, + "loss": 2.6354, + "step": 15446 + }, + { + "epoch": 0.45805533315541325, + "grad_norm": 0.1165751963853836, + "learning_rate": 0.0005742763115258992, + "loss": 2.649, + "step": 15447 + }, + { + "epoch": 0.4580849865077247, + "grad_norm": 0.10759525746107101, + "learning_rate": 0.0005742297822900326, + "loss": 2.6462, + "step": 15448 + }, + { + "epoch": 0.4581146398600362, + "grad_norm": 0.12130969762802124, + "learning_rate": 0.0005741832523968484, + "loss": 2.6306, + "step": 15449 + }, + { + "epoch": 0.4581442932123477, + "grad_norm": 0.1134147047996521, + "learning_rate": 0.0005741367218467586, + "loss": 2.6312, + "step": 15450 + }, + { + "epoch": 0.45817394656465915, + "grad_norm": 0.11094547063112259, + "learning_rate": 0.0005740901906401754, + "loss": 2.6611, + "step": 15451 + }, + { + "epoch": 0.4582035999169706, + "grad_norm": 0.11538627743721008, + "learning_rate": 0.0005740436587775109, + "loss": 2.6554, + "step": 15452 + }, + { + "epoch": 0.4582332532692821, + "grad_norm": 0.11242377758026123, + "learning_rate": 0.0005739971262591772, + "loss": 2.6262, + "step": 15453 + }, + { + "epoch": 0.4582629066215936, + "grad_norm": 0.12653543055057526, + "learning_rate": 0.0005739505930855864, + "loss": 2.6485, + "step": 15454 + }, + { + "epoch": 0.45829255997390506, + "grad_norm": 0.11738158762454987, + "learning_rate": 0.0005739040592571502, + "loss": 2.6525, + "step": 15455 + }, + { + "epoch": 0.45832221332621653, + "grad_norm": 0.1567540317773819, + "learning_rate": 0.0005738575247742808, + "loss": 2.6974, + "step": 15456 + }, + { + "epoch": 0.458351866678528, + "grad_norm": 0.1624065637588501, + "learning_rate": 0.0005738109896373904, + "loss": 2.6604, + "step": 15457 + }, + { + "epoch": 0.4583815200308395, + "grad_norm": 0.13244131207466125, + "learning_rate": 0.000573764453846891, + "loss": 2.6898, + "step": 15458 + }, + { + "epoch": 0.45841117338315096, + "grad_norm": 0.12566439807415009, + "learning_rate": 0.0005737179174031948, + "loss": 2.6568, + "step": 15459 + }, + { + "epoch": 0.45844082673546244, + "grad_norm": 0.1191372498869896, + "learning_rate": 0.0005736713803067137, + "loss": 2.6481, + "step": 15460 + }, + { + "epoch": 0.4584704800877739, + "grad_norm": 0.14243444800376892, + "learning_rate": 0.00057362484255786, + "loss": 2.6873, + "step": 15461 + }, + { + "epoch": 0.4585001334400854, + "grad_norm": 0.14390186965465546, + "learning_rate": 0.0005735783041570455, + "loss": 2.6538, + "step": 15462 + }, + { + "epoch": 0.45852978679239687, + "grad_norm": 0.12537847459316254, + "learning_rate": 0.0005735317651046827, + "loss": 2.6427, + "step": 15463 + }, + { + "epoch": 0.45855944014470834, + "grad_norm": 0.11421244591474533, + "learning_rate": 0.0005734852254011833, + "loss": 2.691, + "step": 15464 + }, + { + "epoch": 0.4585890934970198, + "grad_norm": 0.13855862617492676, + "learning_rate": 0.0005734386850469596, + "loss": 2.6635, + "step": 15465 + }, + { + "epoch": 0.4586187468493313, + "grad_norm": 0.13984781503677368, + "learning_rate": 0.0005733921440424239, + "loss": 2.6414, + "step": 15466 + }, + { + "epoch": 0.45864840020164277, + "grad_norm": 0.1316283494234085, + "learning_rate": 0.0005733456023879881, + "loss": 2.6721, + "step": 15467 + }, + { + "epoch": 0.4586780535539543, + "grad_norm": 0.13333739340305328, + "learning_rate": 0.0005732990600840644, + "loss": 2.6701, + "step": 15468 + }, + { + "epoch": 0.4587077069062658, + "grad_norm": 0.11304713785648346, + "learning_rate": 0.000573252517131065, + "loss": 2.6317, + "step": 15469 + }, + { + "epoch": 0.45873736025857725, + "grad_norm": 0.12035832554101944, + "learning_rate": 0.0005732059735294019, + "loss": 2.668, + "step": 15470 + }, + { + "epoch": 0.45876701361088873, + "grad_norm": 0.12099315226078033, + "learning_rate": 0.0005731594292794872, + "loss": 2.6647, + "step": 15471 + }, + { + "epoch": 0.4587966669632002, + "grad_norm": 0.13305889070034027, + "learning_rate": 0.0005731128843817335, + "loss": 2.6896, + "step": 15472 + }, + { + "epoch": 0.4588263203155117, + "grad_norm": 0.11578148603439331, + "learning_rate": 0.0005730663388365525, + "loss": 2.6779, + "step": 15473 + }, + { + "epoch": 0.45885597366782316, + "grad_norm": 0.11554548144340515, + "learning_rate": 0.0005730197926443565, + "loss": 2.682, + "step": 15474 + }, + { + "epoch": 0.45888562702013463, + "grad_norm": 0.12671656906604767, + "learning_rate": 0.0005729732458055577, + "loss": 2.6977, + "step": 15475 + }, + { + "epoch": 0.4589152803724461, + "grad_norm": 0.12861520051956177, + "learning_rate": 0.0005729266983205685, + "loss": 2.6329, + "step": 15476 + }, + { + "epoch": 0.4589449337247576, + "grad_norm": 0.1260744333267212, + "learning_rate": 0.0005728801501898006, + "loss": 2.6826, + "step": 15477 + }, + { + "epoch": 0.45897458707706906, + "grad_norm": 0.10806231200695038, + "learning_rate": 0.0005728336014136666, + "loss": 2.6452, + "step": 15478 + }, + { + "epoch": 0.45900424042938054, + "grad_norm": 0.11644171923398972, + "learning_rate": 0.0005727870519925784, + "loss": 2.6849, + "step": 15479 + }, + { + "epoch": 0.459033893781692, + "grad_norm": 0.11372330039739609, + "learning_rate": 0.0005727405019269485, + "loss": 2.6637, + "step": 15480 + }, + { + "epoch": 0.4590635471340035, + "grad_norm": 0.11968286335468292, + "learning_rate": 0.0005726939512171891, + "loss": 2.6389, + "step": 15481 + }, + { + "epoch": 0.45909320048631497, + "grad_norm": 0.10118794441223145, + "learning_rate": 0.000572647399863712, + "loss": 2.6892, + "step": 15482 + }, + { + "epoch": 0.45912285383862644, + "grad_norm": 0.11424201726913452, + "learning_rate": 0.00057260084786693, + "loss": 2.666, + "step": 15483 + }, + { + "epoch": 0.4591525071909379, + "grad_norm": 0.12296919524669647, + "learning_rate": 0.0005725542952272546, + "loss": 2.6596, + "step": 15484 + }, + { + "epoch": 0.4591821605432494, + "grad_norm": 0.1111467108130455, + "learning_rate": 0.0005725077419450988, + "loss": 2.666, + "step": 15485 + }, + { + "epoch": 0.45921181389556087, + "grad_norm": 0.1166350319981575, + "learning_rate": 0.0005724611880208745, + "loss": 2.6634, + "step": 15486 + }, + { + "epoch": 0.45924146724787235, + "grad_norm": 0.1120346188545227, + "learning_rate": 0.000572414633454994, + "loss": 2.653, + "step": 15487 + }, + { + "epoch": 0.4592711206001838, + "grad_norm": 0.11324118077754974, + "learning_rate": 0.0005723680782478693, + "loss": 2.6515, + "step": 15488 + }, + { + "epoch": 0.45930077395249536, + "grad_norm": 0.12105072289705276, + "learning_rate": 0.0005723215223999129, + "loss": 2.6457, + "step": 15489 + }, + { + "epoch": 0.45933042730480683, + "grad_norm": 0.1475285440683365, + "learning_rate": 0.000572274965911537, + "loss": 2.6884, + "step": 15490 + }, + { + "epoch": 0.4593600806571183, + "grad_norm": 0.14413151144981384, + "learning_rate": 0.0005722284087831537, + "loss": 2.6429, + "step": 15491 + }, + { + "epoch": 0.4593897340094298, + "grad_norm": 0.13877171277999878, + "learning_rate": 0.0005721818510151758, + "loss": 2.6653, + "step": 15492 + }, + { + "epoch": 0.45941938736174126, + "grad_norm": 0.16476567089557648, + "learning_rate": 0.0005721352926080152, + "loss": 2.6457, + "step": 15493 + }, + { + "epoch": 0.45944904071405274, + "grad_norm": 0.2002135068178177, + "learning_rate": 0.0005720887335620839, + "loss": 2.6681, + "step": 15494 + }, + { + "epoch": 0.4594786940663642, + "grad_norm": 0.1804182529449463, + "learning_rate": 0.0005720421738777947, + "loss": 2.6395, + "step": 15495 + }, + { + "epoch": 0.4595083474186757, + "grad_norm": 0.11219000816345215, + "learning_rate": 0.0005719956135555595, + "loss": 2.6821, + "step": 15496 + }, + { + "epoch": 0.45953800077098717, + "grad_norm": 0.13107435405254364, + "learning_rate": 0.000571949052595791, + "loss": 2.6671, + "step": 15497 + }, + { + "epoch": 0.45956765412329864, + "grad_norm": 0.1409437656402588, + "learning_rate": 0.0005719024909989012, + "loss": 2.6434, + "step": 15498 + }, + { + "epoch": 0.4595973074756101, + "grad_norm": 0.12190935760736465, + "learning_rate": 0.0005718559287653024, + "loss": 2.6851, + "step": 15499 + }, + { + "epoch": 0.4596269608279216, + "grad_norm": 0.11162280291318893, + "learning_rate": 0.0005718093658954072, + "loss": 2.6637, + "step": 15500 + }, + { + "epoch": 0.45965661418023307, + "grad_norm": 0.12591667473316193, + "learning_rate": 0.0005717628023896277, + "loss": 2.6495, + "step": 15501 + }, + { + "epoch": 0.45968626753254455, + "grad_norm": 0.12489667534828186, + "learning_rate": 0.0005717162382483761, + "loss": 2.6727, + "step": 15502 + }, + { + "epoch": 0.459715920884856, + "grad_norm": 0.1085842028260231, + "learning_rate": 0.000571669673472065, + "loss": 2.6623, + "step": 15503 + }, + { + "epoch": 0.4597455742371675, + "grad_norm": 0.1490858644247055, + "learning_rate": 0.0005716231080611068, + "loss": 2.6783, + "step": 15504 + }, + { + "epoch": 0.459775227589479, + "grad_norm": 0.14015890657901764, + "learning_rate": 0.0005715765420159135, + "loss": 2.6375, + "step": 15505 + }, + { + "epoch": 0.45980488094179045, + "grad_norm": 0.12150483578443527, + "learning_rate": 0.0005715299753368977, + "loss": 2.6521, + "step": 15506 + }, + { + "epoch": 0.4598345342941019, + "grad_norm": 0.12561947107315063, + "learning_rate": 0.0005714834080244716, + "loss": 2.6907, + "step": 15507 + }, + { + "epoch": 0.4598641876464134, + "grad_norm": 0.13589684665203094, + "learning_rate": 0.0005714368400790477, + "loss": 2.652, + "step": 15508 + }, + { + "epoch": 0.4598938409987249, + "grad_norm": 0.11689630895853043, + "learning_rate": 0.0005713902715010385, + "loss": 2.6711, + "step": 15509 + }, + { + "epoch": 0.4599234943510364, + "grad_norm": 0.14460298418998718, + "learning_rate": 0.0005713437022908559, + "loss": 2.6508, + "step": 15510 + }, + { + "epoch": 0.4599531477033479, + "grad_norm": 0.1324053853750229, + "learning_rate": 0.0005712971324489126, + "loss": 2.6643, + "step": 15511 + }, + { + "epoch": 0.45998280105565936, + "grad_norm": 0.11079374700784683, + "learning_rate": 0.0005712505619756212, + "loss": 2.6508, + "step": 15512 + }, + { + "epoch": 0.46001245440797084, + "grad_norm": 0.10573101788759232, + "learning_rate": 0.0005712039908713937, + "loss": 2.5969, + "step": 15513 + }, + { + "epoch": 0.4600421077602823, + "grad_norm": 0.10893820971250534, + "learning_rate": 0.0005711574191366427, + "loss": 2.6698, + "step": 15514 + }, + { + "epoch": 0.4600717611125938, + "grad_norm": 0.11681964248418808, + "learning_rate": 0.0005711108467717805, + "loss": 2.6708, + "step": 15515 + }, + { + "epoch": 0.46010141446490527, + "grad_norm": 0.1094631478190422, + "learning_rate": 0.0005710642737772194, + "loss": 2.6553, + "step": 15516 + }, + { + "epoch": 0.46013106781721674, + "grad_norm": 0.11469723284244537, + "learning_rate": 0.0005710177001533721, + "loss": 2.6447, + "step": 15517 + }, + { + "epoch": 0.4601607211695282, + "grad_norm": 0.11340497434139252, + "learning_rate": 0.0005709711259006508, + "loss": 2.6762, + "step": 15518 + }, + { + "epoch": 0.4601903745218397, + "grad_norm": 0.11553195863962173, + "learning_rate": 0.0005709245510194681, + "loss": 2.6823, + "step": 15519 + }, + { + "epoch": 0.46022002787415117, + "grad_norm": 0.11330343782901764, + "learning_rate": 0.0005708779755102363, + "loss": 2.6383, + "step": 15520 + }, + { + "epoch": 0.46024968122646265, + "grad_norm": 0.10699885338544846, + "learning_rate": 0.0005708313993733679, + "loss": 2.6321, + "step": 15521 + }, + { + "epoch": 0.4602793345787741, + "grad_norm": 0.09879885613918304, + "learning_rate": 0.0005707848226092751, + "loss": 2.6549, + "step": 15522 + }, + { + "epoch": 0.4603089879310856, + "grad_norm": 0.11227478832006454, + "learning_rate": 0.0005707382452183707, + "loss": 2.6725, + "step": 15523 + }, + { + "epoch": 0.4603386412833971, + "grad_norm": 0.1333814263343811, + "learning_rate": 0.0005706916672010671, + "loss": 2.643, + "step": 15524 + }, + { + "epoch": 0.46036829463570855, + "grad_norm": 0.1333109587430954, + "learning_rate": 0.0005706450885577765, + "loss": 2.659, + "step": 15525 + }, + { + "epoch": 0.46039794798802003, + "grad_norm": 0.12731418013572693, + "learning_rate": 0.0005705985092889116, + "loss": 2.6383, + "step": 15526 + }, + { + "epoch": 0.4604276013403315, + "grad_norm": 0.12660889327526093, + "learning_rate": 0.0005705519293948846, + "loss": 2.6492, + "step": 15527 + }, + { + "epoch": 0.460457254692643, + "grad_norm": 0.13095735013484955, + "learning_rate": 0.0005705053488761084, + "loss": 2.6567, + "step": 15528 + }, + { + "epoch": 0.46048690804495446, + "grad_norm": 0.126811683177948, + "learning_rate": 0.0005704587677329949, + "loss": 2.6925, + "step": 15529 + }, + { + "epoch": 0.46051656139726593, + "grad_norm": 0.10395630449056625, + "learning_rate": 0.0005704121859659573, + "loss": 2.6674, + "step": 15530 + }, + { + "epoch": 0.46054621474957746, + "grad_norm": 0.1279558688402176, + "learning_rate": 0.0005703656035754075, + "loss": 2.6514, + "step": 15531 + }, + { + "epoch": 0.46057586810188894, + "grad_norm": 0.14933530986309052, + "learning_rate": 0.0005703190205617584, + "loss": 2.6785, + "step": 15532 + }, + { + "epoch": 0.4606055214542004, + "grad_norm": 0.14450006186962128, + "learning_rate": 0.0005702724369254221, + "loss": 2.657, + "step": 15533 + }, + { + "epoch": 0.4606351748065119, + "grad_norm": 0.14450475573539734, + "learning_rate": 0.0005702258526668113, + "loss": 2.642, + "step": 15534 + }, + { + "epoch": 0.46066482815882337, + "grad_norm": 0.12974813580513, + "learning_rate": 0.0005701792677863387, + "loss": 2.6626, + "step": 15535 + }, + { + "epoch": 0.46069448151113485, + "grad_norm": 0.1539388746023178, + "learning_rate": 0.0005701326822844164, + "loss": 2.6709, + "step": 15536 + }, + { + "epoch": 0.4607241348634463, + "grad_norm": 0.13962166011333466, + "learning_rate": 0.0005700860961614573, + "loss": 2.6397, + "step": 15537 + }, + { + "epoch": 0.4607537882157578, + "grad_norm": 0.14543959498405457, + "learning_rate": 0.0005700395094178738, + "loss": 2.6544, + "step": 15538 + }, + { + "epoch": 0.4607834415680693, + "grad_norm": 0.1386915147304535, + "learning_rate": 0.0005699929220540783, + "loss": 2.6751, + "step": 15539 + }, + { + "epoch": 0.46081309492038075, + "grad_norm": 0.09758548438549042, + "learning_rate": 0.0005699463340704837, + "loss": 2.6538, + "step": 15540 + }, + { + "epoch": 0.4608427482726922, + "grad_norm": 0.11778869479894638, + "learning_rate": 0.0005698997454675021, + "loss": 2.6092, + "step": 15541 + }, + { + "epoch": 0.4608724016250037, + "grad_norm": 0.1235789880156517, + "learning_rate": 0.0005698531562455464, + "loss": 2.6688, + "step": 15542 + }, + { + "epoch": 0.4609020549773152, + "grad_norm": 0.11835145205259323, + "learning_rate": 0.0005698065664050288, + "loss": 2.6515, + "step": 15543 + }, + { + "epoch": 0.46093170832962665, + "grad_norm": 0.11249272525310516, + "learning_rate": 0.0005697599759463622, + "loss": 2.62, + "step": 15544 + }, + { + "epoch": 0.46096136168193813, + "grad_norm": 0.11808189004659653, + "learning_rate": 0.000569713384869959, + "loss": 2.6397, + "step": 15545 + }, + { + "epoch": 0.4609910150342496, + "grad_norm": 0.09917550534009933, + "learning_rate": 0.000569666793176232, + "loss": 2.6271, + "step": 15546 + }, + { + "epoch": 0.4610206683865611, + "grad_norm": 0.12775808572769165, + "learning_rate": 0.0005696202008655934, + "loss": 2.6931, + "step": 15547 + }, + { + "epoch": 0.46105032173887256, + "grad_norm": 0.12439750134944916, + "learning_rate": 0.000569573607938456, + "loss": 2.6678, + "step": 15548 + }, + { + "epoch": 0.46107997509118404, + "grad_norm": 0.10333575308322906, + "learning_rate": 0.0005695270143952322, + "loss": 2.6389, + "step": 15549 + }, + { + "epoch": 0.4611096284434955, + "grad_norm": 0.1283854842185974, + "learning_rate": 0.000569480420236335, + "loss": 2.709, + "step": 15550 + }, + { + "epoch": 0.46113928179580704, + "grad_norm": 0.1392243504524231, + "learning_rate": 0.0005694338254621767, + "loss": 2.6837, + "step": 15551 + }, + { + "epoch": 0.4611689351481185, + "grad_norm": 0.15508101880550385, + "learning_rate": 0.00056938723007317, + "loss": 2.6895, + "step": 15552 + }, + { + "epoch": 0.46119858850043, + "grad_norm": 0.16661323606967926, + "learning_rate": 0.0005693406340697274, + "loss": 2.7245, + "step": 15553 + }, + { + "epoch": 0.46122824185274147, + "grad_norm": 0.15194334089756012, + "learning_rate": 0.0005692940374522616, + "loss": 2.6952, + "step": 15554 + }, + { + "epoch": 0.46125789520505295, + "grad_norm": 0.12654300034046173, + "learning_rate": 0.0005692474402211851, + "loss": 2.6836, + "step": 15555 + }, + { + "epoch": 0.4612875485573644, + "grad_norm": 0.13589926064014435, + "learning_rate": 0.0005692008423769107, + "loss": 2.6537, + "step": 15556 + }, + { + "epoch": 0.4613172019096759, + "grad_norm": 0.15159402787685394, + "learning_rate": 0.000569154243919851, + "loss": 2.6579, + "step": 15557 + }, + { + "epoch": 0.4613468552619874, + "grad_norm": 0.12953341007232666, + "learning_rate": 0.0005691076448504186, + "loss": 2.6651, + "step": 15558 + }, + { + "epoch": 0.46137650861429885, + "grad_norm": 0.12122637778520584, + "learning_rate": 0.0005690610451690261, + "loss": 2.6611, + "step": 15559 + }, + { + "epoch": 0.46140616196661033, + "grad_norm": 0.11676159501075745, + "learning_rate": 0.0005690144448760862, + "loss": 2.6746, + "step": 15560 + }, + { + "epoch": 0.4614358153189218, + "grad_norm": 0.12500159442424774, + "learning_rate": 0.0005689678439720115, + "loss": 2.6597, + "step": 15561 + }, + { + "epoch": 0.4614654686712333, + "grad_norm": 0.13127601146697998, + "learning_rate": 0.0005689212424572149, + "loss": 2.657, + "step": 15562 + }, + { + "epoch": 0.46149512202354476, + "grad_norm": 0.13006088137626648, + "learning_rate": 0.0005688746403321087, + "loss": 2.6962, + "step": 15563 + }, + { + "epoch": 0.46152477537585623, + "grad_norm": 0.12019509077072144, + "learning_rate": 0.0005688280375971057, + "loss": 2.6235, + "step": 15564 + }, + { + "epoch": 0.4615544287281677, + "grad_norm": 0.13547462224960327, + "learning_rate": 0.0005687814342526188, + "loss": 2.621, + "step": 15565 + }, + { + "epoch": 0.4615840820804792, + "grad_norm": 0.12274585664272308, + "learning_rate": 0.0005687348302990603, + "loss": 2.6568, + "step": 15566 + }, + { + "epoch": 0.46161373543279066, + "grad_norm": 0.13541686534881592, + "learning_rate": 0.0005686882257368431, + "loss": 2.713, + "step": 15567 + }, + { + "epoch": 0.46164338878510214, + "grad_norm": 0.14610400795936584, + "learning_rate": 0.00056864162056638, + "loss": 2.6296, + "step": 15568 + }, + { + "epoch": 0.4616730421374136, + "grad_norm": 0.12148890644311905, + "learning_rate": 0.0005685950147880834, + "loss": 2.6593, + "step": 15569 + }, + { + "epoch": 0.4617026954897251, + "grad_norm": 0.1344018131494522, + "learning_rate": 0.0005685484084023663, + "loss": 2.6969, + "step": 15570 + }, + { + "epoch": 0.46173234884203657, + "grad_norm": 0.12195117771625519, + "learning_rate": 0.0005685018014096412, + "loss": 2.6495, + "step": 15571 + }, + { + "epoch": 0.4617620021943481, + "grad_norm": 0.11237315088510513, + "learning_rate": 0.0005684551938103209, + "loss": 2.6461, + "step": 15572 + }, + { + "epoch": 0.4617916555466596, + "grad_norm": 0.12655475735664368, + "learning_rate": 0.0005684085856048183, + "loss": 2.6837, + "step": 15573 + }, + { + "epoch": 0.46182130889897105, + "grad_norm": 0.1199922189116478, + "learning_rate": 0.0005683619767935457, + "loss": 2.7068, + "step": 15574 + }, + { + "epoch": 0.4618509622512825, + "grad_norm": 0.13674119114875793, + "learning_rate": 0.0005683153673769161, + "loss": 2.6785, + "step": 15575 + }, + { + "epoch": 0.461880615603594, + "grad_norm": 0.16052424907684326, + "learning_rate": 0.0005682687573553422, + "loss": 2.6625, + "step": 15576 + }, + { + "epoch": 0.4619102689559055, + "grad_norm": 0.15249598026275635, + "learning_rate": 0.0005682221467292368, + "loss": 2.6865, + "step": 15577 + }, + { + "epoch": 0.46193992230821695, + "grad_norm": 0.14428207278251648, + "learning_rate": 0.0005681755354990125, + "loss": 2.6408, + "step": 15578 + }, + { + "epoch": 0.46196957566052843, + "grad_norm": 0.12651611864566803, + "learning_rate": 0.0005681289236650821, + "loss": 2.6578, + "step": 15579 + }, + { + "epoch": 0.4619992290128399, + "grad_norm": 0.12066060304641724, + "learning_rate": 0.0005680823112278586, + "loss": 2.6439, + "step": 15580 + }, + { + "epoch": 0.4620288823651514, + "grad_norm": 0.12772004306316376, + "learning_rate": 0.0005680356981877544, + "loss": 2.6239, + "step": 15581 + }, + { + "epoch": 0.46205853571746286, + "grad_norm": 0.11747609823942184, + "learning_rate": 0.0005679890845451824, + "loss": 2.6413, + "step": 15582 + }, + { + "epoch": 0.46208818906977434, + "grad_norm": 0.11684516817331314, + "learning_rate": 0.0005679424703005553, + "loss": 2.6561, + "step": 15583 + }, + { + "epoch": 0.4621178424220858, + "grad_norm": 0.10951590538024902, + "learning_rate": 0.0005678958554542861, + "loss": 2.6861, + "step": 15584 + }, + { + "epoch": 0.4621474957743973, + "grad_norm": 0.11791567504405975, + "learning_rate": 0.0005678492400067875, + "loss": 2.6549, + "step": 15585 + }, + { + "epoch": 0.46217714912670876, + "grad_norm": 0.10382585972547531, + "learning_rate": 0.0005678026239584722, + "loss": 2.6408, + "step": 15586 + }, + { + "epoch": 0.46220680247902024, + "grad_norm": 0.11242060363292694, + "learning_rate": 0.0005677560073097528, + "loss": 2.6666, + "step": 15587 + }, + { + "epoch": 0.4622364558313317, + "grad_norm": 0.11553655564785004, + "learning_rate": 0.0005677093900610426, + "loss": 2.6815, + "step": 15588 + }, + { + "epoch": 0.4622661091836432, + "grad_norm": 0.10525952279567719, + "learning_rate": 0.000567662772212754, + "loss": 2.6468, + "step": 15589 + }, + { + "epoch": 0.46229576253595467, + "grad_norm": 0.09794380515813828, + "learning_rate": 0.0005676161537653, + "loss": 2.6727, + "step": 15590 + }, + { + "epoch": 0.46232541588826614, + "grad_norm": 0.11588895320892334, + "learning_rate": 0.0005675695347190933, + "loss": 2.6837, + "step": 15591 + }, + { + "epoch": 0.4623550692405776, + "grad_norm": 0.14767175912857056, + "learning_rate": 0.0005675229150745469, + "loss": 2.6589, + "step": 15592 + }, + { + "epoch": 0.46238472259288915, + "grad_norm": 0.17860953509807587, + "learning_rate": 0.0005674762948320733, + "loss": 2.6878, + "step": 15593 + }, + { + "epoch": 0.46241437594520063, + "grad_norm": 0.18481317162513733, + "learning_rate": 0.0005674296739920856, + "loss": 2.6845, + "step": 15594 + }, + { + "epoch": 0.4624440292975121, + "grad_norm": 0.1321263611316681, + "learning_rate": 0.0005673830525549967, + "loss": 2.6622, + "step": 15595 + }, + { + "epoch": 0.4624736826498236, + "grad_norm": 0.11343945562839508, + "learning_rate": 0.000567336430521219, + "loss": 2.6565, + "step": 15596 + }, + { + "epoch": 0.46250333600213506, + "grad_norm": 0.1429806351661682, + "learning_rate": 0.0005672898078911659, + "loss": 2.6491, + "step": 15597 + }, + { + "epoch": 0.46253298935444653, + "grad_norm": 0.13390867412090302, + "learning_rate": 0.0005672431846652499, + "loss": 2.6465, + "step": 15598 + }, + { + "epoch": 0.462562642706758, + "grad_norm": 0.11592025309801102, + "learning_rate": 0.0005671965608438841, + "loss": 2.655, + "step": 15599 + }, + { + "epoch": 0.4625922960590695, + "grad_norm": 0.10865845531225204, + "learning_rate": 0.000567149936427481, + "loss": 2.6698, + "step": 15600 + }, + { + "epoch": 0.46262194941138096, + "grad_norm": 0.12297538667917252, + "learning_rate": 0.0005671033114164538, + "loss": 2.6289, + "step": 15601 + }, + { + "epoch": 0.46265160276369244, + "grad_norm": 0.112326480448246, + "learning_rate": 0.0005670566858112152, + "loss": 2.6645, + "step": 15602 + }, + { + "epoch": 0.4626812561160039, + "grad_norm": 0.10084402561187744, + "learning_rate": 0.0005670100596121783, + "loss": 2.6868, + "step": 15603 + }, + { + "epoch": 0.4627109094683154, + "grad_norm": 0.1227693110704422, + "learning_rate": 0.0005669634328197557, + "loss": 2.6508, + "step": 15604 + }, + { + "epoch": 0.46274056282062687, + "grad_norm": 0.11408164352178574, + "learning_rate": 0.0005669168054343604, + "loss": 2.6688, + "step": 15605 + }, + { + "epoch": 0.46277021617293834, + "grad_norm": 0.10479623079299927, + "learning_rate": 0.0005668701774564054, + "loss": 2.6821, + "step": 15606 + }, + { + "epoch": 0.4627998695252498, + "grad_norm": 0.10677848756313324, + "learning_rate": 0.0005668235488863034, + "loss": 2.6487, + "step": 15607 + }, + { + "epoch": 0.4628295228775613, + "grad_norm": 0.10914601385593414, + "learning_rate": 0.0005667769197244674, + "loss": 2.6736, + "step": 15608 + }, + { + "epoch": 0.46285917622987277, + "grad_norm": 0.11615871638059616, + "learning_rate": 0.0005667302899713104, + "loss": 2.6713, + "step": 15609 + }, + { + "epoch": 0.46288882958218425, + "grad_norm": 0.12143249064683914, + "learning_rate": 0.0005666836596272451, + "loss": 2.6633, + "step": 15610 + }, + { + "epoch": 0.4629184829344957, + "grad_norm": 0.12501661479473114, + "learning_rate": 0.0005666370286926847, + "loss": 2.6188, + "step": 15611 + }, + { + "epoch": 0.4629481362868072, + "grad_norm": 0.12687677145004272, + "learning_rate": 0.0005665903971680419, + "loss": 2.6573, + "step": 15612 + }, + { + "epoch": 0.4629777896391187, + "grad_norm": 0.10670735687017441, + "learning_rate": 0.0005665437650537297, + "loss": 2.6761, + "step": 15613 + }, + { + "epoch": 0.4630074429914302, + "grad_norm": 0.12498285621404648, + "learning_rate": 0.000566497132350161, + "loss": 2.639, + "step": 15614 + }, + { + "epoch": 0.4630370963437417, + "grad_norm": 0.1149948462843895, + "learning_rate": 0.0005664504990577487, + "loss": 2.6682, + "step": 15615 + }, + { + "epoch": 0.46306674969605316, + "grad_norm": 0.12723080813884735, + "learning_rate": 0.000566403865176906, + "loss": 2.6938, + "step": 15616 + }, + { + "epoch": 0.46309640304836464, + "grad_norm": 0.11316484957933426, + "learning_rate": 0.0005663572307080455, + "loss": 2.6625, + "step": 15617 + }, + { + "epoch": 0.4631260564006761, + "grad_norm": 0.1081644669175148, + "learning_rate": 0.0005663105956515806, + "loss": 2.6741, + "step": 15618 + }, + { + "epoch": 0.4631557097529876, + "grad_norm": 0.12330999970436096, + "learning_rate": 0.0005662639600079238, + "loss": 2.6203, + "step": 15619 + }, + { + "epoch": 0.46318536310529906, + "grad_norm": 0.1243562325835228, + "learning_rate": 0.0005662173237774883, + "loss": 2.6581, + "step": 15620 + }, + { + "epoch": 0.46321501645761054, + "grad_norm": 0.11809532344341278, + "learning_rate": 0.000566170686960687, + "loss": 2.696, + "step": 15621 + }, + { + "epoch": 0.463244669809922, + "grad_norm": 0.11684241145849228, + "learning_rate": 0.0005661240495579329, + "loss": 2.668, + "step": 15622 + }, + { + "epoch": 0.4632743231622335, + "grad_norm": 0.10492979735136032, + "learning_rate": 0.0005660774115696392, + "loss": 2.6566, + "step": 15623 + }, + { + "epoch": 0.46330397651454497, + "grad_norm": 0.1205330342054367, + "learning_rate": 0.0005660307729962184, + "loss": 2.6644, + "step": 15624 + }, + { + "epoch": 0.46333362986685644, + "grad_norm": 0.13288795948028564, + "learning_rate": 0.0005659841338380839, + "loss": 2.6624, + "step": 15625 + }, + { + "epoch": 0.4633632832191679, + "grad_norm": 0.12420503050088882, + "learning_rate": 0.0005659374940956485, + "loss": 2.6338, + "step": 15626 + }, + { + "epoch": 0.4633929365714794, + "grad_norm": 0.12654149532318115, + "learning_rate": 0.0005658908537693253, + "loss": 2.6789, + "step": 15627 + }, + { + "epoch": 0.4634225899237909, + "grad_norm": 0.1282787322998047, + "learning_rate": 0.0005658442128595273, + "loss": 2.6282, + "step": 15628 + }, + { + "epoch": 0.46345224327610235, + "grad_norm": 0.1364089697599411, + "learning_rate": 0.0005657975713666676, + "loss": 2.6926, + "step": 15629 + }, + { + "epoch": 0.4634818966284138, + "grad_norm": 0.16091054677963257, + "learning_rate": 0.0005657509292911591, + "loss": 2.6748, + "step": 15630 + }, + { + "epoch": 0.4635115499807253, + "grad_norm": 0.13642871379852295, + "learning_rate": 0.0005657042866334147, + "loss": 2.6709, + "step": 15631 + }, + { + "epoch": 0.4635412033330368, + "grad_norm": 0.12119483947753906, + "learning_rate": 0.0005656576433938474, + "loss": 2.6519, + "step": 15632 + }, + { + "epoch": 0.46357085668534825, + "grad_norm": 0.14206524193286896, + "learning_rate": 0.0005656109995728708, + "loss": 2.6563, + "step": 15633 + }, + { + "epoch": 0.46360051003765973, + "grad_norm": 0.14510169625282288, + "learning_rate": 0.0005655643551708972, + "loss": 2.6633, + "step": 15634 + }, + { + "epoch": 0.46363016338997126, + "grad_norm": 0.13061262667179108, + "learning_rate": 0.0005655177101883402, + "loss": 2.6594, + "step": 15635 + }, + { + "epoch": 0.46365981674228274, + "grad_norm": 0.11563011258840561, + "learning_rate": 0.0005654710646256125, + "loss": 2.6527, + "step": 15636 + }, + { + "epoch": 0.4636894700945942, + "grad_norm": 0.14754118025302887, + "learning_rate": 0.0005654244184831273, + "loss": 2.6108, + "step": 15637 + }, + { + "epoch": 0.4637191234469057, + "grad_norm": 0.13336625695228577, + "learning_rate": 0.0005653777717612977, + "loss": 2.6575, + "step": 15638 + }, + { + "epoch": 0.46374877679921717, + "grad_norm": 0.1302773356437683, + "learning_rate": 0.0005653311244605367, + "loss": 2.6392, + "step": 15639 + }, + { + "epoch": 0.46377843015152864, + "grad_norm": 0.14294898509979248, + "learning_rate": 0.0005652844765812574, + "loss": 2.635, + "step": 15640 + }, + { + "epoch": 0.4638080835038401, + "grad_norm": 0.12554170191287994, + "learning_rate": 0.0005652378281238728, + "loss": 2.698, + "step": 15641 + }, + { + "epoch": 0.4638377368561516, + "grad_norm": 0.1322275698184967, + "learning_rate": 0.000565191179088796, + "loss": 2.6068, + "step": 15642 + }, + { + "epoch": 0.46386739020846307, + "grad_norm": 0.13402144610881805, + "learning_rate": 0.0005651445294764402, + "loss": 2.6537, + "step": 15643 + }, + { + "epoch": 0.46389704356077455, + "grad_norm": 0.11938533186912537, + "learning_rate": 0.0005650978792872183, + "loss": 2.6396, + "step": 15644 + }, + { + "epoch": 0.463926696913086, + "grad_norm": 0.14507344365119934, + "learning_rate": 0.0005650512285215437, + "loss": 2.674, + "step": 15645 + }, + { + "epoch": 0.4639563502653975, + "grad_norm": 0.14375486969947815, + "learning_rate": 0.0005650045771798291, + "loss": 2.623, + "step": 15646 + }, + { + "epoch": 0.463986003617709, + "grad_norm": 0.14920805394649506, + "learning_rate": 0.0005649579252624879, + "loss": 2.6692, + "step": 15647 + }, + { + "epoch": 0.46401565697002045, + "grad_norm": 0.15115343034267426, + "learning_rate": 0.0005649112727699331, + "loss": 2.6816, + "step": 15648 + }, + { + "epoch": 0.4640453103223319, + "grad_norm": 0.11734415590763092, + "learning_rate": 0.000564864619702578, + "loss": 2.6387, + "step": 15649 + }, + { + "epoch": 0.4640749636746434, + "grad_norm": 0.10901408642530441, + "learning_rate": 0.0005648179660608355, + "loss": 2.6714, + "step": 15650 + }, + { + "epoch": 0.4641046170269549, + "grad_norm": 0.11688099056482315, + "learning_rate": 0.0005647713118451187, + "loss": 2.6852, + "step": 15651 + }, + { + "epoch": 0.46413427037926636, + "grad_norm": 0.11466581374406815, + "learning_rate": 0.0005647246570558407, + "loss": 2.6585, + "step": 15652 + }, + { + "epoch": 0.46416392373157783, + "grad_norm": 0.09873302280902863, + "learning_rate": 0.0005646780016934149, + "loss": 2.6773, + "step": 15653 + }, + { + "epoch": 0.4641935770838893, + "grad_norm": 0.11473038047552109, + "learning_rate": 0.0005646313457582543, + "loss": 2.6425, + "step": 15654 + }, + { + "epoch": 0.46422323043620084, + "grad_norm": 0.11662419885396957, + "learning_rate": 0.0005645846892507719, + "loss": 2.6547, + "step": 15655 + }, + { + "epoch": 0.4642528837885123, + "grad_norm": 0.11619782447814941, + "learning_rate": 0.0005645380321713812, + "loss": 2.7069, + "step": 15656 + }, + { + "epoch": 0.4642825371408238, + "grad_norm": 0.10192587226629257, + "learning_rate": 0.000564491374520495, + "loss": 2.6518, + "step": 15657 + }, + { + "epoch": 0.46431219049313527, + "grad_norm": 0.12869973480701447, + "learning_rate": 0.0005644447162985267, + "loss": 2.6497, + "step": 15658 + }, + { + "epoch": 0.46434184384544674, + "grad_norm": 0.1316012740135193, + "learning_rate": 0.0005643980575058893, + "loss": 2.6212, + "step": 15659 + }, + { + "epoch": 0.4643714971977582, + "grad_norm": 0.14179453253746033, + "learning_rate": 0.000564351398142996, + "loss": 2.6379, + "step": 15660 + }, + { + "epoch": 0.4644011505500697, + "grad_norm": 0.15124134719371796, + "learning_rate": 0.0005643047382102601, + "loss": 2.6601, + "step": 15661 + }, + { + "epoch": 0.4644308039023812, + "grad_norm": 0.15087389945983887, + "learning_rate": 0.0005642580777080948, + "loss": 2.6326, + "step": 15662 + }, + { + "epoch": 0.46446045725469265, + "grad_norm": 0.11692237108945847, + "learning_rate": 0.0005642114166369131, + "loss": 2.6204, + "step": 15663 + }, + { + "epoch": 0.4644901106070041, + "grad_norm": 0.12536314129829407, + "learning_rate": 0.0005641647549971283, + "loss": 2.6956, + "step": 15664 + }, + { + "epoch": 0.4645197639593156, + "grad_norm": 0.14451001584529877, + "learning_rate": 0.0005641180927891535, + "loss": 2.6481, + "step": 15665 + }, + { + "epoch": 0.4645494173116271, + "grad_norm": 0.1470697671175003, + "learning_rate": 0.0005640714300134021, + "loss": 2.6292, + "step": 15666 + }, + { + "epoch": 0.46457907066393855, + "grad_norm": 0.12573327124118805, + "learning_rate": 0.0005640247666702871, + "loss": 2.647, + "step": 15667 + }, + { + "epoch": 0.46460872401625003, + "grad_norm": 0.11733317375183105, + "learning_rate": 0.0005639781027602219, + "loss": 2.6587, + "step": 15668 + }, + { + "epoch": 0.4646383773685615, + "grad_norm": 0.12332029640674591, + "learning_rate": 0.0005639314382836196, + "loss": 2.6348, + "step": 15669 + }, + { + "epoch": 0.464668030720873, + "grad_norm": 0.11758534610271454, + "learning_rate": 0.0005638847732408934, + "loss": 2.6671, + "step": 15670 + }, + { + "epoch": 0.46469768407318446, + "grad_norm": 0.10798261314630508, + "learning_rate": 0.0005638381076324564, + "loss": 2.6868, + "step": 15671 + }, + { + "epoch": 0.46472733742549593, + "grad_norm": 0.12679921090602875, + "learning_rate": 0.0005637914414587222, + "loss": 2.6604, + "step": 15672 + }, + { + "epoch": 0.4647569907778074, + "grad_norm": 0.12944239377975464, + "learning_rate": 0.0005637447747201039, + "loss": 2.6446, + "step": 15673 + }, + { + "epoch": 0.4647866441301189, + "grad_norm": 0.11866395175457001, + "learning_rate": 0.0005636981074170146, + "loss": 2.6726, + "step": 15674 + }, + { + "epoch": 0.46481629748243036, + "grad_norm": 0.13548611104488373, + "learning_rate": 0.0005636514395498675, + "loss": 2.6951, + "step": 15675 + }, + { + "epoch": 0.4648459508347419, + "grad_norm": 0.14203238487243652, + "learning_rate": 0.000563604771119076, + "loss": 2.6422, + "step": 15676 + }, + { + "epoch": 0.46487560418705337, + "grad_norm": 0.13043999671936035, + "learning_rate": 0.0005635581021250536, + "loss": 2.6941, + "step": 15677 + }, + { + "epoch": 0.46490525753936485, + "grad_norm": 0.14522221684455872, + "learning_rate": 0.0005635114325682131, + "loss": 2.6671, + "step": 15678 + }, + { + "epoch": 0.4649349108916763, + "grad_norm": 0.13786618411540985, + "learning_rate": 0.0005634647624489679, + "loss": 2.6581, + "step": 15679 + }, + { + "epoch": 0.4649645642439878, + "grad_norm": 0.1493426263332367, + "learning_rate": 0.0005634180917677314, + "loss": 2.6901, + "step": 15680 + }, + { + "epoch": 0.4649942175962993, + "grad_norm": 0.15177969634532928, + "learning_rate": 0.0005633714205249168, + "loss": 2.6524, + "step": 15681 + }, + { + "epoch": 0.46502387094861075, + "grad_norm": 0.12562055885791779, + "learning_rate": 0.0005633247487209374, + "loss": 2.6905, + "step": 15682 + }, + { + "epoch": 0.4650535243009222, + "grad_norm": 0.13633427023887634, + "learning_rate": 0.0005632780763562067, + "loss": 2.6271, + "step": 15683 + }, + { + "epoch": 0.4650831776532337, + "grad_norm": 0.13278649747371674, + "learning_rate": 0.0005632314034311373, + "loss": 2.6614, + "step": 15684 + }, + { + "epoch": 0.4651128310055452, + "grad_norm": 0.1609233319759369, + "learning_rate": 0.0005631847299461432, + "loss": 2.6698, + "step": 15685 + }, + { + "epoch": 0.46514248435785666, + "grad_norm": 0.16692328453063965, + "learning_rate": 0.0005631380559016376, + "loss": 2.656, + "step": 15686 + }, + { + "epoch": 0.46517213771016813, + "grad_norm": 0.1267002820968628, + "learning_rate": 0.0005630913812980336, + "loss": 2.6736, + "step": 15687 + }, + { + "epoch": 0.4652017910624796, + "grad_norm": 0.17707981169223785, + "learning_rate": 0.0005630447061357447, + "loss": 2.6645, + "step": 15688 + }, + { + "epoch": 0.4652314444147911, + "grad_norm": 0.16821159422397614, + "learning_rate": 0.0005629980304151839, + "loss": 2.6864, + "step": 15689 + }, + { + "epoch": 0.46526109776710256, + "grad_norm": 0.14604398608207703, + "learning_rate": 0.0005629513541367648, + "loss": 2.632, + "step": 15690 + }, + { + "epoch": 0.46529075111941404, + "grad_norm": 0.1394079625606537, + "learning_rate": 0.0005629046773009005, + "loss": 2.669, + "step": 15691 + }, + { + "epoch": 0.4653204044717255, + "grad_norm": 0.1306985318660736, + "learning_rate": 0.0005628579999080046, + "loss": 2.6938, + "step": 15692 + }, + { + "epoch": 0.465350057824037, + "grad_norm": 0.13120608031749725, + "learning_rate": 0.0005628113219584906, + "loss": 2.693, + "step": 15693 + }, + { + "epoch": 0.46537971117634847, + "grad_norm": 0.14761300384998322, + "learning_rate": 0.0005627646434527713, + "loss": 2.6759, + "step": 15694 + }, + { + "epoch": 0.46540936452865994, + "grad_norm": 0.13628600537776947, + "learning_rate": 0.0005627179643912603, + "loss": 2.6561, + "step": 15695 + }, + { + "epoch": 0.4654390178809714, + "grad_norm": 0.11539433151483536, + "learning_rate": 0.000562671284774371, + "loss": 2.639, + "step": 15696 + }, + { + "epoch": 0.46546867123328295, + "grad_norm": 0.11561641097068787, + "learning_rate": 0.0005626246046025167, + "loss": 2.664, + "step": 15697 + }, + { + "epoch": 0.4654983245855944, + "grad_norm": 0.1025240421295166, + "learning_rate": 0.0005625779238761107, + "loss": 2.6488, + "step": 15698 + }, + { + "epoch": 0.4655279779379059, + "grad_norm": 0.12440315634012222, + "learning_rate": 0.0005625312425955667, + "loss": 2.671, + "step": 15699 + }, + { + "epoch": 0.4655576312902174, + "grad_norm": 0.12246021628379822, + "learning_rate": 0.0005624845607612976, + "loss": 2.625, + "step": 15700 + }, + { + "epoch": 0.46558728464252885, + "grad_norm": 0.11903190612792969, + "learning_rate": 0.000562437878373717, + "loss": 2.6626, + "step": 15701 + }, + { + "epoch": 0.46561693799484033, + "grad_norm": 0.09895980358123779, + "learning_rate": 0.0005623911954332384, + "loss": 2.6387, + "step": 15702 + }, + { + "epoch": 0.4656465913471518, + "grad_norm": 0.12583978474140167, + "learning_rate": 0.0005623445119402748, + "loss": 2.6372, + "step": 15703 + }, + { + "epoch": 0.4656762446994633, + "grad_norm": 0.12347112596035004, + "learning_rate": 0.0005622978278952401, + "loss": 2.6505, + "step": 15704 + }, + { + "epoch": 0.46570589805177476, + "grad_norm": 0.12690088152885437, + "learning_rate": 0.0005622511432985473, + "loss": 2.6494, + "step": 15705 + }, + { + "epoch": 0.46573555140408623, + "grad_norm": 0.1054423600435257, + "learning_rate": 0.0005622044581506101, + "loss": 2.643, + "step": 15706 + }, + { + "epoch": 0.4657652047563977, + "grad_norm": 0.10808449983596802, + "learning_rate": 0.0005621577724518415, + "loss": 2.6575, + "step": 15707 + }, + { + "epoch": 0.4657948581087092, + "grad_norm": 0.11035964637994766, + "learning_rate": 0.0005621110862026553, + "loss": 2.6226, + "step": 15708 + }, + { + "epoch": 0.46582451146102066, + "grad_norm": 0.10538388043642044, + "learning_rate": 0.0005620643994034648, + "loss": 2.6406, + "step": 15709 + }, + { + "epoch": 0.46585416481333214, + "grad_norm": 0.12511004507541656, + "learning_rate": 0.0005620177120546833, + "loss": 2.6531, + "step": 15710 + }, + { + "epoch": 0.4658838181656436, + "grad_norm": 0.11201151460409164, + "learning_rate": 0.0005619710241567244, + "loss": 2.6984, + "step": 15711 + }, + { + "epoch": 0.4659134715179551, + "grad_norm": 0.1117027997970581, + "learning_rate": 0.0005619243357100014, + "loss": 2.6449, + "step": 15712 + }, + { + "epoch": 0.46594312487026657, + "grad_norm": 0.125638946890831, + "learning_rate": 0.0005618776467149278, + "loss": 2.678, + "step": 15713 + }, + { + "epoch": 0.46597277822257804, + "grad_norm": 0.11489114910364151, + "learning_rate": 0.0005618309571719171, + "loss": 2.6659, + "step": 15714 + }, + { + "epoch": 0.4660024315748895, + "grad_norm": 0.10887646675109863, + "learning_rate": 0.0005617842670813825, + "loss": 2.6592, + "step": 15715 + }, + { + "epoch": 0.466032084927201, + "grad_norm": 0.09989004582166672, + "learning_rate": 0.0005617375764437376, + "loss": 2.6809, + "step": 15716 + }, + { + "epoch": 0.46606173827951247, + "grad_norm": 0.09597529470920563, + "learning_rate": 0.000561690885259396, + "loss": 2.6552, + "step": 15717 + }, + { + "epoch": 0.466091391631824, + "grad_norm": 0.10993985086679459, + "learning_rate": 0.0005616441935287708, + "loss": 2.6421, + "step": 15718 + }, + { + "epoch": 0.4661210449841355, + "grad_norm": 0.11796755343675613, + "learning_rate": 0.000561597501252276, + "loss": 2.6494, + "step": 15719 + }, + { + "epoch": 0.46615069833644696, + "grad_norm": 0.11854343861341476, + "learning_rate": 0.0005615508084303245, + "loss": 2.6769, + "step": 15720 + }, + { + "epoch": 0.46618035168875843, + "grad_norm": 0.10785375535488129, + "learning_rate": 0.0005615041150633302, + "loss": 2.6452, + "step": 15721 + }, + { + "epoch": 0.4662100050410699, + "grad_norm": 0.09740132093429565, + "learning_rate": 0.0005614574211517064, + "loss": 2.6494, + "step": 15722 + }, + { + "epoch": 0.4662396583933814, + "grad_norm": 0.11412598192691803, + "learning_rate": 0.0005614107266958664, + "loss": 2.6461, + "step": 15723 + }, + { + "epoch": 0.46626931174569286, + "grad_norm": 0.14706365764141083, + "learning_rate": 0.000561364031696224, + "loss": 2.6828, + "step": 15724 + }, + { + "epoch": 0.46629896509800434, + "grad_norm": 0.17101991176605225, + "learning_rate": 0.0005613173361531925, + "loss": 2.6772, + "step": 15725 + }, + { + "epoch": 0.4663286184503158, + "grad_norm": 0.14819757640361786, + "learning_rate": 0.0005612706400671857, + "loss": 2.6354, + "step": 15726 + }, + { + "epoch": 0.4663582718026273, + "grad_norm": 0.1242995411157608, + "learning_rate": 0.0005612239434386166, + "loss": 2.6376, + "step": 15727 + }, + { + "epoch": 0.46638792515493877, + "grad_norm": 0.11984116584062576, + "learning_rate": 0.000561177246267899, + "loss": 2.6423, + "step": 15728 + }, + { + "epoch": 0.46641757850725024, + "grad_norm": 0.11760173738002777, + "learning_rate": 0.0005611305485554465, + "loss": 2.6354, + "step": 15729 + }, + { + "epoch": 0.4664472318595617, + "grad_norm": 0.11689332872629166, + "learning_rate": 0.0005610838503016723, + "loss": 2.666, + "step": 15730 + }, + { + "epoch": 0.4664768852118732, + "grad_norm": 0.1283612698316574, + "learning_rate": 0.0005610371515069903, + "loss": 2.6484, + "step": 15731 + }, + { + "epoch": 0.46650653856418467, + "grad_norm": 0.1128663420677185, + "learning_rate": 0.0005609904521718139, + "loss": 2.652, + "step": 15732 + }, + { + "epoch": 0.46653619191649615, + "grad_norm": 0.11934233456850052, + "learning_rate": 0.0005609437522965565, + "loss": 2.6793, + "step": 15733 + }, + { + "epoch": 0.4665658452688076, + "grad_norm": 0.11191168427467346, + "learning_rate": 0.0005608970518816317, + "loss": 2.6761, + "step": 15734 + }, + { + "epoch": 0.4665954986211191, + "grad_norm": 0.11977911740541458, + "learning_rate": 0.000560850350927453, + "loss": 2.6388, + "step": 15735 + }, + { + "epoch": 0.4666251519734306, + "grad_norm": 0.11834642291069031, + "learning_rate": 0.000560803649434434, + "loss": 2.6789, + "step": 15736 + }, + { + "epoch": 0.46665480532574205, + "grad_norm": 0.11209683865308762, + "learning_rate": 0.0005607569474029883, + "loss": 2.6641, + "step": 15737 + }, + { + "epoch": 0.4666844586780535, + "grad_norm": 0.12107659876346588, + "learning_rate": 0.0005607102448335294, + "loss": 2.6669, + "step": 15738 + }, + { + "epoch": 0.46671411203036506, + "grad_norm": 0.12608616054058075, + "learning_rate": 0.000560663541726471, + "loss": 2.6702, + "step": 15739 + }, + { + "epoch": 0.46674376538267653, + "grad_norm": 0.13026927411556244, + "learning_rate": 0.0005606168380822263, + "loss": 2.6628, + "step": 15740 + }, + { + "epoch": 0.466773418734988, + "grad_norm": 0.1358058899641037, + "learning_rate": 0.0005605701339012092, + "loss": 2.656, + "step": 15741 + }, + { + "epoch": 0.4668030720872995, + "grad_norm": 0.12678968906402588, + "learning_rate": 0.0005605234291838331, + "loss": 2.6569, + "step": 15742 + }, + { + "epoch": 0.46683272543961096, + "grad_norm": 0.11362873017787933, + "learning_rate": 0.0005604767239305116, + "loss": 2.6144, + "step": 15743 + }, + { + "epoch": 0.46686237879192244, + "grad_norm": 0.1104416474699974, + "learning_rate": 0.0005604300181416585, + "loss": 2.6983, + "step": 15744 + }, + { + "epoch": 0.4668920321442339, + "grad_norm": 0.11185162514448166, + "learning_rate": 0.0005603833118176871, + "loss": 2.7028, + "step": 15745 + }, + { + "epoch": 0.4669216854965454, + "grad_norm": 0.1314685344696045, + "learning_rate": 0.0005603366049590112, + "loss": 2.6428, + "step": 15746 + }, + { + "epoch": 0.46695133884885687, + "grad_norm": 0.1467479169368744, + "learning_rate": 0.0005602898975660442, + "loss": 2.6569, + "step": 15747 + }, + { + "epoch": 0.46698099220116834, + "grad_norm": 0.13132333755493164, + "learning_rate": 0.0005602431896391998, + "loss": 2.6476, + "step": 15748 + }, + { + "epoch": 0.4670106455534798, + "grad_norm": 0.121537946164608, + "learning_rate": 0.0005601964811788918, + "loss": 2.678, + "step": 15749 + }, + { + "epoch": 0.4670402989057913, + "grad_norm": 0.15567706525325775, + "learning_rate": 0.0005601497721855334, + "loss": 2.6532, + "step": 15750 + }, + { + "epoch": 0.46706995225810277, + "grad_norm": 0.15679962933063507, + "learning_rate": 0.0005601030626595386, + "loss": 2.6134, + "step": 15751 + }, + { + "epoch": 0.46709960561041425, + "grad_norm": 0.1403515338897705, + "learning_rate": 0.0005600563526013207, + "loss": 2.6653, + "step": 15752 + }, + { + "epoch": 0.4671292589627257, + "grad_norm": 0.12183454632759094, + "learning_rate": 0.0005600096420112937, + "loss": 2.674, + "step": 15753 + }, + { + "epoch": 0.4671589123150372, + "grad_norm": 0.1077132374048233, + "learning_rate": 0.0005599629308898709, + "loss": 2.6746, + "step": 15754 + }, + { + "epoch": 0.4671885656673487, + "grad_norm": 0.13441485166549683, + "learning_rate": 0.000559916219237466, + "loss": 2.6792, + "step": 15755 + }, + { + "epoch": 0.46721821901966015, + "grad_norm": 0.12668940424919128, + "learning_rate": 0.0005598695070544929, + "loss": 2.6434, + "step": 15756 + }, + { + "epoch": 0.46724787237197163, + "grad_norm": 0.12372421473264694, + "learning_rate": 0.0005598227943413648, + "loss": 2.6948, + "step": 15757 + }, + { + "epoch": 0.4672775257242831, + "grad_norm": 0.11835546046495438, + "learning_rate": 0.0005597760810984957, + "loss": 2.6519, + "step": 15758 + }, + { + "epoch": 0.46730717907659464, + "grad_norm": 0.1184382364153862, + "learning_rate": 0.0005597293673262992, + "loss": 2.6483, + "step": 15759 + }, + { + "epoch": 0.4673368324289061, + "grad_norm": 0.11591972410678864, + "learning_rate": 0.0005596826530251889, + "loss": 2.6314, + "step": 15760 + }, + { + "epoch": 0.4673664857812176, + "grad_norm": 0.14421477913856506, + "learning_rate": 0.0005596359381955784, + "loss": 2.677, + "step": 15761 + }, + { + "epoch": 0.46739613913352906, + "grad_norm": 0.13406135141849518, + "learning_rate": 0.0005595892228378813, + "loss": 2.6787, + "step": 15762 + }, + { + "epoch": 0.46742579248584054, + "grad_norm": 0.12637297809123993, + "learning_rate": 0.0005595425069525115, + "loss": 2.6766, + "step": 15763 + }, + { + "epoch": 0.467455445838152, + "grad_norm": 0.11564651131629944, + "learning_rate": 0.0005594957905398826, + "loss": 2.6916, + "step": 15764 + }, + { + "epoch": 0.4674850991904635, + "grad_norm": 0.0999600738286972, + "learning_rate": 0.0005594490736004083, + "loss": 2.6494, + "step": 15765 + }, + { + "epoch": 0.46751475254277497, + "grad_norm": 0.10143683850765228, + "learning_rate": 0.0005594023561345023, + "loss": 2.6768, + "step": 15766 + }, + { + "epoch": 0.46754440589508645, + "grad_norm": 0.1051763966679573, + "learning_rate": 0.0005593556381425782, + "loss": 2.6865, + "step": 15767 + }, + { + "epoch": 0.4675740592473979, + "grad_norm": 0.1110944077372551, + "learning_rate": 0.0005593089196250495, + "loss": 2.6774, + "step": 15768 + }, + { + "epoch": 0.4676037125997094, + "grad_norm": 0.1121826171875, + "learning_rate": 0.0005592622005823303, + "loss": 2.6791, + "step": 15769 + }, + { + "epoch": 0.4676333659520209, + "grad_norm": 0.10682810097932816, + "learning_rate": 0.0005592154810148344, + "loss": 2.6529, + "step": 15770 + }, + { + "epoch": 0.46766301930433235, + "grad_norm": 0.11066453903913498, + "learning_rate": 0.000559168760922975, + "loss": 2.6687, + "step": 15771 + }, + { + "epoch": 0.4676926726566438, + "grad_norm": 0.11010416597127914, + "learning_rate": 0.000559122040307166, + "loss": 2.6445, + "step": 15772 + }, + { + "epoch": 0.4677223260089553, + "grad_norm": 0.10203569382429123, + "learning_rate": 0.0005590753191678213, + "loss": 2.6702, + "step": 15773 + }, + { + "epoch": 0.4677519793612668, + "grad_norm": 0.11490017175674438, + "learning_rate": 0.0005590285975053545, + "loss": 2.6325, + "step": 15774 + }, + { + "epoch": 0.46778163271357825, + "grad_norm": 0.13310962915420532, + "learning_rate": 0.0005589818753201792, + "loss": 2.6947, + "step": 15775 + }, + { + "epoch": 0.46781128606588973, + "grad_norm": 0.151411235332489, + "learning_rate": 0.0005589351526127096, + "loss": 2.6744, + "step": 15776 + }, + { + "epoch": 0.4678409394182012, + "grad_norm": 0.15478475391864777, + "learning_rate": 0.0005588884293833588, + "loss": 2.6688, + "step": 15777 + }, + { + "epoch": 0.4678705927705127, + "grad_norm": 0.1495099663734436, + "learning_rate": 0.000558841705632541, + "loss": 2.6447, + "step": 15778 + }, + { + "epoch": 0.46790024612282416, + "grad_norm": 0.13070090115070343, + "learning_rate": 0.0005587949813606696, + "loss": 2.676, + "step": 15779 + }, + { + "epoch": 0.4679298994751357, + "grad_norm": 0.15069717168807983, + "learning_rate": 0.0005587482565681587, + "loss": 2.6596, + "step": 15780 + }, + { + "epoch": 0.46795955282744717, + "grad_norm": 0.15513913333415985, + "learning_rate": 0.000558701531255422, + "loss": 2.6646, + "step": 15781 + }, + { + "epoch": 0.46798920617975864, + "grad_norm": 0.11568116396665573, + "learning_rate": 0.000558654805422873, + "loss": 2.672, + "step": 15782 + }, + { + "epoch": 0.4680188595320701, + "grad_norm": 0.1323934644460678, + "learning_rate": 0.0005586080790709257, + "loss": 2.6588, + "step": 15783 + }, + { + "epoch": 0.4680485128843816, + "grad_norm": 0.14128583669662476, + "learning_rate": 0.0005585613521999938, + "loss": 2.6819, + "step": 15784 + }, + { + "epoch": 0.46807816623669307, + "grad_norm": 0.1242859736084938, + "learning_rate": 0.0005585146248104911, + "loss": 2.6586, + "step": 15785 + }, + { + "epoch": 0.46810781958900455, + "grad_norm": 0.11887545138597488, + "learning_rate": 0.0005584678969028313, + "loss": 2.6945, + "step": 15786 + }, + { + "epoch": 0.468137472941316, + "grad_norm": 0.11481383442878723, + "learning_rate": 0.0005584211684774283, + "loss": 2.6514, + "step": 15787 + }, + { + "epoch": 0.4681671262936275, + "grad_norm": 0.12193186581134796, + "learning_rate": 0.0005583744395346957, + "loss": 2.682, + "step": 15788 + }, + { + "epoch": 0.468196779645939, + "grad_norm": 0.11873180419206619, + "learning_rate": 0.0005583277100750475, + "loss": 2.6471, + "step": 15789 + }, + { + "epoch": 0.46822643299825045, + "grad_norm": 0.13332486152648926, + "learning_rate": 0.0005582809800988974, + "loss": 2.6675, + "step": 15790 + }, + { + "epoch": 0.46825608635056193, + "grad_norm": 0.1391667276620865, + "learning_rate": 0.0005582342496066592, + "loss": 2.6518, + "step": 15791 + }, + { + "epoch": 0.4682857397028734, + "grad_norm": 0.11375441402196884, + "learning_rate": 0.000558187518598747, + "loss": 2.6728, + "step": 15792 + }, + { + "epoch": 0.4683153930551849, + "grad_norm": 0.12584751844406128, + "learning_rate": 0.000558140787075574, + "loss": 2.6634, + "step": 15793 + }, + { + "epoch": 0.46834504640749636, + "grad_norm": 0.1410481035709381, + "learning_rate": 0.0005580940550375543, + "loss": 2.6702, + "step": 15794 + }, + { + "epoch": 0.46837469975980783, + "grad_norm": 0.14362642168998718, + "learning_rate": 0.0005580473224851019, + "loss": 2.6617, + "step": 15795 + }, + { + "epoch": 0.4684043531121193, + "grad_norm": 0.11913296580314636, + "learning_rate": 0.0005580005894186306, + "loss": 2.6515, + "step": 15796 + }, + { + "epoch": 0.4684340064644308, + "grad_norm": 0.12244006991386414, + "learning_rate": 0.000557953855838554, + "loss": 2.6839, + "step": 15797 + }, + { + "epoch": 0.46846365981674226, + "grad_norm": 0.12319093197584152, + "learning_rate": 0.0005579071217452862, + "loss": 2.6706, + "step": 15798 + }, + { + "epoch": 0.46849331316905374, + "grad_norm": 0.1407845914363861, + "learning_rate": 0.0005578603871392408, + "loss": 2.6282, + "step": 15799 + }, + { + "epoch": 0.4685229665213652, + "grad_norm": 0.13938355445861816, + "learning_rate": 0.0005578136520208315, + "loss": 2.6616, + "step": 15800 + }, + { + "epoch": 0.46855261987367675, + "grad_norm": 0.15021567046642303, + "learning_rate": 0.0005577669163904727, + "loss": 2.6753, + "step": 15801 + }, + { + "epoch": 0.4685822732259882, + "grad_norm": 0.15323956310749054, + "learning_rate": 0.0005577201802485779, + "loss": 2.6716, + "step": 15802 + }, + { + "epoch": 0.4686119265782997, + "grad_norm": 0.14714272320270538, + "learning_rate": 0.0005576734435955611, + "loss": 2.6977, + "step": 15803 + }, + { + "epoch": 0.4686415799306112, + "grad_norm": 0.1350407749414444, + "learning_rate": 0.0005576267064318359, + "loss": 2.6447, + "step": 15804 + }, + { + "epoch": 0.46867123328292265, + "grad_norm": 0.13113367557525635, + "learning_rate": 0.0005575799687578163, + "loss": 2.6819, + "step": 15805 + }, + { + "epoch": 0.4687008866352341, + "grad_norm": 0.15256533026695251, + "learning_rate": 0.0005575332305739162, + "loss": 2.6542, + "step": 15806 + }, + { + "epoch": 0.4687305399875456, + "grad_norm": 0.1392725706100464, + "learning_rate": 0.0005574864918805494, + "loss": 2.6617, + "step": 15807 + }, + { + "epoch": 0.4687601933398571, + "grad_norm": 0.12710626423358917, + "learning_rate": 0.0005574397526781301, + "loss": 2.6532, + "step": 15808 + }, + { + "epoch": 0.46878984669216855, + "grad_norm": 0.1302318423986435, + "learning_rate": 0.0005573930129670716, + "loss": 2.6619, + "step": 15809 + }, + { + "epoch": 0.46881950004448003, + "grad_norm": 0.14026936888694763, + "learning_rate": 0.0005573462727477883, + "loss": 2.6676, + "step": 15810 + }, + { + "epoch": 0.4688491533967915, + "grad_norm": 0.1367465853691101, + "learning_rate": 0.000557299532020694, + "loss": 2.6623, + "step": 15811 + }, + { + "epoch": 0.468878806749103, + "grad_norm": 0.1252744197845459, + "learning_rate": 0.0005572527907862024, + "loss": 2.6318, + "step": 15812 + }, + { + "epoch": 0.46890846010141446, + "grad_norm": 0.1452900469303131, + "learning_rate": 0.0005572060490447275, + "loss": 2.6631, + "step": 15813 + }, + { + "epoch": 0.46893811345372594, + "grad_norm": 0.11079496890306473, + "learning_rate": 0.0005571593067966832, + "loss": 2.6576, + "step": 15814 + }, + { + "epoch": 0.4689677668060374, + "grad_norm": 0.12281375378370285, + "learning_rate": 0.0005571125640424835, + "loss": 2.6347, + "step": 15815 + }, + { + "epoch": 0.4689974201583489, + "grad_norm": 0.1339995265007019, + "learning_rate": 0.0005570658207825423, + "loss": 2.6783, + "step": 15816 + }, + { + "epoch": 0.46902707351066036, + "grad_norm": 0.1336813122034073, + "learning_rate": 0.0005570190770172733, + "loss": 2.6617, + "step": 15817 + }, + { + "epoch": 0.46905672686297184, + "grad_norm": 0.1368875652551651, + "learning_rate": 0.0005569723327470907, + "loss": 2.6361, + "step": 15818 + }, + { + "epoch": 0.4690863802152833, + "grad_norm": 0.11505132168531418, + "learning_rate": 0.0005569255879724082, + "loss": 2.651, + "step": 15819 + }, + { + "epoch": 0.4691160335675948, + "grad_norm": 0.11079025268554688, + "learning_rate": 0.0005568788426936399, + "loss": 2.6237, + "step": 15820 + }, + { + "epoch": 0.46914568691990627, + "grad_norm": 0.12063699215650558, + "learning_rate": 0.0005568320969111997, + "loss": 2.642, + "step": 15821 + }, + { + "epoch": 0.4691753402722178, + "grad_norm": 0.1104375571012497, + "learning_rate": 0.0005567853506255016, + "loss": 2.6669, + "step": 15822 + }, + { + "epoch": 0.4692049936245293, + "grad_norm": 0.11257471889257431, + "learning_rate": 0.0005567386038369594, + "loss": 2.6777, + "step": 15823 + }, + { + "epoch": 0.46923464697684075, + "grad_norm": 0.10604137927293777, + "learning_rate": 0.0005566918565459871, + "loss": 2.6569, + "step": 15824 + }, + { + "epoch": 0.46926430032915223, + "grad_norm": 0.11032076925039291, + "learning_rate": 0.0005566451087529988, + "loss": 2.6261, + "step": 15825 + }, + { + "epoch": 0.4692939536814637, + "grad_norm": 0.10608841478824615, + "learning_rate": 0.0005565983604584082, + "loss": 2.6574, + "step": 15826 + }, + { + "epoch": 0.4693236070337752, + "grad_norm": 0.0951586663722992, + "learning_rate": 0.0005565516116626294, + "loss": 2.6665, + "step": 15827 + }, + { + "epoch": 0.46935326038608666, + "grad_norm": 0.11269266903400421, + "learning_rate": 0.0005565048623660764, + "loss": 2.6384, + "step": 15828 + }, + { + "epoch": 0.46938291373839813, + "grad_norm": 0.11615140736103058, + "learning_rate": 0.0005564581125691632, + "loss": 2.6616, + "step": 15829 + }, + { + "epoch": 0.4694125670907096, + "grad_norm": 0.12466095387935638, + "learning_rate": 0.0005564113622723038, + "loss": 2.6813, + "step": 15830 + }, + { + "epoch": 0.4694422204430211, + "grad_norm": 0.11014919728040695, + "learning_rate": 0.000556364611475912, + "loss": 2.6589, + "step": 15831 + }, + { + "epoch": 0.46947187379533256, + "grad_norm": 0.11658075451850891, + "learning_rate": 0.0005563178601804019, + "loss": 2.6554, + "step": 15832 + }, + { + "epoch": 0.46950152714764404, + "grad_norm": 0.11215251684188843, + "learning_rate": 0.0005562711083861873, + "loss": 2.6729, + "step": 15833 + }, + { + "epoch": 0.4695311804999555, + "grad_norm": 0.12072954326868057, + "learning_rate": 0.0005562243560936824, + "loss": 2.6462, + "step": 15834 + }, + { + "epoch": 0.469560833852267, + "grad_norm": 0.11315274238586426, + "learning_rate": 0.0005561776033033016, + "loss": 2.6648, + "step": 15835 + }, + { + "epoch": 0.46959048720457847, + "grad_norm": 0.10161740332841873, + "learning_rate": 0.0005561308500154581, + "loss": 2.6507, + "step": 15836 + }, + { + "epoch": 0.46962014055688994, + "grad_norm": 0.1308566927909851, + "learning_rate": 0.0005560840962305665, + "loss": 2.6564, + "step": 15837 + }, + { + "epoch": 0.4696497939092014, + "grad_norm": 0.13430741429328918, + "learning_rate": 0.0005560373419490405, + "loss": 2.6608, + "step": 15838 + }, + { + "epoch": 0.4696794472615129, + "grad_norm": 0.12487282603979111, + "learning_rate": 0.000555990587171294, + "loss": 2.6711, + "step": 15839 + }, + { + "epoch": 0.46970910061382437, + "grad_norm": 0.11480975896120071, + "learning_rate": 0.0005559438318977416, + "loss": 2.6621, + "step": 15840 + }, + { + "epoch": 0.46973875396613585, + "grad_norm": 0.1126556545495987, + "learning_rate": 0.0005558970761287968, + "loss": 2.6346, + "step": 15841 + }, + { + "epoch": 0.4697684073184473, + "grad_norm": 0.12953157722949982, + "learning_rate": 0.0005558503198648737, + "loss": 2.6205, + "step": 15842 + }, + { + "epoch": 0.46979806067075885, + "grad_norm": 0.12932530045509338, + "learning_rate": 0.0005558035631063866, + "loss": 2.6542, + "step": 15843 + }, + { + "epoch": 0.46982771402307033, + "grad_norm": 0.1165807694196701, + "learning_rate": 0.0005557568058537493, + "loss": 2.6579, + "step": 15844 + }, + { + "epoch": 0.4698573673753818, + "grad_norm": 0.10958949476480484, + "learning_rate": 0.0005557100481073759, + "loss": 2.6521, + "step": 15845 + }, + { + "epoch": 0.4698870207276933, + "grad_norm": 0.14040535688400269, + "learning_rate": 0.0005556632898676806, + "loss": 2.6258, + "step": 15846 + }, + { + "epoch": 0.46991667408000476, + "grad_norm": 0.13304468989372253, + "learning_rate": 0.0005556165311350772, + "loss": 2.6679, + "step": 15847 + }, + { + "epoch": 0.46994632743231624, + "grad_norm": 0.13995181024074554, + "learning_rate": 0.0005555697719099797, + "loss": 2.6549, + "step": 15848 + }, + { + "epoch": 0.4699759807846277, + "grad_norm": 0.1395839899778366, + "learning_rate": 0.0005555230121928026, + "loss": 2.6808, + "step": 15849 + }, + { + "epoch": 0.4700056341369392, + "grad_norm": 0.1401122361421585, + "learning_rate": 0.0005554762519839596, + "loss": 2.6931, + "step": 15850 + }, + { + "epoch": 0.47003528748925066, + "grad_norm": 0.12180068343877792, + "learning_rate": 0.0005554294912838648, + "loss": 2.6344, + "step": 15851 + }, + { + "epoch": 0.47006494084156214, + "grad_norm": 0.11759757250547409, + "learning_rate": 0.0005553827300929324, + "loss": 2.6686, + "step": 15852 + }, + { + "epoch": 0.4700945941938736, + "grad_norm": 0.10717027634382248, + "learning_rate": 0.0005553359684115764, + "loss": 2.6252, + "step": 15853 + }, + { + "epoch": 0.4701242475461851, + "grad_norm": 0.11806291341781616, + "learning_rate": 0.0005552892062402109, + "loss": 2.676, + "step": 15854 + }, + { + "epoch": 0.47015390089849657, + "grad_norm": 0.12921708822250366, + "learning_rate": 0.00055524244357925, + "loss": 2.6552, + "step": 15855 + }, + { + "epoch": 0.47018355425080804, + "grad_norm": 0.12825991213321686, + "learning_rate": 0.0005551956804291079, + "loss": 2.6678, + "step": 15856 + }, + { + "epoch": 0.4702132076031195, + "grad_norm": 0.1189003512263298, + "learning_rate": 0.0005551489167901984, + "loss": 2.6334, + "step": 15857 + }, + { + "epoch": 0.470242860955431, + "grad_norm": 0.11471814662218094, + "learning_rate": 0.0005551021526629359, + "loss": 2.6337, + "step": 15858 + }, + { + "epoch": 0.4702725143077425, + "grad_norm": 0.12483272701501846, + "learning_rate": 0.0005550553880477343, + "loss": 2.658, + "step": 15859 + }, + { + "epoch": 0.47030216766005395, + "grad_norm": 0.11573314666748047, + "learning_rate": 0.0005550086229450077, + "loss": 2.6537, + "step": 15860 + }, + { + "epoch": 0.4703318210123654, + "grad_norm": 0.11267251521348953, + "learning_rate": 0.0005549618573551704, + "loss": 2.7068, + "step": 15861 + }, + { + "epoch": 0.4703614743646769, + "grad_norm": 0.12013466656208038, + "learning_rate": 0.0005549150912786365, + "loss": 2.6785, + "step": 15862 + }, + { + "epoch": 0.47039112771698843, + "grad_norm": 0.1474199742078781, + "learning_rate": 0.00055486832471582, + "loss": 2.6288, + "step": 15863 + }, + { + "epoch": 0.4704207810692999, + "grad_norm": 0.15265613794326782, + "learning_rate": 0.0005548215576671352, + "loss": 2.6242, + "step": 15864 + }, + { + "epoch": 0.4704504344216114, + "grad_norm": 0.16048699617385864, + "learning_rate": 0.0005547747901329957, + "loss": 2.6367, + "step": 15865 + }, + { + "epoch": 0.47048008777392286, + "grad_norm": 0.1440204530954361, + "learning_rate": 0.0005547280221138162, + "loss": 2.6831, + "step": 15866 + }, + { + "epoch": 0.47050974112623434, + "grad_norm": 0.11318585276603699, + "learning_rate": 0.0005546812536100107, + "loss": 2.651, + "step": 15867 + }, + { + "epoch": 0.4705393944785458, + "grad_norm": 0.13420948386192322, + "learning_rate": 0.0005546344846219936, + "loss": 2.6513, + "step": 15868 + }, + { + "epoch": 0.4705690478308573, + "grad_norm": 0.15690630674362183, + "learning_rate": 0.0005545877151501785, + "loss": 2.6569, + "step": 15869 + }, + { + "epoch": 0.47059870118316877, + "grad_norm": 0.1430591344833374, + "learning_rate": 0.0005545409451949798, + "loss": 2.6639, + "step": 15870 + }, + { + "epoch": 0.47062835453548024, + "grad_norm": 0.10623843222856522, + "learning_rate": 0.0005544941747568118, + "loss": 2.641, + "step": 15871 + }, + { + "epoch": 0.4706580078877917, + "grad_norm": 0.10458455234766006, + "learning_rate": 0.0005544474038360882, + "loss": 2.6472, + "step": 15872 + }, + { + "epoch": 0.4706876612401032, + "grad_norm": 0.11802656203508377, + "learning_rate": 0.0005544006324332238, + "loss": 2.6656, + "step": 15873 + }, + { + "epoch": 0.47071731459241467, + "grad_norm": 0.10349331796169281, + "learning_rate": 0.0005543538605486323, + "loss": 2.6534, + "step": 15874 + }, + { + "epoch": 0.47074696794472615, + "grad_norm": 0.11514319479465485, + "learning_rate": 0.0005543070881827282, + "loss": 2.696, + "step": 15875 + }, + { + "epoch": 0.4707766212970376, + "grad_norm": 0.12057633697986603, + "learning_rate": 0.0005542603153359253, + "loss": 2.6315, + "step": 15876 + }, + { + "epoch": 0.4708062746493491, + "grad_norm": 0.12016650289297104, + "learning_rate": 0.0005542135420086383, + "loss": 2.6582, + "step": 15877 + }, + { + "epoch": 0.4708359280016606, + "grad_norm": 0.11433380097150803, + "learning_rate": 0.0005541667682012807, + "loss": 2.6446, + "step": 15878 + }, + { + "epoch": 0.47086558135397205, + "grad_norm": 0.1138373389840126, + "learning_rate": 0.0005541199939142673, + "loss": 2.6661, + "step": 15879 + }, + { + "epoch": 0.4708952347062835, + "grad_norm": 0.11230604350566864, + "learning_rate": 0.0005540732191480121, + "loss": 2.6735, + "step": 15880 + }, + { + "epoch": 0.470924888058595, + "grad_norm": 0.11460154503583908, + "learning_rate": 0.0005540264439029292, + "loss": 2.6784, + "step": 15881 + }, + { + "epoch": 0.4709545414109065, + "grad_norm": 0.10549158602952957, + "learning_rate": 0.0005539796681794329, + "loss": 2.635, + "step": 15882 + }, + { + "epoch": 0.47098419476321796, + "grad_norm": 0.12755966186523438, + "learning_rate": 0.0005539328919779373, + "loss": 2.6698, + "step": 15883 + }, + { + "epoch": 0.4710138481155295, + "grad_norm": 0.1315014362335205, + "learning_rate": 0.0005538861152988567, + "loss": 2.6422, + "step": 15884 + }, + { + "epoch": 0.47104350146784096, + "grad_norm": 0.12138865888118744, + "learning_rate": 0.0005538393381426054, + "loss": 2.6445, + "step": 15885 + }, + { + "epoch": 0.47107315482015244, + "grad_norm": 0.11157087236642838, + "learning_rate": 0.0005537925605095974, + "loss": 2.6719, + "step": 15886 + }, + { + "epoch": 0.4711028081724639, + "grad_norm": 0.11006397008895874, + "learning_rate": 0.0005537457824002469, + "loss": 2.6629, + "step": 15887 + }, + { + "epoch": 0.4711324615247754, + "grad_norm": 0.11650450527667999, + "learning_rate": 0.0005536990038149685, + "loss": 2.6612, + "step": 15888 + }, + { + "epoch": 0.47116211487708687, + "grad_norm": 0.11560797691345215, + "learning_rate": 0.0005536522247541762, + "loss": 2.6721, + "step": 15889 + }, + { + "epoch": 0.47119176822939834, + "grad_norm": 0.23245975375175476, + "learning_rate": 0.0005536054452182844, + "loss": 2.6916, + "step": 15890 + }, + { + "epoch": 0.4712214215817098, + "grad_norm": 0.10393266379833221, + "learning_rate": 0.0005535586652077068, + "loss": 2.6747, + "step": 15891 + }, + { + "epoch": 0.4712510749340213, + "grad_norm": 0.10955336689949036, + "learning_rate": 0.0005535118847228581, + "loss": 2.6781, + "step": 15892 + }, + { + "epoch": 0.4712807282863328, + "grad_norm": 0.11408597975969315, + "learning_rate": 0.0005534651037641527, + "loss": 2.6295, + "step": 15893 + }, + { + "epoch": 0.47131038163864425, + "grad_norm": 0.1183466836810112, + "learning_rate": 0.0005534183223320045, + "loss": 2.7023, + "step": 15894 + }, + { + "epoch": 0.4713400349909557, + "grad_norm": 0.11691351979970932, + "learning_rate": 0.0005533715404268278, + "loss": 2.6328, + "step": 15895 + }, + { + "epoch": 0.4713696883432672, + "grad_norm": 0.11681949347257614, + "learning_rate": 0.0005533247580490373, + "loss": 2.6513, + "step": 15896 + }, + { + "epoch": 0.4713993416955787, + "grad_norm": 0.10251875966787338, + "learning_rate": 0.0005532779751990464, + "loss": 2.6323, + "step": 15897 + }, + { + "epoch": 0.47142899504789015, + "grad_norm": 0.1157757043838501, + "learning_rate": 0.0005532311918772702, + "loss": 2.6613, + "step": 15898 + }, + { + "epoch": 0.47145864840020163, + "grad_norm": 0.10811182856559753, + "learning_rate": 0.0005531844080841227, + "loss": 2.6266, + "step": 15899 + }, + { + "epoch": 0.4714883017525131, + "grad_norm": 0.1014430895447731, + "learning_rate": 0.0005531376238200179, + "loss": 2.6714, + "step": 15900 + }, + { + "epoch": 0.4715179551048246, + "grad_norm": 0.12470205873250961, + "learning_rate": 0.0005530908390853706, + "loss": 2.7023, + "step": 15901 + }, + { + "epoch": 0.47154760845713606, + "grad_norm": 0.13401061296463013, + "learning_rate": 0.0005530440538805947, + "loss": 2.6447, + "step": 15902 + }, + { + "epoch": 0.47157726180944753, + "grad_norm": 0.13071979582309723, + "learning_rate": 0.0005529972682061045, + "loss": 2.6701, + "step": 15903 + }, + { + "epoch": 0.471606915161759, + "grad_norm": 0.1145915538072586, + "learning_rate": 0.0005529504820623144, + "loss": 2.6537, + "step": 15904 + }, + { + "epoch": 0.47163656851407054, + "grad_norm": 0.1194426491856575, + "learning_rate": 0.0005529036954496387, + "loss": 2.6609, + "step": 15905 + }, + { + "epoch": 0.471666221866382, + "grad_norm": 0.10510849952697754, + "learning_rate": 0.0005528569083684918, + "loss": 2.6334, + "step": 15906 + }, + { + "epoch": 0.4716958752186935, + "grad_norm": 0.13394130766391754, + "learning_rate": 0.000552810120819288, + "loss": 2.6526, + "step": 15907 + }, + { + "epoch": 0.47172552857100497, + "grad_norm": 0.14554251730442047, + "learning_rate": 0.0005527633328024413, + "loss": 2.6376, + "step": 15908 + }, + { + "epoch": 0.47175518192331645, + "grad_norm": 0.15671315789222717, + "learning_rate": 0.0005527165443183663, + "loss": 2.7102, + "step": 15909 + }, + { + "epoch": 0.4717848352756279, + "grad_norm": 0.1328945755958557, + "learning_rate": 0.0005526697553674771, + "loss": 2.6533, + "step": 15910 + }, + { + "epoch": 0.4718144886279394, + "grad_norm": 0.11791824549436569, + "learning_rate": 0.0005526229659501883, + "loss": 2.6398, + "step": 15911 + }, + { + "epoch": 0.4718441419802509, + "grad_norm": 0.1301795244216919, + "learning_rate": 0.0005525761760669142, + "loss": 2.6998, + "step": 15912 + }, + { + "epoch": 0.47187379533256235, + "grad_norm": 0.1304623931646347, + "learning_rate": 0.000552529385718069, + "loss": 2.6572, + "step": 15913 + }, + { + "epoch": 0.4719034486848738, + "grad_norm": 0.12926746904850006, + "learning_rate": 0.0005524825949040671, + "loss": 2.6541, + "step": 15914 + }, + { + "epoch": 0.4719331020371853, + "grad_norm": 0.146681010723114, + "learning_rate": 0.0005524358036253226, + "loss": 2.6524, + "step": 15915 + }, + { + "epoch": 0.4719627553894968, + "grad_norm": 0.11717099696397781, + "learning_rate": 0.0005523890118822502, + "loss": 2.6924, + "step": 15916 + }, + { + "epoch": 0.47199240874180826, + "grad_norm": 0.11990607529878616, + "learning_rate": 0.0005523422196752642, + "loss": 2.6486, + "step": 15917 + }, + { + "epoch": 0.47202206209411973, + "grad_norm": 0.12234856933355331, + "learning_rate": 0.0005522954270047787, + "loss": 2.6833, + "step": 15918 + }, + { + "epoch": 0.4720517154464312, + "grad_norm": 0.12424144148826599, + "learning_rate": 0.0005522486338712083, + "loss": 2.666, + "step": 15919 + }, + { + "epoch": 0.4720813687987427, + "grad_norm": 0.1108027845621109, + "learning_rate": 0.0005522018402749673, + "loss": 2.6487, + "step": 15920 + }, + { + "epoch": 0.47211102215105416, + "grad_norm": 0.10863223671913147, + "learning_rate": 0.00055215504621647, + "loss": 2.6507, + "step": 15921 + }, + { + "epoch": 0.47214067550336564, + "grad_norm": 0.11480016261339188, + "learning_rate": 0.0005521082516961309, + "loss": 2.677, + "step": 15922 + }, + { + "epoch": 0.4721703288556771, + "grad_norm": 0.10234847664833069, + "learning_rate": 0.0005520614567143641, + "loss": 2.6464, + "step": 15923 + }, + { + "epoch": 0.4721999822079886, + "grad_norm": 0.10360078513622284, + "learning_rate": 0.0005520146612715842, + "loss": 2.6606, + "step": 15924 + }, + { + "epoch": 0.47222963556030007, + "grad_norm": 0.11242727935314178, + "learning_rate": 0.0005519678653682058, + "loss": 2.6636, + "step": 15925 + }, + { + "epoch": 0.4722592889126116, + "grad_norm": 0.10592052340507507, + "learning_rate": 0.0005519210690046427, + "loss": 2.6624, + "step": 15926 + }, + { + "epoch": 0.4722889422649231, + "grad_norm": 0.10240482538938522, + "learning_rate": 0.0005518742721813099, + "loss": 2.6717, + "step": 15927 + }, + { + "epoch": 0.47231859561723455, + "grad_norm": 0.11362753808498383, + "learning_rate": 0.0005518274748986215, + "loss": 2.6572, + "step": 15928 + }, + { + "epoch": 0.472348248969546, + "grad_norm": 0.12775877118110657, + "learning_rate": 0.0005517806771569918, + "loss": 2.6857, + "step": 15929 + }, + { + "epoch": 0.4723779023218575, + "grad_norm": 0.10765089094638824, + "learning_rate": 0.0005517338789568353, + "loss": 2.6404, + "step": 15930 + }, + { + "epoch": 0.472407555674169, + "grad_norm": 0.10364675521850586, + "learning_rate": 0.0005516870802985663, + "loss": 2.6516, + "step": 15931 + }, + { + "epoch": 0.47243720902648045, + "grad_norm": 0.13834057748317719, + "learning_rate": 0.0005516402811825996, + "loss": 2.6804, + "step": 15932 + }, + { + "epoch": 0.47246686237879193, + "grad_norm": 0.18947207927703857, + "learning_rate": 0.000551593481609349, + "loss": 2.7223, + "step": 15933 + }, + { + "epoch": 0.4724965157311034, + "grad_norm": 0.2167920470237732, + "learning_rate": 0.0005515466815792297, + "loss": 2.6212, + "step": 15934 + }, + { + "epoch": 0.4725261690834149, + "grad_norm": 0.16370731592178345, + "learning_rate": 0.0005514998810926554, + "loss": 2.6961, + "step": 15935 + }, + { + "epoch": 0.47255582243572636, + "grad_norm": 0.14918993413448334, + "learning_rate": 0.0005514530801500409, + "loss": 2.6418, + "step": 15936 + }, + { + "epoch": 0.47258547578803783, + "grad_norm": 0.19021324813365936, + "learning_rate": 0.0005514062787518004, + "loss": 2.6713, + "step": 15937 + }, + { + "epoch": 0.4726151291403493, + "grad_norm": 0.15267692506313324, + "learning_rate": 0.0005513594768983488, + "loss": 2.6474, + "step": 15938 + }, + { + "epoch": 0.4726447824926608, + "grad_norm": 0.12145460397005081, + "learning_rate": 0.0005513126745901, + "loss": 2.6341, + "step": 15939 + }, + { + "epoch": 0.47267443584497226, + "grad_norm": 0.16246846318244934, + "learning_rate": 0.0005512658718274688, + "loss": 2.6204, + "step": 15940 + }, + { + "epoch": 0.47270408919728374, + "grad_norm": 0.12089138478040695, + "learning_rate": 0.0005512190686108693, + "loss": 2.6284, + "step": 15941 + }, + { + "epoch": 0.4727337425495952, + "grad_norm": 0.1442006379365921, + "learning_rate": 0.0005511722649407162, + "loss": 2.6701, + "step": 15942 + }, + { + "epoch": 0.4727633959019067, + "grad_norm": 0.14200319349765778, + "learning_rate": 0.0005511254608174239, + "loss": 2.6788, + "step": 15943 + }, + { + "epoch": 0.47279304925421817, + "grad_norm": 0.10416208207607269, + "learning_rate": 0.0005510786562414071, + "loss": 2.6732, + "step": 15944 + }, + { + "epoch": 0.47282270260652964, + "grad_norm": 0.1337742805480957, + "learning_rate": 0.0005510318512130797, + "loss": 2.6403, + "step": 15945 + }, + { + "epoch": 0.4728523559588411, + "grad_norm": 0.13999216258525848, + "learning_rate": 0.0005509850457328568, + "loss": 2.6212, + "step": 15946 + }, + { + "epoch": 0.47288200931115265, + "grad_norm": 0.12325437366962433, + "learning_rate": 0.0005509382398011523, + "loss": 2.6467, + "step": 15947 + }, + { + "epoch": 0.4729116626634641, + "grad_norm": 0.11222871392965317, + "learning_rate": 0.0005508914334183811, + "loss": 2.6416, + "step": 15948 + }, + { + "epoch": 0.4729413160157756, + "grad_norm": 0.10851552337408066, + "learning_rate": 0.0005508446265849575, + "loss": 2.6698, + "step": 15949 + }, + { + "epoch": 0.4729709693680871, + "grad_norm": 0.10534072667360306, + "learning_rate": 0.000550797819301296, + "loss": 2.6549, + "step": 15950 + }, + { + "epoch": 0.47300062272039856, + "grad_norm": 0.10463469475507736, + "learning_rate": 0.0005507510115678111, + "loss": 2.6291, + "step": 15951 + }, + { + "epoch": 0.47303027607271003, + "grad_norm": 0.10389384627342224, + "learning_rate": 0.0005507042033849173, + "loss": 2.6758, + "step": 15952 + }, + { + "epoch": 0.4730599294250215, + "grad_norm": 0.11123393476009369, + "learning_rate": 0.000550657394753029, + "loss": 2.6477, + "step": 15953 + }, + { + "epoch": 0.473089582777333, + "grad_norm": 0.10276804119348526, + "learning_rate": 0.0005506105856725608, + "loss": 2.6452, + "step": 15954 + }, + { + "epoch": 0.47311923612964446, + "grad_norm": 0.11866732686758041, + "learning_rate": 0.0005505637761439271, + "loss": 2.6307, + "step": 15955 + }, + { + "epoch": 0.47314888948195594, + "grad_norm": 0.1172383651137352, + "learning_rate": 0.0005505169661675427, + "loss": 2.6825, + "step": 15956 + }, + { + "epoch": 0.4731785428342674, + "grad_norm": 0.10703294724225998, + "learning_rate": 0.0005504701557438218, + "loss": 2.6503, + "step": 15957 + }, + { + "epoch": 0.4732081961865789, + "grad_norm": 0.11685331910848618, + "learning_rate": 0.0005504233448731789, + "loss": 2.6819, + "step": 15958 + }, + { + "epoch": 0.47323784953889036, + "grad_norm": 0.11238575726747513, + "learning_rate": 0.0005503765335560287, + "loss": 2.6686, + "step": 15959 + }, + { + "epoch": 0.47326750289120184, + "grad_norm": 0.13090743124485016, + "learning_rate": 0.0005503297217927856, + "loss": 2.6637, + "step": 15960 + }, + { + "epoch": 0.4732971562435133, + "grad_norm": 0.11209756880998611, + "learning_rate": 0.0005502829095838644, + "loss": 2.647, + "step": 15961 + }, + { + "epoch": 0.4733268095958248, + "grad_norm": 0.12708672881126404, + "learning_rate": 0.0005502360969296791, + "loss": 2.6613, + "step": 15962 + }, + { + "epoch": 0.47335646294813627, + "grad_norm": 0.12096463143825531, + "learning_rate": 0.0005501892838306446, + "loss": 2.6924, + "step": 15963 + }, + { + "epoch": 0.47338611630044775, + "grad_norm": 0.11754949390888214, + "learning_rate": 0.0005501424702871754, + "loss": 2.6719, + "step": 15964 + }, + { + "epoch": 0.4734157696527592, + "grad_norm": 0.11915077269077301, + "learning_rate": 0.0005500956562996862, + "loss": 2.6682, + "step": 15965 + }, + { + "epoch": 0.4734454230050707, + "grad_norm": 0.1208028644323349, + "learning_rate": 0.0005500488418685913, + "loss": 2.6829, + "step": 15966 + }, + { + "epoch": 0.47347507635738223, + "grad_norm": 0.11802325397729874, + "learning_rate": 0.0005500020269943052, + "loss": 2.6352, + "step": 15967 + }, + { + "epoch": 0.4735047297096937, + "grad_norm": 0.11873950809240341, + "learning_rate": 0.0005499552116772427, + "loss": 2.6827, + "step": 15968 + }, + { + "epoch": 0.4735343830620052, + "grad_norm": 0.12879997491836548, + "learning_rate": 0.000549908395917818, + "loss": 2.6565, + "step": 15969 + }, + { + "epoch": 0.47356403641431666, + "grad_norm": 0.12486280500888824, + "learning_rate": 0.000549861579716446, + "loss": 2.6409, + "step": 15970 + }, + { + "epoch": 0.47359368976662813, + "grad_norm": 0.1214350089430809, + "learning_rate": 0.0005498147630735411, + "loss": 2.6839, + "step": 15971 + }, + { + "epoch": 0.4736233431189396, + "grad_norm": 0.13242462277412415, + "learning_rate": 0.0005497679459895182, + "loss": 2.7, + "step": 15972 + }, + { + "epoch": 0.4736529964712511, + "grad_norm": 0.1359657347202301, + "learning_rate": 0.0005497211284647914, + "loss": 2.6447, + "step": 15973 + }, + { + "epoch": 0.47368264982356256, + "grad_norm": 0.12557853758335114, + "learning_rate": 0.0005496743104997754, + "loss": 2.6725, + "step": 15974 + }, + { + "epoch": 0.47371230317587404, + "grad_norm": 0.12297026813030243, + "learning_rate": 0.0005496274920948848, + "loss": 2.6511, + "step": 15975 + }, + { + "epoch": 0.4737419565281855, + "grad_norm": 0.11326326429843903, + "learning_rate": 0.0005495806732505343, + "loss": 2.6704, + "step": 15976 + }, + { + "epoch": 0.473771609880497, + "grad_norm": 0.10717950016260147, + "learning_rate": 0.0005495338539671387, + "loss": 2.6682, + "step": 15977 + }, + { + "epoch": 0.47380126323280847, + "grad_norm": 0.12528494000434875, + "learning_rate": 0.0005494870342451122, + "loss": 2.6528, + "step": 15978 + }, + { + "epoch": 0.47383091658511994, + "grad_norm": 0.13374540209770203, + "learning_rate": 0.0005494402140848693, + "loss": 2.655, + "step": 15979 + }, + { + "epoch": 0.4738605699374314, + "grad_norm": 0.13244900107383728, + "learning_rate": 0.000549393393486825, + "loss": 2.6644, + "step": 15980 + }, + { + "epoch": 0.4738902232897429, + "grad_norm": 0.12138756364583969, + "learning_rate": 0.0005493465724513935, + "loss": 2.6582, + "step": 15981 + }, + { + "epoch": 0.47391987664205437, + "grad_norm": 0.12620534002780914, + "learning_rate": 0.0005492997509789899, + "loss": 2.7104, + "step": 15982 + }, + { + "epoch": 0.47394952999436585, + "grad_norm": 0.13442376255989075, + "learning_rate": 0.0005492529290700284, + "loss": 2.6625, + "step": 15983 + }, + { + "epoch": 0.4739791833466773, + "grad_norm": 0.12127437442541122, + "learning_rate": 0.0005492061067249239, + "loss": 2.6483, + "step": 15984 + }, + { + "epoch": 0.4740088366989888, + "grad_norm": 0.1275758296251297, + "learning_rate": 0.0005491592839440908, + "loss": 2.6357, + "step": 15985 + }, + { + "epoch": 0.4740384900513003, + "grad_norm": 0.1330222338438034, + "learning_rate": 0.0005491124607279438, + "loss": 2.6675, + "step": 15986 + }, + { + "epoch": 0.47406814340361175, + "grad_norm": 0.12410580366849899, + "learning_rate": 0.0005490656370768974, + "loss": 2.6666, + "step": 15987 + }, + { + "epoch": 0.4740977967559233, + "grad_norm": 0.11365814507007599, + "learning_rate": 0.0005490188129913665, + "loss": 2.6802, + "step": 15988 + }, + { + "epoch": 0.47412745010823476, + "grad_norm": 0.13241390883922577, + "learning_rate": 0.0005489719884717656, + "loss": 2.6792, + "step": 15989 + }, + { + "epoch": 0.47415710346054624, + "grad_norm": 0.14878255128860474, + "learning_rate": 0.0005489251635185093, + "loss": 2.6409, + "step": 15990 + }, + { + "epoch": 0.4741867568128577, + "grad_norm": 0.13663409650325775, + "learning_rate": 0.0005488783381320122, + "loss": 2.6565, + "step": 15991 + }, + { + "epoch": 0.4742164101651692, + "grad_norm": 0.12385772168636322, + "learning_rate": 0.0005488315123126892, + "loss": 2.6287, + "step": 15992 + }, + { + "epoch": 0.47424606351748066, + "grad_norm": 0.12718845903873444, + "learning_rate": 0.0005487846860609547, + "loss": 2.6382, + "step": 15993 + }, + { + "epoch": 0.47427571686979214, + "grad_norm": 0.16359341144561768, + "learning_rate": 0.0005487378593772233, + "loss": 2.6757, + "step": 15994 + }, + { + "epoch": 0.4743053702221036, + "grad_norm": 0.16247139871120453, + "learning_rate": 0.00054869103226191, + "loss": 2.6829, + "step": 15995 + }, + { + "epoch": 0.4743350235744151, + "grad_norm": 0.13964508473873138, + "learning_rate": 0.0005486442047154291, + "loss": 2.6562, + "step": 15996 + }, + { + "epoch": 0.47436467692672657, + "grad_norm": 0.12494932115077972, + "learning_rate": 0.0005485973767381955, + "loss": 2.6047, + "step": 15997 + }, + { + "epoch": 0.47439433027903805, + "grad_norm": 0.12552925944328308, + "learning_rate": 0.0005485505483306238, + "loss": 2.6737, + "step": 15998 + }, + { + "epoch": 0.4744239836313495, + "grad_norm": 0.1455806940793991, + "learning_rate": 0.0005485037194931287, + "loss": 2.681, + "step": 15999 + }, + { + "epoch": 0.474453636983661, + "grad_norm": 0.14312049746513367, + "learning_rate": 0.0005484568902261248, + "loss": 2.6731, + "step": 16000 + }, + { + "epoch": 0.4744832903359725, + "grad_norm": 0.12760581076145172, + "learning_rate": 0.0005484100605300267, + "loss": 2.6585, + "step": 16001 + }, + { + "epoch": 0.47451294368828395, + "grad_norm": 0.12304390966892242, + "learning_rate": 0.0005483632304052493, + "loss": 2.6465, + "step": 16002 + }, + { + "epoch": 0.4745425970405954, + "grad_norm": 0.15091103315353394, + "learning_rate": 0.0005483163998522071, + "loss": 2.6322, + "step": 16003 + }, + { + "epoch": 0.4745722503929069, + "grad_norm": 0.1281903237104416, + "learning_rate": 0.0005482695688713151, + "loss": 2.674, + "step": 16004 + }, + { + "epoch": 0.4746019037452184, + "grad_norm": 0.11323931813240051, + "learning_rate": 0.0005482227374629877, + "loss": 2.6798, + "step": 16005 + }, + { + "epoch": 0.47463155709752985, + "grad_norm": 0.16490548849105835, + "learning_rate": 0.0005481759056276395, + "loss": 2.6439, + "step": 16006 + }, + { + "epoch": 0.47466121044984133, + "grad_norm": 0.1456504464149475, + "learning_rate": 0.0005481290733656854, + "loss": 2.6458, + "step": 16007 + }, + { + "epoch": 0.4746908638021528, + "grad_norm": 0.1311664879322052, + "learning_rate": 0.0005480822406775403, + "loss": 2.6278, + "step": 16008 + }, + { + "epoch": 0.47472051715446434, + "grad_norm": 0.1354358047246933, + "learning_rate": 0.0005480354075636186, + "loss": 2.6178, + "step": 16009 + }, + { + "epoch": 0.4747501705067758, + "grad_norm": 0.1370290070772171, + "learning_rate": 0.0005479885740243352, + "loss": 2.6669, + "step": 16010 + }, + { + "epoch": 0.4747798238590873, + "grad_norm": 0.12734845280647278, + "learning_rate": 0.0005479417400601047, + "loss": 2.6987, + "step": 16011 + }, + { + "epoch": 0.47480947721139877, + "grad_norm": 0.13548865914344788, + "learning_rate": 0.0005478949056713418, + "loss": 2.6822, + "step": 16012 + }, + { + "epoch": 0.47483913056371024, + "grad_norm": 0.1429576277732849, + "learning_rate": 0.0005478480708584613, + "loss": 2.67, + "step": 16013 + }, + { + "epoch": 0.4748687839160217, + "grad_norm": 0.12769381701946259, + "learning_rate": 0.0005478012356218779, + "loss": 2.6717, + "step": 16014 + }, + { + "epoch": 0.4748984372683332, + "grad_norm": 0.12726430594921112, + "learning_rate": 0.0005477543999620065, + "loss": 2.687, + "step": 16015 + }, + { + "epoch": 0.47492809062064467, + "grad_norm": 0.12344709038734436, + "learning_rate": 0.0005477075638792617, + "loss": 2.6532, + "step": 16016 + }, + { + "epoch": 0.47495774397295615, + "grad_norm": 0.11244117468595505, + "learning_rate": 0.0005476607273740581, + "loss": 2.6632, + "step": 16017 + }, + { + "epoch": 0.4749873973252676, + "grad_norm": 0.10648120194673538, + "learning_rate": 0.0005476138904468105, + "loss": 2.6408, + "step": 16018 + }, + { + "epoch": 0.4750170506775791, + "grad_norm": 0.11907227337360382, + "learning_rate": 0.0005475670530979338, + "loss": 2.645, + "step": 16019 + }, + { + "epoch": 0.4750467040298906, + "grad_norm": 0.12307784706354141, + "learning_rate": 0.0005475202153278428, + "loss": 2.6691, + "step": 16020 + }, + { + "epoch": 0.47507635738220205, + "grad_norm": 0.13343718647956848, + "learning_rate": 0.000547473377136952, + "loss": 2.6625, + "step": 16021 + }, + { + "epoch": 0.47510601073451353, + "grad_norm": 0.1156848818063736, + "learning_rate": 0.0005474265385256764, + "loss": 2.6687, + "step": 16022 + }, + { + "epoch": 0.475135664086825, + "grad_norm": 0.13415805995464325, + "learning_rate": 0.0005473796994944306, + "loss": 2.6312, + "step": 16023 + }, + { + "epoch": 0.4751653174391365, + "grad_norm": 0.11052132397890091, + "learning_rate": 0.0005473328600436295, + "loss": 2.684, + "step": 16024 + }, + { + "epoch": 0.47519497079144796, + "grad_norm": 0.11426485329866409, + "learning_rate": 0.0005472860201736877, + "loss": 2.6353, + "step": 16025 + }, + { + "epoch": 0.47522462414375943, + "grad_norm": 0.11711655557155609, + "learning_rate": 0.00054723917988502, + "loss": 2.6722, + "step": 16026 + }, + { + "epoch": 0.4752542774960709, + "grad_norm": 0.11847063899040222, + "learning_rate": 0.0005471923391780415, + "loss": 2.6651, + "step": 16027 + }, + { + "epoch": 0.4752839308483824, + "grad_norm": 0.1245686411857605, + "learning_rate": 0.0005471454980531665, + "loss": 2.62, + "step": 16028 + }, + { + "epoch": 0.47531358420069386, + "grad_norm": 0.11683019995689392, + "learning_rate": 0.0005470986565108101, + "loss": 2.6721, + "step": 16029 + }, + { + "epoch": 0.4753432375530054, + "grad_norm": 0.12261059880256653, + "learning_rate": 0.0005470518145513871, + "loss": 2.6501, + "step": 16030 + }, + { + "epoch": 0.47537289090531687, + "grad_norm": 0.1299835592508316, + "learning_rate": 0.0005470049721753121, + "loss": 2.6806, + "step": 16031 + }, + { + "epoch": 0.47540254425762835, + "grad_norm": 0.12717977166175842, + "learning_rate": 0.0005469581293830001, + "loss": 2.6406, + "step": 16032 + }, + { + "epoch": 0.4754321976099398, + "grad_norm": 0.12837041914463043, + "learning_rate": 0.0005469112861748658, + "loss": 2.655, + "step": 16033 + }, + { + "epoch": 0.4754618509622513, + "grad_norm": 0.11314348876476288, + "learning_rate": 0.0005468644425513238, + "loss": 2.6581, + "step": 16034 + }, + { + "epoch": 0.4754915043145628, + "grad_norm": 0.11454810947179794, + "learning_rate": 0.0005468175985127894, + "loss": 2.6608, + "step": 16035 + }, + { + "epoch": 0.47552115766687425, + "grad_norm": 0.12263771891593933, + "learning_rate": 0.0005467707540596769, + "loss": 2.6554, + "step": 16036 + }, + { + "epoch": 0.4755508110191857, + "grad_norm": 0.12245673686265945, + "learning_rate": 0.0005467239091924015, + "loss": 2.6253, + "step": 16037 + }, + { + "epoch": 0.4755804643714972, + "grad_norm": 0.13197289407253265, + "learning_rate": 0.0005466770639113779, + "loss": 2.6516, + "step": 16038 + }, + { + "epoch": 0.4756101177238087, + "grad_norm": 0.11933182179927826, + "learning_rate": 0.0005466302182170208, + "loss": 2.6388, + "step": 16039 + }, + { + "epoch": 0.47563977107612015, + "grad_norm": 0.11893273144960403, + "learning_rate": 0.000546583372109745, + "loss": 2.6821, + "step": 16040 + }, + { + "epoch": 0.47566942442843163, + "grad_norm": 0.12017399072647095, + "learning_rate": 0.0005465365255899656, + "loss": 2.6522, + "step": 16041 + }, + { + "epoch": 0.4756990777807431, + "grad_norm": 0.12453875690698624, + "learning_rate": 0.0005464896786580973, + "loss": 2.6236, + "step": 16042 + }, + { + "epoch": 0.4757287311330546, + "grad_norm": 0.12399520725011826, + "learning_rate": 0.0005464428313145548, + "loss": 2.6612, + "step": 16043 + }, + { + "epoch": 0.47575838448536606, + "grad_norm": 0.1289566308259964, + "learning_rate": 0.0005463959835597531, + "loss": 2.6586, + "step": 16044 + }, + { + "epoch": 0.47578803783767754, + "grad_norm": 0.14179755747318268, + "learning_rate": 0.0005463491353941071, + "loss": 2.6716, + "step": 16045 + }, + { + "epoch": 0.475817691189989, + "grad_norm": 0.16498476266860962, + "learning_rate": 0.0005463022868180313, + "loss": 2.6799, + "step": 16046 + }, + { + "epoch": 0.4758473445423005, + "grad_norm": 0.16604214906692505, + "learning_rate": 0.000546255437831941, + "loss": 2.6386, + "step": 16047 + }, + { + "epoch": 0.47587699789461196, + "grad_norm": 0.1403868943452835, + "learning_rate": 0.0005462085884362509, + "loss": 2.674, + "step": 16048 + }, + { + "epoch": 0.47590665124692344, + "grad_norm": 0.1292685866355896, + "learning_rate": 0.0005461617386313758, + "loss": 2.6581, + "step": 16049 + }, + { + "epoch": 0.4759363045992349, + "grad_norm": 0.14482958614826202, + "learning_rate": 0.0005461148884177305, + "loss": 2.6968, + "step": 16050 + }, + { + "epoch": 0.47596595795154645, + "grad_norm": 0.14207662642002106, + "learning_rate": 0.0005460680377957299, + "loss": 2.6882, + "step": 16051 + }, + { + "epoch": 0.4759956113038579, + "grad_norm": 0.11595431715250015, + "learning_rate": 0.0005460211867657891, + "loss": 2.6319, + "step": 16052 + }, + { + "epoch": 0.4760252646561694, + "grad_norm": 0.11407046765089035, + "learning_rate": 0.0005459743353283226, + "loss": 2.6864, + "step": 16053 + }, + { + "epoch": 0.4760549180084809, + "grad_norm": 0.13245712220668793, + "learning_rate": 0.0005459274834837455, + "loss": 2.6848, + "step": 16054 + }, + { + "epoch": 0.47608457136079235, + "grad_norm": 0.11226332187652588, + "learning_rate": 0.0005458806312324726, + "loss": 2.6326, + "step": 16055 + }, + { + "epoch": 0.47611422471310383, + "grad_norm": 0.12935636937618256, + "learning_rate": 0.000545833778574919, + "loss": 2.6689, + "step": 16056 + }, + { + "epoch": 0.4761438780654153, + "grad_norm": 0.12630322575569153, + "learning_rate": 0.0005457869255114994, + "loss": 2.6605, + "step": 16057 + }, + { + "epoch": 0.4761735314177268, + "grad_norm": 0.11855483055114746, + "learning_rate": 0.0005457400720426287, + "loss": 2.6526, + "step": 16058 + }, + { + "epoch": 0.47620318477003826, + "grad_norm": 0.1441514790058136, + "learning_rate": 0.0005456932181687218, + "loss": 2.6755, + "step": 16059 + }, + { + "epoch": 0.47623283812234973, + "grad_norm": 0.12767943739891052, + "learning_rate": 0.0005456463638901935, + "loss": 2.6627, + "step": 16060 + }, + { + "epoch": 0.4762624914746612, + "grad_norm": 0.11010630428791046, + "learning_rate": 0.000545599509207459, + "loss": 2.6405, + "step": 16061 + }, + { + "epoch": 0.4762921448269727, + "grad_norm": 0.11305156350135803, + "learning_rate": 0.0005455526541209328, + "loss": 2.6728, + "step": 16062 + }, + { + "epoch": 0.47632179817928416, + "grad_norm": 0.11327789723873138, + "learning_rate": 0.0005455057986310302, + "loss": 2.6563, + "step": 16063 + }, + { + "epoch": 0.47635145153159564, + "grad_norm": 0.11741021275520325, + "learning_rate": 0.0005454589427381659, + "loss": 2.6517, + "step": 16064 + }, + { + "epoch": 0.4763811048839071, + "grad_norm": 0.1057324931025505, + "learning_rate": 0.0005454120864427549, + "loss": 2.6169, + "step": 16065 + }, + { + "epoch": 0.4764107582362186, + "grad_norm": 0.13133098185062408, + "learning_rate": 0.0005453652297452119, + "loss": 2.6558, + "step": 16066 + }, + { + "epoch": 0.47644041158853007, + "grad_norm": 0.14258508384227753, + "learning_rate": 0.0005453183726459522, + "loss": 2.6398, + "step": 16067 + }, + { + "epoch": 0.47647006494084154, + "grad_norm": 0.13596540689468384, + "learning_rate": 0.0005452715151453904, + "loss": 2.653, + "step": 16068 + }, + { + "epoch": 0.476499718293153, + "grad_norm": 0.12132113426923752, + "learning_rate": 0.0005452246572439416, + "loss": 2.6495, + "step": 16069 + }, + { + "epoch": 0.4765293716454645, + "grad_norm": 0.12287244945764542, + "learning_rate": 0.0005451777989420209, + "loss": 2.6639, + "step": 16070 + }, + { + "epoch": 0.476559024997776, + "grad_norm": 0.13018003106117249, + "learning_rate": 0.0005451309402400428, + "loss": 2.647, + "step": 16071 + }, + { + "epoch": 0.4765886783500875, + "grad_norm": 0.12001430988311768, + "learning_rate": 0.0005450840811384225, + "loss": 2.6546, + "step": 16072 + }, + { + "epoch": 0.476618331702399, + "grad_norm": 0.10333918780088425, + "learning_rate": 0.000545037221637575, + "loss": 2.6564, + "step": 16073 + }, + { + "epoch": 0.47664798505471045, + "grad_norm": 0.10577095299959183, + "learning_rate": 0.0005449903617379151, + "loss": 2.6411, + "step": 16074 + }, + { + "epoch": 0.47667763840702193, + "grad_norm": 0.11123267561197281, + "learning_rate": 0.0005449435014398579, + "loss": 2.6456, + "step": 16075 + }, + { + "epoch": 0.4767072917593334, + "grad_norm": 0.13065309822559357, + "learning_rate": 0.0005448966407438183, + "loss": 2.6462, + "step": 16076 + }, + { + "epoch": 0.4767369451116449, + "grad_norm": 0.13381512463092804, + "learning_rate": 0.0005448497796502112, + "loss": 2.6377, + "step": 16077 + }, + { + "epoch": 0.47676659846395636, + "grad_norm": 0.11558939516544342, + "learning_rate": 0.0005448029181594515, + "loss": 2.6494, + "step": 16078 + }, + { + "epoch": 0.47679625181626784, + "grad_norm": 0.11166825890541077, + "learning_rate": 0.0005447560562719543, + "loss": 2.6484, + "step": 16079 + }, + { + "epoch": 0.4768259051685793, + "grad_norm": 0.1143902912735939, + "learning_rate": 0.0005447091939881346, + "loss": 2.6644, + "step": 16080 + }, + { + "epoch": 0.4768555585208908, + "grad_norm": 0.1336478590965271, + "learning_rate": 0.0005446623313084074, + "loss": 2.6598, + "step": 16081 + }, + { + "epoch": 0.47688521187320226, + "grad_norm": 0.12004943192005157, + "learning_rate": 0.0005446154682331875, + "loss": 2.6608, + "step": 16082 + }, + { + "epoch": 0.47691486522551374, + "grad_norm": 0.11030944436788559, + "learning_rate": 0.00054456860476289, + "loss": 2.6664, + "step": 16083 + }, + { + "epoch": 0.4769445185778252, + "grad_norm": 0.12967686355113983, + "learning_rate": 0.0005445217408979299, + "loss": 2.6403, + "step": 16084 + }, + { + "epoch": 0.4769741719301367, + "grad_norm": 0.129527747631073, + "learning_rate": 0.0005444748766387219, + "loss": 2.6825, + "step": 16085 + }, + { + "epoch": 0.47700382528244817, + "grad_norm": 0.1154840812087059, + "learning_rate": 0.0005444280119856814, + "loss": 2.6781, + "step": 16086 + }, + { + "epoch": 0.47703347863475964, + "grad_norm": 0.12374632805585861, + "learning_rate": 0.0005443811469392233, + "loss": 2.6161, + "step": 16087 + }, + { + "epoch": 0.4770631319870711, + "grad_norm": 0.13393066823482513, + "learning_rate": 0.0005443342814997624, + "loss": 2.6651, + "step": 16088 + }, + { + "epoch": 0.4770927853393826, + "grad_norm": 0.10349546372890472, + "learning_rate": 0.0005442874156677139, + "loss": 2.6594, + "step": 16089 + }, + { + "epoch": 0.4771224386916941, + "grad_norm": 0.13931415975093842, + "learning_rate": 0.0005442405494434926, + "loss": 2.7092, + "step": 16090 + }, + { + "epoch": 0.47715209204400555, + "grad_norm": 0.12348970025777817, + "learning_rate": 0.0005441936828275138, + "loss": 2.6435, + "step": 16091 + }, + { + "epoch": 0.4771817453963171, + "grad_norm": 0.11198873072862625, + "learning_rate": 0.0005441468158201923, + "loss": 2.6095, + "step": 16092 + }, + { + "epoch": 0.47721139874862856, + "grad_norm": 0.1309709995985031, + "learning_rate": 0.0005440999484219432, + "loss": 2.665, + "step": 16093 + }, + { + "epoch": 0.47724105210094003, + "grad_norm": 0.14365795254707336, + "learning_rate": 0.0005440530806331815, + "loss": 2.6626, + "step": 16094 + }, + { + "epoch": 0.4772707054532515, + "grad_norm": 0.1303260177373886, + "learning_rate": 0.0005440062124543221, + "loss": 2.6526, + "step": 16095 + }, + { + "epoch": 0.477300358805563, + "grad_norm": 0.12809628248214722, + "learning_rate": 0.0005439593438857803, + "loss": 2.6762, + "step": 16096 + }, + { + "epoch": 0.47733001215787446, + "grad_norm": 0.1252477914094925, + "learning_rate": 0.0005439124749279708, + "loss": 2.648, + "step": 16097 + }, + { + "epoch": 0.47735966551018594, + "grad_norm": 0.13984371721744537, + "learning_rate": 0.0005438656055813088, + "loss": 2.6394, + "step": 16098 + }, + { + "epoch": 0.4773893188624974, + "grad_norm": 0.14143897593021393, + "learning_rate": 0.0005438187358462094, + "loss": 2.6326, + "step": 16099 + }, + { + "epoch": 0.4774189722148089, + "grad_norm": 0.1284407526254654, + "learning_rate": 0.0005437718657230876, + "loss": 2.6342, + "step": 16100 + }, + { + "epoch": 0.47744862556712037, + "grad_norm": 0.11070097982883453, + "learning_rate": 0.0005437249952123584, + "loss": 2.6435, + "step": 16101 + }, + { + "epoch": 0.47747827891943184, + "grad_norm": 0.12125226855278015, + "learning_rate": 0.0005436781243144369, + "loss": 2.6619, + "step": 16102 + }, + { + "epoch": 0.4775079322717433, + "grad_norm": 0.12631799280643463, + "learning_rate": 0.0005436312530297382, + "loss": 2.6598, + "step": 16103 + }, + { + "epoch": 0.4775375856240548, + "grad_norm": 0.11938761919736862, + "learning_rate": 0.0005435843813586769, + "loss": 2.6064, + "step": 16104 + }, + { + "epoch": 0.47756723897636627, + "grad_norm": 0.12927812337875366, + "learning_rate": 0.0005435375093016686, + "loss": 2.6301, + "step": 16105 + }, + { + "epoch": 0.47759689232867775, + "grad_norm": 0.13809184730052948, + "learning_rate": 0.0005434906368591282, + "loss": 2.6564, + "step": 16106 + }, + { + "epoch": 0.4776265456809892, + "grad_norm": 0.13029591739177704, + "learning_rate": 0.0005434437640314709, + "loss": 2.6341, + "step": 16107 + }, + { + "epoch": 0.4776561990333007, + "grad_norm": 0.11978953331708908, + "learning_rate": 0.0005433968908191115, + "loss": 2.6554, + "step": 16108 + }, + { + "epoch": 0.4776858523856122, + "grad_norm": 0.10546243190765381, + "learning_rate": 0.0005433500172224653, + "loss": 2.639, + "step": 16109 + }, + { + "epoch": 0.47771550573792365, + "grad_norm": 0.12122826278209686, + "learning_rate": 0.000543303143241947, + "loss": 2.657, + "step": 16110 + }, + { + "epoch": 0.4777451590902351, + "grad_norm": 0.1319662630558014, + "learning_rate": 0.000543256268877972, + "loss": 2.6574, + "step": 16111 + }, + { + "epoch": 0.4777748124425466, + "grad_norm": 0.13231034576892853, + "learning_rate": 0.0005432093941309554, + "loss": 2.6462, + "step": 16112 + }, + { + "epoch": 0.47780446579485814, + "grad_norm": 0.11266763508319855, + "learning_rate": 0.0005431625190013123, + "loss": 2.6365, + "step": 16113 + }, + { + "epoch": 0.4778341191471696, + "grad_norm": 0.11817102879285812, + "learning_rate": 0.0005431156434894575, + "loss": 2.6413, + "step": 16114 + }, + { + "epoch": 0.4778637724994811, + "grad_norm": 0.1144891083240509, + "learning_rate": 0.0005430687675958063, + "loss": 2.6526, + "step": 16115 + }, + { + "epoch": 0.47789342585179256, + "grad_norm": 0.11946732550859451, + "learning_rate": 0.0005430218913207739, + "loss": 2.6276, + "step": 16116 + }, + { + "epoch": 0.47792307920410404, + "grad_norm": 0.12865403294563293, + "learning_rate": 0.0005429750146647749, + "loss": 2.6504, + "step": 16117 + }, + { + "epoch": 0.4779527325564155, + "grad_norm": 0.14336872100830078, + "learning_rate": 0.0005429281376282251, + "loss": 2.6063, + "step": 16118 + }, + { + "epoch": 0.477982385908727, + "grad_norm": 0.12734052538871765, + "learning_rate": 0.0005428812602115394, + "loss": 2.6368, + "step": 16119 + }, + { + "epoch": 0.47801203926103847, + "grad_norm": 0.13312870264053345, + "learning_rate": 0.0005428343824151325, + "loss": 2.71, + "step": 16120 + }, + { + "epoch": 0.47804169261334994, + "grad_norm": 0.16190099716186523, + "learning_rate": 0.0005427875042394199, + "loss": 2.6655, + "step": 16121 + }, + { + "epoch": 0.4780713459656614, + "grad_norm": 0.15966930985450745, + "learning_rate": 0.0005427406256848167, + "loss": 2.6972, + "step": 16122 + }, + { + "epoch": 0.4781009993179729, + "grad_norm": 0.13863350450992584, + "learning_rate": 0.0005426937467517377, + "loss": 2.6803, + "step": 16123 + }, + { + "epoch": 0.4781306526702844, + "grad_norm": 0.10972259938716888, + "learning_rate": 0.0005426468674405984, + "loss": 2.6565, + "step": 16124 + }, + { + "epoch": 0.47816030602259585, + "grad_norm": 0.12929180264472961, + "learning_rate": 0.0005425999877518138, + "loss": 2.6444, + "step": 16125 + }, + { + "epoch": 0.4781899593749073, + "grad_norm": 0.10454277694225311, + "learning_rate": 0.0005425531076857988, + "loss": 2.6504, + "step": 16126 + }, + { + "epoch": 0.4782196127272188, + "grad_norm": 0.09979096055030823, + "learning_rate": 0.0005425062272429688, + "loss": 2.6734, + "step": 16127 + }, + { + "epoch": 0.4782492660795303, + "grad_norm": 0.10344865918159485, + "learning_rate": 0.000542459346423739, + "loss": 2.6528, + "step": 16128 + }, + { + "epoch": 0.47827891943184175, + "grad_norm": 0.10337952524423599, + "learning_rate": 0.0005424124652285243, + "loss": 2.6639, + "step": 16129 + }, + { + "epoch": 0.47830857278415323, + "grad_norm": 0.1248270720243454, + "learning_rate": 0.0005423655836577399, + "loss": 2.6474, + "step": 16130 + }, + { + "epoch": 0.4783382261364647, + "grad_norm": 0.1388784497976303, + "learning_rate": 0.000542318701711801, + "loss": 2.6775, + "step": 16131 + }, + { + "epoch": 0.4783678794887762, + "grad_norm": 0.131632462143898, + "learning_rate": 0.0005422718193911228, + "loss": 2.6515, + "step": 16132 + }, + { + "epoch": 0.47839753284108766, + "grad_norm": 0.12766395509243011, + "learning_rate": 0.0005422249366961204, + "loss": 2.6405, + "step": 16133 + }, + { + "epoch": 0.4784271861933992, + "grad_norm": 0.14189454913139343, + "learning_rate": 0.0005421780536272088, + "loss": 2.6458, + "step": 16134 + }, + { + "epoch": 0.47845683954571067, + "grad_norm": 0.12196145951747894, + "learning_rate": 0.0005421311701848035, + "loss": 2.6391, + "step": 16135 + }, + { + "epoch": 0.47848649289802214, + "grad_norm": 0.11705504357814789, + "learning_rate": 0.0005420842863693194, + "loss": 2.6208, + "step": 16136 + }, + { + "epoch": 0.4785161462503336, + "grad_norm": 0.13233527541160583, + "learning_rate": 0.0005420374021811716, + "loss": 2.675, + "step": 16137 + }, + { + "epoch": 0.4785457996026451, + "grad_norm": 0.13160085678100586, + "learning_rate": 0.0005419905176207755, + "loss": 2.5988, + "step": 16138 + }, + { + "epoch": 0.47857545295495657, + "grad_norm": 0.11708460748195648, + "learning_rate": 0.0005419436326885461, + "loss": 2.6802, + "step": 16139 + }, + { + "epoch": 0.47860510630726805, + "grad_norm": 0.11165446788072586, + "learning_rate": 0.0005418967473848986, + "loss": 2.6387, + "step": 16140 + }, + { + "epoch": 0.4786347596595795, + "grad_norm": 0.13811266422271729, + "learning_rate": 0.0005418498617102483, + "loss": 2.6774, + "step": 16141 + }, + { + "epoch": 0.478664413011891, + "grad_norm": 0.11489742994308472, + "learning_rate": 0.0005418029756650102, + "loss": 2.6414, + "step": 16142 + }, + { + "epoch": 0.4786940663642025, + "grad_norm": 0.11344917118549347, + "learning_rate": 0.0005417560892495996, + "loss": 2.6865, + "step": 16143 + }, + { + "epoch": 0.47872371971651395, + "grad_norm": 0.10852305591106415, + "learning_rate": 0.0005417092024644316, + "loss": 2.6758, + "step": 16144 + }, + { + "epoch": 0.4787533730688254, + "grad_norm": 0.11211717128753662, + "learning_rate": 0.0005416623153099216, + "loss": 2.619, + "step": 16145 + }, + { + "epoch": 0.4787830264211369, + "grad_norm": 0.12655547261238098, + "learning_rate": 0.0005416154277864847, + "loss": 2.7058, + "step": 16146 + }, + { + "epoch": 0.4788126797734484, + "grad_norm": 0.1058138832449913, + "learning_rate": 0.000541568539894536, + "loss": 2.6616, + "step": 16147 + }, + { + "epoch": 0.47884233312575986, + "grad_norm": 0.12103123217821121, + "learning_rate": 0.0005415216516344905, + "loss": 2.6934, + "step": 16148 + }, + { + "epoch": 0.47887198647807133, + "grad_norm": 0.13891996443271637, + "learning_rate": 0.0005414747630067639, + "loss": 2.6433, + "step": 16149 + }, + { + "epoch": 0.4789016398303828, + "grad_norm": 0.11145231872797012, + "learning_rate": 0.000541427874011771, + "loss": 2.6571, + "step": 16150 + }, + { + "epoch": 0.4789312931826943, + "grad_norm": 0.11564706265926361, + "learning_rate": 0.0005413809846499273, + "loss": 2.7049, + "step": 16151 + }, + { + "epoch": 0.47896094653500576, + "grad_norm": 0.11335679143667221, + "learning_rate": 0.0005413340949216478, + "loss": 2.6413, + "step": 16152 + }, + { + "epoch": 0.47899059988731724, + "grad_norm": 0.1279119998216629, + "learning_rate": 0.0005412872048273478, + "loss": 2.6426, + "step": 16153 + }, + { + "epoch": 0.4790202532396287, + "grad_norm": 0.1294853240251541, + "learning_rate": 0.0005412403143674425, + "loss": 2.6782, + "step": 16154 + }, + { + "epoch": 0.47904990659194024, + "grad_norm": 0.11499234288930893, + "learning_rate": 0.0005411934235423472, + "loss": 2.6599, + "step": 16155 + }, + { + "epoch": 0.4790795599442517, + "grad_norm": 0.11135923862457275, + "learning_rate": 0.0005411465323524769, + "loss": 2.6638, + "step": 16156 + }, + { + "epoch": 0.4791092132965632, + "grad_norm": 0.11241408437490463, + "learning_rate": 0.0005410996407982472, + "loss": 2.6766, + "step": 16157 + }, + { + "epoch": 0.4791388666488747, + "grad_norm": 0.10427700728178024, + "learning_rate": 0.0005410527488800731, + "loss": 2.6957, + "step": 16158 + }, + { + "epoch": 0.47916852000118615, + "grad_norm": 0.11183334141969681, + "learning_rate": 0.0005410058565983697, + "loss": 2.6633, + "step": 16159 + }, + { + "epoch": 0.4791981733534976, + "grad_norm": 0.12457433342933655, + "learning_rate": 0.0005409589639535526, + "loss": 2.613, + "step": 16160 + }, + { + "epoch": 0.4792278267058091, + "grad_norm": 0.14354310929775238, + "learning_rate": 0.0005409120709460366, + "loss": 2.6577, + "step": 16161 + }, + { + "epoch": 0.4792574800581206, + "grad_norm": 0.16214771568775177, + "learning_rate": 0.0005408651775762374, + "loss": 2.6925, + "step": 16162 + }, + { + "epoch": 0.47928713341043205, + "grad_norm": 0.1346573531627655, + "learning_rate": 0.00054081828384457, + "loss": 2.6306, + "step": 16163 + }, + { + "epoch": 0.47931678676274353, + "grad_norm": 0.12933361530303955, + "learning_rate": 0.0005407713897514497, + "loss": 2.6772, + "step": 16164 + }, + { + "epoch": 0.479346440115055, + "grad_norm": 0.12574170529842377, + "learning_rate": 0.0005407244952972917, + "loss": 2.6901, + "step": 16165 + }, + { + "epoch": 0.4793760934673665, + "grad_norm": 0.13249418139457703, + "learning_rate": 0.0005406776004825112, + "loss": 2.6588, + "step": 16166 + }, + { + "epoch": 0.47940574681967796, + "grad_norm": 0.12415305525064468, + "learning_rate": 0.0005406307053075238, + "loss": 2.6542, + "step": 16167 + }, + { + "epoch": 0.47943540017198943, + "grad_norm": 0.11061771214008331, + "learning_rate": 0.0005405838097727445, + "loss": 2.6477, + "step": 16168 + }, + { + "epoch": 0.4794650535243009, + "grad_norm": 0.1247541755437851, + "learning_rate": 0.0005405369138785884, + "loss": 2.6315, + "step": 16169 + }, + { + "epoch": 0.4794947068766124, + "grad_norm": 0.12996605038642883, + "learning_rate": 0.0005404900176254711, + "loss": 2.6483, + "step": 16170 + }, + { + "epoch": 0.47952436022892386, + "grad_norm": 0.1284177303314209, + "learning_rate": 0.0005404431210138076, + "loss": 2.6765, + "step": 16171 + }, + { + "epoch": 0.47955401358123534, + "grad_norm": 0.1282442957162857, + "learning_rate": 0.0005403962240440135, + "loss": 2.6469, + "step": 16172 + }, + { + "epoch": 0.4795836669335468, + "grad_norm": 0.1196533665060997, + "learning_rate": 0.0005403493267165039, + "loss": 2.6337, + "step": 16173 + }, + { + "epoch": 0.4796133202858583, + "grad_norm": 0.12377548217773438, + "learning_rate": 0.0005403024290316942, + "loss": 2.6513, + "step": 16174 + }, + { + "epoch": 0.4796429736381698, + "grad_norm": 0.12734369933605194, + "learning_rate": 0.0005402555309899993, + "loss": 2.6742, + "step": 16175 + }, + { + "epoch": 0.4796726269904813, + "grad_norm": 0.12112918496131897, + "learning_rate": 0.0005402086325918348, + "loss": 2.6573, + "step": 16176 + }, + { + "epoch": 0.4797022803427928, + "grad_norm": 0.11309146136045456, + "learning_rate": 0.000540161733837616, + "loss": 2.6572, + "step": 16177 + }, + { + "epoch": 0.47973193369510425, + "grad_norm": 0.11500285565853119, + "learning_rate": 0.0005401148347277582, + "loss": 2.638, + "step": 16178 + }, + { + "epoch": 0.4797615870474157, + "grad_norm": 0.10380689799785614, + "learning_rate": 0.0005400679352626767, + "loss": 2.6434, + "step": 16179 + }, + { + "epoch": 0.4797912403997272, + "grad_norm": 0.12671270966529846, + "learning_rate": 0.0005400210354427867, + "loss": 2.6505, + "step": 16180 + }, + { + "epoch": 0.4798208937520387, + "grad_norm": 0.1249203234910965, + "learning_rate": 0.0005399741352685036, + "loss": 2.6302, + "step": 16181 + }, + { + "epoch": 0.47985054710435016, + "grad_norm": 0.11472116410732269, + "learning_rate": 0.0005399272347402425, + "loss": 2.6497, + "step": 16182 + }, + { + "epoch": 0.47988020045666163, + "grad_norm": 0.11339611560106277, + "learning_rate": 0.000539880333858419, + "loss": 2.6414, + "step": 16183 + }, + { + "epoch": 0.4799098538089731, + "grad_norm": 0.1245897114276886, + "learning_rate": 0.0005398334326234484, + "loss": 2.6631, + "step": 16184 + }, + { + "epoch": 0.4799395071612846, + "grad_norm": 0.11570627987384796, + "learning_rate": 0.0005397865310357459, + "loss": 2.6947, + "step": 16185 + }, + { + "epoch": 0.47996916051359606, + "grad_norm": 0.12329693138599396, + "learning_rate": 0.0005397396290957266, + "loss": 2.6577, + "step": 16186 + }, + { + "epoch": 0.47999881386590754, + "grad_norm": 0.1195799857378006, + "learning_rate": 0.0005396927268038063, + "loss": 2.6563, + "step": 16187 + }, + { + "epoch": 0.480028467218219, + "grad_norm": 0.11122440546751022, + "learning_rate": 0.0005396458241603999, + "loss": 2.6464, + "step": 16188 + }, + { + "epoch": 0.4800581205705305, + "grad_norm": 0.11728827655315399, + "learning_rate": 0.0005395989211659231, + "loss": 2.6508, + "step": 16189 + }, + { + "epoch": 0.48008777392284196, + "grad_norm": 0.12835362553596497, + "learning_rate": 0.000539552017820791, + "loss": 2.6497, + "step": 16190 + }, + { + "epoch": 0.48011742727515344, + "grad_norm": 0.11642956733703613, + "learning_rate": 0.000539505114125419, + "loss": 2.654, + "step": 16191 + }, + { + "epoch": 0.4801470806274649, + "grad_norm": 0.10986274480819702, + "learning_rate": 0.0005394582100802225, + "loss": 2.6701, + "step": 16192 + }, + { + "epoch": 0.4801767339797764, + "grad_norm": 0.1041637659072876, + "learning_rate": 0.0005394113056856166, + "loss": 2.6555, + "step": 16193 + }, + { + "epoch": 0.48020638733208787, + "grad_norm": 0.11364824324846268, + "learning_rate": 0.0005393644009420169, + "loss": 2.6647, + "step": 16194 + }, + { + "epoch": 0.48023604068439935, + "grad_norm": 0.13350704312324524, + "learning_rate": 0.0005393174958498387, + "loss": 2.6839, + "step": 16195 + }, + { + "epoch": 0.4802656940367109, + "grad_norm": 0.14640627801418304, + "learning_rate": 0.0005392705904094973, + "loss": 2.6515, + "step": 16196 + }, + { + "epoch": 0.48029534738902235, + "grad_norm": 0.15638558566570282, + "learning_rate": 0.0005392236846214081, + "loss": 2.6591, + "step": 16197 + }, + { + "epoch": 0.48032500074133383, + "grad_norm": 0.13022896647453308, + "learning_rate": 0.0005391767784859864, + "loss": 2.6711, + "step": 16198 + }, + { + "epoch": 0.4803546540936453, + "grad_norm": 0.13161815702915192, + "learning_rate": 0.0005391298720036477, + "loss": 2.6868, + "step": 16199 + }, + { + "epoch": 0.4803843074459568, + "grad_norm": 0.147512748837471, + "learning_rate": 0.0005390829651748071, + "loss": 2.6746, + "step": 16200 + }, + { + "epoch": 0.48041396079826826, + "grad_norm": 0.1337665617465973, + "learning_rate": 0.0005390360579998803, + "loss": 2.6624, + "step": 16201 + }, + { + "epoch": 0.48044361415057973, + "grad_norm": 0.11504694074392319, + "learning_rate": 0.0005389891504792824, + "loss": 2.648, + "step": 16202 + }, + { + "epoch": 0.4804732675028912, + "grad_norm": 0.1132650151848793, + "learning_rate": 0.000538942242613429, + "loss": 2.6666, + "step": 16203 + }, + { + "epoch": 0.4805029208552027, + "grad_norm": 0.11589200794696808, + "learning_rate": 0.0005388953344027353, + "loss": 2.6451, + "step": 16204 + }, + { + "epoch": 0.48053257420751416, + "grad_norm": 0.12634943425655365, + "learning_rate": 0.0005388484258476167, + "loss": 2.6784, + "step": 16205 + }, + { + "epoch": 0.48056222755982564, + "grad_norm": 0.13178275525569916, + "learning_rate": 0.0005388015169484888, + "loss": 2.67, + "step": 16206 + }, + { + "epoch": 0.4805918809121371, + "grad_norm": 0.13708946108818054, + "learning_rate": 0.0005387546077057666, + "loss": 2.665, + "step": 16207 + }, + { + "epoch": 0.4806215342644486, + "grad_norm": 0.13298028707504272, + "learning_rate": 0.0005387076981198657, + "loss": 2.68, + "step": 16208 + }, + { + "epoch": 0.48065118761676007, + "grad_norm": 0.13741272687911987, + "learning_rate": 0.0005386607881912015, + "loss": 2.6783, + "step": 16209 + }, + { + "epoch": 0.48068084096907154, + "grad_norm": 0.12003655731678009, + "learning_rate": 0.0005386138779201893, + "loss": 2.6735, + "step": 16210 + }, + { + "epoch": 0.480710494321383, + "grad_norm": 0.11748401820659637, + "learning_rate": 0.0005385669673072447, + "loss": 2.6688, + "step": 16211 + }, + { + "epoch": 0.4807401476736945, + "grad_norm": 0.10508552938699722, + "learning_rate": 0.0005385200563527831, + "loss": 2.6851, + "step": 16212 + }, + { + "epoch": 0.48076980102600597, + "grad_norm": 0.10824795812368393, + "learning_rate": 0.0005384731450572196, + "loss": 2.6439, + "step": 16213 + }, + { + "epoch": 0.48079945437831745, + "grad_norm": 0.11799131333827972, + "learning_rate": 0.0005384262334209699, + "loss": 2.6571, + "step": 16214 + }, + { + "epoch": 0.4808291077306289, + "grad_norm": 0.12146859616041183, + "learning_rate": 0.0005383793214444493, + "loss": 2.661, + "step": 16215 + }, + { + "epoch": 0.4808587610829404, + "grad_norm": 0.12313803285360336, + "learning_rate": 0.0005383324091280731, + "loss": 2.6295, + "step": 16216 + }, + { + "epoch": 0.48088841443525193, + "grad_norm": 0.12911677360534668, + "learning_rate": 0.0005382854964722571, + "loss": 2.6441, + "step": 16217 + }, + { + "epoch": 0.4809180677875634, + "grad_norm": 0.12846839427947998, + "learning_rate": 0.0005382385834774163, + "loss": 2.6279, + "step": 16218 + }, + { + "epoch": 0.4809477211398749, + "grad_norm": 0.11658346652984619, + "learning_rate": 0.0005381916701439663, + "loss": 2.6488, + "step": 16219 + }, + { + "epoch": 0.48097737449218636, + "grad_norm": 0.10117319226264954, + "learning_rate": 0.0005381447564723224, + "loss": 2.6903, + "step": 16220 + }, + { + "epoch": 0.48100702784449784, + "grad_norm": 0.1292138695716858, + "learning_rate": 0.0005380978424629002, + "loss": 2.6282, + "step": 16221 + }, + { + "epoch": 0.4810366811968093, + "grad_norm": 0.13876663148403168, + "learning_rate": 0.0005380509281161151, + "loss": 2.6692, + "step": 16222 + }, + { + "epoch": 0.4810663345491208, + "grad_norm": 0.137106791138649, + "learning_rate": 0.0005380040134323825, + "loss": 2.6697, + "step": 16223 + }, + { + "epoch": 0.48109598790143226, + "grad_norm": 0.14711228013038635, + "learning_rate": 0.0005379570984121178, + "loss": 2.6603, + "step": 16224 + }, + { + "epoch": 0.48112564125374374, + "grad_norm": 0.12164401262998581, + "learning_rate": 0.0005379101830557364, + "loss": 2.6605, + "step": 16225 + }, + { + "epoch": 0.4811552946060552, + "grad_norm": 0.12229687720537186, + "learning_rate": 0.000537863267363654, + "loss": 2.6451, + "step": 16226 + }, + { + "epoch": 0.4811849479583667, + "grad_norm": 0.12419568747282028, + "learning_rate": 0.0005378163513362858, + "loss": 2.6134, + "step": 16227 + }, + { + "epoch": 0.48121460131067817, + "grad_norm": 0.10746420174837112, + "learning_rate": 0.0005377694349740472, + "loss": 2.6741, + "step": 16228 + }, + { + "epoch": 0.48124425466298965, + "grad_norm": 0.11201915889978409, + "learning_rate": 0.000537722518277354, + "loss": 2.6594, + "step": 16229 + }, + { + "epoch": 0.4812739080153011, + "grad_norm": 0.1279802918434143, + "learning_rate": 0.0005376756012466213, + "loss": 2.6824, + "step": 16230 + }, + { + "epoch": 0.4813035613676126, + "grad_norm": 0.11741775274276733, + "learning_rate": 0.0005376286838822647, + "loss": 2.6653, + "step": 16231 + }, + { + "epoch": 0.4813332147199241, + "grad_norm": 0.11561306565999985, + "learning_rate": 0.0005375817661846995, + "loss": 2.6558, + "step": 16232 + }, + { + "epoch": 0.48136286807223555, + "grad_norm": 0.10359309613704681, + "learning_rate": 0.0005375348481543415, + "loss": 2.6333, + "step": 16233 + }, + { + "epoch": 0.481392521424547, + "grad_norm": 0.09655595570802689, + "learning_rate": 0.000537487929791606, + "loss": 2.6468, + "step": 16234 + }, + { + "epoch": 0.4814221747768585, + "grad_norm": 0.09703168272972107, + "learning_rate": 0.0005374410110969084, + "loss": 2.6257, + "step": 16235 + }, + { + "epoch": 0.48145182812917, + "grad_norm": 0.09766444563865662, + "learning_rate": 0.0005373940920706641, + "loss": 2.638, + "step": 16236 + }, + { + "epoch": 0.48148148148148145, + "grad_norm": 0.09505915641784668, + "learning_rate": 0.0005373471727132889, + "loss": 2.6238, + "step": 16237 + }, + { + "epoch": 0.481511134833793, + "grad_norm": 0.09348843991756439, + "learning_rate": 0.0005373002530251978, + "loss": 2.6199, + "step": 16238 + }, + { + "epoch": 0.48154078818610446, + "grad_norm": 0.09424552321434021, + "learning_rate": 0.0005372533330068068, + "loss": 2.6576, + "step": 16239 + }, + { + "epoch": 0.48157044153841594, + "grad_norm": 0.1017138734459877, + "learning_rate": 0.000537206412658531, + "loss": 2.6234, + "step": 16240 + }, + { + "epoch": 0.4816000948907274, + "grad_norm": 0.11159953474998474, + "learning_rate": 0.000537159491980786, + "loss": 2.6578, + "step": 16241 + }, + { + "epoch": 0.4816297482430389, + "grad_norm": 0.12247452884912491, + "learning_rate": 0.0005371125709739875, + "loss": 2.6569, + "step": 16242 + }, + { + "epoch": 0.48165940159535037, + "grad_norm": 0.14749151468276978, + "learning_rate": 0.0005370656496385506, + "loss": 2.6375, + "step": 16243 + }, + { + "epoch": 0.48168905494766184, + "grad_norm": 0.14178462326526642, + "learning_rate": 0.0005370187279748913, + "loss": 2.6457, + "step": 16244 + }, + { + "epoch": 0.4817187082999733, + "grad_norm": 0.16267238557338715, + "learning_rate": 0.0005369718059834247, + "loss": 2.6357, + "step": 16245 + }, + { + "epoch": 0.4817483616522848, + "grad_norm": 0.1405816376209259, + "learning_rate": 0.0005369248836645663, + "loss": 2.675, + "step": 16246 + }, + { + "epoch": 0.48177801500459627, + "grad_norm": 0.13782033324241638, + "learning_rate": 0.0005368779610187317, + "loss": 2.663, + "step": 16247 + }, + { + "epoch": 0.48180766835690775, + "grad_norm": 0.1433916687965393, + "learning_rate": 0.0005368310380463364, + "loss": 2.6413, + "step": 16248 + }, + { + "epoch": 0.4818373217092192, + "grad_norm": 0.12018409371376038, + "learning_rate": 0.0005367841147477961, + "loss": 2.6557, + "step": 16249 + }, + { + "epoch": 0.4818669750615307, + "grad_norm": 0.12327545136213303, + "learning_rate": 0.0005367371911235261, + "loss": 2.6848, + "step": 16250 + }, + { + "epoch": 0.4818966284138422, + "grad_norm": 0.12222456932067871, + "learning_rate": 0.000536690267173942, + "loss": 2.6169, + "step": 16251 + }, + { + "epoch": 0.48192628176615365, + "grad_norm": 0.10337665677070618, + "learning_rate": 0.0005366433428994591, + "loss": 2.6504, + "step": 16252 + }, + { + "epoch": 0.48195593511846513, + "grad_norm": 0.10945123434066772, + "learning_rate": 0.0005365964183004932, + "loss": 2.6473, + "step": 16253 + }, + { + "epoch": 0.4819855884707766, + "grad_norm": 0.11244472116231918, + "learning_rate": 0.0005365494933774599, + "loss": 2.6522, + "step": 16254 + }, + { + "epoch": 0.4820152418230881, + "grad_norm": 0.1212957501411438, + "learning_rate": 0.0005365025681307744, + "loss": 2.6569, + "step": 16255 + }, + { + "epoch": 0.48204489517539956, + "grad_norm": 0.11035221070051193, + "learning_rate": 0.0005364556425608524, + "loss": 2.6838, + "step": 16256 + }, + { + "epoch": 0.48207454852771103, + "grad_norm": 0.12413524836301804, + "learning_rate": 0.0005364087166681096, + "loss": 2.6483, + "step": 16257 + }, + { + "epoch": 0.4821042018800225, + "grad_norm": 0.13140571117401123, + "learning_rate": 0.0005363617904529612, + "loss": 2.5981, + "step": 16258 + }, + { + "epoch": 0.48213385523233404, + "grad_norm": 0.10889989137649536, + "learning_rate": 0.0005363148639158228, + "loss": 2.6532, + "step": 16259 + }, + { + "epoch": 0.4821635085846455, + "grad_norm": 0.11591313779354095, + "learning_rate": 0.0005362679370571102, + "loss": 2.6649, + "step": 16260 + }, + { + "epoch": 0.482193161936957, + "grad_norm": 0.1078805923461914, + "learning_rate": 0.0005362210098772388, + "loss": 2.649, + "step": 16261 + }, + { + "epoch": 0.48222281528926847, + "grad_norm": 0.1125861182808876, + "learning_rate": 0.0005361740823766241, + "loss": 2.651, + "step": 16262 + }, + { + "epoch": 0.48225246864157995, + "grad_norm": 0.11168526113033295, + "learning_rate": 0.0005361271545556817, + "loss": 2.6479, + "step": 16263 + }, + { + "epoch": 0.4822821219938914, + "grad_norm": 0.11275436729192734, + "learning_rate": 0.0005360802264148271, + "loss": 2.6363, + "step": 16264 + }, + { + "epoch": 0.4823117753462029, + "grad_norm": 0.11511638015508652, + "learning_rate": 0.000536033297954476, + "loss": 2.6528, + "step": 16265 + }, + { + "epoch": 0.4823414286985144, + "grad_norm": 0.10157465934753418, + "learning_rate": 0.0005359863691750437, + "loss": 2.6581, + "step": 16266 + }, + { + "epoch": 0.48237108205082585, + "grad_norm": 0.09737180173397064, + "learning_rate": 0.0005359394400769461, + "loss": 2.6641, + "step": 16267 + }, + { + "epoch": 0.4824007354031373, + "grad_norm": 0.10606455057859421, + "learning_rate": 0.0005358925106605985, + "loss": 2.6466, + "step": 16268 + }, + { + "epoch": 0.4824303887554488, + "grad_norm": 0.10975144803524017, + "learning_rate": 0.0005358455809264165, + "loss": 2.6559, + "step": 16269 + }, + { + "epoch": 0.4824600421077603, + "grad_norm": 0.1225607767701149, + "learning_rate": 0.0005357986508748158, + "loss": 2.6075, + "step": 16270 + }, + { + "epoch": 0.48248969546007175, + "grad_norm": 0.11578598618507385, + "learning_rate": 0.000535751720506212, + "loss": 2.6647, + "step": 16271 + }, + { + "epoch": 0.48251934881238323, + "grad_norm": 0.1081441268324852, + "learning_rate": 0.0005357047898210205, + "loss": 2.6653, + "step": 16272 + }, + { + "epoch": 0.4825490021646947, + "grad_norm": 0.14627695083618164, + "learning_rate": 0.0005356578588196569, + "loss": 2.6645, + "step": 16273 + }, + { + "epoch": 0.4825786555170062, + "grad_norm": 0.1206464022397995, + "learning_rate": 0.000535610927502537, + "loss": 2.6309, + "step": 16274 + }, + { + "epoch": 0.48260830886931766, + "grad_norm": 0.11114910244941711, + "learning_rate": 0.0005355639958700759, + "loss": 2.6335, + "step": 16275 + }, + { + "epoch": 0.48263796222162914, + "grad_norm": 0.13016287982463837, + "learning_rate": 0.0005355170639226898, + "loss": 2.6619, + "step": 16276 + }, + { + "epoch": 0.4826676155739406, + "grad_norm": 0.13722030818462372, + "learning_rate": 0.000535470131660794, + "loss": 2.6574, + "step": 16277 + }, + { + "epoch": 0.4826972689262521, + "grad_norm": 0.12312261015176773, + "learning_rate": 0.0005354231990848041, + "loss": 2.6344, + "step": 16278 + }, + { + "epoch": 0.4827269222785636, + "grad_norm": 0.10085349529981613, + "learning_rate": 0.0005353762661951354, + "loss": 2.6521, + "step": 16279 + }, + { + "epoch": 0.4827565756308751, + "grad_norm": 0.137095645070076, + "learning_rate": 0.000535329332992204, + "loss": 2.6661, + "step": 16280 + }, + { + "epoch": 0.48278622898318657, + "grad_norm": 0.15329423546791077, + "learning_rate": 0.0005352823994764253, + "loss": 2.6606, + "step": 16281 + }, + { + "epoch": 0.48281588233549805, + "grad_norm": 0.1236036941409111, + "learning_rate": 0.000535235465648215, + "loss": 2.6483, + "step": 16282 + }, + { + "epoch": 0.4828455356878095, + "grad_norm": 0.12367726862430573, + "learning_rate": 0.0005351885315079885, + "loss": 2.6968, + "step": 16283 + }, + { + "epoch": 0.482875189040121, + "grad_norm": 0.12338818609714508, + "learning_rate": 0.0005351415970561615, + "loss": 2.6377, + "step": 16284 + }, + { + "epoch": 0.4829048423924325, + "grad_norm": 0.11628659814596176, + "learning_rate": 0.0005350946622931495, + "loss": 2.6465, + "step": 16285 + }, + { + "epoch": 0.48293449574474395, + "grad_norm": 0.12598441541194916, + "learning_rate": 0.0005350477272193684, + "loss": 2.657, + "step": 16286 + }, + { + "epoch": 0.48296414909705543, + "grad_norm": 0.1416654735803604, + "learning_rate": 0.0005350007918352336, + "loss": 2.6613, + "step": 16287 + }, + { + "epoch": 0.4829938024493669, + "grad_norm": 0.11744807660579681, + "learning_rate": 0.0005349538561411609, + "loss": 2.616, + "step": 16288 + }, + { + "epoch": 0.4830234558016784, + "grad_norm": 0.1144152581691742, + "learning_rate": 0.0005349069201375657, + "loss": 2.655, + "step": 16289 + }, + { + "epoch": 0.48305310915398986, + "grad_norm": 0.11956743150949478, + "learning_rate": 0.0005348599838248637, + "loss": 2.6747, + "step": 16290 + }, + { + "epoch": 0.48308276250630133, + "grad_norm": 0.11211837828159332, + "learning_rate": 0.0005348130472034707, + "loss": 2.6888, + "step": 16291 + }, + { + "epoch": 0.4831124158586128, + "grad_norm": 0.11850051581859589, + "learning_rate": 0.0005347661102738019, + "loss": 2.664, + "step": 16292 + }, + { + "epoch": 0.4831420692109243, + "grad_norm": 0.1340261995792389, + "learning_rate": 0.0005347191730362736, + "loss": 2.6691, + "step": 16293 + }, + { + "epoch": 0.48317172256323576, + "grad_norm": 0.12954404950141907, + "learning_rate": 0.0005346722354913009, + "loss": 2.6663, + "step": 16294 + }, + { + "epoch": 0.48320137591554724, + "grad_norm": 0.1365434229373932, + "learning_rate": 0.0005346252976392995, + "loss": 2.6635, + "step": 16295 + }, + { + "epoch": 0.4832310292678587, + "grad_norm": 0.13423077762126923, + "learning_rate": 0.0005345783594806852, + "loss": 2.6242, + "step": 16296 + }, + { + "epoch": 0.4832606826201702, + "grad_norm": 0.1042693480849266, + "learning_rate": 0.0005345314210158737, + "loss": 2.6425, + "step": 16297 + }, + { + "epoch": 0.48329033597248167, + "grad_norm": 0.1254681646823883, + "learning_rate": 0.0005344844822452805, + "loss": 2.657, + "step": 16298 + }, + { + "epoch": 0.48331998932479314, + "grad_norm": 0.10625249147415161, + "learning_rate": 0.0005344375431693213, + "loss": 2.633, + "step": 16299 + }, + { + "epoch": 0.4833496426771047, + "grad_norm": 0.11954861134290695, + "learning_rate": 0.0005343906037884117, + "loss": 2.6875, + "step": 16300 + }, + { + "epoch": 0.48337929602941615, + "grad_norm": 0.12309711426496506, + "learning_rate": 0.0005343436641029673, + "loss": 2.6397, + "step": 16301 + }, + { + "epoch": 0.4834089493817276, + "grad_norm": 0.12649701535701752, + "learning_rate": 0.000534296724113404, + "loss": 2.6562, + "step": 16302 + }, + { + "epoch": 0.4834386027340391, + "grad_norm": 0.11878644675016403, + "learning_rate": 0.0005342497838201373, + "loss": 2.6751, + "step": 16303 + }, + { + "epoch": 0.4834682560863506, + "grad_norm": 0.11594345420598984, + "learning_rate": 0.0005342028432235828, + "loss": 2.6509, + "step": 16304 + }, + { + "epoch": 0.48349790943866205, + "grad_norm": 0.10577788203954697, + "learning_rate": 0.0005341559023241564, + "loss": 2.69, + "step": 16305 + }, + { + "epoch": 0.48352756279097353, + "grad_norm": 0.12604016065597534, + "learning_rate": 0.0005341089611222735, + "loss": 2.6753, + "step": 16306 + }, + { + "epoch": 0.483557216143285, + "grad_norm": 0.1498228907585144, + "learning_rate": 0.0005340620196183499, + "loss": 2.6852, + "step": 16307 + }, + { + "epoch": 0.4835868694955965, + "grad_norm": 0.15189167857170105, + "learning_rate": 0.0005340150778128013, + "loss": 2.6829, + "step": 16308 + }, + { + "epoch": 0.48361652284790796, + "grad_norm": 0.1328558325767517, + "learning_rate": 0.0005339681357060433, + "loss": 2.6453, + "step": 16309 + }, + { + "epoch": 0.48364617620021944, + "grad_norm": 0.12679021060466766, + "learning_rate": 0.0005339211932984918, + "loss": 2.7007, + "step": 16310 + }, + { + "epoch": 0.4836758295525309, + "grad_norm": 0.13006959855556488, + "learning_rate": 0.0005338742505905621, + "loss": 2.61, + "step": 16311 + }, + { + "epoch": 0.4837054829048424, + "grad_norm": 0.13371945917606354, + "learning_rate": 0.0005338273075826702, + "loss": 2.6626, + "step": 16312 + }, + { + "epoch": 0.48373513625715386, + "grad_norm": 0.11942228674888611, + "learning_rate": 0.0005337803642752317, + "loss": 2.6293, + "step": 16313 + }, + { + "epoch": 0.48376478960946534, + "grad_norm": 0.12234961241483688, + "learning_rate": 0.0005337334206686622, + "loss": 2.6734, + "step": 16314 + }, + { + "epoch": 0.4837944429617768, + "grad_norm": 0.11710643023252487, + "learning_rate": 0.0005336864767633777, + "loss": 2.6763, + "step": 16315 + }, + { + "epoch": 0.4838240963140883, + "grad_norm": 0.11071798205375671, + "learning_rate": 0.0005336395325597935, + "loss": 2.6409, + "step": 16316 + }, + { + "epoch": 0.48385374966639977, + "grad_norm": 0.12609073519706726, + "learning_rate": 0.0005335925880583253, + "loss": 2.6215, + "step": 16317 + }, + { + "epoch": 0.48388340301871124, + "grad_norm": 0.13885201513767242, + "learning_rate": 0.0005335456432593891, + "loss": 2.6061, + "step": 16318 + }, + { + "epoch": 0.4839130563710227, + "grad_norm": 0.12091066688299179, + "learning_rate": 0.0005334986981634004, + "loss": 2.6649, + "step": 16319 + }, + { + "epoch": 0.4839427097233342, + "grad_norm": 0.11616144329309464, + "learning_rate": 0.0005334517527707752, + "loss": 2.6237, + "step": 16320 + }, + { + "epoch": 0.48397236307564573, + "grad_norm": 0.11278074979782104, + "learning_rate": 0.0005334048070819289, + "loss": 2.6661, + "step": 16321 + }, + { + "epoch": 0.4840020164279572, + "grad_norm": 0.11312730610370636, + "learning_rate": 0.0005333578610972773, + "loss": 2.6338, + "step": 16322 + }, + { + "epoch": 0.4840316697802687, + "grad_norm": 0.11990751326084137, + "learning_rate": 0.000533310914817236, + "loss": 2.6487, + "step": 16323 + }, + { + "epoch": 0.48406132313258016, + "grad_norm": 0.13590310513973236, + "learning_rate": 0.0005332639682422207, + "loss": 2.649, + "step": 16324 + }, + { + "epoch": 0.48409097648489163, + "grad_norm": 0.15529730916023254, + "learning_rate": 0.0005332170213726475, + "loss": 2.6546, + "step": 16325 + }, + { + "epoch": 0.4841206298372031, + "grad_norm": 0.12887780368328094, + "learning_rate": 0.0005331700742089319, + "loss": 2.6706, + "step": 16326 + }, + { + "epoch": 0.4841502831895146, + "grad_norm": 0.11260472238063812, + "learning_rate": 0.0005331231267514896, + "loss": 2.6788, + "step": 16327 + }, + { + "epoch": 0.48417993654182606, + "grad_norm": 0.12808968126773834, + "learning_rate": 0.0005330761790007363, + "loss": 2.6481, + "step": 16328 + }, + { + "epoch": 0.48420958989413754, + "grad_norm": 0.15047776699066162, + "learning_rate": 0.0005330292309570876, + "loss": 2.6468, + "step": 16329 + }, + { + "epoch": 0.484239243246449, + "grad_norm": 0.1309632807970047, + "learning_rate": 0.0005329822826209595, + "loss": 2.6338, + "step": 16330 + }, + { + "epoch": 0.4842688965987605, + "grad_norm": 0.10893052071332932, + "learning_rate": 0.0005329353339927676, + "loss": 2.6361, + "step": 16331 + }, + { + "epoch": 0.48429854995107197, + "grad_norm": 0.1336885392665863, + "learning_rate": 0.0005328883850729277, + "loss": 2.683, + "step": 16332 + }, + { + "epoch": 0.48432820330338344, + "grad_norm": 0.15353991091251373, + "learning_rate": 0.0005328414358618555, + "loss": 2.6506, + "step": 16333 + }, + { + "epoch": 0.4843578566556949, + "grad_norm": 0.13252367079257965, + "learning_rate": 0.0005327944863599668, + "loss": 2.6665, + "step": 16334 + }, + { + "epoch": 0.4843875100080064, + "grad_norm": 0.11127161234617233, + "learning_rate": 0.0005327475365676772, + "loss": 2.597, + "step": 16335 + }, + { + "epoch": 0.48441716336031787, + "grad_norm": 0.11601609736680984, + "learning_rate": 0.0005327005864854026, + "loss": 2.6579, + "step": 16336 + }, + { + "epoch": 0.48444681671262935, + "grad_norm": 0.12961164116859436, + "learning_rate": 0.0005326536361135586, + "loss": 2.6386, + "step": 16337 + }, + { + "epoch": 0.4844764700649408, + "grad_norm": 0.11686054617166519, + "learning_rate": 0.0005326066854525612, + "loss": 2.6523, + "step": 16338 + }, + { + "epoch": 0.4845061234172523, + "grad_norm": 0.10776098817586899, + "learning_rate": 0.0005325597345028259, + "loss": 2.668, + "step": 16339 + }, + { + "epoch": 0.4845357767695638, + "grad_norm": 0.15054890513420105, + "learning_rate": 0.0005325127832647687, + "loss": 2.6714, + "step": 16340 + }, + { + "epoch": 0.48456543012187525, + "grad_norm": 0.12738841772079468, + "learning_rate": 0.0005324658317388049, + "loss": 2.6534, + "step": 16341 + }, + { + "epoch": 0.4845950834741868, + "grad_norm": 0.10628295689821243, + "learning_rate": 0.0005324188799253509, + "loss": 2.6428, + "step": 16342 + }, + { + "epoch": 0.48462473682649826, + "grad_norm": 0.11199741810560226, + "learning_rate": 0.000532371927824822, + "loss": 2.6476, + "step": 16343 + }, + { + "epoch": 0.48465439017880974, + "grad_norm": 0.14670109748840332, + "learning_rate": 0.0005323249754376341, + "loss": 2.6551, + "step": 16344 + }, + { + "epoch": 0.4846840435311212, + "grad_norm": 0.14119595289230347, + "learning_rate": 0.0005322780227642031, + "loss": 2.6512, + "step": 16345 + }, + { + "epoch": 0.4847136968834327, + "grad_norm": 0.11155374348163605, + "learning_rate": 0.0005322310698049446, + "loss": 2.6771, + "step": 16346 + }, + { + "epoch": 0.48474335023574416, + "grad_norm": 0.12204521894454956, + "learning_rate": 0.0005321841165602746, + "loss": 2.6301, + "step": 16347 + }, + { + "epoch": 0.48477300358805564, + "grad_norm": 0.1378782093524933, + "learning_rate": 0.0005321371630306087, + "loss": 2.6544, + "step": 16348 + }, + { + "epoch": 0.4848026569403671, + "grad_norm": 0.13094612956047058, + "learning_rate": 0.0005320902092163625, + "loss": 2.6642, + "step": 16349 + }, + { + "epoch": 0.4848323102926786, + "grad_norm": 0.09913370013237, + "learning_rate": 0.000532043255117952, + "loss": 2.6471, + "step": 16350 + }, + { + "epoch": 0.48486196364499007, + "grad_norm": 0.14368242025375366, + "learning_rate": 0.0005319963007357931, + "loss": 2.6816, + "step": 16351 + }, + { + "epoch": 0.48489161699730154, + "grad_norm": 0.16173648834228516, + "learning_rate": 0.0005319493460703014, + "loss": 2.6802, + "step": 16352 + }, + { + "epoch": 0.484921270349613, + "grad_norm": 0.16765402257442474, + "learning_rate": 0.000531902391121893, + "loss": 2.6446, + "step": 16353 + }, + { + "epoch": 0.4849509237019245, + "grad_norm": 0.12797291576862335, + "learning_rate": 0.0005318554358909832, + "loss": 2.6621, + "step": 16354 + }, + { + "epoch": 0.484980577054236, + "grad_norm": 0.1309882551431656, + "learning_rate": 0.000531808480377988, + "loss": 2.6359, + "step": 16355 + }, + { + "epoch": 0.48501023040654745, + "grad_norm": 0.15071845054626465, + "learning_rate": 0.0005317615245833232, + "loss": 2.6648, + "step": 16356 + }, + { + "epoch": 0.4850398837588589, + "grad_norm": 0.11484530568122864, + "learning_rate": 0.0005317145685074049, + "loss": 2.6765, + "step": 16357 + }, + { + "epoch": 0.4850695371111704, + "grad_norm": 0.11116950213909149, + "learning_rate": 0.0005316676121506485, + "loss": 2.6418, + "step": 16358 + }, + { + "epoch": 0.4850991904634819, + "grad_norm": 0.11388172209262848, + "learning_rate": 0.0005316206555134701, + "loss": 2.6443, + "step": 16359 + }, + { + "epoch": 0.48512884381579335, + "grad_norm": 0.1252434104681015, + "learning_rate": 0.0005315736985962852, + "loss": 2.6416, + "step": 16360 + }, + { + "epoch": 0.48515849716810483, + "grad_norm": 0.11912503093481064, + "learning_rate": 0.00053152674139951, + "loss": 2.6927, + "step": 16361 + }, + { + "epoch": 0.4851881505204163, + "grad_norm": 0.10584302246570587, + "learning_rate": 0.0005314797839235599, + "loss": 2.6507, + "step": 16362 + }, + { + "epoch": 0.48521780387272784, + "grad_norm": 0.11648177355527878, + "learning_rate": 0.0005314328261688508, + "loss": 2.6397, + "step": 16363 + }, + { + "epoch": 0.4852474572250393, + "grad_norm": 0.11658894270658493, + "learning_rate": 0.0005313858681357988, + "loss": 2.6077, + "step": 16364 + }, + { + "epoch": 0.4852771105773508, + "grad_norm": 0.11346179991960526, + "learning_rate": 0.0005313389098248196, + "loss": 2.66, + "step": 16365 + }, + { + "epoch": 0.48530676392966227, + "grad_norm": 0.1164432168006897, + "learning_rate": 0.0005312919512363289, + "loss": 2.6943, + "step": 16366 + }, + { + "epoch": 0.48533641728197374, + "grad_norm": 0.11791500449180603, + "learning_rate": 0.0005312449923707425, + "loss": 2.656, + "step": 16367 + }, + { + "epoch": 0.4853660706342852, + "grad_norm": 0.11307903379201889, + "learning_rate": 0.0005311980332284765, + "loss": 2.6299, + "step": 16368 + }, + { + "epoch": 0.4853957239865967, + "grad_norm": 0.12370114028453827, + "learning_rate": 0.0005311510738099465, + "loss": 2.6671, + "step": 16369 + }, + { + "epoch": 0.48542537733890817, + "grad_norm": 0.12235091626644135, + "learning_rate": 0.0005311041141155684, + "loss": 2.6798, + "step": 16370 + }, + { + "epoch": 0.48545503069121965, + "grad_norm": 0.12338915467262268, + "learning_rate": 0.0005310571541457579, + "loss": 2.6745, + "step": 16371 + }, + { + "epoch": 0.4854846840435311, + "grad_norm": 0.12231872975826263, + "learning_rate": 0.0005310101939009312, + "loss": 2.6473, + "step": 16372 + }, + { + "epoch": 0.4855143373958426, + "grad_norm": 0.12816013395786285, + "learning_rate": 0.0005309632333815038, + "loss": 2.6367, + "step": 16373 + }, + { + "epoch": 0.4855439907481541, + "grad_norm": 0.1128997951745987, + "learning_rate": 0.0005309162725878917, + "loss": 2.6822, + "step": 16374 + }, + { + "epoch": 0.48557364410046555, + "grad_norm": 0.11941536515951157, + "learning_rate": 0.0005308693115205106, + "loss": 2.6855, + "step": 16375 + }, + { + "epoch": 0.485603297452777, + "grad_norm": 0.13930775225162506, + "learning_rate": 0.0005308223501797765, + "loss": 2.6655, + "step": 16376 + }, + { + "epoch": 0.4856329508050885, + "grad_norm": 0.15455669164657593, + "learning_rate": 0.0005307753885661052, + "loss": 2.6894, + "step": 16377 + }, + { + "epoch": 0.4856626041574, + "grad_norm": 0.12135829776525497, + "learning_rate": 0.0005307284266799125, + "loss": 2.6547, + "step": 16378 + }, + { + "epoch": 0.48569225750971146, + "grad_norm": 0.11848343908786774, + "learning_rate": 0.0005306814645216144, + "loss": 2.6436, + "step": 16379 + }, + { + "epoch": 0.48572191086202293, + "grad_norm": 0.12533898651599884, + "learning_rate": 0.0005306345020916265, + "loss": 2.6657, + "step": 16380 + }, + { + "epoch": 0.4857515642143344, + "grad_norm": 0.11287776380777359, + "learning_rate": 0.0005305875393903651, + "loss": 2.6535, + "step": 16381 + }, + { + "epoch": 0.4857812175666459, + "grad_norm": 0.1284097135066986, + "learning_rate": 0.0005305405764182455, + "loss": 2.6111, + "step": 16382 + }, + { + "epoch": 0.48581087091895736, + "grad_norm": 0.12910141050815582, + "learning_rate": 0.000530493613175684, + "loss": 2.6861, + "step": 16383 + }, + { + "epoch": 0.4858405242712689, + "grad_norm": 0.13067397475242615, + "learning_rate": 0.0005304466496630963, + "loss": 2.6392, + "step": 16384 + }, + { + "epoch": 0.48587017762358037, + "grad_norm": 0.1114138662815094, + "learning_rate": 0.0005303996858808983, + "loss": 2.6309, + "step": 16385 + }, + { + "epoch": 0.48589983097589184, + "grad_norm": 0.12373001128435135, + "learning_rate": 0.000530352721829506, + "loss": 2.6582, + "step": 16386 + }, + { + "epoch": 0.4859294843282033, + "grad_norm": 0.11376983672380447, + "learning_rate": 0.000530305757509335, + "loss": 2.6712, + "step": 16387 + }, + { + "epoch": 0.4859591376805148, + "grad_norm": 0.1139267310500145, + "learning_rate": 0.0005302587929208012, + "loss": 2.6266, + "step": 16388 + }, + { + "epoch": 0.4859887910328263, + "grad_norm": 0.11862007528543472, + "learning_rate": 0.0005302118280643206, + "loss": 2.6495, + "step": 16389 + }, + { + "epoch": 0.48601844438513775, + "grad_norm": 0.11164918541908264, + "learning_rate": 0.0005301648629403093, + "loss": 2.6271, + "step": 16390 + }, + { + "epoch": 0.4860480977374492, + "grad_norm": 0.10577265173196793, + "learning_rate": 0.000530117897549183, + "loss": 2.6351, + "step": 16391 + }, + { + "epoch": 0.4860777510897607, + "grad_norm": 0.11797158420085907, + "learning_rate": 0.0005300709318913574, + "loss": 2.6978, + "step": 16392 + }, + { + "epoch": 0.4861074044420722, + "grad_norm": 0.11967350542545319, + "learning_rate": 0.0005300239659672485, + "loss": 2.6542, + "step": 16393 + }, + { + "epoch": 0.48613705779438365, + "grad_norm": 0.11083029955625534, + "learning_rate": 0.0005299769997772722, + "loss": 2.7008, + "step": 16394 + }, + { + "epoch": 0.48616671114669513, + "grad_norm": 0.1132173240184784, + "learning_rate": 0.0005299300333218444, + "loss": 2.6477, + "step": 16395 + }, + { + "epoch": 0.4861963644990066, + "grad_norm": 0.1078856959939003, + "learning_rate": 0.0005298830666013811, + "loss": 2.6751, + "step": 16396 + }, + { + "epoch": 0.4862260178513181, + "grad_norm": 0.1159668117761612, + "learning_rate": 0.0005298360996162982, + "loss": 2.6488, + "step": 16397 + }, + { + "epoch": 0.48625567120362956, + "grad_norm": 0.1333291232585907, + "learning_rate": 0.0005297891323670115, + "loss": 2.6696, + "step": 16398 + }, + { + "epoch": 0.48628532455594103, + "grad_norm": 0.15180142223834991, + "learning_rate": 0.0005297421648539367, + "loss": 2.6774, + "step": 16399 + }, + { + "epoch": 0.4863149779082525, + "grad_norm": 0.1626322716474533, + "learning_rate": 0.0005296951970774901, + "loss": 2.6224, + "step": 16400 + }, + { + "epoch": 0.486344631260564, + "grad_norm": 0.13420604169368744, + "learning_rate": 0.0005296482290380874, + "loss": 2.6491, + "step": 16401 + }, + { + "epoch": 0.48637428461287546, + "grad_norm": 0.12013046443462372, + "learning_rate": 0.0005296012607361446, + "loss": 2.6741, + "step": 16402 + }, + { + "epoch": 0.48640393796518694, + "grad_norm": 0.15661700069904327, + "learning_rate": 0.0005295542921720776, + "loss": 2.6351, + "step": 16403 + }, + { + "epoch": 0.48643359131749847, + "grad_norm": 0.15054960548877716, + "learning_rate": 0.000529507323346302, + "loss": 2.6783, + "step": 16404 + }, + { + "epoch": 0.48646324466980995, + "grad_norm": 0.13539262115955353, + "learning_rate": 0.0005294603542592342, + "loss": 2.694, + "step": 16405 + }, + { + "epoch": 0.4864928980221214, + "grad_norm": 0.13446420431137085, + "learning_rate": 0.0005294133849112899, + "loss": 2.6553, + "step": 16406 + }, + { + "epoch": 0.4865225513744329, + "grad_norm": 0.11596526205539703, + "learning_rate": 0.0005293664153028849, + "loss": 2.6549, + "step": 16407 + }, + { + "epoch": 0.4865522047267444, + "grad_norm": 0.12323106080293655, + "learning_rate": 0.0005293194454344354, + "loss": 2.6423, + "step": 16408 + }, + { + "epoch": 0.48658185807905585, + "grad_norm": 0.13208431005477905, + "learning_rate": 0.0005292724753063571, + "loss": 2.6467, + "step": 16409 + }, + { + "epoch": 0.4866115114313673, + "grad_norm": 0.11322976648807526, + "learning_rate": 0.0005292255049190661, + "loss": 2.6266, + "step": 16410 + }, + { + "epoch": 0.4866411647836788, + "grad_norm": 0.12643033266067505, + "learning_rate": 0.0005291785342729781, + "loss": 2.6417, + "step": 16411 + }, + { + "epoch": 0.4866708181359903, + "grad_norm": 0.1158999353647232, + "learning_rate": 0.0005291315633685094, + "loss": 2.6451, + "step": 16412 + }, + { + "epoch": 0.48670047148830176, + "grad_norm": 0.1296369731426239, + "learning_rate": 0.0005290845922060754, + "loss": 2.6154, + "step": 16413 + }, + { + "epoch": 0.48673012484061323, + "grad_norm": 0.12892620265483856, + "learning_rate": 0.0005290376207860927, + "loss": 2.6748, + "step": 16414 + }, + { + "epoch": 0.4867597781929247, + "grad_norm": 0.11908628046512604, + "learning_rate": 0.0005289906491089765, + "loss": 2.6749, + "step": 16415 + }, + { + "epoch": 0.4867894315452362, + "grad_norm": 0.1272568702697754, + "learning_rate": 0.0005289436771751434, + "loss": 2.6598, + "step": 16416 + }, + { + "epoch": 0.48681908489754766, + "grad_norm": 0.12749335169792175, + "learning_rate": 0.0005288967049850089, + "loss": 2.6478, + "step": 16417 + }, + { + "epoch": 0.48684873824985914, + "grad_norm": 0.132768914103508, + "learning_rate": 0.0005288497325389892, + "loss": 2.6641, + "step": 16418 + }, + { + "epoch": 0.4868783916021706, + "grad_norm": 0.1014924943447113, + "learning_rate": 0.0005288027598375003, + "loss": 2.6422, + "step": 16419 + }, + { + "epoch": 0.4869080449544821, + "grad_norm": 0.10666588693857193, + "learning_rate": 0.0005287557868809579, + "loss": 2.6825, + "step": 16420 + }, + { + "epoch": 0.48693769830679356, + "grad_norm": 0.12364991009235382, + "learning_rate": 0.0005287088136697778, + "loss": 2.6471, + "step": 16421 + }, + { + "epoch": 0.48696735165910504, + "grad_norm": 0.1367063969373703, + "learning_rate": 0.0005286618402043765, + "loss": 2.6467, + "step": 16422 + }, + { + "epoch": 0.4869970050114165, + "grad_norm": 0.1401624083518982, + "learning_rate": 0.0005286148664851697, + "loss": 2.6761, + "step": 16423 + }, + { + "epoch": 0.487026658363728, + "grad_norm": 0.12331351637840271, + "learning_rate": 0.0005285678925125735, + "loss": 2.6335, + "step": 16424 + }, + { + "epoch": 0.4870563117160395, + "grad_norm": 0.13018940389156342, + "learning_rate": 0.0005285209182870034, + "loss": 2.6721, + "step": 16425 + }, + { + "epoch": 0.487085965068351, + "grad_norm": 0.13181447982788086, + "learning_rate": 0.0005284739438088758, + "loss": 2.6675, + "step": 16426 + }, + { + "epoch": 0.4871156184206625, + "grad_norm": 0.11294158548116684, + "learning_rate": 0.0005284269690786066, + "loss": 2.6594, + "step": 16427 + }, + { + "epoch": 0.48714527177297395, + "grad_norm": 0.12051820009946823, + "learning_rate": 0.0005283799940966114, + "loss": 2.6691, + "step": 16428 + }, + { + "epoch": 0.48717492512528543, + "grad_norm": 0.10985372215509415, + "learning_rate": 0.0005283330188633068, + "loss": 2.6868, + "step": 16429 + }, + { + "epoch": 0.4872045784775969, + "grad_norm": 0.11448997259140015, + "learning_rate": 0.0005282860433791083, + "loss": 2.6864, + "step": 16430 + }, + { + "epoch": 0.4872342318299084, + "grad_norm": 0.12379468977451324, + "learning_rate": 0.0005282390676444322, + "loss": 2.667, + "step": 16431 + }, + { + "epoch": 0.48726388518221986, + "grad_norm": 0.11940142512321472, + "learning_rate": 0.0005281920916596942, + "loss": 2.6617, + "step": 16432 + }, + { + "epoch": 0.48729353853453133, + "grad_norm": 0.12115234136581421, + "learning_rate": 0.0005281451154253104, + "loss": 2.6301, + "step": 16433 + }, + { + "epoch": 0.4873231918868428, + "grad_norm": 0.1149098128080368, + "learning_rate": 0.0005280981389416966, + "loss": 2.6745, + "step": 16434 + }, + { + "epoch": 0.4873528452391543, + "grad_norm": 0.10347721725702286, + "learning_rate": 0.0005280511622092693, + "loss": 2.6297, + "step": 16435 + }, + { + "epoch": 0.48738249859146576, + "grad_norm": 0.11266220360994339, + "learning_rate": 0.0005280041852284439, + "loss": 2.6533, + "step": 16436 + }, + { + "epoch": 0.48741215194377724, + "grad_norm": 0.11388355493545532, + "learning_rate": 0.0005279572079996367, + "loss": 2.6745, + "step": 16437 + }, + { + "epoch": 0.4874418052960887, + "grad_norm": 0.11142325401306152, + "learning_rate": 0.0005279102305232637, + "loss": 2.6625, + "step": 16438 + }, + { + "epoch": 0.4874714586484002, + "grad_norm": 0.12074562907218933, + "learning_rate": 0.0005278632527997407, + "loss": 2.6597, + "step": 16439 + }, + { + "epoch": 0.48750111200071167, + "grad_norm": 0.1097375676035881, + "learning_rate": 0.0005278162748294839, + "loss": 2.6034, + "step": 16440 + }, + { + "epoch": 0.48753076535302314, + "grad_norm": 0.11262618750333786, + "learning_rate": 0.0005277692966129091, + "loss": 2.6285, + "step": 16441 + }, + { + "epoch": 0.4875604187053346, + "grad_norm": 0.10108945518732071, + "learning_rate": 0.0005277223181504324, + "loss": 2.6346, + "step": 16442 + }, + { + "epoch": 0.4875900720576461, + "grad_norm": 0.11553481966257095, + "learning_rate": 0.00052767533944247, + "loss": 2.6429, + "step": 16443 + }, + { + "epoch": 0.48761972540995757, + "grad_norm": 0.10960012674331665, + "learning_rate": 0.0005276283604894376, + "loss": 2.6849, + "step": 16444 + }, + { + "epoch": 0.48764937876226905, + "grad_norm": 0.1081543043255806, + "learning_rate": 0.0005275813812917514, + "loss": 2.6342, + "step": 16445 + }, + { + "epoch": 0.4876790321145806, + "grad_norm": 0.12233809381723404, + "learning_rate": 0.0005275344018498274, + "loss": 2.6686, + "step": 16446 + }, + { + "epoch": 0.48770868546689206, + "grad_norm": 0.12774276733398438, + "learning_rate": 0.0005274874221640813, + "loss": 2.6583, + "step": 16447 + }, + { + "epoch": 0.48773833881920353, + "grad_norm": 0.10561387985944748, + "learning_rate": 0.0005274404422349295, + "loss": 2.6312, + "step": 16448 + }, + { + "epoch": 0.487767992171515, + "grad_norm": 0.10389541834592819, + "learning_rate": 0.0005273934620627878, + "loss": 2.6743, + "step": 16449 + }, + { + "epoch": 0.4877976455238265, + "grad_norm": 0.10825281590223312, + "learning_rate": 0.0005273464816480723, + "loss": 2.6455, + "step": 16450 + }, + { + "epoch": 0.48782729887613796, + "grad_norm": 0.1333082765340805, + "learning_rate": 0.0005272995009911991, + "loss": 2.6849, + "step": 16451 + }, + { + "epoch": 0.48785695222844944, + "grad_norm": 0.11894572526216507, + "learning_rate": 0.0005272525200925842, + "loss": 2.6282, + "step": 16452 + }, + { + "epoch": 0.4878866055807609, + "grad_norm": 0.13483063876628876, + "learning_rate": 0.0005272055389526434, + "loss": 2.6196, + "step": 16453 + }, + { + "epoch": 0.4879162589330724, + "grad_norm": 0.13555431365966797, + "learning_rate": 0.0005271585575717929, + "loss": 2.624, + "step": 16454 + }, + { + "epoch": 0.48794591228538386, + "grad_norm": 0.12355600297451019, + "learning_rate": 0.0005271115759504487, + "loss": 2.6433, + "step": 16455 + }, + { + "epoch": 0.48797556563769534, + "grad_norm": 0.10740988701581955, + "learning_rate": 0.0005270645940890268, + "loss": 2.6395, + "step": 16456 + }, + { + "epoch": 0.4880052189900068, + "grad_norm": 0.11770416796207428, + "learning_rate": 0.0005270176119879435, + "loss": 2.6925, + "step": 16457 + }, + { + "epoch": 0.4880348723423183, + "grad_norm": 0.13160204887390137, + "learning_rate": 0.0005269706296476144, + "loss": 2.6343, + "step": 16458 + }, + { + "epoch": 0.48806452569462977, + "grad_norm": 0.12218109518289566, + "learning_rate": 0.0005269236470684559, + "loss": 2.6741, + "step": 16459 + }, + { + "epoch": 0.48809417904694125, + "grad_norm": 0.12451578676700592, + "learning_rate": 0.0005268766642508837, + "loss": 2.6484, + "step": 16460 + }, + { + "epoch": 0.4881238323992527, + "grad_norm": 0.14504624903202057, + "learning_rate": 0.0005268296811953141, + "loss": 2.6583, + "step": 16461 + }, + { + "epoch": 0.4881534857515642, + "grad_norm": 0.13544265925884247, + "learning_rate": 0.0005267826979021632, + "loss": 2.6408, + "step": 16462 + }, + { + "epoch": 0.4881831391038757, + "grad_norm": 0.11947884410619736, + "learning_rate": 0.0005267357143718468, + "loss": 2.6391, + "step": 16463 + }, + { + "epoch": 0.48821279245618715, + "grad_norm": 0.12685419619083405, + "learning_rate": 0.0005266887306047811, + "loss": 2.6567, + "step": 16464 + }, + { + "epoch": 0.4882424458084986, + "grad_norm": 0.13204605877399445, + "learning_rate": 0.000526641746601382, + "loss": 2.6762, + "step": 16465 + }, + { + "epoch": 0.4882720991608101, + "grad_norm": 0.1297285258769989, + "learning_rate": 0.0005265947623620656, + "loss": 2.692, + "step": 16466 + }, + { + "epoch": 0.48830175251312163, + "grad_norm": 0.1355925053358078, + "learning_rate": 0.0005265477778872483, + "loss": 2.6013, + "step": 16467 + }, + { + "epoch": 0.4883314058654331, + "grad_norm": 0.1233421117067337, + "learning_rate": 0.0005265007931773457, + "loss": 2.6611, + "step": 16468 + }, + { + "epoch": 0.4883610592177446, + "grad_norm": 0.13277941942214966, + "learning_rate": 0.000526453808232774, + "loss": 2.6632, + "step": 16469 + }, + { + "epoch": 0.48839071257005606, + "grad_norm": 0.1394268125295639, + "learning_rate": 0.0005264068230539494, + "loss": 2.6603, + "step": 16470 + }, + { + "epoch": 0.48842036592236754, + "grad_norm": 0.12363879382610321, + "learning_rate": 0.0005263598376412877, + "loss": 2.6762, + "step": 16471 + }, + { + "epoch": 0.488450019274679, + "grad_norm": 0.11366458982229233, + "learning_rate": 0.0005263128519952052, + "loss": 2.6396, + "step": 16472 + }, + { + "epoch": 0.4884796726269905, + "grad_norm": 0.13791225850582123, + "learning_rate": 0.0005262658661161178, + "loss": 2.6565, + "step": 16473 + }, + { + "epoch": 0.48850932597930197, + "grad_norm": 0.1420263797044754, + "learning_rate": 0.0005262188800044418, + "loss": 2.6724, + "step": 16474 + }, + { + "epoch": 0.48853897933161344, + "grad_norm": 0.1391598880290985, + "learning_rate": 0.0005261718936605931, + "loss": 2.6758, + "step": 16475 + }, + { + "epoch": 0.4885686326839249, + "grad_norm": 0.11420057713985443, + "learning_rate": 0.0005261249070849876, + "loss": 2.6844, + "step": 16476 + }, + { + "epoch": 0.4885982860362364, + "grad_norm": 0.10449489951133728, + "learning_rate": 0.0005260779202780417, + "loss": 2.6732, + "step": 16477 + }, + { + "epoch": 0.48862793938854787, + "grad_norm": 0.11208783835172653, + "learning_rate": 0.0005260309332401714, + "loss": 2.6709, + "step": 16478 + }, + { + "epoch": 0.48865759274085935, + "grad_norm": 0.1120118573307991, + "learning_rate": 0.0005259839459717926, + "loss": 2.6752, + "step": 16479 + }, + { + "epoch": 0.4886872460931708, + "grad_norm": 0.10334838926792145, + "learning_rate": 0.0005259369584733215, + "loss": 2.6624, + "step": 16480 + }, + { + "epoch": 0.4887168994454823, + "grad_norm": 0.1100066676735878, + "learning_rate": 0.0005258899707451742, + "loss": 2.6516, + "step": 16481 + }, + { + "epoch": 0.4887465527977938, + "grad_norm": 0.10613589733839035, + "learning_rate": 0.0005258429827877667, + "loss": 2.6774, + "step": 16482 + }, + { + "epoch": 0.48877620615010525, + "grad_norm": 0.11713147908449173, + "learning_rate": 0.0005257959946015154, + "loss": 2.6536, + "step": 16483 + }, + { + "epoch": 0.48880585950241673, + "grad_norm": 0.13563773036003113, + "learning_rate": 0.000525749006186836, + "loss": 2.665, + "step": 16484 + }, + { + "epoch": 0.4888355128547282, + "grad_norm": 0.12377791106700897, + "learning_rate": 0.0005257020175441447, + "loss": 2.6361, + "step": 16485 + }, + { + "epoch": 0.4888651662070397, + "grad_norm": 0.12607936561107635, + "learning_rate": 0.0005256550286738575, + "loss": 2.6661, + "step": 16486 + }, + { + "epoch": 0.48889481955935116, + "grad_norm": 0.13358177244663239, + "learning_rate": 0.0005256080395763908, + "loss": 2.6501, + "step": 16487 + }, + { + "epoch": 0.4889244729116627, + "grad_norm": 0.13779684901237488, + "learning_rate": 0.0005255610502521605, + "loss": 2.6994, + "step": 16488 + }, + { + "epoch": 0.48895412626397416, + "grad_norm": 0.13168838620185852, + "learning_rate": 0.0005255140607015826, + "loss": 2.6567, + "step": 16489 + }, + { + "epoch": 0.48898377961628564, + "grad_norm": 0.1155891865491867, + "learning_rate": 0.0005254670709250735, + "loss": 2.6398, + "step": 16490 + }, + { + "epoch": 0.4890134329685971, + "grad_norm": 0.13606083393096924, + "learning_rate": 0.000525420080923049, + "loss": 2.6674, + "step": 16491 + }, + { + "epoch": 0.4890430863209086, + "grad_norm": 0.12493971735239029, + "learning_rate": 0.0005253730906959253, + "loss": 2.6839, + "step": 16492 + }, + { + "epoch": 0.48907273967322007, + "grad_norm": 0.11175957322120667, + "learning_rate": 0.0005253261002441186, + "loss": 2.6737, + "step": 16493 + }, + { + "epoch": 0.48910239302553155, + "grad_norm": 0.1088763102889061, + "learning_rate": 0.0005252791095680448, + "loss": 2.6252, + "step": 16494 + }, + { + "epoch": 0.489132046377843, + "grad_norm": 0.12813426554203033, + "learning_rate": 0.0005252321186681204, + "loss": 2.6317, + "step": 16495 + }, + { + "epoch": 0.4891616997301545, + "grad_norm": 0.12331829220056534, + "learning_rate": 0.000525185127544761, + "loss": 2.6562, + "step": 16496 + }, + { + "epoch": 0.489191353082466, + "grad_norm": 0.10251522064208984, + "learning_rate": 0.0005251381361983831, + "loss": 2.6624, + "step": 16497 + }, + { + "epoch": 0.48922100643477745, + "grad_norm": 0.11357148736715317, + "learning_rate": 0.0005250911446294026, + "loss": 2.6594, + "step": 16498 + }, + { + "epoch": 0.4892506597870889, + "grad_norm": 0.11595022678375244, + "learning_rate": 0.0005250441528382357, + "loss": 2.599, + "step": 16499 + }, + { + "epoch": 0.4892803131394004, + "grad_norm": 0.12006942927837372, + "learning_rate": 0.0005249971608252987, + "loss": 2.66, + "step": 16500 + }, + { + "epoch": 0.4893099664917119, + "grad_norm": 0.1330784261226654, + "learning_rate": 0.0005249501685910074, + "loss": 2.6113, + "step": 16501 + }, + { + "epoch": 0.48933961984402335, + "grad_norm": 0.13730627298355103, + "learning_rate": 0.0005249031761357781, + "loss": 2.6637, + "step": 16502 + }, + { + "epoch": 0.48936927319633483, + "grad_norm": 0.12232384085655212, + "learning_rate": 0.000524856183460027, + "loss": 2.6753, + "step": 16503 + }, + { + "epoch": 0.4893989265486463, + "grad_norm": 0.0971483662724495, + "learning_rate": 0.00052480919056417, + "loss": 2.6538, + "step": 16504 + }, + { + "epoch": 0.4894285799009578, + "grad_norm": 0.12566906213760376, + "learning_rate": 0.0005247621974486233, + "loss": 2.6377, + "step": 16505 + }, + { + "epoch": 0.48945823325326926, + "grad_norm": 0.13282768428325653, + "learning_rate": 0.0005247152041138033, + "loss": 2.6264, + "step": 16506 + }, + { + "epoch": 0.48948788660558074, + "grad_norm": 0.12108078598976135, + "learning_rate": 0.0005246682105601257, + "loss": 2.6464, + "step": 16507 + }, + { + "epoch": 0.48951753995789227, + "grad_norm": 0.13307610154151917, + "learning_rate": 0.000524621216788007, + "loss": 2.6551, + "step": 16508 + }, + { + "epoch": 0.48954719331020374, + "grad_norm": 0.11622413247823715, + "learning_rate": 0.0005245742227978631, + "loss": 2.6044, + "step": 16509 + }, + { + "epoch": 0.4895768466625152, + "grad_norm": 0.12837910652160645, + "learning_rate": 0.0005245272285901104, + "loss": 2.6465, + "step": 16510 + }, + { + "epoch": 0.4896065000148267, + "grad_norm": 0.12002262473106384, + "learning_rate": 0.0005244802341651648, + "loss": 2.63, + "step": 16511 + }, + { + "epoch": 0.48963615336713817, + "grad_norm": 0.1270824819803238, + "learning_rate": 0.0005244332395234426, + "loss": 2.6833, + "step": 16512 + }, + { + "epoch": 0.48966580671944965, + "grad_norm": 0.12933947145938873, + "learning_rate": 0.0005243862446653596, + "loss": 2.6153, + "step": 16513 + }, + { + "epoch": 0.4896954600717611, + "grad_norm": 0.10252153873443604, + "learning_rate": 0.0005243392495913325, + "loss": 2.6516, + "step": 16514 + }, + { + "epoch": 0.4897251134240726, + "grad_norm": 0.12743748724460602, + "learning_rate": 0.0005242922543017771, + "loss": 2.6799, + "step": 16515 + }, + { + "epoch": 0.4897547667763841, + "grad_norm": 0.1399061232805252, + "learning_rate": 0.0005242452587971096, + "loss": 2.6687, + "step": 16516 + }, + { + "epoch": 0.48978442012869555, + "grad_norm": 0.13839516043663025, + "learning_rate": 0.0005241982630777464, + "loss": 2.6343, + "step": 16517 + }, + { + "epoch": 0.48981407348100703, + "grad_norm": 0.09550376236438751, + "learning_rate": 0.000524151267144103, + "loss": 2.6476, + "step": 16518 + }, + { + "epoch": 0.4898437268333185, + "grad_norm": 0.12501299381256104, + "learning_rate": 0.0005241042709965961, + "loss": 2.6671, + "step": 16519 + }, + { + "epoch": 0.48987338018563, + "grad_norm": 0.11424875259399414, + "learning_rate": 0.0005240572746356418, + "loss": 2.6552, + "step": 16520 + }, + { + "epoch": 0.48990303353794146, + "grad_norm": 0.11523524671792984, + "learning_rate": 0.0005240102780616563, + "loss": 2.6516, + "step": 16521 + }, + { + "epoch": 0.48993268689025293, + "grad_norm": 0.11552272737026215, + "learning_rate": 0.0005239632812750556, + "loss": 2.6582, + "step": 16522 + }, + { + "epoch": 0.4899623402425644, + "grad_norm": 0.1268199235200882, + "learning_rate": 0.000523916284276256, + "loss": 2.6611, + "step": 16523 + }, + { + "epoch": 0.4899919935948759, + "grad_norm": 0.10995377600193024, + "learning_rate": 0.0005238692870656735, + "loss": 2.6225, + "step": 16524 + }, + { + "epoch": 0.49002164694718736, + "grad_norm": 0.10757226496934891, + "learning_rate": 0.0005238222896437242, + "loss": 2.6338, + "step": 16525 + }, + { + "epoch": 0.49005130029949884, + "grad_norm": 0.12004981189966202, + "learning_rate": 0.0005237752920108248, + "loss": 2.6527, + "step": 16526 + }, + { + "epoch": 0.4900809536518103, + "grad_norm": 0.1269870102405548, + "learning_rate": 0.0005237282941673909, + "loss": 2.6737, + "step": 16527 + }, + { + "epoch": 0.4901106070041218, + "grad_norm": 0.12397722154855728, + "learning_rate": 0.000523681296113839, + "loss": 2.6569, + "step": 16528 + }, + { + "epoch": 0.4901402603564333, + "grad_norm": 0.14958657324314117, + "learning_rate": 0.000523634297850585, + "loss": 2.6353, + "step": 16529 + }, + { + "epoch": 0.4901699137087448, + "grad_norm": 0.13044138252735138, + "learning_rate": 0.0005235872993780453, + "loss": 2.6588, + "step": 16530 + }, + { + "epoch": 0.4901995670610563, + "grad_norm": 0.10395324975252151, + "learning_rate": 0.000523540300696636, + "loss": 2.6712, + "step": 16531 + }, + { + "epoch": 0.49022922041336775, + "grad_norm": 0.13154825568199158, + "learning_rate": 0.0005234933018067732, + "loss": 2.6282, + "step": 16532 + }, + { + "epoch": 0.4902588737656792, + "grad_norm": 0.13238519430160522, + "learning_rate": 0.0005234463027088734, + "loss": 2.6545, + "step": 16533 + }, + { + "epoch": 0.4902885271179907, + "grad_norm": 0.10873524099588394, + "learning_rate": 0.0005233993034033525, + "loss": 2.6874, + "step": 16534 + }, + { + "epoch": 0.4903181804703022, + "grad_norm": 0.11216899007558823, + "learning_rate": 0.0005233523038906267, + "loss": 2.655, + "step": 16535 + }, + { + "epoch": 0.49034783382261365, + "grad_norm": 0.1204681545495987, + "learning_rate": 0.0005233053041711122, + "loss": 2.6543, + "step": 16536 + }, + { + "epoch": 0.49037748717492513, + "grad_norm": 0.1378696858882904, + "learning_rate": 0.0005232583042452252, + "loss": 2.6921, + "step": 16537 + }, + { + "epoch": 0.4904071405272366, + "grad_norm": 0.15583962202072144, + "learning_rate": 0.0005232113041133821, + "loss": 2.6536, + "step": 16538 + }, + { + "epoch": 0.4904367938795481, + "grad_norm": 0.13732536137104034, + "learning_rate": 0.0005231643037759989, + "loss": 2.6618, + "step": 16539 + }, + { + "epoch": 0.49046644723185956, + "grad_norm": 0.12720385193824768, + "learning_rate": 0.0005231173032334917, + "loss": 2.6491, + "step": 16540 + }, + { + "epoch": 0.49049610058417104, + "grad_norm": 0.1295522153377533, + "learning_rate": 0.0005230703024862768, + "loss": 2.6492, + "step": 16541 + }, + { + "epoch": 0.4905257539364825, + "grad_norm": 0.13415329158306122, + "learning_rate": 0.0005230233015347705, + "loss": 2.6346, + "step": 16542 + }, + { + "epoch": 0.490555407288794, + "grad_norm": 0.11993445456027985, + "learning_rate": 0.0005229763003793889, + "loss": 2.6588, + "step": 16543 + }, + { + "epoch": 0.49058506064110546, + "grad_norm": 0.11678264290094376, + "learning_rate": 0.0005229292990205482, + "loss": 2.6948, + "step": 16544 + }, + { + "epoch": 0.49061471399341694, + "grad_norm": 0.1294981688261032, + "learning_rate": 0.0005228822974586647, + "loss": 2.6945, + "step": 16545 + }, + { + "epoch": 0.4906443673457284, + "grad_norm": 0.12216373533010483, + "learning_rate": 0.0005228352956941543, + "loss": 2.6448, + "step": 16546 + }, + { + "epoch": 0.4906740206980399, + "grad_norm": 0.11188913136720657, + "learning_rate": 0.0005227882937274336, + "loss": 2.5967, + "step": 16547 + }, + { + "epoch": 0.49070367405035137, + "grad_norm": 0.12867337465286255, + "learning_rate": 0.0005227412915589187, + "loss": 2.6588, + "step": 16548 + }, + { + "epoch": 0.49073332740266284, + "grad_norm": 0.14074815809726715, + "learning_rate": 0.0005226942891890256, + "loss": 2.6676, + "step": 16549 + }, + { + "epoch": 0.4907629807549744, + "grad_norm": 0.12407074123620987, + "learning_rate": 0.0005226472866181708, + "loss": 2.6206, + "step": 16550 + }, + { + "epoch": 0.49079263410728585, + "grad_norm": 0.11993174999952316, + "learning_rate": 0.0005226002838467704, + "loss": 2.6749, + "step": 16551 + }, + { + "epoch": 0.49082228745959733, + "grad_norm": 0.13503800332546234, + "learning_rate": 0.0005225532808752405, + "loss": 2.6563, + "step": 16552 + }, + { + "epoch": 0.4908519408119088, + "grad_norm": 0.11427493393421173, + "learning_rate": 0.0005225062777039975, + "loss": 2.658, + "step": 16553 + }, + { + "epoch": 0.4908815941642203, + "grad_norm": 0.11323347687721252, + "learning_rate": 0.0005224592743334575, + "loss": 2.6515, + "step": 16554 + }, + { + "epoch": 0.49091124751653176, + "grad_norm": 0.11312036961317062, + "learning_rate": 0.0005224122707640369, + "loss": 2.6646, + "step": 16555 + }, + { + "epoch": 0.49094090086884323, + "grad_norm": 0.12216179817914963, + "learning_rate": 0.0005223652669961518, + "loss": 2.6638, + "step": 16556 + }, + { + "epoch": 0.4909705542211547, + "grad_norm": 0.12616541981697083, + "learning_rate": 0.0005223182630302181, + "loss": 2.6354, + "step": 16557 + }, + { + "epoch": 0.4910002075734662, + "grad_norm": 0.12640857696533203, + "learning_rate": 0.0005222712588666527, + "loss": 2.6661, + "step": 16558 + }, + { + "epoch": 0.49102986092577766, + "grad_norm": 0.12133452296257019, + "learning_rate": 0.0005222242545058713, + "loss": 2.644, + "step": 16559 + }, + { + "epoch": 0.49105951427808914, + "grad_norm": 0.13561907410621643, + "learning_rate": 0.0005221772499482903, + "loss": 2.6427, + "step": 16560 + }, + { + "epoch": 0.4910891676304006, + "grad_norm": 0.1329662799835205, + "learning_rate": 0.0005221302451943262, + "loss": 2.6632, + "step": 16561 + }, + { + "epoch": 0.4911188209827121, + "grad_norm": 0.14410212635993958, + "learning_rate": 0.0005220832402443947, + "loss": 2.6676, + "step": 16562 + }, + { + "epoch": 0.49114847433502357, + "grad_norm": 0.13475194573402405, + "learning_rate": 0.0005220362350989123, + "loss": 2.6311, + "step": 16563 + }, + { + "epoch": 0.49117812768733504, + "grad_norm": 0.14608170092105865, + "learning_rate": 0.0005219892297582954, + "loss": 2.6585, + "step": 16564 + }, + { + "epoch": 0.4912077810396465, + "grad_norm": 0.13790926337242126, + "learning_rate": 0.00052194222422296, + "loss": 2.6346, + "step": 16565 + }, + { + "epoch": 0.491237434391958, + "grad_norm": 0.1264207363128662, + "learning_rate": 0.0005218952184933227, + "loss": 2.661, + "step": 16566 + }, + { + "epoch": 0.49126708774426947, + "grad_norm": 0.13685838878154755, + "learning_rate": 0.0005218482125697992, + "loss": 2.66, + "step": 16567 + }, + { + "epoch": 0.49129674109658095, + "grad_norm": 0.11985025554895401, + "learning_rate": 0.0005218012064528061, + "loss": 2.6281, + "step": 16568 + }, + { + "epoch": 0.4913263944488924, + "grad_norm": 0.11798617243766785, + "learning_rate": 0.0005217542001427596, + "loss": 2.6194, + "step": 16569 + }, + { + "epoch": 0.4913560478012039, + "grad_norm": 0.12203233689069748, + "learning_rate": 0.0005217071936400758, + "loss": 2.6569, + "step": 16570 + }, + { + "epoch": 0.49138570115351543, + "grad_norm": 0.11818314343690872, + "learning_rate": 0.0005216601869451712, + "loss": 2.633, + "step": 16571 + }, + { + "epoch": 0.4914153545058269, + "grad_norm": 0.11805194616317749, + "learning_rate": 0.000521613180058462, + "loss": 2.6515, + "step": 16572 + }, + { + "epoch": 0.4914450078581384, + "grad_norm": 0.14709654450416565, + "learning_rate": 0.0005215661729803642, + "loss": 2.6542, + "step": 16573 + }, + { + "epoch": 0.49147466121044986, + "grad_norm": 0.148987278342247, + "learning_rate": 0.0005215191657112944, + "loss": 2.6892, + "step": 16574 + }, + { + "epoch": 0.49150431456276134, + "grad_norm": 0.14565812051296234, + "learning_rate": 0.0005214721582516686, + "loss": 2.6647, + "step": 16575 + }, + { + "epoch": 0.4915339679150728, + "grad_norm": 0.13221879303455353, + "learning_rate": 0.0005214251506019032, + "loss": 2.6716, + "step": 16576 + }, + { + "epoch": 0.4915636212673843, + "grad_norm": 0.12289386987686157, + "learning_rate": 0.0005213781427624144, + "loss": 2.6529, + "step": 16577 + }, + { + "epoch": 0.49159327461969576, + "grad_norm": 0.11380046606063843, + "learning_rate": 0.0005213311347336183, + "loss": 2.6241, + "step": 16578 + }, + { + "epoch": 0.49162292797200724, + "grad_norm": 0.1316141039133072, + "learning_rate": 0.0005212841265159316, + "loss": 2.6743, + "step": 16579 + }, + { + "epoch": 0.4916525813243187, + "grad_norm": 0.11575146019458771, + "learning_rate": 0.0005212371181097701, + "loss": 2.6519, + "step": 16580 + }, + { + "epoch": 0.4916822346766302, + "grad_norm": 0.11025829613208771, + "learning_rate": 0.0005211901095155504, + "loss": 2.6651, + "step": 16581 + }, + { + "epoch": 0.49171188802894167, + "grad_norm": 0.1301266849040985, + "learning_rate": 0.0005211431007336886, + "loss": 2.6533, + "step": 16582 + }, + { + "epoch": 0.49174154138125314, + "grad_norm": 0.1369241178035736, + "learning_rate": 0.0005210960917646012, + "loss": 2.6631, + "step": 16583 + }, + { + "epoch": 0.4917711947335646, + "grad_norm": 0.11581303179264069, + "learning_rate": 0.000521049082608704, + "loss": 2.6519, + "step": 16584 + }, + { + "epoch": 0.4918008480858761, + "grad_norm": 0.11457502841949463, + "learning_rate": 0.0005210020732664137, + "loss": 2.6479, + "step": 16585 + }, + { + "epoch": 0.4918305014381876, + "grad_norm": 0.11626479029655457, + "learning_rate": 0.0005209550637381465, + "loss": 2.6514, + "step": 16586 + }, + { + "epoch": 0.49186015479049905, + "grad_norm": 0.11729494482278824, + "learning_rate": 0.0005209080540243185, + "loss": 2.6529, + "step": 16587 + }, + { + "epoch": 0.4918898081428105, + "grad_norm": 0.10419446974992752, + "learning_rate": 0.0005208610441253461, + "loss": 2.6514, + "step": 16588 + }, + { + "epoch": 0.491919461495122, + "grad_norm": 0.10238155722618103, + "learning_rate": 0.0005208140340416457, + "loss": 2.6686, + "step": 16589 + }, + { + "epoch": 0.4919491148474335, + "grad_norm": 0.1064343973994255, + "learning_rate": 0.0005207670237736332, + "loss": 2.6679, + "step": 16590 + }, + { + "epoch": 0.49197876819974495, + "grad_norm": 0.09499876946210861, + "learning_rate": 0.0005207200133217254, + "loss": 2.6496, + "step": 16591 + }, + { + "epoch": 0.4920084215520565, + "grad_norm": 0.0980197936296463, + "learning_rate": 0.0005206730026863382, + "loss": 2.6433, + "step": 16592 + }, + { + "epoch": 0.49203807490436796, + "grad_norm": 0.13136430084705353, + "learning_rate": 0.0005206259918678881, + "loss": 2.6554, + "step": 16593 + }, + { + "epoch": 0.49206772825667944, + "grad_norm": 0.1440686136484146, + "learning_rate": 0.0005205789808667913, + "loss": 2.6809, + "step": 16594 + }, + { + "epoch": 0.4920973816089909, + "grad_norm": 0.1383051574230194, + "learning_rate": 0.0005205319696834639, + "loss": 2.6318, + "step": 16595 + }, + { + "epoch": 0.4921270349613024, + "grad_norm": 0.12401992082595825, + "learning_rate": 0.0005204849583183225, + "loss": 2.6442, + "step": 16596 + }, + { + "epoch": 0.49215668831361387, + "grad_norm": 0.11228828877210617, + "learning_rate": 0.0005204379467717833, + "loss": 2.6522, + "step": 16597 + }, + { + "epoch": 0.49218634166592534, + "grad_norm": 0.13226260244846344, + "learning_rate": 0.0005203909350442625, + "loss": 2.6367, + "step": 16598 + }, + { + "epoch": 0.4922159950182368, + "grad_norm": 0.12529952824115753, + "learning_rate": 0.0005203439231361766, + "loss": 2.6027, + "step": 16599 + }, + { + "epoch": 0.4922456483705483, + "grad_norm": 0.1174577847123146, + "learning_rate": 0.0005202969110479418, + "loss": 2.6708, + "step": 16600 + }, + { + "epoch": 0.49227530172285977, + "grad_norm": 0.11212334781885147, + "learning_rate": 0.0005202498987799742, + "loss": 2.6638, + "step": 16601 + }, + { + "epoch": 0.49230495507517125, + "grad_norm": 0.10515715181827545, + "learning_rate": 0.0005202028863326902, + "loss": 2.6799, + "step": 16602 + }, + { + "epoch": 0.4923346084274827, + "grad_norm": 0.12481527775526047, + "learning_rate": 0.0005201558737065065, + "loss": 2.6515, + "step": 16603 + }, + { + "epoch": 0.4923642617797942, + "grad_norm": 0.1152682974934578, + "learning_rate": 0.0005201088609018389, + "loss": 2.6495, + "step": 16604 + }, + { + "epoch": 0.4923939151321057, + "grad_norm": 0.12762326002120972, + "learning_rate": 0.0005200618479191038, + "loss": 2.6795, + "step": 16605 + }, + { + "epoch": 0.49242356848441715, + "grad_norm": 0.13034257292747498, + "learning_rate": 0.0005200148347587177, + "loss": 2.6676, + "step": 16606 + }, + { + "epoch": 0.4924532218367286, + "grad_norm": 0.11854133009910583, + "learning_rate": 0.0005199678214210968, + "loss": 2.636, + "step": 16607 + }, + { + "epoch": 0.4924828751890401, + "grad_norm": 0.11320364475250244, + "learning_rate": 0.0005199208079066573, + "loss": 2.6266, + "step": 16608 + }, + { + "epoch": 0.4925125285413516, + "grad_norm": 0.11590708792209625, + "learning_rate": 0.0005198737942158158, + "loss": 2.6388, + "step": 16609 + }, + { + "epoch": 0.49254218189366306, + "grad_norm": 0.11528311669826508, + "learning_rate": 0.0005198267803489884, + "loss": 2.6359, + "step": 16610 + }, + { + "epoch": 0.49257183524597453, + "grad_norm": 0.10928294062614441, + "learning_rate": 0.0005197797663065913, + "loss": 2.6465, + "step": 16611 + }, + { + "epoch": 0.49260148859828606, + "grad_norm": 0.09489717334508896, + "learning_rate": 0.0005197327520890412, + "loss": 2.6609, + "step": 16612 + }, + { + "epoch": 0.49263114195059754, + "grad_norm": 0.11205354332923889, + "learning_rate": 0.0005196857376967539, + "loss": 2.6409, + "step": 16613 + }, + { + "epoch": 0.492660795302909, + "grad_norm": 0.1166083812713623, + "learning_rate": 0.0005196387231301463, + "loss": 2.6825, + "step": 16614 + }, + { + "epoch": 0.4926904486552205, + "grad_norm": 0.1197928786277771, + "learning_rate": 0.0005195917083896343, + "loss": 2.6744, + "step": 16615 + }, + { + "epoch": 0.49272010200753197, + "grad_norm": 0.13404732942581177, + "learning_rate": 0.0005195446934756344, + "loss": 2.6417, + "step": 16616 + }, + { + "epoch": 0.49274975535984344, + "grad_norm": 0.15050667524337769, + "learning_rate": 0.0005194976783885628, + "loss": 2.6213, + "step": 16617 + }, + { + "epoch": 0.4927794087121549, + "grad_norm": 0.13938482105731964, + "learning_rate": 0.000519450663128836, + "loss": 2.6469, + "step": 16618 + }, + { + "epoch": 0.4928090620644664, + "grad_norm": 0.11616713553667068, + "learning_rate": 0.0005194036476968702, + "loss": 2.6652, + "step": 16619 + }, + { + "epoch": 0.4928387154167779, + "grad_norm": 0.11575798690319061, + "learning_rate": 0.0005193566320930818, + "loss": 2.6642, + "step": 16620 + }, + { + "epoch": 0.49286836876908935, + "grad_norm": 0.11706609278917313, + "learning_rate": 0.000519309616317887, + "loss": 2.6833, + "step": 16621 + }, + { + "epoch": 0.4928980221214008, + "grad_norm": 0.1364898830652237, + "learning_rate": 0.0005192626003717025, + "loss": 2.6783, + "step": 16622 + }, + { + "epoch": 0.4929276754737123, + "grad_norm": 0.15379057824611664, + "learning_rate": 0.0005192155842549442, + "loss": 2.669, + "step": 16623 + }, + { + "epoch": 0.4929573288260238, + "grad_norm": 0.1582040637731552, + "learning_rate": 0.0005191685679680286, + "loss": 2.6881, + "step": 16624 + }, + { + "epoch": 0.49298698217833525, + "grad_norm": 0.11149467527866364, + "learning_rate": 0.0005191215515113719, + "loss": 2.6556, + "step": 16625 + }, + { + "epoch": 0.49301663553064673, + "grad_norm": 0.11268848925828934, + "learning_rate": 0.0005190745348853909, + "loss": 2.65, + "step": 16626 + }, + { + "epoch": 0.4930462888829582, + "grad_norm": 0.11927483975887299, + "learning_rate": 0.0005190275180905014, + "loss": 2.6794, + "step": 16627 + }, + { + "epoch": 0.4930759422352697, + "grad_norm": 0.13193809986114502, + "learning_rate": 0.0005189805011271199, + "loss": 2.6781, + "step": 16628 + }, + { + "epoch": 0.49310559558758116, + "grad_norm": 0.11868367344141006, + "learning_rate": 0.0005189334839956628, + "loss": 2.6848, + "step": 16629 + }, + { + "epoch": 0.49313524893989263, + "grad_norm": 0.11612959951162338, + "learning_rate": 0.0005188864666965467, + "loss": 2.6576, + "step": 16630 + }, + { + "epoch": 0.4931649022922041, + "grad_norm": 0.12451793253421783, + "learning_rate": 0.0005188394492301877, + "loss": 2.6928, + "step": 16631 + }, + { + "epoch": 0.4931945556445156, + "grad_norm": 0.1369253695011139, + "learning_rate": 0.000518792431597002, + "loss": 2.6633, + "step": 16632 + }, + { + "epoch": 0.4932242089968271, + "grad_norm": 0.12708720564842224, + "learning_rate": 0.000518745413797406, + "loss": 2.6528, + "step": 16633 + }, + { + "epoch": 0.4932538623491386, + "grad_norm": 0.11351626366376877, + "learning_rate": 0.0005186983958318161, + "loss": 2.633, + "step": 16634 + }, + { + "epoch": 0.49328351570145007, + "grad_norm": 0.1260550320148468, + "learning_rate": 0.0005186513777006488, + "loss": 2.6842, + "step": 16635 + }, + { + "epoch": 0.49331316905376155, + "grad_norm": 0.1273672729730606, + "learning_rate": 0.0005186043594043204, + "loss": 2.6057, + "step": 16636 + }, + { + "epoch": 0.493342822406073, + "grad_norm": 0.13871783018112183, + "learning_rate": 0.0005185573409432473, + "loss": 2.6561, + "step": 16637 + }, + { + "epoch": 0.4933724757583845, + "grad_norm": 0.12087865173816681, + "learning_rate": 0.0005185103223178456, + "loss": 2.6768, + "step": 16638 + }, + { + "epoch": 0.493402129110696, + "grad_norm": 0.1497645527124405, + "learning_rate": 0.0005184633035285319, + "loss": 2.6542, + "step": 16639 + }, + { + "epoch": 0.49343178246300745, + "grad_norm": 0.12153157591819763, + "learning_rate": 0.0005184162845757223, + "loss": 2.6358, + "step": 16640 + }, + { + "epoch": 0.4934614358153189, + "grad_norm": 0.11326383799314499, + "learning_rate": 0.0005183692654598334, + "loss": 2.6088, + "step": 16641 + }, + { + "epoch": 0.4934910891676304, + "grad_norm": 0.11635226011276245, + "learning_rate": 0.0005183222461812816, + "loss": 2.6697, + "step": 16642 + }, + { + "epoch": 0.4935207425199419, + "grad_norm": 0.10740300267934799, + "learning_rate": 0.0005182752267404832, + "loss": 2.6418, + "step": 16643 + }, + { + "epoch": 0.49355039587225336, + "grad_norm": 0.1269596517086029, + "learning_rate": 0.0005182282071378544, + "loss": 2.6503, + "step": 16644 + }, + { + "epoch": 0.49358004922456483, + "grad_norm": 0.11726070195436478, + "learning_rate": 0.0005181811873738118, + "loss": 2.6548, + "step": 16645 + }, + { + "epoch": 0.4936097025768763, + "grad_norm": 0.1120627224445343, + "learning_rate": 0.0005181341674487717, + "loss": 2.6444, + "step": 16646 + }, + { + "epoch": 0.4936393559291878, + "grad_norm": 0.13421586155891418, + "learning_rate": 0.0005180871473631503, + "loss": 2.6876, + "step": 16647 + }, + { + "epoch": 0.49366900928149926, + "grad_norm": 0.10575951635837555, + "learning_rate": 0.0005180401271173643, + "loss": 2.6655, + "step": 16648 + }, + { + "epoch": 0.49369866263381074, + "grad_norm": 0.12332295626401901, + "learning_rate": 0.0005179931067118296, + "loss": 2.6948, + "step": 16649 + }, + { + "epoch": 0.4937283159861222, + "grad_norm": 0.10645143687725067, + "learning_rate": 0.0005179460861469631, + "loss": 2.6703, + "step": 16650 + }, + { + "epoch": 0.4937579693384337, + "grad_norm": 0.12797360122203827, + "learning_rate": 0.0005178990654231808, + "loss": 2.6296, + "step": 16651 + }, + { + "epoch": 0.49378762269074516, + "grad_norm": 0.13341934978961945, + "learning_rate": 0.0005178520445408991, + "loss": 2.653, + "step": 16652 + }, + { + "epoch": 0.49381727604305664, + "grad_norm": 0.13212187588214874, + "learning_rate": 0.0005178050235005347, + "loss": 2.6499, + "step": 16653 + }, + { + "epoch": 0.4938469293953682, + "grad_norm": 0.13163979351520538, + "learning_rate": 0.0005177580023025037, + "loss": 2.63, + "step": 16654 + }, + { + "epoch": 0.49387658274767965, + "grad_norm": 0.1270526498556137, + "learning_rate": 0.0005177109809472224, + "loss": 2.6549, + "step": 16655 + }, + { + "epoch": 0.4939062360999911, + "grad_norm": 0.10932212322950363, + "learning_rate": 0.0005176639594351074, + "loss": 2.6411, + "step": 16656 + }, + { + "epoch": 0.4939358894523026, + "grad_norm": 0.12346676737070084, + "learning_rate": 0.0005176169377665752, + "loss": 2.6175, + "step": 16657 + }, + { + "epoch": 0.4939655428046141, + "grad_norm": 0.11873405426740646, + "learning_rate": 0.0005175699159420419, + "loss": 2.6603, + "step": 16658 + }, + { + "epoch": 0.49399519615692555, + "grad_norm": 0.10795191675424576, + "learning_rate": 0.0005175228939619239, + "loss": 2.6773, + "step": 16659 + }, + { + "epoch": 0.49402484950923703, + "grad_norm": 0.12402434647083282, + "learning_rate": 0.0005174758718266376, + "loss": 2.6638, + "step": 16660 + }, + { + "epoch": 0.4940545028615485, + "grad_norm": 0.1147393211722374, + "learning_rate": 0.0005174288495365995, + "loss": 2.6295, + "step": 16661 + }, + { + "epoch": 0.49408415621386, + "grad_norm": 0.10504583269357681, + "learning_rate": 0.000517381827092226, + "loss": 2.6272, + "step": 16662 + }, + { + "epoch": 0.49411380956617146, + "grad_norm": 0.14380735158920288, + "learning_rate": 0.0005173348044939334, + "loss": 2.6542, + "step": 16663 + }, + { + "epoch": 0.49414346291848293, + "grad_norm": 0.14522966742515564, + "learning_rate": 0.0005172877817421382, + "loss": 2.6684, + "step": 16664 + }, + { + "epoch": 0.4941731162707944, + "grad_norm": 0.14275911450386047, + "learning_rate": 0.0005172407588372568, + "loss": 2.6751, + "step": 16665 + }, + { + "epoch": 0.4942027696231059, + "grad_norm": 0.12678055465221405, + "learning_rate": 0.0005171937357797053, + "loss": 2.6613, + "step": 16666 + }, + { + "epoch": 0.49423242297541736, + "grad_norm": 0.107282355427742, + "learning_rate": 0.0005171467125699003, + "loss": 2.6453, + "step": 16667 + }, + { + "epoch": 0.49426207632772884, + "grad_norm": 0.12302394956350327, + "learning_rate": 0.0005170996892082583, + "loss": 2.6624, + "step": 16668 + }, + { + "epoch": 0.4942917296800403, + "grad_norm": 0.12544091045856476, + "learning_rate": 0.0005170526656951958, + "loss": 2.6071, + "step": 16669 + }, + { + "epoch": 0.4943213830323518, + "grad_norm": 0.11445101350545883, + "learning_rate": 0.0005170056420311289, + "loss": 2.6456, + "step": 16670 + }, + { + "epoch": 0.49435103638466327, + "grad_norm": 0.10886669158935547, + "learning_rate": 0.0005169586182164741, + "loss": 2.6279, + "step": 16671 + }, + { + "epoch": 0.49438068973697474, + "grad_norm": 0.11975815147161484, + "learning_rate": 0.0005169115942516478, + "loss": 2.6557, + "step": 16672 + }, + { + "epoch": 0.4944103430892862, + "grad_norm": 0.11310090124607086, + "learning_rate": 0.0005168645701370663, + "loss": 2.6705, + "step": 16673 + }, + { + "epoch": 0.4944399964415977, + "grad_norm": 0.11199463158845901, + "learning_rate": 0.0005168175458731462, + "loss": 2.623, + "step": 16674 + }, + { + "epoch": 0.4944696497939092, + "grad_norm": 0.12528833746910095, + "learning_rate": 0.0005167705214603041, + "loss": 2.6578, + "step": 16675 + }, + { + "epoch": 0.4944993031462207, + "grad_norm": 0.1329909861087799, + "learning_rate": 0.000516723496898956, + "loss": 2.6601, + "step": 16676 + }, + { + "epoch": 0.4945289564985322, + "grad_norm": 0.12952570617198944, + "learning_rate": 0.0005166764721895184, + "loss": 2.6787, + "step": 16677 + }, + { + "epoch": 0.49455860985084366, + "grad_norm": 0.11559386551380157, + "learning_rate": 0.0005166294473324078, + "loss": 2.6355, + "step": 16678 + }, + { + "epoch": 0.49458826320315513, + "grad_norm": 0.1351398527622223, + "learning_rate": 0.0005165824223280406, + "loss": 2.6982, + "step": 16679 + }, + { + "epoch": 0.4946179165554666, + "grad_norm": 0.12932220101356506, + "learning_rate": 0.0005165353971768331, + "loss": 2.6451, + "step": 16680 + }, + { + "epoch": 0.4946475699077781, + "grad_norm": 0.12000270932912827, + "learning_rate": 0.0005164883718792021, + "loss": 2.6653, + "step": 16681 + }, + { + "epoch": 0.49467722326008956, + "grad_norm": 0.10309120267629623, + "learning_rate": 0.0005164413464355635, + "loss": 2.6525, + "step": 16682 + }, + { + "epoch": 0.49470687661240104, + "grad_norm": 0.10845543444156647, + "learning_rate": 0.0005163943208463341, + "loss": 2.6568, + "step": 16683 + }, + { + "epoch": 0.4947365299647125, + "grad_norm": 0.12431426346302032, + "learning_rate": 0.00051634729511193, + "loss": 2.6593, + "step": 16684 + }, + { + "epoch": 0.494766183317024, + "grad_norm": 0.12647351622581482, + "learning_rate": 0.0005163002692327679, + "loss": 2.6528, + "step": 16685 + }, + { + "epoch": 0.49479583666933546, + "grad_norm": 0.11847168207168579, + "learning_rate": 0.0005162532432092642, + "loss": 2.6491, + "step": 16686 + }, + { + "epoch": 0.49482549002164694, + "grad_norm": 0.12281970679759979, + "learning_rate": 0.0005162062170418351, + "loss": 2.6494, + "step": 16687 + }, + { + "epoch": 0.4948551433739584, + "grad_norm": 0.1305149346590042, + "learning_rate": 0.0005161591907308972, + "loss": 2.6561, + "step": 16688 + }, + { + "epoch": 0.4948847967262699, + "grad_norm": 0.1289036124944687, + "learning_rate": 0.000516112164276867, + "loss": 2.6801, + "step": 16689 + }, + { + "epoch": 0.49491445007858137, + "grad_norm": 0.11748743057250977, + "learning_rate": 0.0005160651376801606, + "loss": 2.6118, + "step": 16690 + }, + { + "epoch": 0.49494410343089285, + "grad_norm": 0.11621082574129105, + "learning_rate": 0.0005160181109411947, + "loss": 2.6425, + "step": 16691 + }, + { + "epoch": 0.4949737567832043, + "grad_norm": 0.14108562469482422, + "learning_rate": 0.000515971084060386, + "loss": 2.6197, + "step": 16692 + }, + { + "epoch": 0.4950034101355158, + "grad_norm": 0.13437213003635406, + "learning_rate": 0.0005159240570381503, + "loss": 2.611, + "step": 16693 + }, + { + "epoch": 0.4950330634878273, + "grad_norm": 0.10931705683469772, + "learning_rate": 0.0005158770298749044, + "loss": 2.6705, + "step": 16694 + }, + { + "epoch": 0.49506271684013875, + "grad_norm": 0.11907859146595001, + "learning_rate": 0.0005158300025710646, + "loss": 2.6521, + "step": 16695 + }, + { + "epoch": 0.4950923701924503, + "grad_norm": 0.12684471905231476, + "learning_rate": 0.0005157829751270476, + "loss": 2.6654, + "step": 16696 + }, + { + "epoch": 0.49512202354476176, + "grad_norm": 0.12253428995609283, + "learning_rate": 0.0005157359475432696, + "loss": 2.6468, + "step": 16697 + }, + { + "epoch": 0.49515167689707323, + "grad_norm": 0.11513262987136841, + "learning_rate": 0.000515688919820147, + "loss": 2.5956, + "step": 16698 + }, + { + "epoch": 0.4951813302493847, + "grad_norm": 0.11022809147834778, + "learning_rate": 0.0005156418919580962, + "loss": 2.6458, + "step": 16699 + }, + { + "epoch": 0.4952109836016962, + "grad_norm": 0.12782518565654755, + "learning_rate": 0.0005155948639575338, + "loss": 2.655, + "step": 16700 + }, + { + "epoch": 0.49524063695400766, + "grad_norm": 0.1274474859237671, + "learning_rate": 0.0005155478358188764, + "loss": 2.6442, + "step": 16701 + }, + { + "epoch": 0.49527029030631914, + "grad_norm": 0.12210766971111298, + "learning_rate": 0.0005155008075425402, + "loss": 2.6585, + "step": 16702 + }, + { + "epoch": 0.4952999436586306, + "grad_norm": 0.09753429144620895, + "learning_rate": 0.0005154537791289417, + "loss": 2.6233, + "step": 16703 + }, + { + "epoch": 0.4953295970109421, + "grad_norm": 0.10044311732053757, + "learning_rate": 0.0005154067505784973, + "loss": 2.657, + "step": 16704 + }, + { + "epoch": 0.49535925036325357, + "grad_norm": 0.1200653687119484, + "learning_rate": 0.0005153597218916234, + "loss": 2.6966, + "step": 16705 + }, + { + "epoch": 0.49538890371556504, + "grad_norm": 0.12689411640167236, + "learning_rate": 0.0005153126930687365, + "loss": 2.6332, + "step": 16706 + }, + { + "epoch": 0.4954185570678765, + "grad_norm": 0.12680822610855103, + "learning_rate": 0.0005152656641102532, + "loss": 2.6664, + "step": 16707 + }, + { + "epoch": 0.495448210420188, + "grad_norm": 0.12127240747213364, + "learning_rate": 0.0005152186350165898, + "loss": 2.6354, + "step": 16708 + }, + { + "epoch": 0.49547786377249947, + "grad_norm": 0.1251169741153717, + "learning_rate": 0.0005151716057881628, + "loss": 2.663, + "step": 16709 + }, + { + "epoch": 0.49550751712481095, + "grad_norm": 0.14788509905338287, + "learning_rate": 0.0005151245764253886, + "loss": 2.6007, + "step": 16710 + }, + { + "epoch": 0.4955371704771224, + "grad_norm": 0.14964014291763306, + "learning_rate": 0.0005150775469286836, + "loss": 2.6609, + "step": 16711 + }, + { + "epoch": 0.4955668238294339, + "grad_norm": 0.11298781633377075, + "learning_rate": 0.0005150305172984642, + "loss": 2.6254, + "step": 16712 + }, + { + "epoch": 0.4955964771817454, + "grad_norm": 0.13868537545204163, + "learning_rate": 0.0005149834875351475, + "loss": 2.6571, + "step": 16713 + }, + { + "epoch": 0.49562613053405685, + "grad_norm": 0.16218870878219604, + "learning_rate": 0.0005149364576391491, + "loss": 2.6547, + "step": 16714 + }, + { + "epoch": 0.49565578388636833, + "grad_norm": 0.15685917437076569, + "learning_rate": 0.0005148894276108858, + "loss": 2.6358, + "step": 16715 + }, + { + "epoch": 0.49568543723867986, + "grad_norm": 0.12847980856895447, + "learning_rate": 0.0005148423974507741, + "loss": 2.6346, + "step": 16716 + }, + { + "epoch": 0.49571509059099134, + "grad_norm": 0.13916265964508057, + "learning_rate": 0.0005147953671592304, + "loss": 2.6704, + "step": 16717 + }, + { + "epoch": 0.4957447439433028, + "grad_norm": 0.13244397938251495, + "learning_rate": 0.0005147483367366712, + "loss": 2.6569, + "step": 16718 + }, + { + "epoch": 0.4957743972956143, + "grad_norm": 0.1354682594537735, + "learning_rate": 0.000514701306183513, + "loss": 2.6388, + "step": 16719 + }, + { + "epoch": 0.49580405064792576, + "grad_norm": 0.1485004425048828, + "learning_rate": 0.0005146542755001721, + "loss": 2.6466, + "step": 16720 + }, + { + "epoch": 0.49583370400023724, + "grad_norm": 0.1335614174604416, + "learning_rate": 0.0005146072446870651, + "loss": 2.6457, + "step": 16721 + }, + { + "epoch": 0.4958633573525487, + "grad_norm": 0.11974786221981049, + "learning_rate": 0.0005145602137446084, + "loss": 2.6508, + "step": 16722 + }, + { + "epoch": 0.4958930107048602, + "grad_norm": 0.14953777194023132, + "learning_rate": 0.0005145131826732186, + "loss": 2.6594, + "step": 16723 + }, + { + "epoch": 0.49592266405717167, + "grad_norm": 0.14456959068775177, + "learning_rate": 0.0005144661514733122, + "loss": 2.6118, + "step": 16724 + }, + { + "epoch": 0.49595231740948315, + "grad_norm": 0.11443055421113968, + "learning_rate": 0.0005144191201453054, + "loss": 2.6551, + "step": 16725 + }, + { + "epoch": 0.4959819707617946, + "grad_norm": 0.1307218223810196, + "learning_rate": 0.0005143720886896147, + "loss": 2.6648, + "step": 16726 + }, + { + "epoch": 0.4960116241141061, + "grad_norm": 0.13599847257137299, + "learning_rate": 0.0005143250571066569, + "loss": 2.6278, + "step": 16727 + }, + { + "epoch": 0.4960412774664176, + "grad_norm": 0.09705319255590439, + "learning_rate": 0.0005142780253968481, + "loss": 2.6524, + "step": 16728 + }, + { + "epoch": 0.49607093081872905, + "grad_norm": 0.11818470060825348, + "learning_rate": 0.000514230993560605, + "loss": 2.6661, + "step": 16729 + }, + { + "epoch": 0.4961005841710405, + "grad_norm": 0.12807327508926392, + "learning_rate": 0.0005141839615983441, + "loss": 2.6398, + "step": 16730 + }, + { + "epoch": 0.496130237523352, + "grad_norm": 0.12039002031087875, + "learning_rate": 0.0005141369295104816, + "loss": 2.6629, + "step": 16731 + }, + { + "epoch": 0.4961598908756635, + "grad_norm": 0.11297673732042313, + "learning_rate": 0.0005140898972974343, + "loss": 2.6393, + "step": 16732 + }, + { + "epoch": 0.49618954422797495, + "grad_norm": 0.11890433728694916, + "learning_rate": 0.0005140428649596185, + "loss": 2.6205, + "step": 16733 + }, + { + "epoch": 0.49621919758028643, + "grad_norm": 0.11684613674879074, + "learning_rate": 0.0005139958324974507, + "loss": 2.668, + "step": 16734 + }, + { + "epoch": 0.4962488509325979, + "grad_norm": 0.12155960500240326, + "learning_rate": 0.0005139487999113476, + "loss": 2.6287, + "step": 16735 + }, + { + "epoch": 0.4962785042849094, + "grad_norm": 0.11673431098461151, + "learning_rate": 0.0005139017672017253, + "loss": 2.6381, + "step": 16736 + }, + { + "epoch": 0.4963081576372209, + "grad_norm": 0.1207694336771965, + "learning_rate": 0.0005138547343690004, + "loss": 2.6598, + "step": 16737 + }, + { + "epoch": 0.4963378109895324, + "grad_norm": 0.11610677093267441, + "learning_rate": 0.0005138077014135895, + "loss": 2.6381, + "step": 16738 + }, + { + "epoch": 0.49636746434184387, + "grad_norm": 0.11912420392036438, + "learning_rate": 0.0005137606683359089, + "loss": 2.6606, + "step": 16739 + }, + { + "epoch": 0.49639711769415534, + "grad_norm": 0.13245299458503723, + "learning_rate": 0.0005137136351363756, + "loss": 2.6413, + "step": 16740 + }, + { + "epoch": 0.4964267710464668, + "grad_norm": 0.12827487289905548, + "learning_rate": 0.0005136666018154054, + "loss": 2.6296, + "step": 16741 + }, + { + "epoch": 0.4964564243987783, + "grad_norm": 0.09537952393293381, + "learning_rate": 0.0005136195683734153, + "loss": 2.6386, + "step": 16742 + }, + { + "epoch": 0.49648607775108977, + "grad_norm": 0.0999763086438179, + "learning_rate": 0.0005135725348108214, + "loss": 2.663, + "step": 16743 + }, + { + "epoch": 0.49651573110340125, + "grad_norm": 0.11401594430208206, + "learning_rate": 0.0005135255011280404, + "loss": 2.6085, + "step": 16744 + }, + { + "epoch": 0.4965453844557127, + "grad_norm": 0.11186465620994568, + "learning_rate": 0.0005134784673254889, + "loss": 2.6336, + "step": 16745 + }, + { + "epoch": 0.4965750378080242, + "grad_norm": 0.13451729714870453, + "learning_rate": 0.0005134314334035832, + "loss": 2.6751, + "step": 16746 + }, + { + "epoch": 0.4966046911603357, + "grad_norm": 0.13714544475078583, + "learning_rate": 0.0005133843993627397, + "loss": 2.6388, + "step": 16747 + }, + { + "epoch": 0.49663434451264715, + "grad_norm": 0.13823416829109192, + "learning_rate": 0.0005133373652033751, + "loss": 2.6471, + "step": 16748 + }, + { + "epoch": 0.49666399786495863, + "grad_norm": 0.13254296779632568, + "learning_rate": 0.0005132903309259059, + "loss": 2.624, + "step": 16749 + }, + { + "epoch": 0.4966936512172701, + "grad_norm": 0.12757359445095062, + "learning_rate": 0.0005132432965307487, + "loss": 2.666, + "step": 16750 + }, + { + "epoch": 0.4967233045695816, + "grad_norm": 0.13658270239830017, + "learning_rate": 0.0005131962620183196, + "loss": 2.6395, + "step": 16751 + }, + { + "epoch": 0.49675295792189306, + "grad_norm": 0.11971856653690338, + "learning_rate": 0.0005131492273890354, + "loss": 2.6803, + "step": 16752 + }, + { + "epoch": 0.49678261127420453, + "grad_norm": 0.11537467688322067, + "learning_rate": 0.0005131021926433125, + "loss": 2.6614, + "step": 16753 + }, + { + "epoch": 0.496812264626516, + "grad_norm": 0.1338287591934204, + "learning_rate": 0.0005130551577815675, + "loss": 2.6609, + "step": 16754 + }, + { + "epoch": 0.4968419179788275, + "grad_norm": 0.12403975427150726, + "learning_rate": 0.0005130081228042168, + "loss": 2.6554, + "step": 16755 + }, + { + "epoch": 0.49687157133113896, + "grad_norm": 0.1267441213130951, + "learning_rate": 0.0005129610877116769, + "loss": 2.6147, + "step": 16756 + }, + { + "epoch": 0.49690122468345044, + "grad_norm": 0.1152476817369461, + "learning_rate": 0.0005129140525043644, + "loss": 2.6356, + "step": 16757 + }, + { + "epoch": 0.49693087803576197, + "grad_norm": 0.13585247099399567, + "learning_rate": 0.0005128670171826958, + "loss": 2.6865, + "step": 16758 + }, + { + "epoch": 0.49696053138807345, + "grad_norm": 0.14454221725463867, + "learning_rate": 0.0005128199817470876, + "loss": 2.6556, + "step": 16759 + }, + { + "epoch": 0.4969901847403849, + "grad_norm": 0.12486425042152405, + "learning_rate": 0.000512772946197956, + "loss": 2.6532, + "step": 16760 + }, + { + "epoch": 0.4970198380926964, + "grad_norm": 0.10341277718544006, + "learning_rate": 0.0005127259105357179, + "loss": 2.6385, + "step": 16761 + }, + { + "epoch": 0.4970494914450079, + "grad_norm": 0.11405341327190399, + "learning_rate": 0.0005126788747607898, + "loss": 2.6308, + "step": 16762 + }, + { + "epoch": 0.49707914479731935, + "grad_norm": 0.12918619811534882, + "learning_rate": 0.0005126318388735882, + "loss": 2.6381, + "step": 16763 + }, + { + "epoch": 0.4971087981496308, + "grad_norm": 0.10886655747890472, + "learning_rate": 0.0005125848028745292, + "loss": 2.6247, + "step": 16764 + }, + { + "epoch": 0.4971384515019423, + "grad_norm": 0.10054195672273636, + "learning_rate": 0.0005125377667640296, + "loss": 2.6416, + "step": 16765 + }, + { + "epoch": 0.4971681048542538, + "grad_norm": 0.12057122588157654, + "learning_rate": 0.000512490730542506, + "loss": 2.657, + "step": 16766 + }, + { + "epoch": 0.49719775820656525, + "grad_norm": 0.12204849720001221, + "learning_rate": 0.0005124436942103749, + "loss": 2.6165, + "step": 16767 + }, + { + "epoch": 0.49722741155887673, + "grad_norm": 0.12150897085666656, + "learning_rate": 0.0005123966577680527, + "loss": 2.659, + "step": 16768 + }, + { + "epoch": 0.4972570649111882, + "grad_norm": 0.13400250673294067, + "learning_rate": 0.0005123496212159561, + "loss": 2.6468, + "step": 16769 + }, + { + "epoch": 0.4972867182634997, + "grad_norm": 0.12280695140361786, + "learning_rate": 0.0005123025845545013, + "loss": 2.6701, + "step": 16770 + }, + { + "epoch": 0.49731637161581116, + "grad_norm": 0.11137713491916656, + "learning_rate": 0.000512255547784105, + "loss": 2.6151, + "step": 16771 + }, + { + "epoch": 0.49734602496812264, + "grad_norm": 0.14151665568351746, + "learning_rate": 0.0005122085109051838, + "loss": 2.6339, + "step": 16772 + }, + { + "epoch": 0.4973756783204341, + "grad_norm": 0.14559434354305267, + "learning_rate": 0.0005121614739181543, + "loss": 2.6356, + "step": 16773 + }, + { + "epoch": 0.4974053316727456, + "grad_norm": 0.12613363564014435, + "learning_rate": 0.0005121144368234326, + "loss": 2.6555, + "step": 16774 + }, + { + "epoch": 0.49743498502505706, + "grad_norm": 0.13308709859848022, + "learning_rate": 0.0005120673996214356, + "loss": 2.638, + "step": 16775 + }, + { + "epoch": 0.49746463837736854, + "grad_norm": 0.11314094811677933, + "learning_rate": 0.0005120203623125796, + "loss": 2.6141, + "step": 16776 + }, + { + "epoch": 0.49749429172968, + "grad_norm": 0.12347319722175598, + "learning_rate": 0.0005119733248972814, + "loss": 2.6681, + "step": 16777 + }, + { + "epoch": 0.4975239450819915, + "grad_norm": 0.12903332710266113, + "learning_rate": 0.0005119262873759572, + "loss": 2.6196, + "step": 16778 + }, + { + "epoch": 0.497553598434303, + "grad_norm": 0.11883565783500671, + "learning_rate": 0.0005118792497490238, + "loss": 2.6212, + "step": 16779 + }, + { + "epoch": 0.4975832517866145, + "grad_norm": 0.11729374527931213, + "learning_rate": 0.0005118322120168976, + "loss": 2.6725, + "step": 16780 + }, + { + "epoch": 0.497612905138926, + "grad_norm": 0.12485356628894806, + "learning_rate": 0.000511785174179995, + "loss": 2.6654, + "step": 16781 + }, + { + "epoch": 0.49764255849123745, + "grad_norm": 0.13029666244983673, + "learning_rate": 0.0005117381362387327, + "loss": 2.6307, + "step": 16782 + }, + { + "epoch": 0.49767221184354893, + "grad_norm": 0.10746868699789047, + "learning_rate": 0.0005116910981935273, + "loss": 2.628, + "step": 16783 + }, + { + "epoch": 0.4977018651958604, + "grad_norm": 0.1223132461309433, + "learning_rate": 0.0005116440600447951, + "loss": 2.6595, + "step": 16784 + }, + { + "epoch": 0.4977315185481719, + "grad_norm": 0.13209636509418488, + "learning_rate": 0.0005115970217929529, + "loss": 2.6801, + "step": 16785 + }, + { + "epoch": 0.49776117190048336, + "grad_norm": 0.10753139108419418, + "learning_rate": 0.0005115499834384169, + "loss": 2.6611, + "step": 16786 + }, + { + "epoch": 0.49779082525279483, + "grad_norm": 0.12095413357019424, + "learning_rate": 0.0005115029449816039, + "loss": 2.6735, + "step": 16787 + }, + { + "epoch": 0.4978204786051063, + "grad_norm": 0.10876050591468811, + "learning_rate": 0.0005114559064229303, + "loss": 2.6596, + "step": 16788 + }, + { + "epoch": 0.4978501319574178, + "grad_norm": 0.11214027553796768, + "learning_rate": 0.0005114088677628126, + "loss": 2.6374, + "step": 16789 + }, + { + "epoch": 0.49787978530972926, + "grad_norm": 0.09895467758178711, + "learning_rate": 0.0005113618290016677, + "loss": 2.5982, + "step": 16790 + }, + { + "epoch": 0.49790943866204074, + "grad_norm": 0.11466093361377716, + "learning_rate": 0.0005113147901399116, + "loss": 2.6455, + "step": 16791 + }, + { + "epoch": 0.4979390920143522, + "grad_norm": 0.11953189224004745, + "learning_rate": 0.0005112677511779612, + "loss": 2.6594, + "step": 16792 + }, + { + "epoch": 0.4979687453666637, + "grad_norm": 0.10599512606859207, + "learning_rate": 0.0005112207121162329, + "loss": 2.6217, + "step": 16793 + }, + { + "epoch": 0.49799839871897517, + "grad_norm": 0.12477274984121323, + "learning_rate": 0.0005111736729551433, + "loss": 2.6593, + "step": 16794 + }, + { + "epoch": 0.49802805207128664, + "grad_norm": 0.1308365911245346, + "learning_rate": 0.0005111266336951091, + "loss": 2.6538, + "step": 16795 + }, + { + "epoch": 0.4980577054235981, + "grad_norm": 0.13152749836444855, + "learning_rate": 0.0005110795943365462, + "loss": 2.6422, + "step": 16796 + }, + { + "epoch": 0.4980873587759096, + "grad_norm": 0.12773479521274567, + "learning_rate": 0.0005110325548798719, + "loss": 2.6591, + "step": 16797 + }, + { + "epoch": 0.49811701212822107, + "grad_norm": 0.12591657042503357, + "learning_rate": 0.0005109855153255023, + "loss": 2.6702, + "step": 16798 + }, + { + "epoch": 0.49814666548053255, + "grad_norm": 0.10090389847755432, + "learning_rate": 0.0005109384756738542, + "loss": 2.6394, + "step": 16799 + }, + { + "epoch": 0.4981763188328441, + "grad_norm": 0.10821472853422165, + "learning_rate": 0.000510891435925344, + "loss": 2.6904, + "step": 16800 + }, + { + "epoch": 0.49820597218515555, + "grad_norm": 0.11721204221248627, + "learning_rate": 0.0005108443960803882, + "loss": 2.6509, + "step": 16801 + }, + { + "epoch": 0.49823562553746703, + "grad_norm": 0.11108984798192978, + "learning_rate": 0.0005107973561394034, + "loss": 2.6414, + "step": 16802 + }, + { + "epoch": 0.4982652788897785, + "grad_norm": 0.09860846400260925, + "learning_rate": 0.0005107503161028062, + "loss": 2.6705, + "step": 16803 + }, + { + "epoch": 0.49829493224209, + "grad_norm": 0.11239483952522278, + "learning_rate": 0.0005107032759710131, + "loss": 2.6903, + "step": 16804 + }, + { + "epoch": 0.49832458559440146, + "grad_norm": 0.11685798317193985, + "learning_rate": 0.0005106562357444406, + "loss": 2.6562, + "step": 16805 + }, + { + "epoch": 0.49835423894671294, + "grad_norm": 0.10476480424404144, + "learning_rate": 0.0005106091954235055, + "loss": 2.6463, + "step": 16806 + }, + { + "epoch": 0.4983838922990244, + "grad_norm": 0.12601304054260254, + "learning_rate": 0.000510562155008624, + "loss": 2.6706, + "step": 16807 + }, + { + "epoch": 0.4984135456513359, + "grad_norm": 0.11597459018230438, + "learning_rate": 0.0005105151145002128, + "loss": 2.6468, + "step": 16808 + }, + { + "epoch": 0.49844319900364736, + "grad_norm": 0.14149311184883118, + "learning_rate": 0.0005104680738986883, + "loss": 2.6774, + "step": 16809 + }, + { + "epoch": 0.49847285235595884, + "grad_norm": 0.14827698469161987, + "learning_rate": 0.0005104210332044674, + "loss": 2.6401, + "step": 16810 + }, + { + "epoch": 0.4985025057082703, + "grad_norm": 0.14597399532794952, + "learning_rate": 0.0005103739924179665, + "loss": 2.6725, + "step": 16811 + }, + { + "epoch": 0.4985321590605818, + "grad_norm": 0.14384815096855164, + "learning_rate": 0.0005103269515396021, + "loss": 2.6732, + "step": 16812 + }, + { + "epoch": 0.49856181241289327, + "grad_norm": 0.14069649577140808, + "learning_rate": 0.0005102799105697908, + "loss": 2.6411, + "step": 16813 + }, + { + "epoch": 0.49859146576520474, + "grad_norm": 0.13306337594985962, + "learning_rate": 0.000510232869508949, + "loss": 2.6668, + "step": 16814 + }, + { + "epoch": 0.4986211191175162, + "grad_norm": 0.13660390675067902, + "learning_rate": 0.0005101858283574933, + "loss": 2.6361, + "step": 16815 + }, + { + "epoch": 0.4986507724698277, + "grad_norm": 0.14226087927818298, + "learning_rate": 0.0005101387871158406, + "loss": 2.6591, + "step": 16816 + }, + { + "epoch": 0.4986804258221392, + "grad_norm": 0.11365310102701187, + "learning_rate": 0.0005100917457844071, + "loss": 2.6057, + "step": 16817 + }, + { + "epoch": 0.49871007917445065, + "grad_norm": 0.126953125, + "learning_rate": 0.0005100447043636094, + "loss": 2.6465, + "step": 16818 + }, + { + "epoch": 0.4987397325267621, + "grad_norm": 0.12711669504642487, + "learning_rate": 0.0005099976628538641, + "loss": 2.6518, + "step": 16819 + }, + { + "epoch": 0.49876938587907366, + "grad_norm": 0.11960938572883606, + "learning_rate": 0.0005099506212555879, + "loss": 2.6433, + "step": 16820 + }, + { + "epoch": 0.49879903923138513, + "grad_norm": 0.12754978239536285, + "learning_rate": 0.0005099035795691972, + "loss": 2.641, + "step": 16821 + }, + { + "epoch": 0.4988286925836966, + "grad_norm": 0.15959268808364868, + "learning_rate": 0.0005098565377951085, + "loss": 2.6635, + "step": 16822 + }, + { + "epoch": 0.4988583459360081, + "grad_norm": 0.14523768424987793, + "learning_rate": 0.0005098094959337386, + "loss": 2.652, + "step": 16823 + }, + { + "epoch": 0.49888799928831956, + "grad_norm": 0.11786701530218124, + "learning_rate": 0.0005097624539855039, + "loss": 2.6367, + "step": 16824 + }, + { + "epoch": 0.49891765264063104, + "grad_norm": 0.13240204751491547, + "learning_rate": 0.0005097154119508209, + "loss": 2.6615, + "step": 16825 + }, + { + "epoch": 0.4989473059929425, + "grad_norm": 0.13217681646347046, + "learning_rate": 0.0005096683698301063, + "loss": 2.6389, + "step": 16826 + }, + { + "epoch": 0.498976959345254, + "grad_norm": 0.12725332379341125, + "learning_rate": 0.0005096213276237768, + "loss": 2.6259, + "step": 16827 + }, + { + "epoch": 0.49900661269756547, + "grad_norm": 0.1152581200003624, + "learning_rate": 0.0005095742853322485, + "loss": 2.6556, + "step": 16828 + }, + { + "epoch": 0.49903626604987694, + "grad_norm": 0.11224714666604996, + "learning_rate": 0.0005095272429559384, + "loss": 2.6188, + "step": 16829 + }, + { + "epoch": 0.4990659194021884, + "grad_norm": 0.1153464987874031, + "learning_rate": 0.000509480200495263, + "loss": 2.6594, + "step": 16830 + }, + { + "epoch": 0.4990955727544999, + "grad_norm": 0.12158017605543137, + "learning_rate": 0.0005094331579506387, + "loss": 2.6338, + "step": 16831 + }, + { + "epoch": 0.49912522610681137, + "grad_norm": 0.13394635915756226, + "learning_rate": 0.0005093861153224823, + "loss": 2.6571, + "step": 16832 + }, + { + "epoch": 0.49915487945912285, + "grad_norm": 0.1250055432319641, + "learning_rate": 0.0005093390726112102, + "loss": 2.6539, + "step": 16833 + }, + { + "epoch": 0.4991845328114343, + "grad_norm": 0.12963011860847473, + "learning_rate": 0.000509292029817239, + "loss": 2.6555, + "step": 16834 + }, + { + "epoch": 0.4992141861637458, + "grad_norm": 0.10506226867437363, + "learning_rate": 0.000509244986940985, + "loss": 2.6294, + "step": 16835 + }, + { + "epoch": 0.4992438395160573, + "grad_norm": 0.1282709687948227, + "learning_rate": 0.0005091979439828654, + "loss": 2.6486, + "step": 16836 + }, + { + "epoch": 0.49927349286836875, + "grad_norm": 0.12037761509418488, + "learning_rate": 0.0005091509009432962, + "loss": 2.6521, + "step": 16837 + }, + { + "epoch": 0.4993031462206802, + "grad_norm": 0.11482318490743637, + "learning_rate": 0.0005091038578226943, + "loss": 2.6521, + "step": 16838 + }, + { + "epoch": 0.4993327995729917, + "grad_norm": 0.11481954902410507, + "learning_rate": 0.0005090568146214763, + "loss": 2.6797, + "step": 16839 + }, + { + "epoch": 0.4993624529253032, + "grad_norm": 0.1138983815908432, + "learning_rate": 0.0005090097713400585, + "loss": 2.624, + "step": 16840 + }, + { + "epoch": 0.4993921062776147, + "grad_norm": 0.13488386571407318, + "learning_rate": 0.0005089627279788577, + "loss": 2.6565, + "step": 16841 + }, + { + "epoch": 0.4994217596299262, + "grad_norm": 0.12007325142621994, + "learning_rate": 0.0005089156845382903, + "loss": 2.6674, + "step": 16842 + }, + { + "epoch": 0.49945141298223766, + "grad_norm": 0.13927975296974182, + "learning_rate": 0.0005088686410187731, + "loss": 2.643, + "step": 16843 + }, + { + "epoch": 0.49948106633454914, + "grad_norm": 0.12040326744318008, + "learning_rate": 0.0005088215974207226, + "loss": 2.618, + "step": 16844 + }, + { + "epoch": 0.4995107196868606, + "grad_norm": 0.1160614863038063, + "learning_rate": 0.0005087745537445552, + "loss": 2.6622, + "step": 16845 + }, + { + "epoch": 0.4995403730391721, + "grad_norm": 0.1311805546283722, + "learning_rate": 0.0005087275099906878, + "loss": 2.625, + "step": 16846 + }, + { + "epoch": 0.49957002639148357, + "grad_norm": 0.15600396692752838, + "learning_rate": 0.0005086804661595366, + "loss": 2.6436, + "step": 16847 + }, + { + "epoch": 0.49959967974379504, + "grad_norm": 0.14752887189388275, + "learning_rate": 0.0005086334222515183, + "loss": 2.6554, + "step": 16848 + }, + { + "epoch": 0.4996293330961065, + "grad_norm": 0.10913485288619995, + "learning_rate": 0.00050858637826705, + "loss": 2.631, + "step": 16849 + }, + { + "epoch": 0.499658986448418, + "grad_norm": 0.13860401511192322, + "learning_rate": 0.0005085393342065474, + "loss": 2.6542, + "step": 16850 + }, + { + "epoch": 0.4996886398007295, + "grad_norm": 0.1120716854929924, + "learning_rate": 0.0005084922900704278, + "loss": 2.6144, + "step": 16851 + }, + { + "epoch": 0.49971829315304095, + "grad_norm": 0.1077515035867691, + "learning_rate": 0.0005084452458591073, + "loss": 2.6626, + "step": 16852 + }, + { + "epoch": 0.4997479465053524, + "grad_norm": 0.1190141886472702, + "learning_rate": 0.0005083982015730028, + "loss": 2.6538, + "step": 16853 + }, + { + "epoch": 0.4997775998576639, + "grad_norm": 0.13728420436382294, + "learning_rate": 0.0005083511572125308, + "loss": 2.6409, + "step": 16854 + }, + { + "epoch": 0.4998072532099754, + "grad_norm": 0.12845666706562042, + "learning_rate": 0.0005083041127781079, + "loss": 2.6834, + "step": 16855 + }, + { + "epoch": 0.49983690656228685, + "grad_norm": 0.1191016361117363, + "learning_rate": 0.0005082570682701506, + "loss": 2.6509, + "step": 16856 + }, + { + "epoch": 0.49986655991459833, + "grad_norm": 0.12087702006101608, + "learning_rate": 0.0005082100236890757, + "loss": 2.6682, + "step": 16857 + }, + { + "epoch": 0.4998962132669098, + "grad_norm": 0.12132467329502106, + "learning_rate": 0.0005081629790352994, + "loss": 2.6754, + "step": 16858 + }, + { + "epoch": 0.4999258666192213, + "grad_norm": 0.116453155875206, + "learning_rate": 0.0005081159343092387, + "loss": 2.6675, + "step": 16859 + }, + { + "epoch": 0.49995551997153276, + "grad_norm": 0.1085876002907753, + "learning_rate": 0.00050806888951131, + "loss": 2.6628, + "step": 16860 + }, + { + "epoch": 0.49998517332384423, + "grad_norm": 0.11640726774930954, + "learning_rate": 0.0005080218446419296, + "loss": 2.6282, + "step": 16861 + }, + { + "epoch": 0.5000148266761557, + "grad_norm": 0.1329130232334137, + "learning_rate": 0.0005079747997015148, + "loss": 2.6593, + "step": 16862 + }, + { + "epoch": 0.5000444800284672, + "grad_norm": 0.12325196713209152, + "learning_rate": 0.0005079277546904815, + "loss": 2.6608, + "step": 16863 + }, + { + "epoch": 0.5000741333807787, + "grad_norm": 0.13999147713184357, + "learning_rate": 0.0005078807096092466, + "loss": 2.6501, + "step": 16864 + }, + { + "epoch": 0.5001037867330902, + "grad_norm": 0.13594485819339752, + "learning_rate": 0.0005078336644582268, + "loss": 2.6536, + "step": 16865 + }, + { + "epoch": 0.5001334400854016, + "grad_norm": 0.11765959858894348, + "learning_rate": 0.0005077866192378385, + "loss": 2.6341, + "step": 16866 + }, + { + "epoch": 0.5001630934377131, + "grad_norm": 0.10535982996225357, + "learning_rate": 0.0005077395739484982, + "loss": 2.663, + "step": 16867 + }, + { + "epoch": 0.5001927467900246, + "grad_norm": 0.1393285095691681, + "learning_rate": 0.0005076925285906229, + "loss": 2.6687, + "step": 16868 + }, + { + "epoch": 0.5002224001423361, + "grad_norm": 0.11687658727169037, + "learning_rate": 0.0005076454831646288, + "loss": 2.6785, + "step": 16869 + }, + { + "epoch": 0.5002520534946475, + "grad_norm": 0.11509423702955246, + "learning_rate": 0.0005075984376709326, + "loss": 2.6364, + "step": 16870 + }, + { + "epoch": 0.500281706846959, + "grad_norm": 0.1114908829331398, + "learning_rate": 0.0005075513921099511, + "loss": 2.6594, + "step": 16871 + }, + { + "epoch": 0.5003113601992706, + "grad_norm": 0.11500275880098343, + "learning_rate": 0.0005075043464821006, + "loss": 2.6658, + "step": 16872 + }, + { + "epoch": 0.500341013551582, + "grad_norm": 0.12653975188732147, + "learning_rate": 0.0005074573007877979, + "loss": 2.6574, + "step": 16873 + }, + { + "epoch": 0.5003706669038935, + "grad_norm": 0.11880166828632355, + "learning_rate": 0.0005074102550274594, + "loss": 2.6377, + "step": 16874 + }, + { + "epoch": 0.500400320256205, + "grad_norm": 0.10403931885957718, + "learning_rate": 0.0005073632092015018, + "loss": 2.6434, + "step": 16875 + }, + { + "epoch": 0.5004299736085165, + "grad_norm": 0.10663066804409027, + "learning_rate": 0.0005073161633103418, + "loss": 2.642, + "step": 16876 + }, + { + "epoch": 0.5004596269608279, + "grad_norm": 0.11653158813714981, + "learning_rate": 0.0005072691173543959, + "loss": 2.6254, + "step": 16877 + }, + { + "epoch": 0.5004892803131394, + "grad_norm": 0.12240326404571533, + "learning_rate": 0.0005072220713340808, + "loss": 2.6177, + "step": 16878 + }, + { + "epoch": 0.5005189336654509, + "grad_norm": 0.12377519905567169, + "learning_rate": 0.000507175025249813, + "loss": 2.6643, + "step": 16879 + }, + { + "epoch": 0.5005485870177624, + "grad_norm": 0.12380657345056534, + "learning_rate": 0.0005071279791020089, + "loss": 2.6846, + "step": 16880 + }, + { + "epoch": 0.5005782403700738, + "grad_norm": 0.11996407061815262, + "learning_rate": 0.0005070809328910855, + "loss": 2.6224, + "step": 16881 + }, + { + "epoch": 0.5006078937223853, + "grad_norm": 0.10986389219760895, + "learning_rate": 0.0005070338866174593, + "loss": 2.6507, + "step": 16882 + }, + { + "epoch": 0.5006375470746968, + "grad_norm": 0.11881105601787567, + "learning_rate": 0.0005069868402815468, + "loss": 2.6185, + "step": 16883 + }, + { + "epoch": 0.5006672004270083, + "grad_norm": 0.11396054923534393, + "learning_rate": 0.0005069397938837646, + "loss": 2.6325, + "step": 16884 + }, + { + "epoch": 0.5006968537793197, + "grad_norm": 0.10384147614240646, + "learning_rate": 0.0005068927474245292, + "loss": 2.64, + "step": 16885 + }, + { + "epoch": 0.5007265071316312, + "grad_norm": 0.0957789197564125, + "learning_rate": 0.0005068457009042574, + "loss": 2.6432, + "step": 16886 + }, + { + "epoch": 0.5007561604839427, + "grad_norm": 0.10525913536548615, + "learning_rate": 0.0005067986543233658, + "loss": 2.6315, + "step": 16887 + }, + { + "epoch": 0.5007858138362542, + "grad_norm": 0.10263215005397797, + "learning_rate": 0.000506751607682271, + "loss": 2.6599, + "step": 16888 + }, + { + "epoch": 0.5008154671885656, + "grad_norm": 0.10735198855400085, + "learning_rate": 0.0005067045609813895, + "loss": 2.6065, + "step": 16889 + }, + { + "epoch": 0.5008451205408772, + "grad_norm": 0.10596579313278198, + "learning_rate": 0.0005066575142211379, + "loss": 2.6208, + "step": 16890 + }, + { + "epoch": 0.5008747738931886, + "grad_norm": 0.100699283182621, + "learning_rate": 0.0005066104674019329, + "loss": 2.6256, + "step": 16891 + }, + { + "epoch": 0.5009044272455001, + "grad_norm": 0.11578365415334702, + "learning_rate": 0.0005065634205241911, + "loss": 2.6536, + "step": 16892 + }, + { + "epoch": 0.5009340805978116, + "grad_norm": 0.12701401114463806, + "learning_rate": 0.0005065163735883291, + "loss": 2.6389, + "step": 16893 + }, + { + "epoch": 0.5009637339501231, + "grad_norm": 0.13910700380802155, + "learning_rate": 0.0005064693265947636, + "loss": 2.6498, + "step": 16894 + }, + { + "epoch": 0.5009933873024346, + "grad_norm": 0.13592718541622162, + "learning_rate": 0.0005064222795439109, + "loss": 2.665, + "step": 16895 + }, + { + "epoch": 0.501023040654746, + "grad_norm": 0.14333458244800568, + "learning_rate": 0.0005063752324361879, + "loss": 2.6191, + "step": 16896 + }, + { + "epoch": 0.5010526940070575, + "grad_norm": 0.12787190079689026, + "learning_rate": 0.0005063281852720111, + "loss": 2.6528, + "step": 16897 + }, + { + "epoch": 0.501082347359369, + "grad_norm": 0.11159324645996094, + "learning_rate": 0.0005062811380517971, + "loss": 2.6533, + "step": 16898 + }, + { + "epoch": 0.5011120007116805, + "grad_norm": 0.12892740964889526, + "learning_rate": 0.0005062340907759626, + "loss": 2.6597, + "step": 16899 + }, + { + "epoch": 0.5011416540639919, + "grad_norm": 0.15765021741390228, + "learning_rate": 0.000506187043444924, + "loss": 2.6792, + "step": 16900 + }, + { + "epoch": 0.5011713074163034, + "grad_norm": 0.12563279271125793, + "learning_rate": 0.0005061399960590983, + "loss": 2.6628, + "step": 16901 + }, + { + "epoch": 0.5012009607686149, + "grad_norm": 0.11957060545682907, + "learning_rate": 0.0005060929486189017, + "loss": 2.6139, + "step": 16902 + }, + { + "epoch": 0.5012306141209264, + "grad_norm": 0.11904066801071167, + "learning_rate": 0.0005060459011247512, + "loss": 2.6497, + "step": 16903 + }, + { + "epoch": 0.5012602674732378, + "grad_norm": 0.14209102094173431, + "learning_rate": 0.0005059988535770632, + "loss": 2.6711, + "step": 16904 + }, + { + "epoch": 0.5012899208255494, + "grad_norm": 0.1274794638156891, + "learning_rate": 0.0005059518059762542, + "loss": 2.6691, + "step": 16905 + }, + { + "epoch": 0.5013195741778608, + "grad_norm": 0.11396551132202148, + "learning_rate": 0.0005059047583227408, + "loss": 2.6787, + "step": 16906 + }, + { + "epoch": 0.5013492275301723, + "grad_norm": 0.136253222823143, + "learning_rate": 0.0005058577106169399, + "loss": 2.6057, + "step": 16907 + }, + { + "epoch": 0.5013788808824837, + "grad_norm": 0.10860008001327515, + "learning_rate": 0.000505810662859268, + "loss": 2.6489, + "step": 16908 + }, + { + "epoch": 0.5014085342347953, + "grad_norm": 0.12688453495502472, + "learning_rate": 0.0005057636150501418, + "loss": 2.6243, + "step": 16909 + }, + { + "epoch": 0.5014381875871067, + "grad_norm": 0.10970801115036011, + "learning_rate": 0.0005057165671899776, + "loss": 2.6479, + "step": 16910 + }, + { + "epoch": 0.5014678409394182, + "grad_norm": 0.12456317245960236, + "learning_rate": 0.0005056695192791924, + "loss": 2.6798, + "step": 16911 + }, + { + "epoch": 0.5014974942917296, + "grad_norm": 0.12308531999588013, + "learning_rate": 0.0005056224713182023, + "loss": 2.6361, + "step": 16912 + }, + { + "epoch": 0.5015271476440412, + "grad_norm": 0.12093250453472137, + "learning_rate": 0.0005055754233074245, + "loss": 2.6444, + "step": 16913 + }, + { + "epoch": 0.5015568009963527, + "grad_norm": 0.11518470197916031, + "learning_rate": 0.0005055283752472753, + "loss": 2.6458, + "step": 16914 + }, + { + "epoch": 0.5015864543486641, + "grad_norm": 0.09883486479520798, + "learning_rate": 0.0005054813271381715, + "loss": 2.6511, + "step": 16915 + }, + { + "epoch": 0.5016161077009756, + "grad_norm": 0.10379130393266678, + "learning_rate": 0.0005054342789805296, + "loss": 2.6911, + "step": 16916 + }, + { + "epoch": 0.5016457610532871, + "grad_norm": 0.11149107664823532, + "learning_rate": 0.0005053872307747661, + "loss": 2.6687, + "step": 16917 + }, + { + "epoch": 0.5016754144055986, + "grad_norm": 0.10917819291353226, + "learning_rate": 0.0005053401825212977, + "loss": 2.66, + "step": 16918 + }, + { + "epoch": 0.50170506775791, + "grad_norm": 0.11044733226299286, + "learning_rate": 0.0005052931342205411, + "loss": 2.6514, + "step": 16919 + }, + { + "epoch": 0.5017347211102215, + "grad_norm": 0.10955383628606796, + "learning_rate": 0.000505246085872913, + "loss": 2.6374, + "step": 16920 + }, + { + "epoch": 0.501764374462533, + "grad_norm": 0.11041735857725143, + "learning_rate": 0.0005051990374788301, + "loss": 2.605, + "step": 16921 + }, + { + "epoch": 0.5017940278148445, + "grad_norm": 0.11982633918523788, + "learning_rate": 0.0005051519890387084, + "loss": 2.6587, + "step": 16922 + }, + { + "epoch": 0.5018236811671559, + "grad_norm": 0.11474984139204025, + "learning_rate": 0.0005051049405529652, + "loss": 2.665, + "step": 16923 + }, + { + "epoch": 0.5018533345194675, + "grad_norm": 0.10650314390659332, + "learning_rate": 0.0005050578920220167, + "loss": 2.6148, + "step": 16924 + }, + { + "epoch": 0.5018829878717789, + "grad_norm": 0.11898516118526459, + "learning_rate": 0.0005050108434462799, + "loss": 2.6518, + "step": 16925 + }, + { + "epoch": 0.5019126412240904, + "grad_norm": 0.12075206637382507, + "learning_rate": 0.0005049637948261711, + "loss": 2.6726, + "step": 16926 + }, + { + "epoch": 0.5019422945764018, + "grad_norm": 0.11009617894887924, + "learning_rate": 0.0005049167461621071, + "loss": 2.6719, + "step": 16927 + }, + { + "epoch": 0.5019719479287134, + "grad_norm": 0.11801552027463913, + "learning_rate": 0.0005048696974545045, + "loss": 2.6256, + "step": 16928 + }, + { + "epoch": 0.5020016012810248, + "grad_norm": 0.13397495448589325, + "learning_rate": 0.0005048226487037799, + "loss": 2.6319, + "step": 16929 + }, + { + "epoch": 0.5020312546333363, + "grad_norm": 0.12337132543325424, + "learning_rate": 0.0005047755999103499, + "loss": 2.6623, + "step": 16930 + }, + { + "epoch": 0.5020609079856477, + "grad_norm": 0.11799673736095428, + "learning_rate": 0.000504728551074631, + "loss": 2.64, + "step": 16931 + }, + { + "epoch": 0.5020905613379593, + "grad_norm": 0.1153755933046341, + "learning_rate": 0.00050468150219704, + "loss": 2.6436, + "step": 16932 + }, + { + "epoch": 0.5021202146902707, + "grad_norm": 0.11100828647613525, + "learning_rate": 0.0005046344532779936, + "loss": 2.6811, + "step": 16933 + }, + { + "epoch": 0.5021498680425822, + "grad_norm": 0.11052288115024567, + "learning_rate": 0.0005045874043179083, + "loss": 2.6468, + "step": 16934 + }, + { + "epoch": 0.5021795213948937, + "grad_norm": 0.12137481570243835, + "learning_rate": 0.0005045403553172007, + "loss": 2.6283, + "step": 16935 + }, + { + "epoch": 0.5022091747472052, + "grad_norm": 0.12084270268678665, + "learning_rate": 0.0005044933062762875, + "loss": 2.6491, + "step": 16936 + }, + { + "epoch": 0.5022388280995167, + "grad_norm": 0.13878796994686127, + "learning_rate": 0.0005044462571955854, + "loss": 2.6479, + "step": 16937 + }, + { + "epoch": 0.5022684814518281, + "grad_norm": 0.15123015642166138, + "learning_rate": 0.0005043992080755108, + "loss": 2.6522, + "step": 16938 + }, + { + "epoch": 0.5022981348041397, + "grad_norm": 0.14078903198242188, + "learning_rate": 0.0005043521589164805, + "loss": 2.6634, + "step": 16939 + }, + { + "epoch": 0.5023277881564511, + "grad_norm": 0.12776678800582886, + "learning_rate": 0.0005043051097189111, + "loss": 2.6581, + "step": 16940 + }, + { + "epoch": 0.5023574415087626, + "grad_norm": 0.11938611418008804, + "learning_rate": 0.0005042580604832192, + "loss": 2.6753, + "step": 16941 + }, + { + "epoch": 0.502387094861074, + "grad_norm": 0.12209094315767288, + "learning_rate": 0.0005042110112098214, + "loss": 2.666, + "step": 16942 + }, + { + "epoch": 0.5024167482133856, + "grad_norm": 0.12651559710502625, + "learning_rate": 0.0005041639618991345, + "loss": 2.6548, + "step": 16943 + }, + { + "epoch": 0.502446401565697, + "grad_norm": 0.11508721858263016, + "learning_rate": 0.000504116912551575, + "loss": 2.6456, + "step": 16944 + }, + { + "epoch": 0.5024760549180085, + "grad_norm": 0.12344995886087418, + "learning_rate": 0.0005040698631675593, + "loss": 2.6144, + "step": 16945 + }, + { + "epoch": 0.5025057082703199, + "grad_norm": 0.11718367040157318, + "learning_rate": 0.0005040228137475044, + "loss": 2.6369, + "step": 16946 + }, + { + "epoch": 0.5025353616226315, + "grad_norm": 0.0979020968079567, + "learning_rate": 0.0005039757642918269, + "loss": 2.6458, + "step": 16947 + }, + { + "epoch": 0.5025650149749429, + "grad_norm": 0.1058560460805893, + "learning_rate": 0.0005039287148009433, + "loss": 2.6245, + "step": 16948 + }, + { + "epoch": 0.5025946683272544, + "grad_norm": 0.10267788171768188, + "learning_rate": 0.0005038816652752702, + "loss": 2.6545, + "step": 16949 + }, + { + "epoch": 0.5026243216795658, + "grad_norm": 0.11328034847974777, + "learning_rate": 0.0005038346157152242, + "loss": 2.635, + "step": 16950 + }, + { + "epoch": 0.5026539750318774, + "grad_norm": 0.13000521063804626, + "learning_rate": 0.000503787566121222, + "loss": 2.6448, + "step": 16951 + }, + { + "epoch": 0.5026836283841888, + "grad_norm": 0.11653272807598114, + "learning_rate": 0.0005037405164936803, + "loss": 2.6527, + "step": 16952 + }, + { + "epoch": 0.5027132817365003, + "grad_norm": 0.10867041349411011, + "learning_rate": 0.0005036934668330159, + "loss": 2.6749, + "step": 16953 + }, + { + "epoch": 0.5027429350888117, + "grad_norm": 0.0921834409236908, + "learning_rate": 0.0005036464171396449, + "loss": 2.656, + "step": 16954 + }, + { + "epoch": 0.5027725884411233, + "grad_norm": 0.09603279083967209, + "learning_rate": 0.0005035993674139845, + "loss": 2.641, + "step": 16955 + }, + { + "epoch": 0.5028022417934348, + "grad_norm": 0.1045510470867157, + "learning_rate": 0.000503552317656451, + "loss": 2.6363, + "step": 16956 + }, + { + "epoch": 0.5028318951457462, + "grad_norm": 0.09099146723747253, + "learning_rate": 0.0005035052678674611, + "loss": 2.6383, + "step": 16957 + }, + { + "epoch": 0.5028615484980578, + "grad_norm": 0.09829682111740112, + "learning_rate": 0.0005034582180474314, + "loss": 2.6238, + "step": 16958 + }, + { + "epoch": 0.5028912018503692, + "grad_norm": 0.10353543609380722, + "learning_rate": 0.0005034111681967787, + "loss": 2.6157, + "step": 16959 + }, + { + "epoch": 0.5029208552026807, + "grad_norm": 0.11629071086645126, + "learning_rate": 0.0005033641183159194, + "loss": 2.6655, + "step": 16960 + }, + { + "epoch": 0.5029505085549921, + "grad_norm": 0.1247774288058281, + "learning_rate": 0.0005033170684052704, + "loss": 2.6568, + "step": 16961 + }, + { + "epoch": 0.5029801619073037, + "grad_norm": 0.11186469346284866, + "learning_rate": 0.0005032700184652481, + "loss": 2.6249, + "step": 16962 + }, + { + "epoch": 0.5030098152596151, + "grad_norm": 0.10505027323961258, + "learning_rate": 0.0005032229684962692, + "loss": 2.6646, + "step": 16963 + }, + { + "epoch": 0.5030394686119266, + "grad_norm": 0.11055217683315277, + "learning_rate": 0.0005031759184987503, + "loss": 2.6597, + "step": 16964 + }, + { + "epoch": 0.503069121964238, + "grad_norm": 0.12800493836402893, + "learning_rate": 0.0005031288684731082, + "loss": 2.6487, + "step": 16965 + }, + { + "epoch": 0.5030987753165496, + "grad_norm": 0.1193658635020256, + "learning_rate": 0.0005030818184197595, + "loss": 2.6222, + "step": 16966 + }, + { + "epoch": 0.503128428668861, + "grad_norm": 0.10862761735916138, + "learning_rate": 0.0005030347683391207, + "loss": 2.6336, + "step": 16967 + }, + { + "epoch": 0.5031580820211725, + "grad_norm": 0.11093349754810333, + "learning_rate": 0.0005029877182316085, + "loss": 2.6595, + "step": 16968 + }, + { + "epoch": 0.5031877353734839, + "grad_norm": 0.12170465290546417, + "learning_rate": 0.0005029406680976395, + "loss": 2.6561, + "step": 16969 + }, + { + "epoch": 0.5032173887257955, + "grad_norm": 0.13204704225063324, + "learning_rate": 0.0005028936179376306, + "loss": 2.6384, + "step": 16970 + }, + { + "epoch": 0.5032470420781069, + "grad_norm": 0.13291987776756287, + "learning_rate": 0.0005028465677519978, + "loss": 2.6527, + "step": 16971 + }, + { + "epoch": 0.5032766954304184, + "grad_norm": 0.12374426424503326, + "learning_rate": 0.0005027995175411584, + "loss": 2.6695, + "step": 16972 + }, + { + "epoch": 0.5033063487827298, + "grad_norm": 0.12793445587158203, + "learning_rate": 0.0005027524673055288, + "loss": 2.6526, + "step": 16973 + }, + { + "epoch": 0.5033360021350414, + "grad_norm": 0.11177685111761093, + "learning_rate": 0.0005027054170455256, + "loss": 2.5795, + "step": 16974 + }, + { + "epoch": 0.5033656554873528, + "grad_norm": 0.1223628893494606, + "learning_rate": 0.0005026583667615656, + "loss": 2.6473, + "step": 16975 + }, + { + "epoch": 0.5033953088396643, + "grad_norm": 0.11078982800245285, + "learning_rate": 0.0005026113164540651, + "loss": 2.6457, + "step": 16976 + }, + { + "epoch": 0.5034249621919759, + "grad_norm": 0.11013253033161163, + "learning_rate": 0.0005025642661234409, + "loss": 2.6301, + "step": 16977 + }, + { + "epoch": 0.5034546155442873, + "grad_norm": 0.1044585257768631, + "learning_rate": 0.0005025172157701099, + "loss": 2.6625, + "step": 16978 + }, + { + "epoch": 0.5034842688965988, + "grad_norm": 0.1173214241862297, + "learning_rate": 0.0005024701653944884, + "loss": 2.6473, + "step": 16979 + }, + { + "epoch": 0.5035139222489102, + "grad_norm": 0.10442598909139633, + "learning_rate": 0.0005024231149969934, + "loss": 2.6502, + "step": 16980 + }, + { + "epoch": 0.5035435756012218, + "grad_norm": 0.10727021098136902, + "learning_rate": 0.000502376064578041, + "loss": 2.6449, + "step": 16981 + }, + { + "epoch": 0.5035732289535332, + "grad_norm": 0.10342148691415787, + "learning_rate": 0.0005023290141380482, + "loss": 2.6362, + "step": 16982 + }, + { + "epoch": 0.5036028823058447, + "grad_norm": 0.11158411204814911, + "learning_rate": 0.0005022819636774316, + "loss": 2.6585, + "step": 16983 + }, + { + "epoch": 0.5036325356581561, + "grad_norm": 0.11965786665678024, + "learning_rate": 0.0005022349131966078, + "loss": 2.6734, + "step": 16984 + }, + { + "epoch": 0.5036621890104677, + "grad_norm": 0.11307431012392044, + "learning_rate": 0.0005021878626959936, + "loss": 2.6676, + "step": 16985 + }, + { + "epoch": 0.5036918423627791, + "grad_norm": 0.11817950010299683, + "learning_rate": 0.0005021408121760054, + "loss": 2.646, + "step": 16986 + }, + { + "epoch": 0.5037214957150906, + "grad_norm": 0.1273929625749588, + "learning_rate": 0.0005020937616370598, + "loss": 2.6996, + "step": 16987 + }, + { + "epoch": 0.503751149067402, + "grad_norm": 0.13792237639427185, + "learning_rate": 0.0005020467110795738, + "loss": 2.663, + "step": 16988 + }, + { + "epoch": 0.5037808024197136, + "grad_norm": 0.13313992321491241, + "learning_rate": 0.0005019996605039637, + "loss": 2.6639, + "step": 16989 + }, + { + "epoch": 0.503810455772025, + "grad_norm": 0.11854874342679977, + "learning_rate": 0.0005019526099106461, + "loss": 2.6787, + "step": 16990 + }, + { + "epoch": 0.5038401091243365, + "grad_norm": 0.13844384253025055, + "learning_rate": 0.0005019055593000382, + "loss": 2.6615, + "step": 16991 + }, + { + "epoch": 0.5038697624766479, + "grad_norm": 0.15774497389793396, + "learning_rate": 0.000501858508672556, + "loss": 2.6623, + "step": 16992 + }, + { + "epoch": 0.5038994158289595, + "grad_norm": 0.13672566413879395, + "learning_rate": 0.0005018114580286165, + "loss": 2.6389, + "step": 16993 + }, + { + "epoch": 0.5039290691812709, + "grad_norm": 0.13897185027599335, + "learning_rate": 0.0005017644073686361, + "loss": 2.6384, + "step": 16994 + }, + { + "epoch": 0.5039587225335824, + "grad_norm": 0.14240504801273346, + "learning_rate": 0.0005017173566930316, + "loss": 2.6629, + "step": 16995 + }, + { + "epoch": 0.5039883758858938, + "grad_norm": 0.11850490421056747, + "learning_rate": 0.0005016703060022197, + "loss": 2.6571, + "step": 16996 + }, + { + "epoch": 0.5040180292382054, + "grad_norm": 0.12074034661054611, + "learning_rate": 0.0005016232552966169, + "loss": 2.6444, + "step": 16997 + }, + { + "epoch": 0.5040476825905169, + "grad_norm": 0.12190556526184082, + "learning_rate": 0.0005015762045766399, + "loss": 2.6771, + "step": 16998 + }, + { + "epoch": 0.5040773359428283, + "grad_norm": 0.10189306735992432, + "learning_rate": 0.0005015291538427054, + "loss": 2.6598, + "step": 16999 + }, + { + "epoch": 0.5041069892951399, + "grad_norm": 0.11493877321481705, + "learning_rate": 0.0005014821030952299, + "loss": 2.649, + "step": 17000 + }, + { + "epoch": 0.5041366426474513, + "grad_norm": 0.12472280114889145, + "learning_rate": 0.0005014350523346301, + "loss": 2.6487, + "step": 17001 + }, + { + "epoch": 0.5041662959997628, + "grad_norm": 0.10836218297481537, + "learning_rate": 0.0005013880015613226, + "loss": 2.6777, + "step": 17002 + }, + { + "epoch": 0.5041959493520742, + "grad_norm": 0.13690374791622162, + "learning_rate": 0.0005013409507757243, + "loss": 2.7107, + "step": 17003 + }, + { + "epoch": 0.5042256027043858, + "grad_norm": 0.12848661839962006, + "learning_rate": 0.0005012938999782516, + "loss": 2.6506, + "step": 17004 + }, + { + "epoch": 0.5042552560566972, + "grad_norm": 0.11316554993391037, + "learning_rate": 0.000501246849169321, + "loss": 2.6139, + "step": 17005 + }, + { + "epoch": 0.5042849094090087, + "grad_norm": 0.10997689515352249, + "learning_rate": 0.0005011997983493495, + "loss": 2.6651, + "step": 17006 + }, + { + "epoch": 0.5043145627613201, + "grad_norm": 0.10342677682638168, + "learning_rate": 0.0005011527475187536, + "loss": 2.6628, + "step": 17007 + }, + { + "epoch": 0.5043442161136317, + "grad_norm": 0.1037956103682518, + "learning_rate": 0.00050110569667795, + "loss": 2.6506, + "step": 17008 + }, + { + "epoch": 0.5043738694659431, + "grad_norm": 0.11170383542776108, + "learning_rate": 0.000501058645827355, + "loss": 2.6264, + "step": 17009 + }, + { + "epoch": 0.5044035228182546, + "grad_norm": 0.11050731688737869, + "learning_rate": 0.0005010115949673858, + "loss": 2.6604, + "step": 17010 + }, + { + "epoch": 0.504433176170566, + "grad_norm": 0.09720779210329056, + "learning_rate": 0.0005009645440984586, + "loss": 2.6693, + "step": 17011 + }, + { + "epoch": 0.5044628295228776, + "grad_norm": 0.11032428592443466, + "learning_rate": 0.0005009174932209902, + "loss": 2.6829, + "step": 17012 + }, + { + "epoch": 0.504492482875189, + "grad_norm": 0.11307048052549362, + "learning_rate": 0.0005008704423353973, + "loss": 2.6505, + "step": 17013 + }, + { + "epoch": 0.5045221362275005, + "grad_norm": 0.09990722686052322, + "learning_rate": 0.0005008233914420966, + "loss": 2.6557, + "step": 17014 + }, + { + "epoch": 0.504551789579812, + "grad_norm": 0.09695124626159668, + "learning_rate": 0.0005007763405415044, + "loss": 2.5946, + "step": 17015 + }, + { + "epoch": 0.5045814429321235, + "grad_norm": 0.10354376584291458, + "learning_rate": 0.0005007292896340376, + "loss": 2.66, + "step": 17016 + }, + { + "epoch": 0.5046110962844349, + "grad_norm": 0.10041588544845581, + "learning_rate": 0.0005006822387201128, + "loss": 2.6222, + "step": 17017 + }, + { + "epoch": 0.5046407496367464, + "grad_norm": 0.10739095509052277, + "learning_rate": 0.0005006351878001467, + "loss": 2.6505, + "step": 17018 + }, + { + "epoch": 0.504670402989058, + "grad_norm": 0.1395796835422516, + "learning_rate": 0.0005005881368745559, + "loss": 2.6617, + "step": 17019 + }, + { + "epoch": 0.5047000563413694, + "grad_norm": 0.14475397765636444, + "learning_rate": 0.0005005410859437572, + "loss": 2.6554, + "step": 17020 + }, + { + "epoch": 0.5047297096936809, + "grad_norm": 0.13713210821151733, + "learning_rate": 0.0005004940350081669, + "loss": 2.6614, + "step": 17021 + }, + { + "epoch": 0.5047593630459923, + "grad_norm": 0.1258479207754135, + "learning_rate": 0.0005004469840682018, + "loss": 2.6354, + "step": 17022 + }, + { + "epoch": 0.5047890163983039, + "grad_norm": 0.10733863711357117, + "learning_rate": 0.0005003999331242788, + "loss": 2.6219, + "step": 17023 + }, + { + "epoch": 0.5048186697506153, + "grad_norm": 0.12852801382541656, + "learning_rate": 0.0005003528821768142, + "loss": 2.5913, + "step": 17024 + }, + { + "epoch": 0.5048483231029268, + "grad_norm": 0.10617940872907639, + "learning_rate": 0.0005003058312262247, + "loss": 2.6477, + "step": 17025 + }, + { + "epoch": 0.5048779764552382, + "grad_norm": 0.11820931732654572, + "learning_rate": 0.0005002587802729271, + "loss": 2.6671, + "step": 17026 + }, + { + "epoch": 0.5049076298075498, + "grad_norm": 0.11906421929597855, + "learning_rate": 0.0005002117293173379, + "loss": 2.6603, + "step": 17027 + }, + { + "epoch": 0.5049372831598612, + "grad_norm": 0.12185788154602051, + "learning_rate": 0.0005001646783598738, + "loss": 2.6657, + "step": 17028 + }, + { + "epoch": 0.5049669365121727, + "grad_norm": 0.1088290736079216, + "learning_rate": 0.0005001176274009514, + "loss": 2.6384, + "step": 17029 + }, + { + "epoch": 0.5049965898644841, + "grad_norm": 0.10035234689712524, + "learning_rate": 0.0005000705764409875, + "loss": 2.6508, + "step": 17030 + }, + { + "epoch": 0.5050262432167957, + "grad_norm": 0.09764999151229858, + "learning_rate": 0.0005000235254803986, + "loss": 2.6488, + "step": 17031 + }, + { + "epoch": 0.5050558965691071, + "grad_norm": 0.12182255834341049, + "learning_rate": 0.0004999764745196014, + "loss": 2.6602, + "step": 17032 + }, + { + "epoch": 0.5050855499214186, + "grad_norm": 0.1090313121676445, + "learning_rate": 0.0004999294235590125, + "loss": 2.6449, + "step": 17033 + }, + { + "epoch": 0.50511520327373, + "grad_norm": 0.10217040032148361, + "learning_rate": 0.0004998823725990486, + "loss": 2.6511, + "step": 17034 + }, + { + "epoch": 0.5051448566260416, + "grad_norm": 0.12999175488948822, + "learning_rate": 0.0004998353216401263, + "loss": 2.6414, + "step": 17035 + }, + { + "epoch": 0.505174509978353, + "grad_norm": 0.14279411733150482, + "learning_rate": 0.0004997882706826623, + "loss": 2.6694, + "step": 17036 + }, + { + "epoch": 0.5052041633306645, + "grad_norm": 0.1453888714313507, + "learning_rate": 0.0004997412197270732, + "loss": 2.651, + "step": 17037 + }, + { + "epoch": 0.505233816682976, + "grad_norm": 0.12007112801074982, + "learning_rate": 0.0004996941687737753, + "loss": 2.6342, + "step": 17038 + }, + { + "epoch": 0.5052634700352875, + "grad_norm": 0.15321218967437744, + "learning_rate": 0.000499647117823186, + "loss": 2.6582, + "step": 17039 + }, + { + "epoch": 0.505293123387599, + "grad_norm": 0.1677589863538742, + "learning_rate": 0.0004996000668757213, + "loss": 2.6481, + "step": 17040 + }, + { + "epoch": 0.5053227767399104, + "grad_norm": 0.14932937920093536, + "learning_rate": 0.0004995530159317982, + "loss": 2.662, + "step": 17041 + }, + { + "epoch": 0.505352430092222, + "grad_norm": 0.11896185576915741, + "learning_rate": 0.0004995059649918332, + "loss": 2.6272, + "step": 17042 + }, + { + "epoch": 0.5053820834445334, + "grad_norm": 0.13132058084011078, + "learning_rate": 0.000499458914056243, + "loss": 2.6761, + "step": 17043 + }, + { + "epoch": 0.5054117367968449, + "grad_norm": 0.1301865130662918, + "learning_rate": 0.0004994118631254441, + "loss": 2.625, + "step": 17044 + }, + { + "epoch": 0.5054413901491563, + "grad_norm": 0.12231624126434326, + "learning_rate": 0.0004993648121998534, + "loss": 2.6358, + "step": 17045 + }, + { + "epoch": 0.5054710435014679, + "grad_norm": 0.11461092531681061, + "learning_rate": 0.0004993177612798873, + "loss": 2.6393, + "step": 17046 + }, + { + "epoch": 0.5055006968537793, + "grad_norm": 0.10831968486309052, + "learning_rate": 0.0004992707103659626, + "loss": 2.6439, + "step": 17047 + }, + { + "epoch": 0.5055303502060908, + "grad_norm": 0.11528804153203964, + "learning_rate": 0.0004992236594584959, + "loss": 2.6662, + "step": 17048 + }, + { + "epoch": 0.5055600035584022, + "grad_norm": 0.128323495388031, + "learning_rate": 0.0004991766085579037, + "loss": 2.65, + "step": 17049 + }, + { + "epoch": 0.5055896569107138, + "grad_norm": 0.1121143102645874, + "learning_rate": 0.0004991295576646028, + "loss": 2.635, + "step": 17050 + }, + { + "epoch": 0.5056193102630252, + "grad_norm": 0.1325673758983612, + "learning_rate": 0.0004990825067790098, + "loss": 2.6668, + "step": 17051 + }, + { + "epoch": 0.5056489636153367, + "grad_norm": 0.1163388192653656, + "learning_rate": 0.0004990354559015413, + "loss": 2.628, + "step": 17052 + }, + { + "epoch": 0.5056786169676482, + "grad_norm": 0.1208794116973877, + "learning_rate": 0.0004989884050326142, + "loss": 2.6253, + "step": 17053 + }, + { + "epoch": 0.5057082703199597, + "grad_norm": 0.10263658314943314, + "learning_rate": 0.0004989413541726448, + "loss": 2.6669, + "step": 17054 + }, + { + "epoch": 0.5057379236722711, + "grad_norm": 0.10571911931037903, + "learning_rate": 0.00049889430332205, + "loss": 2.6751, + "step": 17055 + }, + { + "epoch": 0.5057675770245826, + "grad_norm": 0.12457720190286636, + "learning_rate": 0.0004988472524812464, + "loss": 2.6428, + "step": 17056 + }, + { + "epoch": 0.5057972303768941, + "grad_norm": 0.09722235798835754, + "learning_rate": 0.0004988002016506505, + "loss": 2.6591, + "step": 17057 + }, + { + "epoch": 0.5058268837292056, + "grad_norm": 0.11425261199474335, + "learning_rate": 0.000498753150830679, + "loss": 2.6548, + "step": 17058 + }, + { + "epoch": 0.505856537081517, + "grad_norm": 0.11528799682855606, + "learning_rate": 0.0004987061000217485, + "loss": 2.649, + "step": 17059 + }, + { + "epoch": 0.5058861904338285, + "grad_norm": 0.14230410754680634, + "learning_rate": 0.0004986590492242758, + "loss": 2.6625, + "step": 17060 + }, + { + "epoch": 0.5059158437861401, + "grad_norm": 0.1455865055322647, + "learning_rate": 0.0004986119984386774, + "loss": 2.6424, + "step": 17061 + }, + { + "epoch": 0.5059454971384515, + "grad_norm": 0.1630588322877884, + "learning_rate": 0.00049856494766537, + "loss": 2.65, + "step": 17062 + }, + { + "epoch": 0.505975150490763, + "grad_norm": 0.15674185752868652, + "learning_rate": 0.0004985178969047704, + "loss": 2.6382, + "step": 17063 + }, + { + "epoch": 0.5060048038430744, + "grad_norm": 0.12177979201078415, + "learning_rate": 0.0004984708461572946, + "loss": 2.6496, + "step": 17064 + }, + { + "epoch": 0.506034457195386, + "grad_norm": 0.13675476610660553, + "learning_rate": 0.0004984237954233601, + "loss": 2.6515, + "step": 17065 + }, + { + "epoch": 0.5060641105476974, + "grad_norm": 0.15652889013290405, + "learning_rate": 0.0004983767447033831, + "loss": 2.6545, + "step": 17066 + }, + { + "epoch": 0.5060937639000089, + "grad_norm": 0.15275511145591736, + "learning_rate": 0.0004983296939977804, + "loss": 2.6386, + "step": 17067 + }, + { + "epoch": 0.5061234172523204, + "grad_norm": 0.13635104894638062, + "learning_rate": 0.0004982826433069684, + "loss": 2.6551, + "step": 17068 + }, + { + "epoch": 0.5061530706046319, + "grad_norm": 0.13200241327285767, + "learning_rate": 0.0004982355926313639, + "loss": 2.6822, + "step": 17069 + }, + { + "epoch": 0.5061827239569433, + "grad_norm": 0.1326194554567337, + "learning_rate": 0.0004981885419713836, + "loss": 2.6237, + "step": 17070 + }, + { + "epoch": 0.5062123773092548, + "grad_norm": 0.11594197899103165, + "learning_rate": 0.000498141491327444, + "loss": 2.6637, + "step": 17071 + }, + { + "epoch": 0.5062420306615663, + "grad_norm": 0.13124430179595947, + "learning_rate": 0.0004980944406999619, + "loss": 2.6721, + "step": 17072 + }, + { + "epoch": 0.5062716840138778, + "grad_norm": 0.14627186954021454, + "learning_rate": 0.0004980473900893539, + "loss": 2.6338, + "step": 17073 + }, + { + "epoch": 0.5063013373661892, + "grad_norm": 0.11574770510196686, + "learning_rate": 0.0004980003394960365, + "loss": 2.6601, + "step": 17074 + }, + { + "epoch": 0.5063309907185007, + "grad_norm": 0.11005976796150208, + "learning_rate": 0.0004979532889204264, + "loss": 2.6599, + "step": 17075 + }, + { + "epoch": 0.5063606440708122, + "grad_norm": 0.12552227079868317, + "learning_rate": 0.0004979062383629404, + "loss": 2.6489, + "step": 17076 + }, + { + "epoch": 0.5063902974231237, + "grad_norm": 0.11333411931991577, + "learning_rate": 0.0004978591878239948, + "loss": 2.6538, + "step": 17077 + }, + { + "epoch": 0.5064199507754351, + "grad_norm": 0.10964903235435486, + "learning_rate": 0.0004978121373040065, + "loss": 2.681, + "step": 17078 + }, + { + "epoch": 0.5064496041277466, + "grad_norm": 0.11484343558549881, + "learning_rate": 0.0004977650868033922, + "loss": 2.6407, + "step": 17079 + }, + { + "epoch": 0.5064792574800582, + "grad_norm": 0.11578627675771713, + "learning_rate": 0.0004977180363225685, + "loss": 2.6245, + "step": 17080 + }, + { + "epoch": 0.5065089108323696, + "grad_norm": 0.11855221539735794, + "learning_rate": 0.0004976709858619518, + "loss": 2.6623, + "step": 17081 + }, + { + "epoch": 0.5065385641846811, + "grad_norm": 0.12183623015880585, + "learning_rate": 0.0004976239354219591, + "loss": 2.6196, + "step": 17082 + }, + { + "epoch": 0.5065682175369925, + "grad_norm": 0.13161851465702057, + "learning_rate": 0.0004975768850030068, + "loss": 2.6659, + "step": 17083 + }, + { + "epoch": 0.5065978708893041, + "grad_norm": 0.11918072402477264, + "learning_rate": 0.0004975298346055117, + "loss": 2.6486, + "step": 17084 + }, + { + "epoch": 0.5066275242416155, + "grad_norm": 0.12801210582256317, + "learning_rate": 0.0004974827842298903, + "loss": 2.6576, + "step": 17085 + }, + { + "epoch": 0.506657177593927, + "grad_norm": 0.12963595986366272, + "learning_rate": 0.0004974357338765591, + "loss": 2.6574, + "step": 17086 + }, + { + "epoch": 0.5066868309462385, + "grad_norm": 0.1155867651104927, + "learning_rate": 0.000497388683545935, + "loss": 2.6663, + "step": 17087 + }, + { + "epoch": 0.50671648429855, + "grad_norm": 0.14190176129341125, + "learning_rate": 0.0004973416332384347, + "loss": 2.6267, + "step": 17088 + }, + { + "epoch": 0.5067461376508614, + "grad_norm": 0.15681080520153046, + "learning_rate": 0.0004972945829544745, + "loss": 2.6662, + "step": 17089 + }, + { + "epoch": 0.5067757910031729, + "grad_norm": 0.11331711709499359, + "learning_rate": 0.0004972475326944712, + "loss": 2.611, + "step": 17090 + }, + { + "epoch": 0.5068054443554844, + "grad_norm": 0.10505126416683197, + "learning_rate": 0.0004972004824588416, + "loss": 2.6149, + "step": 17091 + }, + { + "epoch": 0.5068350977077959, + "grad_norm": 0.10949171334505081, + "learning_rate": 0.0004971534322480021, + "loss": 2.6993, + "step": 17092 + }, + { + "epoch": 0.5068647510601073, + "grad_norm": 0.12475109845399857, + "learning_rate": 0.0004971063820623696, + "loss": 2.6288, + "step": 17093 + }, + { + "epoch": 0.5068944044124188, + "grad_norm": 0.11588737368583679, + "learning_rate": 0.0004970593319023606, + "loss": 2.6413, + "step": 17094 + }, + { + "epoch": 0.5069240577647303, + "grad_norm": 0.1126447319984436, + "learning_rate": 0.0004970122817683916, + "loss": 2.6559, + "step": 17095 + }, + { + "epoch": 0.5069537111170418, + "grad_norm": 0.1078527495265007, + "learning_rate": 0.0004969652316608794, + "loss": 2.6523, + "step": 17096 + }, + { + "epoch": 0.5069833644693532, + "grad_norm": 0.12799392640590668, + "learning_rate": 0.0004969181815802406, + "loss": 2.6782, + "step": 17097 + }, + { + "epoch": 0.5070130178216647, + "grad_norm": 0.12212161719799042, + "learning_rate": 0.0004968711315268919, + "loss": 2.6472, + "step": 17098 + }, + { + "epoch": 0.5070426711739762, + "grad_norm": 0.11209389567375183, + "learning_rate": 0.0004968240815012497, + "loss": 2.6554, + "step": 17099 + }, + { + "epoch": 0.5070723245262877, + "grad_norm": 0.1360110640525818, + "learning_rate": 0.0004967770315037308, + "loss": 2.6505, + "step": 17100 + }, + { + "epoch": 0.5071019778785992, + "grad_norm": 0.13721393048763275, + "learning_rate": 0.0004967299815347521, + "loss": 2.6568, + "step": 17101 + }, + { + "epoch": 0.5071316312309107, + "grad_norm": 0.12079471349716187, + "learning_rate": 0.0004966829315947299, + "loss": 2.6678, + "step": 17102 + }, + { + "epoch": 0.5071612845832222, + "grad_norm": 0.10708027333021164, + "learning_rate": 0.0004966358816840805, + "loss": 2.6551, + "step": 17103 + }, + { + "epoch": 0.5071909379355336, + "grad_norm": 0.1018075942993164, + "learning_rate": 0.0004965888318032213, + "loss": 2.639, + "step": 17104 + }, + { + "epoch": 0.5072205912878451, + "grad_norm": 0.11060324311256409, + "learning_rate": 0.0004965417819525686, + "loss": 2.637, + "step": 17105 + }, + { + "epoch": 0.5072502446401566, + "grad_norm": 0.12660883367061615, + "learning_rate": 0.0004964947321325389, + "loss": 2.6422, + "step": 17106 + }, + { + "epoch": 0.5072798979924681, + "grad_norm": 0.12975038588047028, + "learning_rate": 0.0004964476823435491, + "loss": 2.6232, + "step": 17107 + }, + { + "epoch": 0.5073095513447795, + "grad_norm": 0.12500302493572235, + "learning_rate": 0.0004964006325860155, + "loss": 2.6181, + "step": 17108 + }, + { + "epoch": 0.507339204697091, + "grad_norm": 0.11820872128009796, + "learning_rate": 0.0004963535828603551, + "loss": 2.6189, + "step": 17109 + }, + { + "epoch": 0.5073688580494025, + "grad_norm": 0.09942157566547394, + "learning_rate": 0.0004963065331669842, + "loss": 2.6339, + "step": 17110 + }, + { + "epoch": 0.507398511401714, + "grad_norm": 0.11610140651464462, + "learning_rate": 0.0004962594835063197, + "loss": 2.6326, + "step": 17111 + }, + { + "epoch": 0.5074281647540254, + "grad_norm": 0.138010635972023, + "learning_rate": 0.0004962124338787781, + "loss": 2.6643, + "step": 17112 + }, + { + "epoch": 0.5074578181063369, + "grad_norm": 0.1611698716878891, + "learning_rate": 0.000496165384284776, + "loss": 2.6165, + "step": 17113 + }, + { + "epoch": 0.5074874714586484, + "grad_norm": 0.1396913230419159, + "learning_rate": 0.0004961183347247301, + "loss": 2.6539, + "step": 17114 + }, + { + "epoch": 0.5075171248109599, + "grad_norm": 0.12171056866645813, + "learning_rate": 0.0004960712851990569, + "loss": 2.6337, + "step": 17115 + }, + { + "epoch": 0.5075467781632713, + "grad_norm": 0.14251317083835602, + "learning_rate": 0.0004960242357081732, + "loss": 2.6427, + "step": 17116 + }, + { + "epoch": 0.5075764315155828, + "grad_norm": 0.1531708538532257, + "learning_rate": 0.0004959771862524955, + "loss": 2.6697, + "step": 17117 + }, + { + "epoch": 0.5076060848678943, + "grad_norm": 0.13436415791511536, + "learning_rate": 0.0004959301368324407, + "loss": 2.6718, + "step": 17118 + }, + { + "epoch": 0.5076357382202058, + "grad_norm": 0.13172736763954163, + "learning_rate": 0.0004958830874484252, + "loss": 2.6444, + "step": 17119 + }, + { + "epoch": 0.5076653915725172, + "grad_norm": 0.14195826649665833, + "learning_rate": 0.0004958360381008655, + "loss": 2.6518, + "step": 17120 + }, + { + "epoch": 0.5076950449248288, + "grad_norm": 0.1137697845697403, + "learning_rate": 0.0004957889887901786, + "loss": 2.6458, + "step": 17121 + }, + { + "epoch": 0.5077246982771403, + "grad_norm": 0.12862153351306915, + "learning_rate": 0.000495741939516781, + "loss": 2.6836, + "step": 17122 + }, + { + "epoch": 0.5077543516294517, + "grad_norm": 0.11859425902366638, + "learning_rate": 0.000495694890281089, + "loss": 2.6542, + "step": 17123 + }, + { + "epoch": 0.5077840049817632, + "grad_norm": 0.12381764501333237, + "learning_rate": 0.0004956478410835196, + "loss": 2.6548, + "step": 17124 + }, + { + "epoch": 0.5078136583340747, + "grad_norm": 0.12937615811824799, + "learning_rate": 0.0004956007919244892, + "loss": 2.6383, + "step": 17125 + }, + { + "epoch": 0.5078433116863862, + "grad_norm": 0.12288162857294083, + "learning_rate": 0.0004955537428044147, + "loss": 2.6422, + "step": 17126 + }, + { + "epoch": 0.5078729650386976, + "grad_norm": 0.13524599373340607, + "learning_rate": 0.0004955066937237125, + "loss": 2.7274, + "step": 17127 + }, + { + "epoch": 0.5079026183910091, + "grad_norm": 0.1386358141899109, + "learning_rate": 0.0004954596446827995, + "loss": 2.6556, + "step": 17128 + }, + { + "epoch": 0.5079322717433206, + "grad_norm": 0.11457838118076324, + "learning_rate": 0.0004954125956820916, + "loss": 2.6189, + "step": 17129 + }, + { + "epoch": 0.5079619250956321, + "grad_norm": 0.11239068955183029, + "learning_rate": 0.0004953655467220063, + "loss": 2.6256, + "step": 17130 + }, + { + "epoch": 0.5079915784479435, + "grad_norm": 0.1085270345211029, + "learning_rate": 0.0004953184978029599, + "loss": 2.6825, + "step": 17131 + }, + { + "epoch": 0.508021231800255, + "grad_norm": 0.1257808804512024, + "learning_rate": 0.0004952714489253691, + "loss": 2.6388, + "step": 17132 + }, + { + "epoch": 0.5080508851525665, + "grad_norm": 0.1248939037322998, + "learning_rate": 0.0004952244000896503, + "loss": 2.6256, + "step": 17133 + }, + { + "epoch": 0.508080538504878, + "grad_norm": 0.10066954791545868, + "learning_rate": 0.0004951773512962203, + "loss": 2.6497, + "step": 17134 + }, + { + "epoch": 0.5081101918571894, + "grad_norm": 0.11024513095617294, + "learning_rate": 0.0004951303025454956, + "loss": 2.6654, + "step": 17135 + }, + { + "epoch": 0.508139845209501, + "grad_norm": 0.12307461351156235, + "learning_rate": 0.000495083253837893, + "loss": 2.6483, + "step": 17136 + }, + { + "epoch": 0.5081694985618124, + "grad_norm": 0.11124832183122635, + "learning_rate": 0.0004950362051738289, + "loss": 2.6944, + "step": 17137 + }, + { + "epoch": 0.5081991519141239, + "grad_norm": 0.10009879618883133, + "learning_rate": 0.0004949891565537202, + "loss": 2.6398, + "step": 17138 + }, + { + "epoch": 0.5082288052664353, + "grad_norm": 0.12391871958971024, + "learning_rate": 0.0004949421079779834, + "loss": 2.627, + "step": 17139 + }, + { + "epoch": 0.5082584586187469, + "grad_norm": 0.11634382605552673, + "learning_rate": 0.000494895059447035, + "loss": 2.6418, + "step": 17140 + }, + { + "epoch": 0.5082881119710583, + "grad_norm": 0.11483067274093628, + "learning_rate": 0.0004948480109612918, + "loss": 2.6437, + "step": 17141 + }, + { + "epoch": 0.5083177653233698, + "grad_norm": 0.13262984156608582, + "learning_rate": 0.00049480096252117, + "loss": 2.6464, + "step": 17142 + }, + { + "epoch": 0.5083474186756813, + "grad_norm": 0.1313244253396988, + "learning_rate": 0.000494753914127087, + "loss": 2.6165, + "step": 17143 + }, + { + "epoch": 0.5083770720279928, + "grad_norm": 0.11843816936016083, + "learning_rate": 0.0004947068657794588, + "loss": 2.6265, + "step": 17144 + }, + { + "epoch": 0.5084067253803043, + "grad_norm": 0.12364132702350616, + "learning_rate": 0.0004946598174787023, + "loss": 2.6421, + "step": 17145 + }, + { + "epoch": 0.5084363787326157, + "grad_norm": 0.11761355400085449, + "learning_rate": 0.000494612769225234, + "loss": 2.6414, + "step": 17146 + }, + { + "epoch": 0.5084660320849272, + "grad_norm": 0.13064226508140564, + "learning_rate": 0.0004945657210194706, + "loss": 2.6481, + "step": 17147 + }, + { + "epoch": 0.5084956854372387, + "grad_norm": 0.13082033395767212, + "learning_rate": 0.0004945186728618285, + "loss": 2.6066, + "step": 17148 + }, + { + "epoch": 0.5085253387895502, + "grad_norm": 0.11612265557050705, + "learning_rate": 0.0004944716247527248, + "loss": 2.6383, + "step": 17149 + }, + { + "epoch": 0.5085549921418616, + "grad_norm": 0.10942504554986954, + "learning_rate": 0.0004944245766925757, + "loss": 2.6812, + "step": 17150 + }, + { + "epoch": 0.5085846454941731, + "grad_norm": 0.14457890391349792, + "learning_rate": 0.0004943775286817977, + "loss": 2.6459, + "step": 17151 + }, + { + "epoch": 0.5086142988464846, + "grad_norm": 0.15896357595920563, + "learning_rate": 0.0004943304807208079, + "loss": 2.6824, + "step": 17152 + }, + { + "epoch": 0.5086439521987961, + "grad_norm": 0.13635259866714478, + "learning_rate": 0.0004942834328100225, + "loss": 2.6467, + "step": 17153 + }, + { + "epoch": 0.5086736055511075, + "grad_norm": 0.13102559745311737, + "learning_rate": 0.0004942363849498585, + "loss": 2.6373, + "step": 17154 + }, + { + "epoch": 0.508703258903419, + "grad_norm": 0.10959319025278091, + "learning_rate": 0.0004941893371407319, + "loss": 2.6712, + "step": 17155 + }, + { + "epoch": 0.5087329122557305, + "grad_norm": 0.10616316646337509, + "learning_rate": 0.00049414228938306, + "loss": 2.6655, + "step": 17156 + }, + { + "epoch": 0.508762565608042, + "grad_norm": 0.10977350920438766, + "learning_rate": 0.0004940952416772591, + "loss": 2.6424, + "step": 17157 + }, + { + "epoch": 0.5087922189603534, + "grad_norm": 0.11749093979597092, + "learning_rate": 0.0004940481940237458, + "loss": 2.6154, + "step": 17158 + }, + { + "epoch": 0.508821872312665, + "grad_norm": 0.103972889482975, + "learning_rate": 0.0004940011464229369, + "loss": 2.668, + "step": 17159 + }, + { + "epoch": 0.5088515256649764, + "grad_norm": 0.12181870639324188, + "learning_rate": 0.0004939540988752489, + "loss": 2.6705, + "step": 17160 + }, + { + "epoch": 0.5088811790172879, + "grad_norm": 0.11566364020109177, + "learning_rate": 0.0004939070513810983, + "loss": 2.6315, + "step": 17161 + }, + { + "epoch": 0.5089108323695993, + "grad_norm": 0.10907218605279922, + "learning_rate": 0.0004938600039409018, + "loss": 2.634, + "step": 17162 + }, + { + "epoch": 0.5089404857219109, + "grad_norm": 0.10691971331834793, + "learning_rate": 0.000493812956555076, + "loss": 2.658, + "step": 17163 + }, + { + "epoch": 0.5089701390742224, + "grad_norm": 0.11433706432580948, + "learning_rate": 0.0004937659092240375, + "loss": 2.6518, + "step": 17164 + }, + { + "epoch": 0.5089997924265338, + "grad_norm": 0.1113133653998375, + "learning_rate": 0.000493718861948203, + "loss": 2.6363, + "step": 17165 + }, + { + "epoch": 0.5090294457788453, + "grad_norm": 0.10737162083387375, + "learning_rate": 0.0004936718147279891, + "loss": 2.661, + "step": 17166 + }, + { + "epoch": 0.5090590991311568, + "grad_norm": 0.10248072445392609, + "learning_rate": 0.0004936247675638123, + "loss": 2.6228, + "step": 17167 + }, + { + "epoch": 0.5090887524834683, + "grad_norm": 0.11214008927345276, + "learning_rate": 0.0004935777204560891, + "loss": 2.6355, + "step": 17168 + }, + { + "epoch": 0.5091184058357797, + "grad_norm": 0.10126639157533646, + "learning_rate": 0.0004935306734052365, + "loss": 2.6626, + "step": 17169 + }, + { + "epoch": 0.5091480591880913, + "grad_norm": 0.09866223484277725, + "learning_rate": 0.000493483626411671, + "loss": 2.6634, + "step": 17170 + }, + { + "epoch": 0.5091777125404027, + "grad_norm": 0.10152479261159897, + "learning_rate": 0.0004934365794758089, + "loss": 2.6466, + "step": 17171 + }, + { + "epoch": 0.5092073658927142, + "grad_norm": 0.11838258057832718, + "learning_rate": 0.0004933895325980672, + "loss": 2.652, + "step": 17172 + }, + { + "epoch": 0.5092370192450256, + "grad_norm": 0.11502708494663239, + "learning_rate": 0.0004933424857788621, + "loss": 2.6432, + "step": 17173 + }, + { + "epoch": 0.5092666725973372, + "grad_norm": 0.10239218920469284, + "learning_rate": 0.0004932954390186107, + "loss": 2.6403, + "step": 17174 + }, + { + "epoch": 0.5092963259496486, + "grad_norm": 0.11114884912967682, + "learning_rate": 0.0004932483923177291, + "loss": 2.6659, + "step": 17175 + }, + { + "epoch": 0.5093259793019601, + "grad_norm": 0.09335130453109741, + "learning_rate": 0.0004932013456766342, + "loss": 2.6422, + "step": 17176 + }, + { + "epoch": 0.5093556326542715, + "grad_norm": 0.11222957819700241, + "learning_rate": 0.0004931542990957427, + "loss": 2.6836, + "step": 17177 + }, + { + "epoch": 0.5093852860065831, + "grad_norm": 0.13962972164154053, + "learning_rate": 0.000493107252575471, + "loss": 2.6509, + "step": 17178 + }, + { + "epoch": 0.5094149393588945, + "grad_norm": 0.15001599490642548, + "learning_rate": 0.0004930602061162357, + "loss": 2.6469, + "step": 17179 + }, + { + "epoch": 0.509444592711206, + "grad_norm": 0.12653441727161407, + "learning_rate": 0.0004930131597184535, + "loss": 2.6483, + "step": 17180 + }, + { + "epoch": 0.5094742460635174, + "grad_norm": 0.10731307417154312, + "learning_rate": 0.0004929661133825407, + "loss": 2.6361, + "step": 17181 + }, + { + "epoch": 0.509503899415829, + "grad_norm": 0.1488928347826004, + "learning_rate": 0.0004929190671089145, + "loss": 2.6765, + "step": 17182 + }, + { + "epoch": 0.5095335527681404, + "grad_norm": 0.17501892149448395, + "learning_rate": 0.000492872020897991, + "loss": 2.6531, + "step": 17183 + }, + { + "epoch": 0.5095632061204519, + "grad_norm": 0.15528392791748047, + "learning_rate": 0.0004928249747501871, + "loss": 2.6114, + "step": 17184 + }, + { + "epoch": 0.5095928594727634, + "grad_norm": 0.1338358372449875, + "learning_rate": 0.0004927779286659192, + "loss": 2.6442, + "step": 17185 + }, + { + "epoch": 0.5096225128250749, + "grad_norm": 0.12338937073945999, + "learning_rate": 0.0004927308826456041, + "loss": 2.6608, + "step": 17186 + }, + { + "epoch": 0.5096521661773864, + "grad_norm": 0.1383715122938156, + "learning_rate": 0.0004926838366896584, + "loss": 2.6621, + "step": 17187 + }, + { + "epoch": 0.5096818195296978, + "grad_norm": 0.14760567247867584, + "learning_rate": 0.0004926367907984984, + "loss": 2.6044, + "step": 17188 + }, + { + "epoch": 0.5097114728820094, + "grad_norm": 0.11954233795404434, + "learning_rate": 0.0004925897449725408, + "loss": 2.6176, + "step": 17189 + }, + { + "epoch": 0.5097411262343208, + "grad_norm": 0.10815957188606262, + "learning_rate": 0.0004925426992122024, + "loss": 2.6864, + "step": 17190 + }, + { + "epoch": 0.5097707795866323, + "grad_norm": 0.11080613732337952, + "learning_rate": 0.0004924956535178995, + "loss": 2.6423, + "step": 17191 + }, + { + "epoch": 0.5098004329389437, + "grad_norm": 0.10978008061647415, + "learning_rate": 0.0004924486078900491, + "loss": 2.6257, + "step": 17192 + }, + { + "epoch": 0.5098300862912553, + "grad_norm": 0.11232069134712219, + "learning_rate": 0.0004924015623290675, + "loss": 2.636, + "step": 17193 + }, + { + "epoch": 0.5098597396435667, + "grad_norm": 0.11322075873613358, + "learning_rate": 0.0004923545168353712, + "loss": 2.6248, + "step": 17194 + }, + { + "epoch": 0.5098893929958782, + "grad_norm": 0.11007029563188553, + "learning_rate": 0.0004923074714093771, + "loss": 2.628, + "step": 17195 + }, + { + "epoch": 0.5099190463481896, + "grad_norm": 0.12709832191467285, + "learning_rate": 0.0004922604260515017, + "loss": 2.6599, + "step": 17196 + }, + { + "epoch": 0.5099486997005012, + "grad_norm": 0.11679598689079285, + "learning_rate": 0.0004922133807621615, + "loss": 2.6182, + "step": 17197 + }, + { + "epoch": 0.5099783530528126, + "grad_norm": 0.09735091030597687, + "learning_rate": 0.0004921663355417733, + "loss": 2.6345, + "step": 17198 + }, + { + "epoch": 0.5100080064051241, + "grad_norm": 0.10633069276809692, + "learning_rate": 0.0004921192903907534, + "loss": 2.6503, + "step": 17199 + }, + { + "epoch": 0.5100376597574355, + "grad_norm": 0.10572648048400879, + "learning_rate": 0.0004920722453095187, + "loss": 2.6808, + "step": 17200 + }, + { + "epoch": 0.5100673131097471, + "grad_norm": 0.11838128417730331, + "learning_rate": 0.0004920252002984854, + "loss": 2.661, + "step": 17201 + }, + { + "epoch": 0.5100969664620585, + "grad_norm": 0.1134922131896019, + "learning_rate": 0.0004919781553580704, + "loss": 2.6407, + "step": 17202 + }, + { + "epoch": 0.51012661981437, + "grad_norm": 0.10303660482168198, + "learning_rate": 0.0004919311104886902, + "loss": 2.6459, + "step": 17203 + }, + { + "epoch": 0.5101562731666814, + "grad_norm": 0.09743606299161911, + "learning_rate": 0.0004918840656907615, + "loss": 2.6406, + "step": 17204 + }, + { + "epoch": 0.510185926518993, + "grad_norm": 0.11215806007385254, + "learning_rate": 0.0004918370209647007, + "loss": 2.6106, + "step": 17205 + }, + { + "epoch": 0.5102155798713045, + "grad_norm": 0.1250530332326889, + "learning_rate": 0.0004917899763109245, + "loss": 2.6652, + "step": 17206 + }, + { + "epoch": 0.5102452332236159, + "grad_norm": 0.11335567384958267, + "learning_rate": 0.0004917429317298493, + "loss": 2.6268, + "step": 17207 + }, + { + "epoch": 0.5102748865759275, + "grad_norm": 0.12133960425853729, + "learning_rate": 0.0004916958872218921, + "loss": 2.6584, + "step": 17208 + }, + { + "epoch": 0.5103045399282389, + "grad_norm": 0.12954886257648468, + "learning_rate": 0.0004916488427874692, + "loss": 2.6388, + "step": 17209 + }, + { + "epoch": 0.5103341932805504, + "grad_norm": 0.14262163639068604, + "learning_rate": 0.0004916017984269972, + "loss": 2.6631, + "step": 17210 + }, + { + "epoch": 0.5103638466328618, + "grad_norm": 0.1292349100112915, + "learning_rate": 0.0004915547541408927, + "loss": 2.6583, + "step": 17211 + }, + { + "epoch": 0.5103934999851734, + "grad_norm": 0.10822470486164093, + "learning_rate": 0.0004915077099295723, + "loss": 2.6104, + "step": 17212 + }, + { + "epoch": 0.5104231533374848, + "grad_norm": 0.11484699696302414, + "learning_rate": 0.0004914606657934526, + "loss": 2.6514, + "step": 17213 + }, + { + "epoch": 0.5104528066897963, + "grad_norm": 0.11087854206562042, + "learning_rate": 0.0004914136217329502, + "loss": 2.6128, + "step": 17214 + }, + { + "epoch": 0.5104824600421077, + "grad_norm": 0.09847352653741837, + "learning_rate": 0.0004913665777484817, + "loss": 2.6162, + "step": 17215 + }, + { + "epoch": 0.5105121133944193, + "grad_norm": 0.12729555368423462, + "learning_rate": 0.0004913195338404635, + "loss": 2.6631, + "step": 17216 + }, + { + "epoch": 0.5105417667467307, + "grad_norm": 0.12960204482078552, + "learning_rate": 0.0004912724900093125, + "loss": 2.6538, + "step": 17217 + }, + { + "epoch": 0.5105714200990422, + "grad_norm": 0.10923700779676437, + "learning_rate": 0.000491225446255445, + "loss": 2.6612, + "step": 17218 + }, + { + "epoch": 0.5106010734513536, + "grad_norm": 0.13050498068332672, + "learning_rate": 0.0004911784025792775, + "loss": 2.6573, + "step": 17219 + }, + { + "epoch": 0.5106307268036652, + "grad_norm": 0.12108304351568222, + "learning_rate": 0.0004911313589812269, + "loss": 2.6577, + "step": 17220 + }, + { + "epoch": 0.5106603801559766, + "grad_norm": 0.11650855839252472, + "learning_rate": 0.0004910843154617096, + "loss": 2.6204, + "step": 17221 + }, + { + "epoch": 0.5106900335082881, + "grad_norm": 0.10902122408151627, + "learning_rate": 0.0004910372720211423, + "loss": 2.6348, + "step": 17222 + }, + { + "epoch": 0.5107196868605995, + "grad_norm": 0.11965013295412064, + "learning_rate": 0.0004909902286599415, + "loss": 2.6567, + "step": 17223 + }, + { + "epoch": 0.5107493402129111, + "grad_norm": 0.10876964777708054, + "learning_rate": 0.0004909431853785237, + "loss": 2.643, + "step": 17224 + }, + { + "epoch": 0.5107789935652225, + "grad_norm": 0.11333967000246048, + "learning_rate": 0.0004908961421773057, + "loss": 2.6431, + "step": 17225 + }, + { + "epoch": 0.510808646917534, + "grad_norm": 0.1029343232512474, + "learning_rate": 0.0004908490990567038, + "loss": 2.6688, + "step": 17226 + }, + { + "epoch": 0.5108383002698456, + "grad_norm": 0.10399957746267319, + "learning_rate": 0.0004908020560171348, + "loss": 2.6912, + "step": 17227 + }, + { + "epoch": 0.510867953622157, + "grad_norm": 0.10777656733989716, + "learning_rate": 0.0004907550130590151, + "loss": 2.6638, + "step": 17228 + }, + { + "epoch": 0.5108976069744685, + "grad_norm": 0.10943695157766342, + "learning_rate": 0.0004907079701827612, + "loss": 2.683, + "step": 17229 + }, + { + "epoch": 0.5109272603267799, + "grad_norm": 0.11353693157434464, + "learning_rate": 0.00049066092738879, + "loss": 2.6412, + "step": 17230 + }, + { + "epoch": 0.5109569136790915, + "grad_norm": 0.11245053261518478, + "learning_rate": 0.0004906138846775179, + "loss": 2.6398, + "step": 17231 + }, + { + "epoch": 0.5109865670314029, + "grad_norm": 0.13001945614814758, + "learning_rate": 0.0004905668420493612, + "loss": 2.6593, + "step": 17232 + }, + { + "epoch": 0.5110162203837144, + "grad_norm": 0.11033880710601807, + "learning_rate": 0.000490519799504737, + "loss": 2.6763, + "step": 17233 + }, + { + "epoch": 0.5110458737360258, + "grad_norm": 0.1281115561723709, + "learning_rate": 0.0004904727570440615, + "loss": 2.6219, + "step": 17234 + }, + { + "epoch": 0.5110755270883374, + "grad_norm": 0.11869699507951736, + "learning_rate": 0.0004904257146677514, + "loss": 2.618, + "step": 17235 + }, + { + "epoch": 0.5111051804406488, + "grad_norm": 0.10991362482309341, + "learning_rate": 0.0004903786723762234, + "loss": 2.6553, + "step": 17236 + }, + { + "epoch": 0.5111348337929603, + "grad_norm": 0.1312592327594757, + "learning_rate": 0.0004903316301698937, + "loss": 2.6471, + "step": 17237 + }, + { + "epoch": 0.5111644871452717, + "grad_norm": 0.12014483660459518, + "learning_rate": 0.0004902845880491791, + "loss": 2.6145, + "step": 17238 + }, + { + "epoch": 0.5111941404975833, + "grad_norm": 0.11667684465646744, + "learning_rate": 0.0004902375460144962, + "loss": 2.6323, + "step": 17239 + }, + { + "epoch": 0.5112237938498947, + "grad_norm": 0.1115150898694992, + "learning_rate": 0.0004901905040662614, + "loss": 2.6413, + "step": 17240 + }, + { + "epoch": 0.5112534472022062, + "grad_norm": 0.11126608401536942, + "learning_rate": 0.0004901434622048915, + "loss": 2.6819, + "step": 17241 + }, + { + "epoch": 0.5112831005545176, + "grad_norm": 0.12185388803482056, + "learning_rate": 0.0004900964204308029, + "loss": 2.6123, + "step": 17242 + }, + { + "epoch": 0.5113127539068292, + "grad_norm": 0.11522393673658371, + "learning_rate": 0.0004900493787444123, + "loss": 2.6066, + "step": 17243 + }, + { + "epoch": 0.5113424072591406, + "grad_norm": 0.11744898557662964, + "learning_rate": 0.000490002337146136, + "loss": 2.6378, + "step": 17244 + }, + { + "epoch": 0.5113720606114521, + "grad_norm": 0.11222179234027863, + "learning_rate": 0.0004899552956363906, + "loss": 2.6556, + "step": 17245 + }, + { + "epoch": 0.5114017139637635, + "grad_norm": 0.10760253667831421, + "learning_rate": 0.000489908254215593, + "loss": 2.638, + "step": 17246 + }, + { + "epoch": 0.5114313673160751, + "grad_norm": 0.12364798784255981, + "learning_rate": 0.0004898612128841595, + "loss": 2.626, + "step": 17247 + }, + { + "epoch": 0.5114610206683866, + "grad_norm": 0.12587769329547882, + "learning_rate": 0.0004898141716425066, + "loss": 2.6453, + "step": 17248 + }, + { + "epoch": 0.511490674020698, + "grad_norm": 0.107386514544487, + "learning_rate": 0.0004897671304910511, + "loss": 2.6501, + "step": 17249 + }, + { + "epoch": 0.5115203273730096, + "grad_norm": 0.12138400226831436, + "learning_rate": 0.0004897200894302094, + "loss": 2.6365, + "step": 17250 + }, + { + "epoch": 0.511549980725321, + "grad_norm": 0.1250501573085785, + "learning_rate": 0.000489673048460398, + "loss": 2.6443, + "step": 17251 + }, + { + "epoch": 0.5115796340776325, + "grad_norm": 0.1184569001197815, + "learning_rate": 0.0004896260075820335, + "loss": 2.6351, + "step": 17252 + }, + { + "epoch": 0.5116092874299439, + "grad_norm": 0.1224566176533699, + "learning_rate": 0.0004895789667955327, + "loss": 2.6403, + "step": 17253 + }, + { + "epoch": 0.5116389407822555, + "grad_norm": 0.13389842212200165, + "learning_rate": 0.0004895319261013117, + "loss": 2.6276, + "step": 17254 + }, + { + "epoch": 0.5116685941345669, + "grad_norm": 0.14985504746437073, + "learning_rate": 0.0004894848854997874, + "loss": 2.6449, + "step": 17255 + }, + { + "epoch": 0.5116982474868784, + "grad_norm": 0.10202576965093613, + "learning_rate": 0.0004894378449913763, + "loss": 2.6395, + "step": 17256 + }, + { + "epoch": 0.5117279008391898, + "grad_norm": 0.11661679297685623, + "learning_rate": 0.0004893908045764948, + "loss": 2.636, + "step": 17257 + }, + { + "epoch": 0.5117575541915014, + "grad_norm": 0.1171361580491066, + "learning_rate": 0.0004893437642555594, + "loss": 2.6479, + "step": 17258 + }, + { + "epoch": 0.5117872075438128, + "grad_norm": 0.12463749200105667, + "learning_rate": 0.0004892967240289869, + "loss": 2.6318, + "step": 17259 + }, + { + "epoch": 0.5118168608961243, + "grad_norm": 0.1460927277803421, + "learning_rate": 0.0004892496838971938, + "loss": 2.6547, + "step": 17260 + }, + { + "epoch": 0.5118465142484357, + "grad_norm": 0.13116782903671265, + "learning_rate": 0.0004892026438605966, + "loss": 2.6364, + "step": 17261 + }, + { + "epoch": 0.5118761676007473, + "grad_norm": 0.12740667164325714, + "learning_rate": 0.0004891556039196118, + "loss": 2.6262, + "step": 17262 + }, + { + "epoch": 0.5119058209530587, + "grad_norm": 0.11328410357236862, + "learning_rate": 0.0004891085640746562, + "loss": 2.6596, + "step": 17263 + }, + { + "epoch": 0.5119354743053702, + "grad_norm": 0.11748631298542023, + "learning_rate": 0.0004890615243261459, + "loss": 2.6351, + "step": 17264 + }, + { + "epoch": 0.5119651276576817, + "grad_norm": 0.11364060640335083, + "learning_rate": 0.0004890144846744978, + "loss": 2.6667, + "step": 17265 + }, + { + "epoch": 0.5119947810099932, + "grad_norm": 0.11058134585618973, + "learning_rate": 0.0004889674451201282, + "loss": 2.6669, + "step": 17266 + }, + { + "epoch": 0.5120244343623046, + "grad_norm": 0.11729389429092407, + "learning_rate": 0.0004889204056634539, + "loss": 2.6489, + "step": 17267 + }, + { + "epoch": 0.5120540877146161, + "grad_norm": 0.1309797614812851, + "learning_rate": 0.0004888733663048912, + "loss": 2.619, + "step": 17268 + }, + { + "epoch": 0.5120837410669277, + "grad_norm": 0.13702550530433655, + "learning_rate": 0.0004888263270448567, + "loss": 2.6468, + "step": 17269 + }, + { + "epoch": 0.5121133944192391, + "grad_norm": 0.12273719906806946, + "learning_rate": 0.0004887792878837672, + "loss": 2.6642, + "step": 17270 + }, + { + "epoch": 0.5121430477715506, + "grad_norm": 0.11436596512794495, + "learning_rate": 0.0004887322488220388, + "loss": 2.644, + "step": 17271 + }, + { + "epoch": 0.512172701123862, + "grad_norm": 0.11192747950553894, + "learning_rate": 0.0004886852098600883, + "loss": 2.6609, + "step": 17272 + }, + { + "epoch": 0.5122023544761736, + "grad_norm": 0.10134855657815933, + "learning_rate": 0.0004886381709983323, + "loss": 2.6164, + "step": 17273 + }, + { + "epoch": 0.512232007828485, + "grad_norm": 0.09980373829603195, + "learning_rate": 0.0004885911322371874, + "loss": 2.6134, + "step": 17274 + }, + { + "epoch": 0.5122616611807965, + "grad_norm": 0.09985677152872086, + "learning_rate": 0.0004885440935770697, + "loss": 2.6442, + "step": 17275 + }, + { + "epoch": 0.5122913145331079, + "grad_norm": 0.10362769663333893, + "learning_rate": 0.0004884970550183962, + "loss": 2.6578, + "step": 17276 + }, + { + "epoch": 0.5123209678854195, + "grad_norm": 0.1272822916507721, + "learning_rate": 0.0004884500165615831, + "loss": 2.6362, + "step": 17277 + }, + { + "epoch": 0.5123506212377309, + "grad_norm": 0.1248043030500412, + "learning_rate": 0.0004884029782070472, + "loss": 2.7047, + "step": 17278 + }, + { + "epoch": 0.5123802745900424, + "grad_norm": 0.11146997660398483, + "learning_rate": 0.0004883559399552049, + "loss": 2.6086, + "step": 17279 + }, + { + "epoch": 0.5124099279423538, + "grad_norm": 0.10218314826488495, + "learning_rate": 0.0004883089018064728, + "loss": 2.6654, + "step": 17280 + }, + { + "epoch": 0.5124395812946654, + "grad_norm": 0.10432693362236023, + "learning_rate": 0.0004882618637612674, + "loss": 2.6545, + "step": 17281 + }, + { + "epoch": 0.5124692346469768, + "grad_norm": 0.1011495366692543, + "learning_rate": 0.00048821482582000515, + "loss": 2.6264, + "step": 17282 + }, + { + "epoch": 0.5124988879992883, + "grad_norm": 0.11041552573442459, + "learning_rate": 0.0004881677879831026, + "loss": 2.6726, + "step": 17283 + }, + { + "epoch": 0.5125285413515998, + "grad_norm": 0.10858593881130219, + "learning_rate": 0.0004881207502509763, + "loss": 2.6594, + "step": 17284 + }, + { + "epoch": 0.5125581947039113, + "grad_norm": 0.1153573989868164, + "learning_rate": 0.00048807371262404284, + "loss": 2.6601, + "step": 17285 + }, + { + "epoch": 0.5125878480562227, + "grad_norm": 0.11980558186769485, + "learning_rate": 0.0004880266751027187, + "loss": 2.6339, + "step": 17286 + }, + { + "epoch": 0.5126175014085342, + "grad_norm": 0.10514775663614273, + "learning_rate": 0.0004879796376874204, + "loss": 2.6619, + "step": 17287 + }, + { + "epoch": 0.5126471547608458, + "grad_norm": 0.10146746039390564, + "learning_rate": 0.00048793260037856447, + "loss": 2.664, + "step": 17288 + }, + { + "epoch": 0.5126768081131572, + "grad_norm": 0.11017337441444397, + "learning_rate": 0.00048788556317656747, + "loss": 2.6594, + "step": 17289 + }, + { + "epoch": 0.5127064614654687, + "grad_norm": 0.11394182592630386, + "learning_rate": 0.0004878385260818458, + "loss": 2.6287, + "step": 17290 + }, + { + "epoch": 0.5127361148177801, + "grad_norm": 0.12082083523273468, + "learning_rate": 0.0004877914890948163, + "loss": 2.6363, + "step": 17291 + }, + { + "epoch": 0.5127657681700917, + "grad_norm": 0.11464525759220123, + "learning_rate": 0.00048774445221589507, + "loss": 2.6565, + "step": 17292 + }, + { + "epoch": 0.5127954215224031, + "grad_norm": 0.10010717064142227, + "learning_rate": 0.00048769741544549885, + "loss": 2.6253, + "step": 17293 + }, + { + "epoch": 0.5128250748747146, + "grad_norm": 0.11393657326698303, + "learning_rate": 0.0004876503787840441, + "loss": 2.6415, + "step": 17294 + }, + { + "epoch": 0.512854728227026, + "grad_norm": 0.11760997772216797, + "learning_rate": 0.0004876033422319474, + "loss": 2.637, + "step": 17295 + }, + { + "epoch": 0.5128843815793376, + "grad_norm": 0.1072036549448967, + "learning_rate": 0.00048755630578962517, + "loss": 2.6021, + "step": 17296 + }, + { + "epoch": 0.512914034931649, + "grad_norm": 0.115837462246418, + "learning_rate": 0.000487509269457494, + "loss": 2.6506, + "step": 17297 + }, + { + "epoch": 0.5129436882839605, + "grad_norm": 0.11465481668710709, + "learning_rate": 0.0004874622332359704, + "loss": 2.6643, + "step": 17298 + }, + { + "epoch": 0.512973341636272, + "grad_norm": 0.11464477330446243, + "learning_rate": 0.0004874151971254709, + "loss": 2.6658, + "step": 17299 + }, + { + "epoch": 0.5130029949885835, + "grad_norm": 0.1299496293067932, + "learning_rate": 0.000487368161126412, + "loss": 2.6484, + "step": 17300 + }, + { + "epoch": 0.5130326483408949, + "grad_norm": 0.15366412699222565, + "learning_rate": 0.0004873211252392103, + "loss": 2.6715, + "step": 17301 + }, + { + "epoch": 0.5130623016932064, + "grad_norm": 0.15727797150611877, + "learning_rate": 0.00048727408946428206, + "loss": 2.6412, + "step": 17302 + }, + { + "epoch": 0.5130919550455179, + "grad_norm": 0.14277131855487823, + "learning_rate": 0.000487227053802044, + "loss": 2.6235, + "step": 17303 + }, + { + "epoch": 0.5131216083978294, + "grad_norm": 0.1290512979030609, + "learning_rate": 0.00048718001825291256, + "loss": 2.6249, + "step": 17304 + }, + { + "epoch": 0.5131512617501408, + "grad_norm": 0.12804199755191803, + "learning_rate": 0.0004871329828173043, + "loss": 2.6451, + "step": 17305 + }, + { + "epoch": 0.5131809151024523, + "grad_norm": 0.12171803414821625, + "learning_rate": 0.0004870859474956356, + "loss": 2.6286, + "step": 17306 + }, + { + "epoch": 0.5132105684547638, + "grad_norm": 0.12209201604127884, + "learning_rate": 0.00048703891228832314, + "loss": 2.6471, + "step": 17307 + }, + { + "epoch": 0.5132402218070753, + "grad_norm": 0.10533411800861359, + "learning_rate": 0.0004869918771957834, + "loss": 2.6244, + "step": 17308 + }, + { + "epoch": 0.5132698751593868, + "grad_norm": 0.1404459923505783, + "learning_rate": 0.0004869448422184327, + "loss": 2.6835, + "step": 17309 + }, + { + "epoch": 0.5132995285116982, + "grad_norm": 0.1356470137834549, + "learning_rate": 0.0004868978073566875, + "loss": 2.6355, + "step": 17310 + }, + { + "epoch": 0.5133291818640098, + "grad_norm": 0.11417006701231003, + "learning_rate": 0.0004868507726109646, + "loss": 2.6123, + "step": 17311 + }, + { + "epoch": 0.5133588352163212, + "grad_norm": 0.1216389387845993, + "learning_rate": 0.00048680373798168055, + "loss": 2.6158, + "step": 17312 + }, + { + "epoch": 0.5133884885686327, + "grad_norm": 0.11734971404075623, + "learning_rate": 0.0004867567034692515, + "loss": 2.6505, + "step": 17313 + }, + { + "epoch": 0.5134181419209441, + "grad_norm": 0.2941901385784149, + "learning_rate": 0.00048670966907409413, + "loss": 2.6254, + "step": 17314 + }, + { + "epoch": 0.5134477952732557, + "grad_norm": 0.23924726247787476, + "learning_rate": 0.00048666263479662494, + "loss": 2.6494, + "step": 17315 + }, + { + "epoch": 0.5134774486255671, + "grad_norm": 0.2020246833562851, + "learning_rate": 0.0004866156006372604, + "loss": 2.6568, + "step": 17316 + }, + { + "epoch": 0.5135071019778786, + "grad_norm": 0.12187641113996506, + "learning_rate": 0.000486568566596417, + "loss": 2.678, + "step": 17317 + }, + { + "epoch": 0.51353675533019, + "grad_norm": 0.16973215341567993, + "learning_rate": 0.0004865215326745113, + "loss": 2.6806, + "step": 17318 + }, + { + "epoch": 0.5135664086825016, + "grad_norm": 0.16956256330013275, + "learning_rate": 0.0004864744988719598, + "loss": 2.6073, + "step": 17319 + }, + { + "epoch": 0.513596062034813, + "grad_norm": 0.13130557537078857, + "learning_rate": 0.0004864274651891788, + "loss": 2.618, + "step": 17320 + }, + { + "epoch": 0.5136257153871245, + "grad_norm": 0.1425095796585083, + "learning_rate": 0.00048638043162658497, + "loss": 2.6495, + "step": 17321 + }, + { + "epoch": 0.513655368739436, + "grad_norm": 0.12994246184825897, + "learning_rate": 0.0004863333981845947, + "loss": 2.6128, + "step": 17322 + }, + { + "epoch": 0.5136850220917475, + "grad_norm": 0.11881346255540848, + "learning_rate": 0.0004862863648636245, + "loss": 2.6435, + "step": 17323 + }, + { + "epoch": 0.5137146754440589, + "grad_norm": 0.13862068951129913, + "learning_rate": 0.00048623933166409096, + "loss": 2.6352, + "step": 17324 + }, + { + "epoch": 0.5137443287963704, + "grad_norm": 0.11299237608909607, + "learning_rate": 0.0004861922985864105, + "loss": 2.636, + "step": 17325 + }, + { + "epoch": 0.5137739821486819, + "grad_norm": 0.11711611598730087, + "learning_rate": 0.0004861452656309996, + "loss": 2.6441, + "step": 17326 + }, + { + "epoch": 0.5138036355009934, + "grad_norm": 0.12290842086076736, + "learning_rate": 0.0004860982327982748, + "loss": 2.6673, + "step": 17327 + }, + { + "epoch": 0.5138332888533048, + "grad_norm": 0.11597312241792679, + "learning_rate": 0.0004860512000886525, + "loss": 2.6483, + "step": 17328 + }, + { + "epoch": 0.5138629422056163, + "grad_norm": 0.1108851507306099, + "learning_rate": 0.0004860041675025493, + "loss": 2.6189, + "step": 17329 + }, + { + "epoch": 0.5138925955579279, + "grad_norm": 0.10163544118404388, + "learning_rate": 0.00048595713504038157, + "loss": 2.6795, + "step": 17330 + }, + { + "epoch": 0.5139222489102393, + "grad_norm": 0.11593718826770782, + "learning_rate": 0.00048591010270256573, + "loss": 2.6319, + "step": 17331 + }, + { + "epoch": 0.5139519022625508, + "grad_norm": 0.108201764523983, + "learning_rate": 0.0004858630704895184, + "loss": 2.6395, + "step": 17332 + }, + { + "epoch": 0.5139815556148623, + "grad_norm": 0.1146807074546814, + "learning_rate": 0.000485816038401656, + "loss": 2.6424, + "step": 17333 + }, + { + "epoch": 0.5140112089671738, + "grad_norm": 0.11909318715333939, + "learning_rate": 0.000485769006439395, + "loss": 2.6132, + "step": 17334 + }, + { + "epoch": 0.5140408623194852, + "grad_norm": 0.11028913408517838, + "learning_rate": 0.0004857219746031519, + "loss": 2.633, + "step": 17335 + }, + { + "epoch": 0.5140705156717967, + "grad_norm": 0.11179937422275543, + "learning_rate": 0.0004856749428933431, + "loss": 2.6597, + "step": 17336 + }, + { + "epoch": 0.5141001690241082, + "grad_norm": 0.1038542091846466, + "learning_rate": 0.0004856279113103852, + "loss": 2.6716, + "step": 17337 + }, + { + "epoch": 0.5141298223764197, + "grad_norm": 0.11974523216485977, + "learning_rate": 0.00048558087985469463, + "loss": 2.6332, + "step": 17338 + }, + { + "epoch": 0.5141594757287311, + "grad_norm": 0.10715017467737198, + "learning_rate": 0.00048553384852668784, + "loss": 2.6054, + "step": 17339 + }, + { + "epoch": 0.5141891290810426, + "grad_norm": 0.10937076061964035, + "learning_rate": 0.00048548681732678143, + "loss": 2.6409, + "step": 17340 + }, + { + "epoch": 0.5142187824333541, + "grad_norm": 0.10953191667795181, + "learning_rate": 0.0004854397862553916, + "loss": 2.6514, + "step": 17341 + }, + { + "epoch": 0.5142484357856656, + "grad_norm": 0.11165981739759445, + "learning_rate": 0.000485392755312935, + "loss": 2.6554, + "step": 17342 + }, + { + "epoch": 0.514278089137977, + "grad_norm": 0.10070677101612091, + "learning_rate": 0.000485345724499828, + "loss": 2.6559, + "step": 17343 + }, + { + "epoch": 0.5143077424902885, + "grad_norm": 0.0971943736076355, + "learning_rate": 0.0004852986938164872, + "loss": 2.6394, + "step": 17344 + }, + { + "epoch": 0.5143373958426, + "grad_norm": 0.08979930728673935, + "learning_rate": 0.00048525166326332894, + "loss": 2.6289, + "step": 17345 + }, + { + "epoch": 0.5143670491949115, + "grad_norm": 0.10746315866708755, + "learning_rate": 0.00048520463284076984, + "loss": 2.6421, + "step": 17346 + }, + { + "epoch": 0.5143967025472229, + "grad_norm": 0.1139853373169899, + "learning_rate": 0.0004851576025492261, + "loss": 2.6829, + "step": 17347 + }, + { + "epoch": 0.5144263558995344, + "grad_norm": 0.11718464642763138, + "learning_rate": 0.00048511057238911443, + "loss": 2.64, + "step": 17348 + }, + { + "epoch": 0.5144560092518459, + "grad_norm": 0.09774130582809448, + "learning_rate": 0.00048506354236085093, + "loss": 2.6493, + "step": 17349 + }, + { + "epoch": 0.5144856626041574, + "grad_norm": 0.1027158796787262, + "learning_rate": 0.0004850165124648527, + "loss": 2.6306, + "step": 17350 + }, + { + "epoch": 0.5145153159564689, + "grad_norm": 0.11429992318153381, + "learning_rate": 0.00048496948270153567, + "loss": 2.6528, + "step": 17351 + }, + { + "epoch": 0.5145449693087804, + "grad_norm": 0.11685990542173386, + "learning_rate": 0.00048492245307131646, + "loss": 2.6557, + "step": 17352 + }, + { + "epoch": 0.5145746226610919, + "grad_norm": 0.1338595449924469, + "learning_rate": 0.0004848754235746115, + "loss": 2.644, + "step": 17353 + }, + { + "epoch": 0.5146042760134033, + "grad_norm": 0.10341611504554749, + "learning_rate": 0.0004848283942118373, + "loss": 2.6723, + "step": 17354 + }, + { + "epoch": 0.5146339293657148, + "grad_norm": 0.10014057904481888, + "learning_rate": 0.00048478136498341024, + "loss": 2.6175, + "step": 17355 + }, + { + "epoch": 0.5146635827180263, + "grad_norm": 0.12495765089988708, + "learning_rate": 0.0004847343358897468, + "loss": 2.6453, + "step": 17356 + }, + { + "epoch": 0.5146932360703378, + "grad_norm": 0.12621892988681793, + "learning_rate": 0.0004846873069312636, + "loss": 2.6608, + "step": 17357 + }, + { + "epoch": 0.5147228894226492, + "grad_norm": 0.12200133502483368, + "learning_rate": 0.00048464027810837676, + "loss": 2.6511, + "step": 17358 + }, + { + "epoch": 0.5147525427749607, + "grad_norm": 0.13076074421405792, + "learning_rate": 0.0004845932494215029, + "loss": 2.6121, + "step": 17359 + }, + { + "epoch": 0.5147821961272722, + "grad_norm": 0.09734684228897095, + "learning_rate": 0.0004845462208710585, + "loss": 2.6353, + "step": 17360 + }, + { + "epoch": 0.5148118494795837, + "grad_norm": 0.11536731570959091, + "learning_rate": 0.00048449919245745996, + "loss": 2.6405, + "step": 17361 + }, + { + "epoch": 0.5148415028318951, + "grad_norm": 0.13153712451457977, + "learning_rate": 0.0004844521641811236, + "loss": 2.6415, + "step": 17362 + }, + { + "epoch": 0.5148711561842066, + "grad_norm": 0.10811718553304672, + "learning_rate": 0.00048440513604246606, + "loss": 2.6604, + "step": 17363 + }, + { + "epoch": 0.5149008095365181, + "grad_norm": 0.1236269399523735, + "learning_rate": 0.00048435810804190377, + "loss": 2.6674, + "step": 17364 + }, + { + "epoch": 0.5149304628888296, + "grad_norm": 0.13691304624080658, + "learning_rate": 0.0004843110801798531, + "loss": 2.6691, + "step": 17365 + }, + { + "epoch": 0.514960116241141, + "grad_norm": 0.11908422410488129, + "learning_rate": 0.00048426405245673057, + "loss": 2.6035, + "step": 17366 + }, + { + "epoch": 0.5149897695934526, + "grad_norm": 0.09935563802719116, + "learning_rate": 0.0004842170248729526, + "loss": 2.6575, + "step": 17367 + }, + { + "epoch": 0.515019422945764, + "grad_norm": 0.10122629255056381, + "learning_rate": 0.00048416999742893546, + "loss": 2.6701, + "step": 17368 + }, + { + "epoch": 0.5150490762980755, + "grad_norm": 0.12815718352794647, + "learning_rate": 0.0004841229701250958, + "loss": 2.6554, + "step": 17369 + }, + { + "epoch": 0.5150787296503869, + "grad_norm": 0.13380049169063568, + "learning_rate": 0.00048407594296184987, + "loss": 2.6249, + "step": 17370 + }, + { + "epoch": 0.5151083830026985, + "grad_norm": 0.11446920782327652, + "learning_rate": 0.00048402891593961426, + "loss": 2.6419, + "step": 17371 + }, + { + "epoch": 0.51513803635501, + "grad_norm": 0.1186923012137413, + "learning_rate": 0.0004839818890588053, + "loss": 2.64, + "step": 17372 + }, + { + "epoch": 0.5151676897073214, + "grad_norm": 0.13044241070747375, + "learning_rate": 0.00048393486231983944, + "loss": 2.6228, + "step": 17373 + }, + { + "epoch": 0.5151973430596329, + "grad_norm": 0.1339295357465744, + "learning_rate": 0.0004838878357231333, + "loss": 2.6439, + "step": 17374 + }, + { + "epoch": 0.5152269964119444, + "grad_norm": 0.10975156724452972, + "learning_rate": 0.00048384080926910277, + "loss": 2.6618, + "step": 17375 + }, + { + "epoch": 0.5152566497642559, + "grad_norm": 0.12027100473642349, + "learning_rate": 0.0004837937829581649, + "loss": 2.649, + "step": 17376 + }, + { + "epoch": 0.5152863031165673, + "grad_norm": 0.12570419907569885, + "learning_rate": 0.00048374675679073583, + "loss": 2.5842, + "step": 17377 + }, + { + "epoch": 0.5153159564688788, + "grad_norm": 0.11339129507541656, + "learning_rate": 0.0004836997307672322, + "loss": 2.6565, + "step": 17378 + }, + { + "epoch": 0.5153456098211903, + "grad_norm": 0.11043508350849152, + "learning_rate": 0.00048365270488807006, + "loss": 2.6683, + "step": 17379 + }, + { + "epoch": 0.5153752631735018, + "grad_norm": 0.11311638355255127, + "learning_rate": 0.00048360567915366605, + "loss": 2.6076, + "step": 17380 + }, + { + "epoch": 0.5154049165258132, + "grad_norm": 0.14199736714363098, + "learning_rate": 0.00048355865356443655, + "loss": 2.6714, + "step": 17381 + }, + { + "epoch": 0.5154345698781247, + "grad_norm": 0.14205460250377655, + "learning_rate": 0.000483511628120798, + "loss": 2.6359, + "step": 17382 + }, + { + "epoch": 0.5154642232304362, + "grad_norm": 0.12298295646905899, + "learning_rate": 0.00048346460282316684, + "loss": 2.6448, + "step": 17383 + }, + { + "epoch": 0.5154938765827477, + "grad_norm": 0.13046370446681976, + "learning_rate": 0.0004834175776719596, + "loss": 2.6467, + "step": 17384 + }, + { + "epoch": 0.5155235299350591, + "grad_norm": 0.12859390676021576, + "learning_rate": 0.0004833705526675924, + "loss": 2.6527, + "step": 17385 + }, + { + "epoch": 0.5155531832873707, + "grad_norm": 0.10912932455539703, + "learning_rate": 0.00048332352781048176, + "loss": 2.6521, + "step": 17386 + }, + { + "epoch": 0.5155828366396821, + "grad_norm": 0.09737104177474976, + "learning_rate": 0.0004832765031010442, + "loss": 2.6187, + "step": 17387 + }, + { + "epoch": 0.5156124899919936, + "grad_norm": 0.12545105814933777, + "learning_rate": 0.000483229478539696, + "loss": 2.6498, + "step": 17388 + }, + { + "epoch": 0.515642143344305, + "grad_norm": 0.1458413451910019, + "learning_rate": 0.0004831824541268537, + "loss": 2.6748, + "step": 17389 + }, + { + "epoch": 0.5156717966966166, + "grad_norm": 0.14765159785747528, + "learning_rate": 0.0004831354298629337, + "loss": 2.6853, + "step": 17390 + }, + { + "epoch": 0.515701450048928, + "grad_norm": 0.1325836032629013, + "learning_rate": 0.00048308840574835235, + "loss": 2.6272, + "step": 17391 + }, + { + "epoch": 0.5157311034012395, + "grad_norm": 0.11980956792831421, + "learning_rate": 0.00048304138178352604, + "loss": 2.628, + "step": 17392 + }, + { + "epoch": 0.515760756753551, + "grad_norm": 0.11876930296421051, + "learning_rate": 0.0004829943579688712, + "loss": 2.6294, + "step": 17393 + }, + { + "epoch": 0.5157904101058625, + "grad_norm": 0.10931480675935745, + "learning_rate": 0.00048294733430480433, + "loss": 2.6275, + "step": 17394 + }, + { + "epoch": 0.515820063458174, + "grad_norm": 0.1169353798031807, + "learning_rate": 0.00048290031079174176, + "loss": 2.6293, + "step": 17395 + }, + { + "epoch": 0.5158497168104854, + "grad_norm": 0.10391870886087418, + "learning_rate": 0.0004828532874300998, + "loss": 2.6319, + "step": 17396 + }, + { + "epoch": 0.515879370162797, + "grad_norm": 0.10872507095336914, + "learning_rate": 0.0004828062642202949, + "loss": 2.6216, + "step": 17397 + }, + { + "epoch": 0.5159090235151084, + "grad_norm": 0.11350736767053604, + "learning_rate": 0.00048275924116274345, + "loss": 2.6394, + "step": 17398 + }, + { + "epoch": 0.5159386768674199, + "grad_norm": 0.11297476291656494, + "learning_rate": 0.0004827122182578619, + "loss": 2.6173, + "step": 17399 + }, + { + "epoch": 0.5159683302197313, + "grad_norm": 0.13362784683704376, + "learning_rate": 0.0004826651955060667, + "loss": 2.6562, + "step": 17400 + }, + { + "epoch": 0.5159979835720429, + "grad_norm": 0.11779244244098663, + "learning_rate": 0.000482618172907774, + "loss": 2.6698, + "step": 17401 + }, + { + "epoch": 0.5160276369243543, + "grad_norm": 0.11646579951047897, + "learning_rate": 0.0004825711504634004, + "loss": 2.6434, + "step": 17402 + }, + { + "epoch": 0.5160572902766658, + "grad_norm": 0.13070248067378998, + "learning_rate": 0.0004825241281733624, + "loss": 2.6324, + "step": 17403 + }, + { + "epoch": 0.5160869436289772, + "grad_norm": 0.13371746242046356, + "learning_rate": 0.00048247710603807614, + "loss": 2.6274, + "step": 17404 + }, + { + "epoch": 0.5161165969812888, + "grad_norm": 0.11867804080247879, + "learning_rate": 0.0004824300840579583, + "loss": 2.6471, + "step": 17405 + }, + { + "epoch": 0.5161462503336002, + "grad_norm": 0.1247006505727768, + "learning_rate": 0.0004823830622334249, + "loss": 2.6305, + "step": 17406 + }, + { + "epoch": 0.5161759036859117, + "grad_norm": 0.11923964321613312, + "learning_rate": 0.0004823360405648926, + "loss": 2.6163, + "step": 17407 + }, + { + "epoch": 0.5162055570382231, + "grad_norm": 0.14413218200206757, + "learning_rate": 0.0004822890190527776, + "loss": 2.6513, + "step": 17408 + }, + { + "epoch": 0.5162352103905347, + "grad_norm": 0.1458827257156372, + "learning_rate": 0.00048224199769749646, + "loss": 2.6009, + "step": 17409 + }, + { + "epoch": 0.5162648637428461, + "grad_norm": 0.15701709687709808, + "learning_rate": 0.00048219497649946535, + "loss": 2.6066, + "step": 17410 + }, + { + "epoch": 0.5162945170951576, + "grad_norm": 0.14158475399017334, + "learning_rate": 0.0004821479554591009, + "loss": 2.6635, + "step": 17411 + }, + { + "epoch": 0.516324170447469, + "grad_norm": 0.13162578642368317, + "learning_rate": 0.0004821009345768194, + "loss": 2.6319, + "step": 17412 + }, + { + "epoch": 0.5163538237997806, + "grad_norm": 0.11917340755462646, + "learning_rate": 0.00048205391385303695, + "loss": 2.6449, + "step": 17413 + }, + { + "epoch": 0.5163834771520921, + "grad_norm": 0.12750937044620514, + "learning_rate": 0.0004820068932881703, + "loss": 2.6492, + "step": 17414 + }, + { + "epoch": 0.5164131305044035, + "grad_norm": 0.13525471091270447, + "learning_rate": 0.00048195987288263576, + "loss": 2.6186, + "step": 17415 + }, + { + "epoch": 0.516442783856715, + "grad_norm": 0.1275801807641983, + "learning_rate": 0.00048191285263684977, + "loss": 2.6766, + "step": 17416 + }, + { + "epoch": 0.5164724372090265, + "grad_norm": 0.11610673367977142, + "learning_rate": 0.0004818658325512284, + "loss": 2.6237, + "step": 17417 + }, + { + "epoch": 0.516502090561338, + "grad_norm": 0.11715587228536606, + "learning_rate": 0.0004818188126261882, + "loss": 2.6253, + "step": 17418 + }, + { + "epoch": 0.5165317439136494, + "grad_norm": 0.11589358747005463, + "learning_rate": 0.00048177179286214555, + "loss": 2.6237, + "step": 17419 + }, + { + "epoch": 0.516561397265961, + "grad_norm": 0.12142173945903778, + "learning_rate": 0.00048172477325951685, + "loss": 2.6671, + "step": 17420 + }, + { + "epoch": 0.5165910506182724, + "grad_norm": 0.09618199616670609, + "learning_rate": 0.00048167775381871835, + "loss": 2.6306, + "step": 17421 + }, + { + "epoch": 0.5166207039705839, + "grad_norm": 0.10943622142076492, + "learning_rate": 0.00048163073454016663, + "loss": 2.6458, + "step": 17422 + }, + { + "epoch": 0.5166503573228953, + "grad_norm": 0.10330035537481308, + "learning_rate": 0.0004815837154242778, + "loss": 2.6439, + "step": 17423 + }, + { + "epoch": 0.5166800106752069, + "grad_norm": 0.10972415655851364, + "learning_rate": 0.00048153669647146835, + "loss": 2.5927, + "step": 17424 + }, + { + "epoch": 0.5167096640275183, + "grad_norm": 0.11372561752796173, + "learning_rate": 0.0004814896776821546, + "loss": 2.6584, + "step": 17425 + }, + { + "epoch": 0.5167393173798298, + "grad_norm": 0.11340635269880295, + "learning_rate": 0.0004814426590567528, + "loss": 2.657, + "step": 17426 + }, + { + "epoch": 0.5167689707321412, + "grad_norm": 0.10094472020864487, + "learning_rate": 0.0004813956405956796, + "loss": 2.6587, + "step": 17427 + }, + { + "epoch": 0.5167986240844528, + "grad_norm": 0.10450959205627441, + "learning_rate": 0.00048134862229935114, + "loss": 2.6215, + "step": 17428 + }, + { + "epoch": 0.5168282774367642, + "grad_norm": 0.12416616082191467, + "learning_rate": 0.00048130160416818386, + "loss": 2.6372, + "step": 17429 + }, + { + "epoch": 0.5168579307890757, + "grad_norm": 0.13298696279525757, + "learning_rate": 0.0004812545862025941, + "loss": 2.6719, + "step": 17430 + }, + { + "epoch": 0.5168875841413871, + "grad_norm": 0.1236528530716896, + "learning_rate": 0.00048120756840299816, + "loss": 2.6621, + "step": 17431 + }, + { + "epoch": 0.5169172374936987, + "grad_norm": 0.1260737031698227, + "learning_rate": 0.00048116055076981247, + "loss": 2.6394, + "step": 17432 + }, + { + "epoch": 0.5169468908460101, + "grad_norm": 0.1122315376996994, + "learning_rate": 0.0004811135333034534, + "loss": 2.6117, + "step": 17433 + }, + { + "epoch": 0.5169765441983216, + "grad_norm": 0.11610116809606552, + "learning_rate": 0.0004810665160043372, + "loss": 2.6597, + "step": 17434 + }, + { + "epoch": 0.5170061975506332, + "grad_norm": 0.12659263610839844, + "learning_rate": 0.00048101949887288014, + "loss": 2.6506, + "step": 17435 + }, + { + "epoch": 0.5170358509029446, + "grad_norm": 0.146744504570961, + "learning_rate": 0.00048097248190949877, + "loss": 2.6154, + "step": 17436 + }, + { + "epoch": 0.5170655042552561, + "grad_norm": 0.12339767068624496, + "learning_rate": 0.00048092546511460926, + "loss": 2.6443, + "step": 17437 + }, + { + "epoch": 0.5170951576075675, + "grad_norm": 0.11644066125154495, + "learning_rate": 0.0004808784484886281, + "loss": 2.6616, + "step": 17438 + }, + { + "epoch": 0.517124810959879, + "grad_norm": 0.12796053290367126, + "learning_rate": 0.00048083143203197143, + "loss": 2.6525, + "step": 17439 + }, + { + "epoch": 0.5171544643121905, + "grad_norm": 0.13267754018306732, + "learning_rate": 0.00048078441574505584, + "loss": 2.6201, + "step": 17440 + }, + { + "epoch": 0.517184117664502, + "grad_norm": 0.12611469626426697, + "learning_rate": 0.0004807373996282975, + "loss": 2.6493, + "step": 17441 + }, + { + "epoch": 0.5172137710168134, + "grad_norm": 0.14337123930454254, + "learning_rate": 0.0004806903836821128, + "loss": 2.6407, + "step": 17442 + }, + { + "epoch": 0.517243424369125, + "grad_norm": 0.14105091989040375, + "learning_rate": 0.0004806433679069182, + "loss": 2.6708, + "step": 17443 + }, + { + "epoch": 0.5172730777214364, + "grad_norm": 0.11929915845394135, + "learning_rate": 0.00048059635230312983, + "loss": 2.6613, + "step": 17444 + }, + { + "epoch": 0.5173027310737479, + "grad_norm": 0.10890960693359375, + "learning_rate": 0.00048054933687116403, + "loss": 2.6704, + "step": 17445 + }, + { + "epoch": 0.5173323844260593, + "grad_norm": 0.11944423615932465, + "learning_rate": 0.0004805023216114372, + "loss": 2.6442, + "step": 17446 + }, + { + "epoch": 0.5173620377783709, + "grad_norm": 0.11081281304359436, + "learning_rate": 0.0004804553065243657, + "loss": 2.6113, + "step": 17447 + }, + { + "epoch": 0.5173916911306823, + "grad_norm": 0.11026380956172943, + "learning_rate": 0.0004804082916103658, + "loss": 2.635, + "step": 17448 + }, + { + "epoch": 0.5174213444829938, + "grad_norm": 0.11356239765882492, + "learning_rate": 0.0004803612768698538, + "loss": 2.645, + "step": 17449 + }, + { + "epoch": 0.5174509978353052, + "grad_norm": 0.1010952815413475, + "learning_rate": 0.0004803142623032462, + "loss": 2.6468, + "step": 17450 + }, + { + "epoch": 0.5174806511876168, + "grad_norm": 0.10647127032279968, + "learning_rate": 0.0004802672479109591, + "loss": 2.6613, + "step": 17451 + }, + { + "epoch": 0.5175103045399282, + "grad_norm": 0.10880040377378464, + "learning_rate": 0.0004802202336934086, + "loss": 2.6328, + "step": 17452 + }, + { + "epoch": 0.5175399578922397, + "grad_norm": 0.10659405589103699, + "learning_rate": 0.00048017321965101165, + "loss": 2.6229, + "step": 17453 + }, + { + "epoch": 0.5175696112445511, + "grad_norm": 0.11508381366729736, + "learning_rate": 0.00048012620578418433, + "loss": 2.6042, + "step": 17454 + }, + { + "epoch": 0.5175992645968627, + "grad_norm": 0.13001003861427307, + "learning_rate": 0.0004800791920933427, + "loss": 2.643, + "step": 17455 + }, + { + "epoch": 0.5176289179491742, + "grad_norm": 0.14081157743930817, + "learning_rate": 0.0004800321785789033, + "loss": 2.6218, + "step": 17456 + }, + { + "epoch": 0.5176585713014856, + "grad_norm": 0.1315857619047165, + "learning_rate": 0.0004799851652412824, + "loss": 2.6245, + "step": 17457 + }, + { + "epoch": 0.5176882246537972, + "grad_norm": 0.11185820400714874, + "learning_rate": 0.00047993815208089623, + "loss": 2.6149, + "step": 17458 + }, + { + "epoch": 0.5177178780061086, + "grad_norm": 0.12157882004976273, + "learning_rate": 0.00047989113909816124, + "loss": 2.6606, + "step": 17459 + }, + { + "epoch": 0.5177475313584201, + "grad_norm": 0.1226380467414856, + "learning_rate": 0.0004798441262934937, + "loss": 2.6504, + "step": 17460 + }, + { + "epoch": 0.5177771847107315, + "grad_norm": 0.10859988629817963, + "learning_rate": 0.00047979711366730985, + "loss": 2.6446, + "step": 17461 + }, + { + "epoch": 0.5178068380630431, + "grad_norm": 0.09554794430732727, + "learning_rate": 0.000479750101220026, + "loss": 2.6534, + "step": 17462 + }, + { + "epoch": 0.5178364914153545, + "grad_norm": 0.1124674454331398, + "learning_rate": 0.00047970308895205844, + "loss": 2.6428, + "step": 17463 + }, + { + "epoch": 0.517866144767666, + "grad_norm": 0.14183804392814636, + "learning_rate": 0.0004796560768638236, + "loss": 2.691, + "step": 17464 + }, + { + "epoch": 0.5178957981199774, + "grad_norm": 0.12086307257413864, + "learning_rate": 0.0004796090649557375, + "loss": 2.6521, + "step": 17465 + }, + { + "epoch": 0.517925451472289, + "grad_norm": 0.09816930443048477, + "learning_rate": 0.0004795620532282167, + "loss": 2.6557, + "step": 17466 + }, + { + "epoch": 0.5179551048246004, + "grad_norm": 0.13453708589076996, + "learning_rate": 0.00047951504168167755, + "loss": 2.66, + "step": 17467 + }, + { + "epoch": 0.5179847581769119, + "grad_norm": 0.14007186889648438, + "learning_rate": 0.0004794680303165361, + "loss": 2.6547, + "step": 17468 + }, + { + "epoch": 0.5180144115292233, + "grad_norm": 0.11098934710025787, + "learning_rate": 0.00047942101913320885, + "loss": 2.6412, + "step": 17469 + }, + { + "epoch": 0.5180440648815349, + "grad_norm": 0.11318878084421158, + "learning_rate": 0.000479374008132112, + "loss": 2.6539, + "step": 17470 + }, + { + "epoch": 0.5180737182338463, + "grad_norm": 0.11897851526737213, + "learning_rate": 0.0004793269973136619, + "loss": 2.672, + "step": 17471 + }, + { + "epoch": 0.5181033715861578, + "grad_norm": 0.13012255728244781, + "learning_rate": 0.0004792799866782747, + "loss": 2.6634, + "step": 17472 + }, + { + "epoch": 0.5181330249384692, + "grad_norm": 0.1342669278383255, + "learning_rate": 0.00047923297622636684, + "loss": 2.623, + "step": 17473 + }, + { + "epoch": 0.5181626782907808, + "grad_norm": 0.14511795341968536, + "learning_rate": 0.0004791859659583545, + "loss": 2.6444, + "step": 17474 + }, + { + "epoch": 0.5181923316430922, + "grad_norm": 0.13362137973308563, + "learning_rate": 0.000479138955874654, + "loss": 2.6634, + "step": 17475 + }, + { + "epoch": 0.5182219849954037, + "grad_norm": 0.13564994931221008, + "learning_rate": 0.00047909194597568164, + "loss": 2.664, + "step": 17476 + }, + { + "epoch": 0.5182516383477153, + "grad_norm": 0.12094253301620483, + "learning_rate": 0.0004790449362618538, + "loss": 2.635, + "step": 17477 + }, + { + "epoch": 0.5182812917000267, + "grad_norm": 0.12198368459939957, + "learning_rate": 0.00047899792673358624, + "loss": 2.6711, + "step": 17478 + }, + { + "epoch": 0.5183109450523382, + "grad_norm": 0.12183486670255661, + "learning_rate": 0.0004789509173912959, + "loss": 2.638, + "step": 17479 + }, + { + "epoch": 0.5183405984046496, + "grad_norm": 0.1259600669145584, + "learning_rate": 0.00047890390823539887, + "loss": 2.6528, + "step": 17480 + }, + { + "epoch": 0.5183702517569612, + "grad_norm": 0.10921788960695267, + "learning_rate": 0.00047885689926631137, + "loss": 2.6558, + "step": 17481 + }, + { + "epoch": 0.5183999051092726, + "grad_norm": 0.11224652826786041, + "learning_rate": 0.0004788098904844496, + "loss": 2.6658, + "step": 17482 + }, + { + "epoch": 0.5184295584615841, + "grad_norm": 0.11715079098939896, + "learning_rate": 0.0004787628818902299, + "loss": 2.6453, + "step": 17483 + }, + { + "epoch": 0.5184592118138955, + "grad_norm": 0.12006713449954987, + "learning_rate": 0.0004787158734840685, + "loss": 2.6518, + "step": 17484 + }, + { + "epoch": 0.5184888651662071, + "grad_norm": 0.10826195031404495, + "learning_rate": 0.00047866886526638173, + "loss": 2.665, + "step": 17485 + }, + { + "epoch": 0.5185185185185185, + "grad_norm": 0.1206730529665947, + "learning_rate": 0.0004786218572375857, + "loss": 2.6465, + "step": 17486 + }, + { + "epoch": 0.51854817187083, + "grad_norm": 0.11704691499471664, + "learning_rate": 0.00047857484939809695, + "loss": 2.6626, + "step": 17487 + }, + { + "epoch": 0.5185778252231414, + "grad_norm": 0.12159210443496704, + "learning_rate": 0.0004785278417483316, + "loss": 2.5942, + "step": 17488 + }, + { + "epoch": 0.518607478575453, + "grad_norm": 0.12944412231445312, + "learning_rate": 0.0004784808342887058, + "loss": 2.6507, + "step": 17489 + }, + { + "epoch": 0.5186371319277644, + "grad_norm": 0.12157995253801346, + "learning_rate": 0.00047843382701963594, + "loss": 2.6625, + "step": 17490 + }, + { + "epoch": 0.5186667852800759, + "grad_norm": 0.10546798259019852, + "learning_rate": 0.000478386819941538, + "loss": 2.6492, + "step": 17491 + }, + { + "epoch": 0.5186964386323873, + "grad_norm": 0.11274387687444687, + "learning_rate": 0.0004783398130548288, + "loss": 2.6382, + "step": 17492 + }, + { + "epoch": 0.5187260919846989, + "grad_norm": 0.09760725498199463, + "learning_rate": 0.00047829280635992415, + "loss": 2.6377, + "step": 17493 + }, + { + "epoch": 0.5187557453370103, + "grad_norm": 0.11444874852895737, + "learning_rate": 0.00047824579985724047, + "loss": 2.6171, + "step": 17494 + }, + { + "epoch": 0.5187853986893218, + "grad_norm": 0.11242570728063583, + "learning_rate": 0.000478198793547194, + "loss": 2.6289, + "step": 17495 + }, + { + "epoch": 0.5188150520416333, + "grad_norm": 0.11212391406297684, + "learning_rate": 0.00047815178743020086, + "loss": 2.6497, + "step": 17496 + }, + { + "epoch": 0.5188447053939448, + "grad_norm": 0.1140584647655487, + "learning_rate": 0.00047810478150667743, + "loss": 2.6539, + "step": 17497 + }, + { + "epoch": 0.5188743587462563, + "grad_norm": 0.10366790741682053, + "learning_rate": 0.00047805777577704004, + "loss": 2.6564, + "step": 17498 + }, + { + "epoch": 0.5189040120985677, + "grad_norm": 0.10433302819728851, + "learning_rate": 0.0004780107702417047, + "loss": 2.6852, + "step": 17499 + }, + { + "epoch": 0.5189336654508793, + "grad_norm": 0.10811221599578857, + "learning_rate": 0.0004779637649010877, + "loss": 2.6322, + "step": 17500 + }, + { + "epoch": 0.5189633188031907, + "grad_norm": 0.09808991104364395, + "learning_rate": 0.00047791675975560543, + "loss": 2.622, + "step": 17501 + }, + { + "epoch": 0.5189929721555022, + "grad_norm": 0.10785866528749466, + "learning_rate": 0.00047786975480567403, + "loss": 2.6429, + "step": 17502 + }, + { + "epoch": 0.5190226255078136, + "grad_norm": 0.12117651849985123, + "learning_rate": 0.00047782275005170974, + "loss": 2.6562, + "step": 17503 + }, + { + "epoch": 0.5190522788601252, + "grad_norm": 0.11598886549472809, + "learning_rate": 0.0004777757454941287, + "loss": 2.6459, + "step": 17504 + }, + { + "epoch": 0.5190819322124366, + "grad_norm": 0.10852615535259247, + "learning_rate": 0.00047772874113334736, + "loss": 2.6284, + "step": 17505 + }, + { + "epoch": 0.5191115855647481, + "grad_norm": 0.09701612591743469, + "learning_rate": 0.0004776817369697818, + "loss": 2.5974, + "step": 17506 + }, + { + "epoch": 0.5191412389170595, + "grad_norm": 0.11340314149856567, + "learning_rate": 0.0004776347330038484, + "loss": 2.638, + "step": 17507 + }, + { + "epoch": 0.5191708922693711, + "grad_norm": 0.12205333262681961, + "learning_rate": 0.0004775877292359631, + "loss": 2.6699, + "step": 17508 + }, + { + "epoch": 0.5192005456216825, + "grad_norm": 0.11813914030790329, + "learning_rate": 0.0004775407256665426, + "loss": 2.6504, + "step": 17509 + }, + { + "epoch": 0.519230198973994, + "grad_norm": 0.10547875612974167, + "learning_rate": 0.0004774937222960026, + "loss": 2.6056, + "step": 17510 + }, + { + "epoch": 0.5192598523263054, + "grad_norm": 0.1076701208949089, + "learning_rate": 0.00047744671912475955, + "loss": 2.6512, + "step": 17511 + }, + { + "epoch": 0.519289505678617, + "grad_norm": 0.11997652798891068, + "learning_rate": 0.0004773997161532297, + "loss": 2.6639, + "step": 17512 + }, + { + "epoch": 0.5193191590309284, + "grad_norm": 0.11642984300851822, + "learning_rate": 0.0004773527133818293, + "loss": 2.6864, + "step": 17513 + }, + { + "epoch": 0.5193488123832399, + "grad_norm": 0.11972708255052567, + "learning_rate": 0.0004773057108109744, + "loss": 2.6432, + "step": 17514 + }, + { + "epoch": 0.5193784657355514, + "grad_norm": 0.11551132053136826, + "learning_rate": 0.00047725870844108156, + "loss": 2.6525, + "step": 17515 + }, + { + "epoch": 0.5194081190878629, + "grad_norm": 0.09382503479719162, + "learning_rate": 0.00047721170627256654, + "loss": 2.6653, + "step": 17516 + }, + { + "epoch": 0.5194377724401744, + "grad_norm": 0.11038144677877426, + "learning_rate": 0.0004771647043058456, + "loss": 2.6528, + "step": 17517 + }, + { + "epoch": 0.5194674257924858, + "grad_norm": 0.1177389919757843, + "learning_rate": 0.0004771177025413354, + "loss": 2.6283, + "step": 17518 + }, + { + "epoch": 0.5194970791447974, + "grad_norm": 0.12356976419687271, + "learning_rate": 0.00047707070097945185, + "loss": 2.6367, + "step": 17519 + }, + { + "epoch": 0.5195267324971088, + "grad_norm": 0.12129712849855423, + "learning_rate": 0.00047702369962061115, + "loss": 2.6334, + "step": 17520 + }, + { + "epoch": 0.5195563858494203, + "grad_norm": 0.10588964074850082, + "learning_rate": 0.00047697669846522955, + "loss": 2.6389, + "step": 17521 + }, + { + "epoch": 0.5195860392017317, + "grad_norm": 0.09516366571187973, + "learning_rate": 0.0004769296975137232, + "loss": 2.6356, + "step": 17522 + }, + { + "epoch": 0.5196156925540433, + "grad_norm": 0.09476576745510101, + "learning_rate": 0.00047688269676650835, + "loss": 2.6061, + "step": 17523 + }, + { + "epoch": 0.5196453459063547, + "grad_norm": 0.11139179766178131, + "learning_rate": 0.0004768356962240012, + "loss": 2.6799, + "step": 17524 + }, + { + "epoch": 0.5196749992586662, + "grad_norm": 0.1067991703748703, + "learning_rate": 0.00047678869588661793, + "loss": 2.6532, + "step": 17525 + }, + { + "epoch": 0.5197046526109776, + "grad_norm": 0.10536666959524155, + "learning_rate": 0.00047674169575477485, + "loss": 2.6579, + "step": 17526 + }, + { + "epoch": 0.5197343059632892, + "grad_norm": 0.109351746737957, + "learning_rate": 0.0004766946958288879, + "loss": 2.6165, + "step": 17527 + }, + { + "epoch": 0.5197639593156006, + "grad_norm": 0.10917574912309647, + "learning_rate": 0.00047664769610937345, + "loss": 2.6603, + "step": 17528 + }, + { + "epoch": 0.5197936126679121, + "grad_norm": 0.13203878700733185, + "learning_rate": 0.0004766006965966477, + "loss": 2.6103, + "step": 17529 + }, + { + "epoch": 0.5198232660202236, + "grad_norm": 0.14503033459186554, + "learning_rate": 0.00047655369729112664, + "loss": 2.6319, + "step": 17530 + }, + { + "epoch": 0.5198529193725351, + "grad_norm": 0.11628742516040802, + "learning_rate": 0.0004765066981932267, + "loss": 2.633, + "step": 17531 + }, + { + "epoch": 0.5198825727248465, + "grad_norm": 0.11159618943929672, + "learning_rate": 0.000476459699303364, + "loss": 2.6326, + "step": 17532 + }, + { + "epoch": 0.519912226077158, + "grad_norm": 0.10733210295438766, + "learning_rate": 0.0004764127006219547, + "loss": 2.6538, + "step": 17533 + }, + { + "epoch": 0.5199418794294695, + "grad_norm": 0.11430779099464417, + "learning_rate": 0.000476365702149415, + "loss": 2.641, + "step": 17534 + }, + { + "epoch": 0.519971532781781, + "grad_norm": 0.12024704366922379, + "learning_rate": 0.00047631870388616115, + "loss": 2.6128, + "step": 17535 + }, + { + "epoch": 0.5200011861340924, + "grad_norm": 0.1037059873342514, + "learning_rate": 0.0004762717058326093, + "loss": 2.6333, + "step": 17536 + }, + { + "epoch": 0.5200308394864039, + "grad_norm": 0.11927412450313568, + "learning_rate": 0.0004762247079891754, + "loss": 2.6466, + "step": 17537 + }, + { + "epoch": 0.5200604928387155, + "grad_norm": 0.10681012272834778, + "learning_rate": 0.00047617771035627585, + "loss": 2.6719, + "step": 17538 + }, + { + "epoch": 0.5200901461910269, + "grad_norm": 0.11258596181869507, + "learning_rate": 0.0004761307129343267, + "loss": 2.6069, + "step": 17539 + }, + { + "epoch": 0.5201197995433384, + "grad_norm": 0.09782971441745758, + "learning_rate": 0.0004760837157237442, + "loss": 2.6252, + "step": 17540 + }, + { + "epoch": 0.5201494528956498, + "grad_norm": 0.10812210291624069, + "learning_rate": 0.0004760367187249445, + "loss": 2.6763, + "step": 17541 + }, + { + "epoch": 0.5201791062479614, + "grad_norm": 0.13954980671405792, + "learning_rate": 0.0004759897219383438, + "loss": 2.6521, + "step": 17542 + }, + { + "epoch": 0.5202087596002728, + "grad_norm": 0.16462287306785583, + "learning_rate": 0.00047594272536435813, + "loss": 2.6697, + "step": 17543 + }, + { + "epoch": 0.5202384129525843, + "grad_norm": 0.17888143658638, + "learning_rate": 0.0004758957290034038, + "loss": 2.6942, + "step": 17544 + }, + { + "epoch": 0.5202680663048957, + "grad_norm": 0.14497052133083344, + "learning_rate": 0.000475848732855897, + "loss": 2.6402, + "step": 17545 + }, + { + "epoch": 0.5202977196572073, + "grad_norm": 0.107772096991539, + "learning_rate": 0.00047580173692225383, + "loss": 2.6604, + "step": 17546 + }, + { + "epoch": 0.5203273730095187, + "grad_norm": 0.1600838154554367, + "learning_rate": 0.00047575474120289046, + "loss": 2.6659, + "step": 17547 + }, + { + "epoch": 0.5203570263618302, + "grad_norm": 0.12357399612665176, + "learning_rate": 0.00047570774569822297, + "loss": 2.6356, + "step": 17548 + }, + { + "epoch": 0.5203866797141417, + "grad_norm": 0.133744016289711, + "learning_rate": 0.0004756607504086676, + "loss": 2.6467, + "step": 17549 + }, + { + "epoch": 0.5204163330664532, + "grad_norm": 0.14105680584907532, + "learning_rate": 0.0004756137553346404, + "loss": 2.6338, + "step": 17550 + }, + { + "epoch": 0.5204459864187646, + "grad_norm": 0.13396087288856506, + "learning_rate": 0.00047556676047655756, + "loss": 2.6118, + "step": 17551 + }, + { + "epoch": 0.5204756397710761, + "grad_norm": 0.11273328959941864, + "learning_rate": 0.00047551976583483526, + "loss": 2.6463, + "step": 17552 + }, + { + "epoch": 0.5205052931233876, + "grad_norm": 0.11191540956497192, + "learning_rate": 0.00047547277140988975, + "loss": 2.6462, + "step": 17553 + }, + { + "epoch": 0.5205349464756991, + "grad_norm": 0.12494157254695892, + "learning_rate": 0.000475425777202137, + "loss": 2.6407, + "step": 17554 + }, + { + "epoch": 0.5205645998280105, + "grad_norm": 0.13820189237594604, + "learning_rate": 0.0004753787832119932, + "loss": 2.6302, + "step": 17555 + }, + { + "epoch": 0.520594253180322, + "grad_norm": 0.11641399562358856, + "learning_rate": 0.0004753317894398742, + "loss": 2.631, + "step": 17556 + }, + { + "epoch": 0.5206239065326335, + "grad_norm": 0.10974215716123581, + "learning_rate": 0.0004752847958861968, + "loss": 2.6351, + "step": 17557 + }, + { + "epoch": 0.520653559884945, + "grad_norm": 0.11269479990005493, + "learning_rate": 0.00047523780255137675, + "loss": 2.6574, + "step": 17558 + }, + { + "epoch": 0.5206832132372565, + "grad_norm": 0.12786415219306946, + "learning_rate": 0.00047519080943583017, + "loss": 2.6554, + "step": 17559 + }, + { + "epoch": 0.520712866589568, + "grad_norm": 0.12853537499904633, + "learning_rate": 0.0004751438165399732, + "loss": 2.6448, + "step": 17560 + }, + { + "epoch": 0.5207425199418795, + "grad_norm": 0.11129574477672577, + "learning_rate": 0.000475096823864222, + "loss": 2.6383, + "step": 17561 + }, + { + "epoch": 0.5207721732941909, + "grad_norm": 0.12422381341457367, + "learning_rate": 0.0004750498314089927, + "loss": 2.637, + "step": 17562 + }, + { + "epoch": 0.5208018266465024, + "grad_norm": 0.12335354089736938, + "learning_rate": 0.00047500283917470144, + "loss": 2.6829, + "step": 17563 + }, + { + "epoch": 0.5208314799988139, + "grad_norm": 0.11644452810287476, + "learning_rate": 0.00047495584716176445, + "loss": 2.5994, + "step": 17564 + }, + { + "epoch": 0.5208611333511254, + "grad_norm": 0.12404326349496841, + "learning_rate": 0.00047490885537059755, + "loss": 2.6382, + "step": 17565 + }, + { + "epoch": 0.5208907867034368, + "grad_norm": 0.11156803369522095, + "learning_rate": 0.00047486186380161713, + "loss": 2.6424, + "step": 17566 + }, + { + "epoch": 0.5209204400557483, + "grad_norm": 0.11009467393159866, + "learning_rate": 0.0004748148724552392, + "loss": 2.6271, + "step": 17567 + }, + { + "epoch": 0.5209500934080598, + "grad_norm": 0.13424715399742126, + "learning_rate": 0.0004747678813318799, + "loss": 2.6387, + "step": 17568 + }, + { + "epoch": 0.5209797467603713, + "grad_norm": 0.11679479479789734, + "learning_rate": 0.0004747208904319552, + "loss": 2.6143, + "step": 17569 + }, + { + "epoch": 0.5210094001126827, + "grad_norm": 0.11147560924291611, + "learning_rate": 0.0004746738997558815, + "loss": 2.5936, + "step": 17570 + }, + { + "epoch": 0.5210390534649942, + "grad_norm": 0.09741301089525223, + "learning_rate": 0.0004746269093040747, + "loss": 2.6173, + "step": 17571 + }, + { + "epoch": 0.5210687068173057, + "grad_norm": 0.12217531353235245, + "learning_rate": 0.00047457991907695103, + "loss": 2.6361, + "step": 17572 + }, + { + "epoch": 0.5210983601696172, + "grad_norm": 0.12056026607751846, + "learning_rate": 0.0004745329290749266, + "loss": 2.6135, + "step": 17573 + }, + { + "epoch": 0.5211280135219286, + "grad_norm": 0.09648333489894867, + "learning_rate": 0.00047448593929841744, + "loss": 2.5871, + "step": 17574 + }, + { + "epoch": 0.5211576668742401, + "grad_norm": 0.11902270466089249, + "learning_rate": 0.00047443894974783966, + "loss": 2.6195, + "step": 17575 + }, + { + "epoch": 0.5211873202265516, + "grad_norm": 0.1303519755601883, + "learning_rate": 0.00047439196042360925, + "loss": 2.6414, + "step": 17576 + }, + { + "epoch": 0.5212169735788631, + "grad_norm": 0.12959745526313782, + "learning_rate": 0.00047434497132614254, + "loss": 2.6343, + "step": 17577 + }, + { + "epoch": 0.5212466269311745, + "grad_norm": 0.1345028579235077, + "learning_rate": 0.0004742979824558555, + "loss": 2.6535, + "step": 17578 + }, + { + "epoch": 0.521276280283486, + "grad_norm": 0.1154261901974678, + "learning_rate": 0.00047425099381316415, + "loss": 2.6235, + "step": 17579 + }, + { + "epoch": 0.5213059336357976, + "grad_norm": 0.11611298471689224, + "learning_rate": 0.00047420400539848476, + "loss": 2.6251, + "step": 17580 + }, + { + "epoch": 0.521335586988109, + "grad_norm": 0.10325796157121658, + "learning_rate": 0.0004741570172122334, + "loss": 2.645, + "step": 17581 + }, + { + "epoch": 0.5213652403404205, + "grad_norm": 0.11960218846797943, + "learning_rate": 0.00047411002925482575, + "loss": 2.6354, + "step": 17582 + }, + { + "epoch": 0.521394893692732, + "grad_norm": 0.11398600041866302, + "learning_rate": 0.00047406304152667846, + "loss": 2.6547, + "step": 17583 + }, + { + "epoch": 0.5214245470450435, + "grad_norm": 0.11656270176172256, + "learning_rate": 0.00047401605402820743, + "loss": 2.6315, + "step": 17584 + }, + { + "epoch": 0.5214542003973549, + "grad_norm": 0.11668023467063904, + "learning_rate": 0.00047396906675982876, + "loss": 2.6488, + "step": 17585 + }, + { + "epoch": 0.5214838537496664, + "grad_norm": 0.14532460272312164, + "learning_rate": 0.0004739220797219584, + "loss": 2.6487, + "step": 17586 + }, + { + "epoch": 0.5215135071019779, + "grad_norm": 0.13233377039432526, + "learning_rate": 0.00047387509291501245, + "loss": 2.6428, + "step": 17587 + }, + { + "epoch": 0.5215431604542894, + "grad_norm": 0.12153252959251404, + "learning_rate": 0.0004738281063394071, + "loss": 2.6867, + "step": 17588 + }, + { + "epoch": 0.5215728138066008, + "grad_norm": 0.13943853974342346, + "learning_rate": 0.00047378111999555835, + "loss": 2.649, + "step": 17589 + }, + { + "epoch": 0.5216024671589123, + "grad_norm": 0.1307966709136963, + "learning_rate": 0.0004737341338838822, + "loss": 2.6305, + "step": 17590 + }, + { + "epoch": 0.5216321205112238, + "grad_norm": 0.12433601915836334, + "learning_rate": 0.00047368714800479503, + "loss": 2.6563, + "step": 17591 + }, + { + "epoch": 0.5216617738635353, + "grad_norm": 0.11492587625980377, + "learning_rate": 0.00047364016235871246, + "loss": 2.645, + "step": 17592 + }, + { + "epoch": 0.5216914272158467, + "grad_norm": 0.11447131633758545, + "learning_rate": 0.0004735931769460509, + "loss": 2.6151, + "step": 17593 + }, + { + "epoch": 0.5217210805681582, + "grad_norm": 0.11876150220632553, + "learning_rate": 0.00047354619176722594, + "loss": 2.6492, + "step": 17594 + }, + { + "epoch": 0.5217507339204697, + "grad_norm": 0.12313195317983627, + "learning_rate": 0.0004734992068226544, + "loss": 2.6359, + "step": 17595 + }, + { + "epoch": 0.5217803872727812, + "grad_norm": 0.13214683532714844, + "learning_rate": 0.0004734522221127519, + "loss": 2.6352, + "step": 17596 + }, + { + "epoch": 0.5218100406250926, + "grad_norm": 0.11537489295005798, + "learning_rate": 0.0004734052376379344, + "loss": 2.6062, + "step": 17597 + }, + { + "epoch": 0.5218396939774042, + "grad_norm": 0.11541235446929932, + "learning_rate": 0.0004733582533986181, + "loss": 2.6116, + "step": 17598 + }, + { + "epoch": 0.5218693473297156, + "grad_norm": 0.11374381184577942, + "learning_rate": 0.00047331126939521905, + "loss": 2.6566, + "step": 17599 + }, + { + "epoch": 0.5218990006820271, + "grad_norm": 0.126090869307518, + "learning_rate": 0.00047326428562815333, + "loss": 2.6459, + "step": 17600 + }, + { + "epoch": 0.5219286540343386, + "grad_norm": 0.11562440544366837, + "learning_rate": 0.00047321730209783693, + "loss": 2.6664, + "step": 17601 + }, + { + "epoch": 0.5219583073866501, + "grad_norm": 0.10248184204101562, + "learning_rate": 0.00047317031880468596, + "loss": 2.6424, + "step": 17602 + }, + { + "epoch": 0.5219879607389616, + "grad_norm": 0.11337792873382568, + "learning_rate": 0.00047312333574911643, + "loss": 2.6336, + "step": 17603 + }, + { + "epoch": 0.522017614091273, + "grad_norm": 0.11546844989061356, + "learning_rate": 0.0004730763529315443, + "loss": 2.6457, + "step": 17604 + }, + { + "epoch": 0.5220472674435845, + "grad_norm": 0.1088903471827507, + "learning_rate": 0.00047302937035238567, + "loss": 2.6547, + "step": 17605 + }, + { + "epoch": 0.522076920795896, + "grad_norm": 0.11680391430854797, + "learning_rate": 0.00047298238801205667, + "loss": 2.6442, + "step": 17606 + }, + { + "epoch": 0.5221065741482075, + "grad_norm": 0.10792072117328644, + "learning_rate": 0.00047293540591097314, + "loss": 2.6337, + "step": 17607 + }, + { + "epoch": 0.5221362275005189, + "grad_norm": 0.1116766631603241, + "learning_rate": 0.0004728884240495513, + "loss": 2.6171, + "step": 17608 + }, + { + "epoch": 0.5221658808528304, + "grad_norm": 0.11079990863800049, + "learning_rate": 0.00047284144242820713, + "loss": 2.6527, + "step": 17609 + }, + { + "epoch": 0.5221955342051419, + "grad_norm": 0.10819334536790848, + "learning_rate": 0.0004727944610473566, + "loss": 2.6819, + "step": 17610 + }, + { + "epoch": 0.5222251875574534, + "grad_norm": 0.11313030123710632, + "learning_rate": 0.0004727474799074159, + "loss": 2.6324, + "step": 17611 + }, + { + "epoch": 0.5222548409097648, + "grad_norm": 0.11845989525318146, + "learning_rate": 0.00047270049900880097, + "loss": 2.6364, + "step": 17612 + }, + { + "epoch": 0.5222844942620763, + "grad_norm": 0.09462862461805344, + "learning_rate": 0.00047265351835192775, + "loss": 2.6217, + "step": 17613 + }, + { + "epoch": 0.5223141476143878, + "grad_norm": 0.11579649150371552, + "learning_rate": 0.00047260653793721233, + "loss": 2.6485, + "step": 17614 + }, + { + "epoch": 0.5223438009666993, + "grad_norm": 0.10456931591033936, + "learning_rate": 0.0004725595577650706, + "loss": 2.6357, + "step": 17615 + }, + { + "epoch": 0.5223734543190107, + "grad_norm": 0.1049107164144516, + "learning_rate": 0.00047251257783591884, + "loss": 2.6232, + "step": 17616 + }, + { + "epoch": 0.5224031076713223, + "grad_norm": 0.11359279602766037, + "learning_rate": 0.0004724655981501728, + "loss": 2.6447, + "step": 17617 + }, + { + "epoch": 0.5224327610236337, + "grad_norm": 0.10730788856744766, + "learning_rate": 0.0004724186187082487, + "loss": 2.6194, + "step": 17618 + }, + { + "epoch": 0.5224624143759452, + "grad_norm": 0.09545815736055374, + "learning_rate": 0.0004723716395105626, + "loss": 2.6357, + "step": 17619 + }, + { + "epoch": 0.5224920677282566, + "grad_norm": 0.10610184818506241, + "learning_rate": 0.00047232466055752994, + "loss": 2.6409, + "step": 17620 + }, + { + "epoch": 0.5225217210805682, + "grad_norm": 0.10790801048278809, + "learning_rate": 0.0004722776818495674, + "loss": 2.6136, + "step": 17621 + }, + { + "epoch": 0.5225513744328797, + "grad_norm": 0.12418412417173386, + "learning_rate": 0.00047223070338709084, + "loss": 2.5767, + "step": 17622 + }, + { + "epoch": 0.5225810277851911, + "grad_norm": 0.14043235778808594, + "learning_rate": 0.00047218372517051624, + "loss": 2.6715, + "step": 17623 + }, + { + "epoch": 0.5226106811375026, + "grad_norm": 0.13983558118343353, + "learning_rate": 0.00047213674720025936, + "loss": 2.6387, + "step": 17624 + }, + { + "epoch": 0.5226403344898141, + "grad_norm": 0.11206603795289993, + "learning_rate": 0.0004720897694767364, + "loss": 2.6111, + "step": 17625 + }, + { + "epoch": 0.5226699878421256, + "grad_norm": 0.12022814154624939, + "learning_rate": 0.00047204279200036334, + "loss": 2.633, + "step": 17626 + }, + { + "epoch": 0.522699641194437, + "grad_norm": 0.1484701782464981, + "learning_rate": 0.0004719958147715561, + "loss": 2.6698, + "step": 17627 + }, + { + "epoch": 0.5227292945467485, + "grad_norm": 0.1427491009235382, + "learning_rate": 0.00047194883779073075, + "loss": 2.6498, + "step": 17628 + }, + { + "epoch": 0.52275894789906, + "grad_norm": 0.13306955993175507, + "learning_rate": 0.00047190186105830343, + "loss": 2.6377, + "step": 17629 + }, + { + "epoch": 0.5227886012513715, + "grad_norm": 0.11780587583780289, + "learning_rate": 0.00047185488457468977, + "loss": 2.6633, + "step": 17630 + }, + { + "epoch": 0.5228182546036829, + "grad_norm": 0.11143293976783752, + "learning_rate": 0.000471807908340306, + "loss": 2.6419, + "step": 17631 + }, + { + "epoch": 0.5228479079559945, + "grad_norm": 0.1747439056634903, + "learning_rate": 0.000471760932355568, + "loss": 2.6302, + "step": 17632 + }, + { + "epoch": 0.5228775613083059, + "grad_norm": 0.16494520008563995, + "learning_rate": 0.0004717139566208917, + "loss": 2.649, + "step": 17633 + }, + { + "epoch": 0.5229072146606174, + "grad_norm": 0.11155309528112411, + "learning_rate": 0.00047166698113669325, + "loss": 2.6962, + "step": 17634 + }, + { + "epoch": 0.5229368680129288, + "grad_norm": 0.13109305500984192, + "learning_rate": 0.0004716200059033885, + "loss": 2.6444, + "step": 17635 + }, + { + "epoch": 0.5229665213652404, + "grad_norm": 0.14896097779273987, + "learning_rate": 0.00047157303092139357, + "loss": 2.6615, + "step": 17636 + }, + { + "epoch": 0.5229961747175518, + "grad_norm": 0.12926314771175385, + "learning_rate": 0.00047152605619112434, + "loss": 2.6303, + "step": 17637 + }, + { + "epoch": 0.5230258280698633, + "grad_norm": 0.1142372190952301, + "learning_rate": 0.0004714790817129967, + "loss": 2.6194, + "step": 17638 + }, + { + "epoch": 0.5230554814221747, + "grad_norm": 0.13128435611724854, + "learning_rate": 0.0004714321074874267, + "loss": 2.6292, + "step": 17639 + }, + { + "epoch": 0.5230851347744863, + "grad_norm": 0.1256808489561081, + "learning_rate": 0.00047138513351483043, + "loss": 2.6302, + "step": 17640 + }, + { + "epoch": 0.5231147881267977, + "grad_norm": 0.12407093495130539, + "learning_rate": 0.00047133815979562353, + "loss": 2.6238, + "step": 17641 + }, + { + "epoch": 0.5231444414791092, + "grad_norm": 0.12153387814760208, + "learning_rate": 0.00047129118633022223, + "loss": 2.5937, + "step": 17642 + }, + { + "epoch": 0.5231740948314207, + "grad_norm": 0.11415506154298782, + "learning_rate": 0.0004712442131190424, + "loss": 2.6324, + "step": 17643 + }, + { + "epoch": 0.5232037481837322, + "grad_norm": 0.11997861415147781, + "learning_rate": 0.00047119724016249994, + "loss": 2.6216, + "step": 17644 + }, + { + "epoch": 0.5232334015360437, + "grad_norm": 0.11810167133808136, + "learning_rate": 0.0004711502674610109, + "loss": 2.6137, + "step": 17645 + }, + { + "epoch": 0.5232630548883551, + "grad_norm": 0.10824351757764816, + "learning_rate": 0.0004711032950149911, + "loss": 2.6632, + "step": 17646 + }, + { + "epoch": 0.5232927082406666, + "grad_norm": 0.10711539536714554, + "learning_rate": 0.00047105632282485666, + "loss": 2.6399, + "step": 17647 + }, + { + "epoch": 0.5233223615929781, + "grad_norm": 0.11037054657936096, + "learning_rate": 0.0004710093508910235, + "loss": 2.6204, + "step": 17648 + }, + { + "epoch": 0.5233520149452896, + "grad_norm": 0.10951051115989685, + "learning_rate": 0.00047096237921390746, + "loss": 2.6157, + "step": 17649 + }, + { + "epoch": 0.523381668297601, + "grad_norm": 0.11360863596200943, + "learning_rate": 0.00047091540779392463, + "loss": 2.6288, + "step": 17650 + }, + { + "epoch": 0.5234113216499126, + "grad_norm": 0.10735885053873062, + "learning_rate": 0.0004708684366314908, + "loss": 2.6474, + "step": 17651 + }, + { + "epoch": 0.523440975002224, + "grad_norm": 0.10512367635965347, + "learning_rate": 0.0004708214657270219, + "loss": 2.6232, + "step": 17652 + }, + { + "epoch": 0.5234706283545355, + "grad_norm": 0.10382776707410812, + "learning_rate": 0.00047077449508093396, + "loss": 2.6458, + "step": 17653 + }, + { + "epoch": 0.5235002817068469, + "grad_norm": 0.10067208111286163, + "learning_rate": 0.00047072752469364286, + "loss": 2.6337, + "step": 17654 + }, + { + "epoch": 0.5235299350591585, + "grad_norm": 0.0965699702501297, + "learning_rate": 0.00047068055456556465, + "loss": 2.6172, + "step": 17655 + }, + { + "epoch": 0.5235595884114699, + "grad_norm": 0.10901973396539688, + "learning_rate": 0.0004706335846971151, + "loss": 2.6517, + "step": 17656 + }, + { + "epoch": 0.5235892417637814, + "grad_norm": 0.1142980232834816, + "learning_rate": 0.0004705866150887103, + "loss": 2.6512, + "step": 17657 + }, + { + "epoch": 0.5236188951160928, + "grad_norm": 0.10958941280841827, + "learning_rate": 0.00047053964574076594, + "loss": 2.6822, + "step": 17658 + }, + { + "epoch": 0.5236485484684044, + "grad_norm": 0.09663635492324829, + "learning_rate": 0.0004704926766536979, + "loss": 2.6825, + "step": 17659 + }, + { + "epoch": 0.5236782018207158, + "grad_norm": 0.10601738840341568, + "learning_rate": 0.00047044570782792247, + "loss": 2.6624, + "step": 17660 + }, + { + "epoch": 0.5237078551730273, + "grad_norm": 0.09691781550645828, + "learning_rate": 0.00047039873926385545, + "loss": 2.6258, + "step": 17661 + }, + { + "epoch": 0.5237375085253387, + "grad_norm": 0.12021374702453613, + "learning_rate": 0.0004703517709619126, + "loss": 2.6466, + "step": 17662 + }, + { + "epoch": 0.5237671618776503, + "grad_norm": 0.12605950236320496, + "learning_rate": 0.00047030480292250995, + "loss": 2.6364, + "step": 17663 + }, + { + "epoch": 0.5237968152299618, + "grad_norm": 0.14805804193019867, + "learning_rate": 0.0004702578351460633, + "loss": 2.6527, + "step": 17664 + }, + { + "epoch": 0.5238264685822732, + "grad_norm": 0.14630824327468872, + "learning_rate": 0.00047021086763298866, + "loss": 2.6509, + "step": 17665 + }, + { + "epoch": 0.5238561219345848, + "grad_norm": 0.1377975046634674, + "learning_rate": 0.0004701639003837019, + "loss": 2.6229, + "step": 17666 + }, + { + "epoch": 0.5238857752868962, + "grad_norm": 0.11535824090242386, + "learning_rate": 0.000470116933398619, + "loss": 2.6554, + "step": 17667 + }, + { + "epoch": 0.5239154286392077, + "grad_norm": 0.12552714347839355, + "learning_rate": 0.0004700699666781557, + "loss": 2.6824, + "step": 17668 + }, + { + "epoch": 0.5239450819915191, + "grad_norm": 0.1326339691877365, + "learning_rate": 0.000470023000222728, + "loss": 2.6609, + "step": 17669 + }, + { + "epoch": 0.5239747353438307, + "grad_norm": 0.11351823806762695, + "learning_rate": 0.00046997603403275176, + "loss": 2.6283, + "step": 17670 + }, + { + "epoch": 0.5240043886961421, + "grad_norm": 0.12712480127811432, + "learning_rate": 0.00046992906810864286, + "loss": 2.65, + "step": 17671 + }, + { + "epoch": 0.5240340420484536, + "grad_norm": 0.12276414781808853, + "learning_rate": 0.0004698821024508171, + "loss": 2.6878, + "step": 17672 + }, + { + "epoch": 0.524063695400765, + "grad_norm": 0.10154356807470322, + "learning_rate": 0.00046983513705969074, + "loss": 2.6217, + "step": 17673 + }, + { + "epoch": 0.5240933487530766, + "grad_norm": 0.13050797581672668, + "learning_rate": 0.0004697881719356793, + "loss": 2.6564, + "step": 17674 + }, + { + "epoch": 0.524123002105388, + "grad_norm": 0.11493990570306778, + "learning_rate": 0.0004697412070791988, + "loss": 2.6617, + "step": 17675 + }, + { + "epoch": 0.5241526554576995, + "grad_norm": 0.11408587545156479, + "learning_rate": 0.0004696942424906651, + "loss": 2.6355, + "step": 17676 + }, + { + "epoch": 0.5241823088100109, + "grad_norm": 0.13050000369548798, + "learning_rate": 0.00046964727817049414, + "loss": 2.6309, + "step": 17677 + }, + { + "epoch": 0.5242119621623225, + "grad_norm": 0.12830817699432373, + "learning_rate": 0.00046960031411910175, + "loss": 2.6674, + "step": 17678 + }, + { + "epoch": 0.5242416155146339, + "grad_norm": 0.10366035997867584, + "learning_rate": 0.0004695533503369038, + "loss": 2.6268, + "step": 17679 + }, + { + "epoch": 0.5242712688669454, + "grad_norm": 0.12819930911064148, + "learning_rate": 0.0004695063868243161, + "loss": 2.6492, + "step": 17680 + }, + { + "epoch": 0.5243009222192568, + "grad_norm": 0.14453868567943573, + "learning_rate": 0.0004694594235817546, + "loss": 2.6416, + "step": 17681 + }, + { + "epoch": 0.5243305755715684, + "grad_norm": 0.10794536024332047, + "learning_rate": 0.0004694124606096351, + "loss": 2.6321, + "step": 17682 + }, + { + "epoch": 0.5243602289238798, + "grad_norm": 0.1158464178442955, + "learning_rate": 0.0004693654979083735, + "loss": 2.6184, + "step": 17683 + }, + { + "epoch": 0.5243898822761913, + "grad_norm": 0.11997734755277634, + "learning_rate": 0.0004693185354783858, + "loss": 2.6373, + "step": 17684 + }, + { + "epoch": 0.5244195356285029, + "grad_norm": 0.11642686277627945, + "learning_rate": 0.00046927157332008753, + "loss": 2.6446, + "step": 17685 + }, + { + "epoch": 0.5244491889808143, + "grad_norm": 0.12214846163988113, + "learning_rate": 0.00046922461143389485, + "loss": 2.6608, + "step": 17686 + }, + { + "epoch": 0.5244788423331258, + "grad_norm": 0.11702380329370499, + "learning_rate": 0.00046917764982022355, + "loss": 2.6344, + "step": 17687 + }, + { + "epoch": 0.5245084956854372, + "grad_norm": 0.11074909567832947, + "learning_rate": 0.0004691306884794895, + "loss": 2.6222, + "step": 17688 + }, + { + "epoch": 0.5245381490377488, + "grad_norm": 0.10770588368177414, + "learning_rate": 0.0004690837274121085, + "loss": 2.658, + "step": 17689 + }, + { + "epoch": 0.5245678023900602, + "grad_norm": 0.11313910782337189, + "learning_rate": 0.0004690367666184963, + "loss": 2.6213, + "step": 17690 + }, + { + "epoch": 0.5245974557423717, + "grad_norm": 0.11461616307497025, + "learning_rate": 0.00046898980609906896, + "loss": 2.6637, + "step": 17691 + }, + { + "epoch": 0.5246271090946831, + "grad_norm": 0.1152389645576477, + "learning_rate": 0.0004689428458542421, + "loss": 2.6842, + "step": 17692 + }, + { + "epoch": 0.5246567624469947, + "grad_norm": 0.12284567952156067, + "learning_rate": 0.0004688958858844317, + "loss": 2.6262, + "step": 17693 + }, + { + "epoch": 0.5246864157993061, + "grad_norm": 0.11009559035301208, + "learning_rate": 0.0004688489261900536, + "loss": 2.667, + "step": 17694 + }, + { + "epoch": 0.5247160691516176, + "grad_norm": 0.11155343055725098, + "learning_rate": 0.0004688019667715237, + "loss": 2.6483, + "step": 17695 + }, + { + "epoch": 0.524745722503929, + "grad_norm": 0.13881713151931763, + "learning_rate": 0.0004687550076292576, + "loss": 2.6532, + "step": 17696 + }, + { + "epoch": 0.5247753758562406, + "grad_norm": 0.14591629803180695, + "learning_rate": 0.0004687080487636713, + "loss": 2.6425, + "step": 17697 + }, + { + "epoch": 0.524805029208552, + "grad_norm": 0.1389932632446289, + "learning_rate": 0.00046866109017518036, + "loss": 2.6424, + "step": 17698 + }, + { + "epoch": 0.5248346825608635, + "grad_norm": 0.1261967569589615, + "learning_rate": 0.0004686141318642012, + "loss": 2.6051, + "step": 17699 + }, + { + "epoch": 0.5248643359131749, + "grad_norm": 0.13577905297279358, + "learning_rate": 0.0004685671738311492, + "loss": 2.6427, + "step": 17700 + }, + { + "epoch": 0.5248939892654865, + "grad_norm": 0.1522398442029953, + "learning_rate": 0.00046852021607644026, + "loss": 2.6428, + "step": 17701 + }, + { + "epoch": 0.5249236426177979, + "grad_norm": 0.13098092377185822, + "learning_rate": 0.00046847325860049016, + "loss": 2.6446, + "step": 17702 + }, + { + "epoch": 0.5249532959701094, + "grad_norm": 0.12340562790632248, + "learning_rate": 0.0004684263014037148, + "loss": 2.6764, + "step": 17703 + }, + { + "epoch": 0.5249829493224208, + "grad_norm": 0.11893027275800705, + "learning_rate": 0.00046837934448652996, + "loss": 2.6713, + "step": 17704 + }, + { + "epoch": 0.5250126026747324, + "grad_norm": 0.12121643126010895, + "learning_rate": 0.00046833238784935155, + "loss": 2.6389, + "step": 17705 + }, + { + "epoch": 0.5250422560270439, + "grad_norm": 0.13199663162231445, + "learning_rate": 0.0004682854314925952, + "loss": 2.6603, + "step": 17706 + }, + { + "epoch": 0.5250719093793553, + "grad_norm": 0.1142774149775505, + "learning_rate": 0.0004682384754166768, + "loss": 2.64, + "step": 17707 + }, + { + "epoch": 0.5251015627316669, + "grad_norm": 0.13814042508602142, + "learning_rate": 0.0004681915196220121, + "loss": 2.6467, + "step": 17708 + }, + { + "epoch": 0.5251312160839783, + "grad_norm": 0.1144995391368866, + "learning_rate": 0.000468144564109017, + "loss": 2.6579, + "step": 17709 + }, + { + "epoch": 0.5251608694362898, + "grad_norm": 0.12624603509902954, + "learning_rate": 0.0004680976088781073, + "loss": 2.6712, + "step": 17710 + }, + { + "epoch": 0.5251905227886012, + "grad_norm": 0.11906757950782776, + "learning_rate": 0.00046805065392969855, + "loss": 2.665, + "step": 17711 + }, + { + "epoch": 0.5252201761409128, + "grad_norm": 0.13261555135250092, + "learning_rate": 0.0004680036992642069, + "loss": 2.6471, + "step": 17712 + }, + { + "epoch": 0.5252498294932242, + "grad_norm": 0.12809503078460693, + "learning_rate": 0.0004679567448820479, + "loss": 2.628, + "step": 17713 + }, + { + "epoch": 0.5252794828455357, + "grad_norm": 0.12322698533535004, + "learning_rate": 0.00046790979078363754, + "loss": 2.6227, + "step": 17714 + }, + { + "epoch": 0.5253091361978471, + "grad_norm": 0.10670968145132065, + "learning_rate": 0.0004678628369693914, + "loss": 2.6624, + "step": 17715 + }, + { + "epoch": 0.5253387895501587, + "grad_norm": 0.11169419437646866, + "learning_rate": 0.0004678158834397255, + "loss": 2.6316, + "step": 17716 + }, + { + "epoch": 0.5253684429024701, + "grad_norm": 0.10303892195224762, + "learning_rate": 0.0004677689301950554, + "loss": 2.6372, + "step": 17717 + }, + { + "epoch": 0.5253980962547816, + "grad_norm": 0.10074521601200104, + "learning_rate": 0.00046772197723579694, + "loss": 2.6324, + "step": 17718 + }, + { + "epoch": 0.525427749607093, + "grad_norm": 0.10228198766708374, + "learning_rate": 0.00046767502456236595, + "loss": 2.6479, + "step": 17719 + }, + { + "epoch": 0.5254574029594046, + "grad_norm": 0.09925471246242523, + "learning_rate": 0.0004676280721751781, + "loss": 2.6118, + "step": 17720 + }, + { + "epoch": 0.525487056311716, + "grad_norm": 0.09790933132171631, + "learning_rate": 0.00046758112007464923, + "loss": 2.6402, + "step": 17721 + }, + { + "epoch": 0.5255167096640275, + "grad_norm": 0.10611460357904434, + "learning_rate": 0.0004675341682611951, + "loss": 2.6123, + "step": 17722 + }, + { + "epoch": 0.525546363016339, + "grad_norm": 0.09842677414417267, + "learning_rate": 0.00046748721673523164, + "loss": 2.6076, + "step": 17723 + }, + { + "epoch": 0.5255760163686505, + "grad_norm": 0.10635311156511307, + "learning_rate": 0.0004674402654971741, + "loss": 2.6182, + "step": 17724 + }, + { + "epoch": 0.525605669720962, + "grad_norm": 0.1038554310798645, + "learning_rate": 0.00046739331454743885, + "loss": 2.6219, + "step": 17725 + }, + { + "epoch": 0.5256353230732734, + "grad_norm": 0.10926210135221481, + "learning_rate": 0.0004673463638864415, + "loss": 2.6312, + "step": 17726 + }, + { + "epoch": 0.525664976425585, + "grad_norm": 0.08719585835933685, + "learning_rate": 0.00046729941351459747, + "loss": 2.6427, + "step": 17727 + }, + { + "epoch": 0.5256946297778964, + "grad_norm": 0.10021504014730453, + "learning_rate": 0.0004672524634323229, + "loss": 2.617, + "step": 17728 + }, + { + "epoch": 0.5257242831302079, + "grad_norm": 0.10024558752775192, + "learning_rate": 0.00046720551364003333, + "loss": 2.6676, + "step": 17729 + }, + { + "epoch": 0.5257539364825193, + "grad_norm": 0.10356369614601135, + "learning_rate": 0.0004671585641381446, + "loss": 2.6169, + "step": 17730 + }, + { + "epoch": 0.5257835898348309, + "grad_norm": 0.10730234533548355, + "learning_rate": 0.00046711161492707235, + "loss": 2.6433, + "step": 17731 + }, + { + "epoch": 0.5258132431871423, + "grad_norm": 0.11719505488872528, + "learning_rate": 0.0004670646660072324, + "loss": 2.6334, + "step": 17732 + }, + { + "epoch": 0.5258428965394538, + "grad_norm": 0.12475226074457169, + "learning_rate": 0.00046701771737904063, + "loss": 2.6804, + "step": 17733 + }, + { + "epoch": 0.5258725498917652, + "grad_norm": 0.1218903586268425, + "learning_rate": 0.00046697076904291256, + "loss": 2.6398, + "step": 17734 + }, + { + "epoch": 0.5259022032440768, + "grad_norm": 0.13842639327049255, + "learning_rate": 0.00046692382099926396, + "loss": 2.6446, + "step": 17735 + }, + { + "epoch": 0.5259318565963882, + "grad_norm": 0.1460326611995697, + "learning_rate": 0.0004668768732485106, + "loss": 2.645, + "step": 17736 + }, + { + "epoch": 0.5259615099486997, + "grad_norm": 0.1278313845396042, + "learning_rate": 0.0004668299257910681, + "loss": 2.627, + "step": 17737 + }, + { + "epoch": 0.5259911633010111, + "grad_norm": 0.11755029112100601, + "learning_rate": 0.0004667829786273524, + "loss": 2.631, + "step": 17738 + }, + { + "epoch": 0.5260208166533227, + "grad_norm": 0.12126617878675461, + "learning_rate": 0.00046673603175777917, + "loss": 2.6288, + "step": 17739 + }, + { + "epoch": 0.5260504700056341, + "grad_norm": 0.12133470177650452, + "learning_rate": 0.0004666890851827641, + "loss": 2.6397, + "step": 17740 + }, + { + "epoch": 0.5260801233579456, + "grad_norm": 0.12099641561508179, + "learning_rate": 0.00046664213890272284, + "loss": 2.6082, + "step": 17741 + }, + { + "epoch": 0.526109776710257, + "grad_norm": 0.0979737862944603, + "learning_rate": 0.00046659519291807115, + "loss": 2.6653, + "step": 17742 + }, + { + "epoch": 0.5261394300625686, + "grad_norm": 0.10507043451070786, + "learning_rate": 0.00046654824722922495, + "loss": 2.6599, + "step": 17743 + }, + { + "epoch": 0.52616908341488, + "grad_norm": 0.10630764067173004, + "learning_rate": 0.00046650130183659963, + "loss": 2.6392, + "step": 17744 + }, + { + "epoch": 0.5261987367671915, + "grad_norm": 0.1180274561047554, + "learning_rate": 0.000466454356740611, + "loss": 2.6207, + "step": 17745 + }, + { + "epoch": 0.5262283901195031, + "grad_norm": 0.10719520598649979, + "learning_rate": 0.0004664074119416748, + "loss": 2.6247, + "step": 17746 + }, + { + "epoch": 0.5262580434718145, + "grad_norm": 0.10915957391262054, + "learning_rate": 0.0004663604674402068, + "loss": 2.6556, + "step": 17747 + }, + { + "epoch": 0.526287696824126, + "grad_norm": 0.11674356460571289, + "learning_rate": 0.0004663135232366226, + "loss": 2.5964, + "step": 17748 + }, + { + "epoch": 0.5263173501764374, + "grad_norm": 0.09936419129371643, + "learning_rate": 0.00046626657933133785, + "loss": 2.613, + "step": 17749 + }, + { + "epoch": 0.526347003528749, + "grad_norm": 0.10785692185163498, + "learning_rate": 0.00046621963572476827, + "loss": 2.6741, + "step": 17750 + }, + { + "epoch": 0.5263766568810604, + "grad_norm": 0.11480775475502014, + "learning_rate": 0.00046617269241732974, + "loss": 2.6612, + "step": 17751 + }, + { + "epoch": 0.5264063102333719, + "grad_norm": 0.13478495180606842, + "learning_rate": 0.00046612574940943784, + "loss": 2.6507, + "step": 17752 + }, + { + "epoch": 0.5264359635856833, + "grad_norm": 0.12074296176433563, + "learning_rate": 0.00046607880670150825, + "loss": 2.6508, + "step": 17753 + }, + { + "epoch": 0.5264656169379949, + "grad_norm": 0.092006154358387, + "learning_rate": 0.0004660318642939567, + "loss": 2.6154, + "step": 17754 + }, + { + "epoch": 0.5264952702903063, + "grad_norm": 0.10968916118144989, + "learning_rate": 0.0004659849221871988, + "loss": 2.6571, + "step": 17755 + }, + { + "epoch": 0.5265249236426178, + "grad_norm": 0.11400245130062103, + "learning_rate": 0.0004659379803816502, + "loss": 2.6269, + "step": 17756 + }, + { + "epoch": 0.5265545769949292, + "grad_norm": 0.11755159497261047, + "learning_rate": 0.00046589103887772663, + "loss": 2.6537, + "step": 17757 + }, + { + "epoch": 0.5265842303472408, + "grad_norm": 0.11397604644298553, + "learning_rate": 0.00046584409767584374, + "loss": 2.6318, + "step": 17758 + }, + { + "epoch": 0.5266138836995522, + "grad_norm": 0.11369791626930237, + "learning_rate": 0.00046579715677641726, + "loss": 2.6425, + "step": 17759 + }, + { + "epoch": 0.5266435370518637, + "grad_norm": 0.11306731402873993, + "learning_rate": 0.0004657502161798629, + "loss": 2.6127, + "step": 17760 + }, + { + "epoch": 0.5266731904041752, + "grad_norm": 0.10797169804573059, + "learning_rate": 0.0004657032758865962, + "loss": 2.6367, + "step": 17761 + }, + { + "epoch": 0.5267028437564867, + "grad_norm": 0.13293789327144623, + "learning_rate": 0.0004656563358970329, + "loss": 2.6561, + "step": 17762 + }, + { + "epoch": 0.5267324971087981, + "grad_norm": 0.17164470255374908, + "learning_rate": 0.00046560939621158835, + "loss": 2.6617, + "step": 17763 + }, + { + "epoch": 0.5267621504611096, + "grad_norm": 0.17404869198799133, + "learning_rate": 0.0004655624568306789, + "loss": 2.6258, + "step": 17764 + }, + { + "epoch": 0.5267918038134211, + "grad_norm": 0.1274198293685913, + "learning_rate": 0.00046551551775471964, + "loss": 2.6313, + "step": 17765 + }, + { + "epoch": 0.5268214571657326, + "grad_norm": 0.11990828812122345, + "learning_rate": 0.00046546857898412635, + "loss": 2.6482, + "step": 17766 + }, + { + "epoch": 0.5268511105180441, + "grad_norm": 0.13608993589878082, + "learning_rate": 0.00046542164051931477, + "loss": 2.6415, + "step": 17767 + }, + { + "epoch": 0.5268807638703555, + "grad_norm": 0.12243889272212982, + "learning_rate": 0.0004653747023607006, + "loss": 2.6294, + "step": 17768 + }, + { + "epoch": 0.5269104172226671, + "grad_norm": 0.12178782373666763, + "learning_rate": 0.0004653277645086992, + "loss": 2.6449, + "step": 17769 + }, + { + "epoch": 0.5269400705749785, + "grad_norm": 0.11777975410223007, + "learning_rate": 0.00046528082696372655, + "loss": 2.6231, + "step": 17770 + }, + { + "epoch": 0.52696972392729, + "grad_norm": 0.11957603693008423, + "learning_rate": 0.0004652338897261981, + "loss": 2.6507, + "step": 17771 + }, + { + "epoch": 0.5269993772796014, + "grad_norm": 0.13101151585578918, + "learning_rate": 0.00046518695279652953, + "loss": 2.6375, + "step": 17772 + }, + { + "epoch": 0.527029030631913, + "grad_norm": 0.10551775246858597, + "learning_rate": 0.0004651400161751364, + "loss": 2.6569, + "step": 17773 + }, + { + "epoch": 0.5270586839842244, + "grad_norm": 0.1159881204366684, + "learning_rate": 0.0004650930798624345, + "loss": 2.6699, + "step": 17774 + }, + { + "epoch": 0.5270883373365359, + "grad_norm": 0.13514693081378937, + "learning_rate": 0.00046504614385883917, + "loss": 2.6371, + "step": 17775 + }, + { + "epoch": 0.5271179906888473, + "grad_norm": 0.12256330251693726, + "learning_rate": 0.00046499920816476636, + "loss": 2.6312, + "step": 17776 + }, + { + "epoch": 0.5271476440411589, + "grad_norm": 0.12723301351070404, + "learning_rate": 0.0004649522727806316, + "loss": 2.6861, + "step": 17777 + }, + { + "epoch": 0.5271772973934703, + "grad_norm": 0.13554853200912476, + "learning_rate": 0.00046490533770685043, + "loss": 2.6407, + "step": 17778 + }, + { + "epoch": 0.5272069507457818, + "grad_norm": 0.13429123163223267, + "learning_rate": 0.0004648584029438386, + "loss": 2.6594, + "step": 17779 + }, + { + "epoch": 0.5272366040980933, + "grad_norm": 0.11731292307376862, + "learning_rate": 0.0004648114684920116, + "loss": 2.6119, + "step": 17780 + }, + { + "epoch": 0.5272662574504048, + "grad_norm": 0.11977938562631607, + "learning_rate": 0.00046476453435178504, + "loss": 2.6262, + "step": 17781 + }, + { + "epoch": 0.5272959108027162, + "grad_norm": 0.12644024193286896, + "learning_rate": 0.00046471760052357473, + "loss": 2.6637, + "step": 17782 + }, + { + "epoch": 0.5273255641550277, + "grad_norm": 0.14176484942436218, + "learning_rate": 0.0004646706670077961, + "loss": 2.6308, + "step": 17783 + }, + { + "epoch": 0.5273552175073392, + "grad_norm": 0.1443796157836914, + "learning_rate": 0.0004646237338048647, + "loss": 2.5867, + "step": 17784 + }, + { + "epoch": 0.5273848708596507, + "grad_norm": 0.10771342366933823, + "learning_rate": 0.00046457680091519616, + "loss": 2.6437, + "step": 17785 + }, + { + "epoch": 0.5274145242119621, + "grad_norm": 0.12730807065963745, + "learning_rate": 0.0004645298683392062, + "loss": 2.6267, + "step": 17786 + }, + { + "epoch": 0.5274441775642736, + "grad_norm": 0.12870104610919952, + "learning_rate": 0.0004644829360773103, + "loss": 2.6488, + "step": 17787 + }, + { + "epoch": 0.5274738309165852, + "grad_norm": 0.11530576646327972, + "learning_rate": 0.000464436004129924, + "loss": 2.6429, + "step": 17788 + }, + { + "epoch": 0.5275034842688966, + "grad_norm": 0.13623254001140594, + "learning_rate": 0.0004643890724974631, + "loss": 2.6159, + "step": 17789 + }, + { + "epoch": 0.5275331376212081, + "grad_norm": 0.1447516232728958, + "learning_rate": 0.00046434214118034304, + "loss": 2.647, + "step": 17790 + }, + { + "epoch": 0.5275627909735195, + "grad_norm": 0.11850083619356155, + "learning_rate": 0.0004642952101789795, + "loss": 2.633, + "step": 17791 + }, + { + "epoch": 0.5275924443258311, + "grad_norm": 0.10105165094137192, + "learning_rate": 0.00046424827949378814, + "loss": 2.6662, + "step": 17792 + }, + { + "epoch": 0.5276220976781425, + "grad_norm": 0.1030043214559555, + "learning_rate": 0.0004642013491251842, + "loss": 2.6592, + "step": 17793 + }, + { + "epoch": 0.527651751030454, + "grad_norm": 0.1024162620306015, + "learning_rate": 0.00046415441907358355, + "loss": 2.6523, + "step": 17794 + }, + { + "epoch": 0.5276814043827655, + "grad_norm": 0.10794473439455032, + "learning_rate": 0.0004641074893394016, + "loss": 2.647, + "step": 17795 + }, + { + "epoch": 0.527711057735077, + "grad_norm": 0.10258165746927261, + "learning_rate": 0.00046406055992305397, + "loss": 2.6286, + "step": 17796 + }, + { + "epoch": 0.5277407110873884, + "grad_norm": 0.10528066754341125, + "learning_rate": 0.0004640136308249563, + "loss": 2.5966, + "step": 17797 + }, + { + "epoch": 0.5277703644396999, + "grad_norm": 0.12003479152917862, + "learning_rate": 0.0004639667020455241, + "loss": 2.6302, + "step": 17798 + }, + { + "epoch": 0.5278000177920114, + "grad_norm": 0.11070713400840759, + "learning_rate": 0.00046391977358517305, + "loss": 2.637, + "step": 17799 + }, + { + "epoch": 0.5278296711443229, + "grad_norm": 0.1060803011059761, + "learning_rate": 0.0004638728454443185, + "loss": 2.6278, + "step": 17800 + }, + { + "epoch": 0.5278593244966343, + "grad_norm": 0.10681694746017456, + "learning_rate": 0.0004638259176233759, + "loss": 2.6189, + "step": 17801 + }, + { + "epoch": 0.5278889778489458, + "grad_norm": 0.11471252888441086, + "learning_rate": 0.0004637789901227613, + "loss": 2.6735, + "step": 17802 + }, + { + "epoch": 0.5279186312012573, + "grad_norm": 0.11572388559579849, + "learning_rate": 0.00046373206294288984, + "loss": 2.6247, + "step": 17803 + }, + { + "epoch": 0.5279482845535688, + "grad_norm": 0.10635847598314285, + "learning_rate": 0.0004636851360841772, + "loss": 2.6739, + "step": 17804 + }, + { + "epoch": 0.5279779379058802, + "grad_norm": 0.12047867476940155, + "learning_rate": 0.00046363820954703895, + "loss": 2.6563, + "step": 17805 + }, + { + "epoch": 0.5280075912581917, + "grad_norm": 0.10866215825080872, + "learning_rate": 0.00046359128333189057, + "loss": 2.6284, + "step": 17806 + }, + { + "epoch": 0.5280372446105032, + "grad_norm": 0.11419177055358887, + "learning_rate": 0.00046354435743914765, + "loss": 2.6396, + "step": 17807 + }, + { + "epoch": 0.5280668979628147, + "grad_norm": 0.11747734248638153, + "learning_rate": 0.00046349743186922565, + "loss": 2.6468, + "step": 17808 + }, + { + "epoch": 0.5280965513151262, + "grad_norm": 0.10125869512557983, + "learning_rate": 0.00046345050662254027, + "loss": 2.6281, + "step": 17809 + }, + { + "epoch": 0.5281262046674376, + "grad_norm": 0.09744291007518768, + "learning_rate": 0.00046340358169950685, + "loss": 2.663, + "step": 17810 + }, + { + "epoch": 0.5281558580197492, + "grad_norm": 0.10011892020702362, + "learning_rate": 0.00046335665710054097, + "loss": 2.5811, + "step": 17811 + }, + { + "epoch": 0.5281855113720606, + "grad_norm": 0.09690800309181213, + "learning_rate": 0.0004633097328260582, + "loss": 2.6196, + "step": 17812 + }, + { + "epoch": 0.5282151647243721, + "grad_norm": 0.10188906639814377, + "learning_rate": 0.00046326280887647403, + "loss": 2.6306, + "step": 17813 + }, + { + "epoch": 0.5282448180766836, + "grad_norm": 0.1110464334487915, + "learning_rate": 0.0004632158852522039, + "loss": 2.652, + "step": 17814 + }, + { + "epoch": 0.5282744714289951, + "grad_norm": 0.10778941214084625, + "learning_rate": 0.00046316896195366356, + "loss": 2.6301, + "step": 17815 + }, + { + "epoch": 0.5283041247813065, + "grad_norm": 0.10632817447185516, + "learning_rate": 0.0004631220389812683, + "loss": 2.5862, + "step": 17816 + }, + { + "epoch": 0.528333778133618, + "grad_norm": 0.10181885212659836, + "learning_rate": 0.0004630751163354338, + "loss": 2.6143, + "step": 17817 + }, + { + "epoch": 0.5283634314859295, + "grad_norm": 0.1031697541475296, + "learning_rate": 0.0004630281940165754, + "loss": 2.6255, + "step": 17818 + }, + { + "epoch": 0.528393084838241, + "grad_norm": 0.12016711384057999, + "learning_rate": 0.00046298127202510877, + "loss": 2.6213, + "step": 17819 + }, + { + "epoch": 0.5284227381905524, + "grad_norm": 0.1166689321398735, + "learning_rate": 0.0004629343503614494, + "loss": 2.6354, + "step": 17820 + }, + { + "epoch": 0.5284523915428639, + "grad_norm": 0.1203920766711235, + "learning_rate": 0.0004628874290260126, + "loss": 2.6487, + "step": 17821 + }, + { + "epoch": 0.5284820448951754, + "grad_norm": 0.12009592354297638, + "learning_rate": 0.000462840508019214, + "loss": 2.6675, + "step": 17822 + }, + { + "epoch": 0.5285116982474869, + "grad_norm": 0.10261792689561844, + "learning_rate": 0.0004627935873414691, + "loss": 2.6154, + "step": 17823 + }, + { + "epoch": 0.5285413515997983, + "grad_norm": 0.10031913220882416, + "learning_rate": 0.00046274666699319336, + "loss": 2.6157, + "step": 17824 + }, + { + "epoch": 0.5285710049521098, + "grad_norm": 0.12053322792053223, + "learning_rate": 0.0004626997469748023, + "loss": 2.6333, + "step": 17825 + }, + { + "epoch": 0.5286006583044213, + "grad_norm": 0.11816118657588959, + "learning_rate": 0.00046265282728671144, + "loss": 2.6413, + "step": 17826 + }, + { + "epoch": 0.5286303116567328, + "grad_norm": 0.11381043493747711, + "learning_rate": 0.0004626059079293359, + "loss": 2.6169, + "step": 17827 + }, + { + "epoch": 0.5286599650090442, + "grad_norm": 0.11494295299053192, + "learning_rate": 0.0004625589889030917, + "loss": 2.6163, + "step": 17828 + }, + { + "epoch": 0.5286896183613558, + "grad_norm": 0.11703848838806152, + "learning_rate": 0.00046251207020839405, + "loss": 2.648, + "step": 17829 + }, + { + "epoch": 0.5287192717136673, + "grad_norm": 0.11073701828718185, + "learning_rate": 0.0004624651518456585, + "loss": 2.6688, + "step": 17830 + }, + { + "epoch": 0.5287489250659787, + "grad_norm": 0.1206808015704155, + "learning_rate": 0.0004624182338153005, + "loss": 2.6366, + "step": 17831 + }, + { + "epoch": 0.5287785784182902, + "grad_norm": 0.10255581140518188, + "learning_rate": 0.00046237131611773544, + "loss": 2.613, + "step": 17832 + }, + { + "epoch": 0.5288082317706017, + "grad_norm": 0.11556925624608994, + "learning_rate": 0.0004623243987533788, + "loss": 2.6354, + "step": 17833 + }, + { + "epoch": 0.5288378851229132, + "grad_norm": 0.11769488453865051, + "learning_rate": 0.0004622774817226461, + "loss": 2.6065, + "step": 17834 + }, + { + "epoch": 0.5288675384752246, + "grad_norm": 0.13026128709316254, + "learning_rate": 0.0004622305650259527, + "loss": 2.6313, + "step": 17835 + }, + { + "epoch": 0.5288971918275361, + "grad_norm": 0.14946779608726501, + "learning_rate": 0.0004621836486637143, + "loss": 2.647, + "step": 17836 + }, + { + "epoch": 0.5289268451798476, + "grad_norm": 0.14603166282176971, + "learning_rate": 0.00046213673263634616, + "loss": 2.6514, + "step": 17837 + }, + { + "epoch": 0.5289564985321591, + "grad_norm": 0.11298425495624542, + "learning_rate": 0.00046208981694426365, + "loss": 2.6521, + "step": 17838 + }, + { + "epoch": 0.5289861518844705, + "grad_norm": 0.11869028210639954, + "learning_rate": 0.0004620429015878824, + "loss": 2.64, + "step": 17839 + }, + { + "epoch": 0.529015805236782, + "grad_norm": 0.13878265023231506, + "learning_rate": 0.00046199598656761757, + "loss": 2.6112, + "step": 17840 + }, + { + "epoch": 0.5290454585890935, + "grad_norm": 0.11604002118110657, + "learning_rate": 0.000461949071883885, + "loss": 2.6419, + "step": 17841 + }, + { + "epoch": 0.529075111941405, + "grad_norm": 0.11603032052516937, + "learning_rate": 0.00046190215753709983, + "loss": 2.6339, + "step": 17842 + }, + { + "epoch": 0.5291047652937164, + "grad_norm": 0.13701549172401428, + "learning_rate": 0.0004618552435276777, + "loss": 2.6815, + "step": 17843 + }, + { + "epoch": 0.529134418646028, + "grad_norm": 0.1241966187953949, + "learning_rate": 0.00046180832985603384, + "loss": 2.6361, + "step": 17844 + }, + { + "epoch": 0.5291640719983394, + "grad_norm": 0.1193241998553276, + "learning_rate": 0.0004617614165225838, + "loss": 2.6237, + "step": 17845 + }, + { + "epoch": 0.5291937253506509, + "grad_norm": 0.123397596180439, + "learning_rate": 0.00046171450352774294, + "loss": 2.6035, + "step": 17846 + }, + { + "epoch": 0.5292233787029623, + "grad_norm": 0.1107330247759819, + "learning_rate": 0.00046166759087192693, + "loss": 2.6246, + "step": 17847 + }, + { + "epoch": 0.5292530320552739, + "grad_norm": 0.10704823583364487, + "learning_rate": 0.0004616206785555508, + "loss": 2.6737, + "step": 17848 + }, + { + "epoch": 0.5292826854075853, + "grad_norm": 0.12056029587984085, + "learning_rate": 0.00046157376657903024, + "loss": 2.6162, + "step": 17849 + }, + { + "epoch": 0.5293123387598968, + "grad_norm": 0.13284583389759064, + "learning_rate": 0.00046152685494278044, + "loss": 2.6285, + "step": 17850 + }, + { + "epoch": 0.5293419921122083, + "grad_norm": 0.12262994050979614, + "learning_rate": 0.000461479943647217, + "loss": 2.6483, + "step": 17851 + }, + { + "epoch": 0.5293716454645198, + "grad_norm": 0.10274462401866913, + "learning_rate": 0.0004614330326927553, + "loss": 2.6357, + "step": 17852 + }, + { + "epoch": 0.5294012988168313, + "grad_norm": 0.10979313403367996, + "learning_rate": 0.0004613861220798106, + "loss": 2.6151, + "step": 17853 + }, + { + "epoch": 0.5294309521691427, + "grad_norm": 0.11312887817621231, + "learning_rate": 0.0004613392118087986, + "loss": 2.6502, + "step": 17854 + }, + { + "epoch": 0.5294606055214542, + "grad_norm": 0.1192888617515564, + "learning_rate": 0.00046129230188013436, + "loss": 2.6546, + "step": 17855 + }, + { + "epoch": 0.5294902588737657, + "grad_norm": 0.10346247255802155, + "learning_rate": 0.0004612453922942335, + "loss": 2.6627, + "step": 17856 + }, + { + "epoch": 0.5295199122260772, + "grad_norm": 0.11910486221313477, + "learning_rate": 0.00046119848305151135, + "loss": 2.6236, + "step": 17857 + }, + { + "epoch": 0.5295495655783886, + "grad_norm": 0.10129901021718979, + "learning_rate": 0.0004611515741523834, + "loss": 2.6712, + "step": 17858 + }, + { + "epoch": 0.5295792189307001, + "grad_norm": 0.09693746268749237, + "learning_rate": 0.00046110466559726485, + "loss": 2.6356, + "step": 17859 + }, + { + "epoch": 0.5296088722830116, + "grad_norm": 0.11266235262155533, + "learning_rate": 0.00046105775738657106, + "loss": 2.6128, + "step": 17860 + }, + { + "epoch": 0.5296385256353231, + "grad_norm": 0.11692958325147629, + "learning_rate": 0.00046101084952071764, + "loss": 2.6058, + "step": 17861 + }, + { + "epoch": 0.5296681789876345, + "grad_norm": 0.10591836273670197, + "learning_rate": 0.0004609639420001198, + "loss": 2.6748, + "step": 17862 + }, + { + "epoch": 0.529697832339946, + "grad_norm": 0.1104547381401062, + "learning_rate": 0.00046091703482519295, + "loss": 2.6475, + "step": 17863 + }, + { + "epoch": 0.5297274856922575, + "grad_norm": 0.11021792888641357, + "learning_rate": 0.00046087012799635255, + "loss": 2.6088, + "step": 17864 + }, + { + "epoch": 0.529757139044569, + "grad_norm": 0.10916832834482193, + "learning_rate": 0.00046082322151401375, + "loss": 2.6502, + "step": 17865 + }, + { + "epoch": 0.5297867923968804, + "grad_norm": 0.12093547731637955, + "learning_rate": 0.0004607763153785919, + "loss": 2.6422, + "step": 17866 + }, + { + "epoch": 0.529816445749192, + "grad_norm": 0.11998270452022552, + "learning_rate": 0.0004607294095905027, + "loss": 2.6397, + "step": 17867 + }, + { + "epoch": 0.5298460991015034, + "grad_norm": 0.13913388550281525, + "learning_rate": 0.00046068250415016136, + "loss": 2.6236, + "step": 17868 + }, + { + "epoch": 0.5298757524538149, + "grad_norm": 0.14801841974258423, + "learning_rate": 0.0004606355990579832, + "loss": 2.6621, + "step": 17869 + }, + { + "epoch": 0.5299054058061263, + "grad_norm": 0.1232537180185318, + "learning_rate": 0.0004605886943143835, + "loss": 2.6168, + "step": 17870 + }, + { + "epoch": 0.5299350591584379, + "grad_norm": 0.1133413016796112, + "learning_rate": 0.00046054178991977767, + "loss": 2.6234, + "step": 17871 + }, + { + "epoch": 0.5299647125107494, + "grad_norm": 0.11244075000286102, + "learning_rate": 0.0004604948858745811, + "loss": 2.6249, + "step": 17872 + }, + { + "epoch": 0.5299943658630608, + "grad_norm": 0.11981581151485443, + "learning_rate": 0.00046044798217920906, + "loss": 2.6672, + "step": 17873 + }, + { + "epoch": 0.5300240192153723, + "grad_norm": 0.11659209430217743, + "learning_rate": 0.00046040107883407695, + "loss": 2.6414, + "step": 17874 + }, + { + "epoch": 0.5300536725676838, + "grad_norm": 0.10490280389785767, + "learning_rate": 0.0004603541758396002, + "loss": 2.6062, + "step": 17875 + }, + { + "epoch": 0.5300833259199953, + "grad_norm": 0.11098089814186096, + "learning_rate": 0.00046030727319619393, + "loss": 2.6231, + "step": 17876 + }, + { + "epoch": 0.5301129792723067, + "grad_norm": 0.11104441434144974, + "learning_rate": 0.00046026037090427354, + "loss": 2.6503, + "step": 17877 + }, + { + "epoch": 0.5301426326246182, + "grad_norm": 0.11289634555578232, + "learning_rate": 0.00046021346896425437, + "loss": 2.6331, + "step": 17878 + }, + { + "epoch": 0.5301722859769297, + "grad_norm": 0.11790551990270615, + "learning_rate": 0.0004601665673765517, + "loss": 2.6429, + "step": 17879 + }, + { + "epoch": 0.5302019393292412, + "grad_norm": 0.12711775302886963, + "learning_rate": 0.000460119666141581, + "loss": 2.6628, + "step": 17880 + }, + { + "epoch": 0.5302315926815526, + "grad_norm": 0.13610994815826416, + "learning_rate": 0.0004600727652597575, + "loss": 2.673, + "step": 17881 + }, + { + "epoch": 0.5302612460338642, + "grad_norm": 0.1150282546877861, + "learning_rate": 0.00046002586473149656, + "loss": 2.6051, + "step": 17882 + }, + { + "epoch": 0.5302908993861756, + "grad_norm": 0.11542847752571106, + "learning_rate": 0.0004599789645572134, + "loss": 2.6073, + "step": 17883 + }, + { + "epoch": 0.5303205527384871, + "grad_norm": 0.11094418168067932, + "learning_rate": 0.00045993206473732333, + "loss": 2.6364, + "step": 17884 + }, + { + "epoch": 0.5303502060907985, + "grad_norm": 0.11755242198705673, + "learning_rate": 0.0004598851652722419, + "loss": 2.6422, + "step": 17885 + }, + { + "epoch": 0.5303798594431101, + "grad_norm": 0.11561029404401779, + "learning_rate": 0.0004598382661623841, + "loss": 2.6449, + "step": 17886 + }, + { + "epoch": 0.5304095127954215, + "grad_norm": 0.11801157146692276, + "learning_rate": 0.0004597913674081653, + "loss": 2.6265, + "step": 17887 + }, + { + "epoch": 0.530439166147733, + "grad_norm": 0.10281168669462204, + "learning_rate": 0.00045974446901000086, + "loss": 2.6444, + "step": 17888 + }, + { + "epoch": 0.5304688195000444, + "grad_norm": 0.10953362286090851, + "learning_rate": 0.00045969757096830607, + "loss": 2.6192, + "step": 17889 + }, + { + "epoch": 0.530498472852356, + "grad_norm": 0.10691395401954651, + "learning_rate": 0.0004596506732834962, + "loss": 2.6393, + "step": 17890 + }, + { + "epoch": 0.5305281262046674, + "grad_norm": 0.10786907374858856, + "learning_rate": 0.0004596037759559866, + "loss": 2.6158, + "step": 17891 + }, + { + "epoch": 0.5305577795569789, + "grad_norm": 0.11408572643995285, + "learning_rate": 0.0004595568789861922, + "loss": 2.6399, + "step": 17892 + }, + { + "epoch": 0.5305874329092904, + "grad_norm": 0.12651920318603516, + "learning_rate": 0.0004595099823745289, + "loss": 2.6515, + "step": 17893 + }, + { + "epoch": 0.5306170862616019, + "grad_norm": 0.12271323055028915, + "learning_rate": 0.00045946308612141156, + "loss": 2.6258, + "step": 17894 + }, + { + "epoch": 0.5306467396139134, + "grad_norm": 0.12994860112667084, + "learning_rate": 0.0004594161902272556, + "loss": 2.645, + "step": 17895 + }, + { + "epoch": 0.5306763929662248, + "grad_norm": 0.11711817234754562, + "learning_rate": 0.0004593692946924763, + "loss": 2.659, + "step": 17896 + }, + { + "epoch": 0.5307060463185364, + "grad_norm": 0.1169971376657486, + "learning_rate": 0.00045932239951748877, + "loss": 2.6178, + "step": 17897 + }, + { + "epoch": 0.5307356996708478, + "grad_norm": 0.13328589498996735, + "learning_rate": 0.00045927550470270843, + "loss": 2.6055, + "step": 17898 + }, + { + "epoch": 0.5307653530231593, + "grad_norm": 0.10675624012947083, + "learning_rate": 0.00045922861024855037, + "loss": 2.6373, + "step": 17899 + }, + { + "epoch": 0.5307950063754707, + "grad_norm": 0.11595282703638077, + "learning_rate": 0.00045918171615543006, + "loss": 2.6283, + "step": 17900 + }, + { + "epoch": 0.5308246597277823, + "grad_norm": 0.10742530971765518, + "learning_rate": 0.00045913482242376265, + "loss": 2.6097, + "step": 17901 + }, + { + "epoch": 0.5308543130800937, + "grad_norm": 0.11511535197496414, + "learning_rate": 0.00045908792905396354, + "loss": 2.621, + "step": 17902 + }, + { + "epoch": 0.5308839664324052, + "grad_norm": 0.13242188096046448, + "learning_rate": 0.00045904103604644766, + "loss": 2.669, + "step": 17903 + }, + { + "epoch": 0.5309136197847166, + "grad_norm": 0.12803924083709717, + "learning_rate": 0.0004589941434016304, + "loss": 2.6527, + "step": 17904 + }, + { + "epoch": 0.5309432731370282, + "grad_norm": 0.12600351870059967, + "learning_rate": 0.0004589472511199269, + "loss": 2.6571, + "step": 17905 + }, + { + "epoch": 0.5309729264893396, + "grad_norm": 0.1103910431265831, + "learning_rate": 0.00045890035920175286, + "loss": 2.6694, + "step": 17906 + }, + { + "epoch": 0.5310025798416511, + "grad_norm": 0.11460141837596893, + "learning_rate": 0.0004588534676475231, + "loss": 2.6426, + "step": 17907 + }, + { + "epoch": 0.5310322331939625, + "grad_norm": 0.12191545218229294, + "learning_rate": 0.0004588065764576529, + "loss": 2.6367, + "step": 17908 + }, + { + "epoch": 0.5310618865462741, + "grad_norm": 0.13344934582710266, + "learning_rate": 0.0004587596856325576, + "loss": 2.6254, + "step": 17909 + }, + { + "epoch": 0.5310915398985855, + "grad_norm": 0.14304907619953156, + "learning_rate": 0.0004587127951726523, + "loss": 2.6807, + "step": 17910 + }, + { + "epoch": 0.531121193250897, + "grad_norm": 0.14831165969371796, + "learning_rate": 0.0004586659050783523, + "loss": 2.6502, + "step": 17911 + }, + { + "epoch": 0.5311508466032084, + "grad_norm": 0.11444906890392303, + "learning_rate": 0.00045861901535007284, + "loss": 2.6681, + "step": 17912 + }, + { + "epoch": 0.53118049995552, + "grad_norm": 0.13032804429531097, + "learning_rate": 0.00045857212598822915, + "loss": 2.6069, + "step": 17913 + }, + { + "epoch": 0.5312101533078315, + "grad_norm": 0.15750300884246826, + "learning_rate": 0.00045852523699323633, + "loss": 2.6364, + "step": 17914 + }, + { + "epoch": 0.5312398066601429, + "grad_norm": 0.12228266149759293, + "learning_rate": 0.0004584783483655096, + "loss": 2.6371, + "step": 17915 + }, + { + "epoch": 0.5312694600124545, + "grad_norm": 0.12540864944458008, + "learning_rate": 0.00045843146010546434, + "loss": 2.6526, + "step": 17916 + }, + { + "epoch": 0.5312991133647659, + "grad_norm": 0.1191539466381073, + "learning_rate": 0.00045838457221351555, + "loss": 2.6385, + "step": 17917 + }, + { + "epoch": 0.5313287667170774, + "grad_norm": 0.10947238653898239, + "learning_rate": 0.0004583376846900784, + "loss": 2.6517, + "step": 17918 + }, + { + "epoch": 0.5313584200693888, + "grad_norm": 0.10383349657058716, + "learning_rate": 0.0004582907975355683, + "loss": 2.6016, + "step": 17919 + }, + { + "epoch": 0.5313880734217004, + "grad_norm": 0.11395114660263062, + "learning_rate": 0.0004582439107504004, + "loss": 2.6115, + "step": 17920 + }, + { + "epoch": 0.5314177267740118, + "grad_norm": 0.10885925590991974, + "learning_rate": 0.00045819702433498984, + "loss": 2.6156, + "step": 17921 + }, + { + "epoch": 0.5314473801263233, + "grad_norm": 0.10548912733793259, + "learning_rate": 0.00045815013828975177, + "loss": 2.6578, + "step": 17922 + }, + { + "epoch": 0.5314770334786347, + "grad_norm": 0.1197163388133049, + "learning_rate": 0.00045810325261510154, + "loss": 2.6478, + "step": 17923 + }, + { + "epoch": 0.5315066868309463, + "grad_norm": 0.13598844408988953, + "learning_rate": 0.0004580563673114541, + "loss": 2.6952, + "step": 17924 + }, + { + "epoch": 0.5315363401832577, + "grad_norm": 0.1493803858757019, + "learning_rate": 0.00045800948237922467, + "loss": 2.6531, + "step": 17925 + }, + { + "epoch": 0.5315659935355692, + "grad_norm": 0.12443718314170837, + "learning_rate": 0.00045796259781882853, + "loss": 2.6587, + "step": 17926 + }, + { + "epoch": 0.5315956468878806, + "grad_norm": 0.11538030207157135, + "learning_rate": 0.0004579157136306808, + "loss": 2.6484, + "step": 17927 + }, + { + "epoch": 0.5316253002401922, + "grad_norm": 0.16531234979629517, + "learning_rate": 0.0004578688298151966, + "loss": 2.6402, + "step": 17928 + }, + { + "epoch": 0.5316549535925036, + "grad_norm": 0.13215649127960205, + "learning_rate": 0.0004578219463727912, + "loss": 2.6515, + "step": 17929 + }, + { + "epoch": 0.5316846069448151, + "grad_norm": 0.11441627144813538, + "learning_rate": 0.0004577750633038798, + "loss": 2.6023, + "step": 17930 + }, + { + "epoch": 0.5317142602971265, + "grad_norm": 0.1232134997844696, + "learning_rate": 0.0004577281806088771, + "loss": 2.6357, + "step": 17931 + }, + { + "epoch": 0.5317439136494381, + "grad_norm": 0.1209527775645256, + "learning_rate": 0.00045768129828819887, + "loss": 2.5997, + "step": 17932 + }, + { + "epoch": 0.5317735670017496, + "grad_norm": 0.11838463693857193, + "learning_rate": 0.00045763441634226, + "loss": 2.6652, + "step": 17933 + }, + { + "epoch": 0.531803220354061, + "grad_norm": 0.1169729232788086, + "learning_rate": 0.0004575875347714758, + "loss": 2.6613, + "step": 17934 + }, + { + "epoch": 0.5318328737063726, + "grad_norm": 0.11632998287677765, + "learning_rate": 0.0004575406535762611, + "loss": 2.642, + "step": 17935 + }, + { + "epoch": 0.531862527058684, + "grad_norm": 0.1074141189455986, + "learning_rate": 0.00045749377275703117, + "loss": 2.6522, + "step": 17936 + }, + { + "epoch": 0.5318921804109955, + "grad_norm": 0.11604836583137512, + "learning_rate": 0.00045744689231420123, + "loss": 2.6129, + "step": 17937 + }, + { + "epoch": 0.5319218337633069, + "grad_norm": 0.11759105324745178, + "learning_rate": 0.0004574000122481864, + "loss": 2.638, + "step": 17938 + }, + { + "epoch": 0.5319514871156185, + "grad_norm": 0.12450037896633148, + "learning_rate": 0.0004573531325594017, + "loss": 2.6064, + "step": 17939 + }, + { + "epoch": 0.5319811404679299, + "grad_norm": 0.14754386246204376, + "learning_rate": 0.00045730625324826246, + "loss": 2.6478, + "step": 17940 + }, + { + "epoch": 0.5320107938202414, + "grad_norm": 0.1192915216088295, + "learning_rate": 0.00045725937431518357, + "loss": 2.6162, + "step": 17941 + }, + { + "epoch": 0.5320404471725528, + "grad_norm": 0.11773353070020676, + "learning_rate": 0.0004572124957605803, + "loss": 2.6381, + "step": 17942 + }, + { + "epoch": 0.5320701005248644, + "grad_norm": 0.12266897410154343, + "learning_rate": 0.0004571656175848676, + "loss": 2.6607, + "step": 17943 + }, + { + "epoch": 0.5320997538771758, + "grad_norm": 0.11260256916284561, + "learning_rate": 0.00045711873978846075, + "loss": 2.6485, + "step": 17944 + }, + { + "epoch": 0.5321294072294873, + "grad_norm": 0.10106469690799713, + "learning_rate": 0.0004570718623717748, + "loss": 2.6448, + "step": 17945 + }, + { + "epoch": 0.5321590605817987, + "grad_norm": 0.1098065972328186, + "learning_rate": 0.00045702498533522497, + "loss": 2.6378, + "step": 17946 + }, + { + "epoch": 0.5321887139341103, + "grad_norm": 0.11326467990875244, + "learning_rate": 0.00045697810867922624, + "loss": 2.6412, + "step": 17947 + }, + { + "epoch": 0.5322183672864217, + "grad_norm": 0.10377342998981476, + "learning_rate": 0.00045693123240419376, + "loss": 2.684, + "step": 17948 + }, + { + "epoch": 0.5322480206387332, + "grad_norm": 0.11231732368469238, + "learning_rate": 0.00045688435651054256, + "loss": 2.6238, + "step": 17949 + }, + { + "epoch": 0.5322776739910446, + "grad_norm": 0.11201854795217514, + "learning_rate": 0.00045683748099868785, + "loss": 2.6434, + "step": 17950 + }, + { + "epoch": 0.5323073273433562, + "grad_norm": 0.10852807760238647, + "learning_rate": 0.0004567906058690447, + "loss": 2.6755, + "step": 17951 + }, + { + "epoch": 0.5323369806956676, + "grad_norm": 0.11569111049175262, + "learning_rate": 0.0004567437311220281, + "loss": 2.6547, + "step": 17952 + }, + { + "epoch": 0.5323666340479791, + "grad_norm": 0.11741551011800766, + "learning_rate": 0.00045669685675805315, + "loss": 2.6278, + "step": 17953 + }, + { + "epoch": 0.5323962874002907, + "grad_norm": 0.12708330154418945, + "learning_rate": 0.00045664998277753497, + "loss": 2.6553, + "step": 17954 + }, + { + "epoch": 0.5324259407526021, + "grad_norm": 0.14230576157569885, + "learning_rate": 0.00045660310918088865, + "loss": 2.6265, + "step": 17955 + }, + { + "epoch": 0.5324555941049136, + "grad_norm": 0.11773985624313354, + "learning_rate": 0.0004565562359685291, + "loss": 2.6452, + "step": 17956 + }, + { + "epoch": 0.532485247457225, + "grad_norm": 0.1400613784790039, + "learning_rate": 0.00045650936314087166, + "loss": 2.6199, + "step": 17957 + }, + { + "epoch": 0.5325149008095366, + "grad_norm": 0.17214958369731903, + "learning_rate": 0.0004564624906983313, + "loss": 2.641, + "step": 17958 + }, + { + "epoch": 0.532544554161848, + "grad_norm": 0.1300162971019745, + "learning_rate": 0.000456415618641323, + "loss": 2.598, + "step": 17959 + }, + { + "epoch": 0.5325742075141595, + "grad_norm": 0.12556053698062897, + "learning_rate": 0.000456368746970262, + "loss": 2.6073, + "step": 17960 + }, + { + "epoch": 0.5326038608664709, + "grad_norm": 0.13003449141979218, + "learning_rate": 0.0004563218756855632, + "loss": 2.6498, + "step": 17961 + }, + { + "epoch": 0.5326335142187825, + "grad_norm": 0.11225450038909912, + "learning_rate": 0.00045627500478764166, + "loss": 2.6183, + "step": 17962 + }, + { + "epoch": 0.5326631675710939, + "grad_norm": 0.1318463534116745, + "learning_rate": 0.00045622813427691243, + "loss": 2.6547, + "step": 17963 + }, + { + "epoch": 0.5326928209234054, + "grad_norm": 0.12280544638633728, + "learning_rate": 0.00045618126415379064, + "loss": 2.6651, + "step": 17964 + }, + { + "epoch": 0.5327224742757168, + "grad_norm": 0.10865677148103714, + "learning_rate": 0.0004561343944186912, + "loss": 2.632, + "step": 17965 + }, + { + "epoch": 0.5327521276280284, + "grad_norm": 0.11727804690599442, + "learning_rate": 0.0004560875250720293, + "loss": 2.6276, + "step": 17966 + }, + { + "epoch": 0.5327817809803398, + "grad_norm": 0.11194168031215668, + "learning_rate": 0.00045604065611421987, + "loss": 2.6331, + "step": 17967 + }, + { + "epoch": 0.5328114343326513, + "grad_norm": 0.12664158642292023, + "learning_rate": 0.00045599378754567805, + "loss": 2.6499, + "step": 17968 + }, + { + "epoch": 0.5328410876849627, + "grad_norm": 0.11965014785528183, + "learning_rate": 0.00045594691936681856, + "loss": 2.6162, + "step": 17969 + }, + { + "epoch": 0.5328707410372743, + "grad_norm": 0.11913285404443741, + "learning_rate": 0.00045590005157805674, + "loss": 2.6158, + "step": 17970 + }, + { + "epoch": 0.5329003943895857, + "grad_norm": 0.11514899879693985, + "learning_rate": 0.0004558531841798076, + "loss": 2.64, + "step": 17971 + }, + { + "epoch": 0.5329300477418972, + "grad_norm": 0.11269070953130722, + "learning_rate": 0.0004558063171724862, + "loss": 2.6664, + "step": 17972 + }, + { + "epoch": 0.5329597010942086, + "grad_norm": 0.12895028293132782, + "learning_rate": 0.00045575945055650744, + "loss": 2.6353, + "step": 17973 + }, + { + "epoch": 0.5329893544465202, + "grad_norm": 0.10402725636959076, + "learning_rate": 0.00045571258433228616, + "loss": 2.6479, + "step": 17974 + }, + { + "epoch": 0.5330190077988317, + "grad_norm": 0.09987323731184006, + "learning_rate": 0.00045566571850023767, + "loss": 2.6315, + "step": 17975 + }, + { + "epoch": 0.5330486611511431, + "grad_norm": 0.11276581883430481, + "learning_rate": 0.00045561885306077683, + "loss": 2.6096, + "step": 17976 + }, + { + "epoch": 0.5330783145034547, + "grad_norm": 0.11444536596536636, + "learning_rate": 0.0004555719880143186, + "loss": 2.6314, + "step": 17977 + }, + { + "epoch": 0.5331079678557661, + "grad_norm": 0.10846299678087234, + "learning_rate": 0.00045552512336127825, + "loss": 2.6279, + "step": 17978 + }, + { + "epoch": 0.5331376212080776, + "grad_norm": 0.11327574402093887, + "learning_rate": 0.0004554782591020704, + "loss": 2.6084, + "step": 17979 + }, + { + "epoch": 0.533167274560389, + "grad_norm": 0.10649966448545456, + "learning_rate": 0.00045543139523711025, + "loss": 2.6437, + "step": 17980 + }, + { + "epoch": 0.5331969279127006, + "grad_norm": 0.12393831461668015, + "learning_rate": 0.00045538453176681274, + "loss": 2.6541, + "step": 17981 + }, + { + "epoch": 0.533226581265012, + "grad_norm": 0.10624528676271439, + "learning_rate": 0.00045533766869159265, + "loss": 2.6334, + "step": 17982 + }, + { + "epoch": 0.5332562346173235, + "grad_norm": 0.10158787667751312, + "learning_rate": 0.00045529080601186534, + "loss": 2.6077, + "step": 17983 + }, + { + "epoch": 0.5332858879696349, + "grad_norm": 0.10522504150867462, + "learning_rate": 0.00045524394372804567, + "loss": 2.6363, + "step": 17984 + }, + { + "epoch": 0.5333155413219465, + "grad_norm": 0.11456578969955444, + "learning_rate": 0.0004551970818405485, + "loss": 2.6284, + "step": 17985 + }, + { + "epoch": 0.5333451946742579, + "grad_norm": 0.11922149360179901, + "learning_rate": 0.00045515022034978885, + "loss": 2.6484, + "step": 17986 + }, + { + "epoch": 0.5333748480265694, + "grad_norm": 0.09941709786653519, + "learning_rate": 0.0004551033592561817, + "loss": 2.6342, + "step": 17987 + }, + { + "epoch": 0.5334045013788808, + "grad_norm": 0.10724233090877533, + "learning_rate": 0.0004550564985601421, + "loss": 2.6466, + "step": 17988 + }, + { + "epoch": 0.5334341547311924, + "grad_norm": 0.1157127395272255, + "learning_rate": 0.000455009638262085, + "loss": 2.6579, + "step": 17989 + }, + { + "epoch": 0.5334638080835038, + "grad_norm": 0.1157548651099205, + "learning_rate": 0.00045496277836242513, + "loss": 2.6484, + "step": 17990 + }, + { + "epoch": 0.5334934614358153, + "grad_norm": 0.11688395589590073, + "learning_rate": 0.00045491591886157756, + "loss": 2.5833, + "step": 17991 + }, + { + "epoch": 0.5335231147881268, + "grad_norm": 0.12423516064882278, + "learning_rate": 0.0004548690597599573, + "loss": 2.6361, + "step": 17992 + }, + { + "epoch": 0.5335527681404383, + "grad_norm": 0.12650354206562042, + "learning_rate": 0.00045482220105797926, + "loss": 2.6414, + "step": 17993 + }, + { + "epoch": 0.5335824214927497, + "grad_norm": 0.12561044096946716, + "learning_rate": 0.0004547753427560584, + "loss": 2.648, + "step": 17994 + }, + { + "epoch": 0.5336120748450612, + "grad_norm": 0.12020843476057053, + "learning_rate": 0.0004547284848546095, + "loss": 2.6749, + "step": 17995 + }, + { + "epoch": 0.5336417281973728, + "grad_norm": 0.12499696016311646, + "learning_rate": 0.0004546816273540478, + "loss": 2.6344, + "step": 17996 + }, + { + "epoch": 0.5336713815496842, + "grad_norm": 0.12965793907642365, + "learning_rate": 0.000454634770254788, + "loss": 2.6275, + "step": 17997 + }, + { + "epoch": 0.5337010349019957, + "grad_norm": 0.12169414013624191, + "learning_rate": 0.00045458791355724516, + "loss": 2.6547, + "step": 17998 + }, + { + "epoch": 0.5337306882543071, + "grad_norm": 0.12876565754413605, + "learning_rate": 0.0004545410572618342, + "loss": 2.6005, + "step": 17999 + }, + { + "epoch": 0.5337603416066187, + "grad_norm": 0.11915191262960434, + "learning_rate": 0.0004544942013689699, + "loss": 2.6075, + "step": 18000 + }, + { + "epoch": 0.5337899949589301, + "grad_norm": 0.11145491898059845, + "learning_rate": 0.0004544473458790672, + "loss": 2.6686, + "step": 18001 + }, + { + "epoch": 0.5338196483112416, + "grad_norm": 0.10333938151597977, + "learning_rate": 0.00045440049079254114, + "loss": 2.6221, + "step": 18002 + }, + { + "epoch": 0.533849301663553, + "grad_norm": 0.10533232241868973, + "learning_rate": 0.00045435363610980654, + "loss": 2.6091, + "step": 18003 + }, + { + "epoch": 0.5338789550158646, + "grad_norm": 0.12033043056726456, + "learning_rate": 0.00045430678183127834, + "loss": 2.6033, + "step": 18004 + }, + { + "epoch": 0.533908608368176, + "grad_norm": 0.13476376235485077, + "learning_rate": 0.0004542599279573714, + "loss": 2.6117, + "step": 18005 + }, + { + "epoch": 0.5339382617204875, + "grad_norm": 0.1293848156929016, + "learning_rate": 0.0004542130744885008, + "loss": 2.6186, + "step": 18006 + }, + { + "epoch": 0.533967915072799, + "grad_norm": 0.12856096029281616, + "learning_rate": 0.0004541662214250811, + "loss": 2.6085, + "step": 18007 + }, + { + "epoch": 0.5339975684251105, + "grad_norm": 0.12811872363090515, + "learning_rate": 0.00045411936876752726, + "loss": 2.6074, + "step": 18008 + }, + { + "epoch": 0.5340272217774219, + "grad_norm": 0.09557980298995972, + "learning_rate": 0.0004540725165162545, + "loss": 2.6058, + "step": 18009 + }, + { + "epoch": 0.5340568751297334, + "grad_norm": 0.1121826097369194, + "learning_rate": 0.0004540256646716775, + "loss": 2.6564, + "step": 18010 + }, + { + "epoch": 0.5340865284820449, + "grad_norm": 0.13689684867858887, + "learning_rate": 0.0004539788132342111, + "loss": 2.6228, + "step": 18011 + }, + { + "epoch": 0.5341161818343564, + "grad_norm": 0.12174029648303986, + "learning_rate": 0.0004539319622042701, + "loss": 2.6176, + "step": 18012 + }, + { + "epoch": 0.5341458351866678, + "grad_norm": 0.11675608158111572, + "learning_rate": 0.00045388511158226964, + "loss": 2.6698, + "step": 18013 + }, + { + "epoch": 0.5341754885389793, + "grad_norm": 0.10239534080028534, + "learning_rate": 0.0004538382613686243, + "loss": 2.6102, + "step": 18014 + }, + { + "epoch": 0.5342051418912908, + "grad_norm": 0.10230505466461182, + "learning_rate": 0.0004537914115637492, + "loss": 2.645, + "step": 18015 + }, + { + "epoch": 0.5342347952436023, + "grad_norm": 0.10718468576669693, + "learning_rate": 0.0004537445621680591, + "loss": 2.6418, + "step": 18016 + }, + { + "epoch": 0.5342644485959138, + "grad_norm": 0.10085774958133698, + "learning_rate": 0.0004536977131819688, + "loss": 2.6322, + "step": 18017 + }, + { + "epoch": 0.5342941019482252, + "grad_norm": 0.10811367630958557, + "learning_rate": 0.0004536508646058931, + "loss": 2.6034, + "step": 18018 + }, + { + "epoch": 0.5343237553005368, + "grad_norm": 0.10526273399591446, + "learning_rate": 0.00045360401644024703, + "loss": 2.6392, + "step": 18019 + }, + { + "epoch": 0.5343534086528482, + "grad_norm": 0.10747962445020676, + "learning_rate": 0.0004535571686854453, + "loss": 2.6519, + "step": 18020 + }, + { + "epoch": 0.5343830620051597, + "grad_norm": 0.10862866789102554, + "learning_rate": 0.0004535103213419028, + "loss": 2.6273, + "step": 18021 + }, + { + "epoch": 0.5344127153574711, + "grad_norm": 0.10301882028579712, + "learning_rate": 0.0004534634744100344, + "loss": 2.6118, + "step": 18022 + }, + { + "epoch": 0.5344423687097827, + "grad_norm": 0.10641878843307495, + "learning_rate": 0.000453416627890255, + "loss": 2.6427, + "step": 18023 + }, + { + "epoch": 0.5344720220620941, + "grad_norm": 0.146262064576149, + "learning_rate": 0.0004533697817829793, + "loss": 2.6376, + "step": 18024 + }, + { + "epoch": 0.5345016754144056, + "grad_norm": 0.146211639046669, + "learning_rate": 0.0004533229360886222, + "loss": 2.6063, + "step": 18025 + }, + { + "epoch": 0.534531328766717, + "grad_norm": 0.12129580974578857, + "learning_rate": 0.0004532760908075985, + "loss": 2.6192, + "step": 18026 + }, + { + "epoch": 0.5345609821190286, + "grad_norm": 0.115235336124897, + "learning_rate": 0.0004532292459403231, + "loss": 2.6278, + "step": 18027 + }, + { + "epoch": 0.53459063547134, + "grad_norm": 0.12417687475681305, + "learning_rate": 0.0004531824014872107, + "loss": 2.662, + "step": 18028 + }, + { + "epoch": 0.5346202888236515, + "grad_norm": 0.11504726856946945, + "learning_rate": 0.00045313555744867616, + "loss": 2.6393, + "step": 18029 + }, + { + "epoch": 0.534649942175963, + "grad_norm": 0.11309683322906494, + "learning_rate": 0.0004530887138251344, + "loss": 2.6179, + "step": 18030 + }, + { + "epoch": 0.5346795955282745, + "grad_norm": 0.11739884316921234, + "learning_rate": 0.00045304187061700004, + "loss": 2.6609, + "step": 18031 + }, + { + "epoch": 0.5347092488805859, + "grad_norm": 0.11355030536651611, + "learning_rate": 0.00045299502782468796, + "loss": 2.6671, + "step": 18032 + }, + { + "epoch": 0.5347389022328974, + "grad_norm": 0.1292552351951599, + "learning_rate": 0.0004529481854486131, + "loss": 2.6702, + "step": 18033 + }, + { + "epoch": 0.5347685555852089, + "grad_norm": 0.11854624003171921, + "learning_rate": 0.0004529013434891898, + "loss": 2.6397, + "step": 18034 + }, + { + "epoch": 0.5347982089375204, + "grad_norm": 0.12020119279623032, + "learning_rate": 0.0004528545019468334, + "loss": 2.6294, + "step": 18035 + }, + { + "epoch": 0.5348278622898318, + "grad_norm": 0.11070319265127182, + "learning_rate": 0.0004528076608219585, + "loss": 2.6467, + "step": 18036 + }, + { + "epoch": 0.5348575156421433, + "grad_norm": 0.12442508339881897, + "learning_rate": 0.00045276082011497996, + "loss": 2.6322, + "step": 18037 + }, + { + "epoch": 0.5348871689944549, + "grad_norm": 0.11446134746074677, + "learning_rate": 0.0004527139798263124, + "loss": 2.6405, + "step": 18038 + }, + { + "epoch": 0.5349168223467663, + "grad_norm": 0.11205567419528961, + "learning_rate": 0.0004526671399563707, + "loss": 2.6289, + "step": 18039 + }, + { + "epoch": 0.5349464756990778, + "grad_norm": 0.10850705951452255, + "learning_rate": 0.00045262030050556947, + "loss": 2.6062, + "step": 18040 + }, + { + "epoch": 0.5349761290513892, + "grad_norm": 0.12232567369937897, + "learning_rate": 0.0004525734614743237, + "loss": 2.6703, + "step": 18041 + }, + { + "epoch": 0.5350057824037008, + "grad_norm": 0.11386381089687347, + "learning_rate": 0.000452526622863048, + "loss": 2.608, + "step": 18042 + }, + { + "epoch": 0.5350354357560122, + "grad_norm": 0.10572772473096848, + "learning_rate": 0.00045247978467215726, + "loss": 2.6001, + "step": 18043 + }, + { + "epoch": 0.5350650891083237, + "grad_norm": 0.11139515787363052, + "learning_rate": 0.0004524329469020663, + "loss": 2.6601, + "step": 18044 + }, + { + "epoch": 0.5350947424606352, + "grad_norm": 0.11142603307962418, + "learning_rate": 0.00045238610955318964, + "loss": 2.6526, + "step": 18045 + }, + { + "epoch": 0.5351243958129467, + "grad_norm": 0.11206411570310593, + "learning_rate": 0.0004523392726259422, + "loss": 2.6246, + "step": 18046 + }, + { + "epoch": 0.5351540491652581, + "grad_norm": 0.10702409595251083, + "learning_rate": 0.00045229243612073834, + "loss": 2.6039, + "step": 18047 + }, + { + "epoch": 0.5351837025175696, + "grad_norm": 0.12322724610567093, + "learning_rate": 0.0004522456000379935, + "loss": 2.6201, + "step": 18048 + }, + { + "epoch": 0.5352133558698811, + "grad_norm": 0.13858045637607574, + "learning_rate": 0.00045219876437812206, + "loss": 2.6512, + "step": 18049 + }, + { + "epoch": 0.5352430092221926, + "grad_norm": 0.12927976250648499, + "learning_rate": 0.0004521519291415387, + "loss": 2.6553, + "step": 18050 + }, + { + "epoch": 0.535272662574504, + "grad_norm": 0.11729983240365982, + "learning_rate": 0.00045210509432865823, + "loss": 2.6141, + "step": 18051 + }, + { + "epoch": 0.5353023159268155, + "grad_norm": 0.10656388849020004, + "learning_rate": 0.0004520582599398954, + "loss": 2.6553, + "step": 18052 + }, + { + "epoch": 0.535331969279127, + "grad_norm": 0.10753444582223892, + "learning_rate": 0.00045201142597566484, + "loss": 2.6539, + "step": 18053 + }, + { + "epoch": 0.5353616226314385, + "grad_norm": 0.11055570840835571, + "learning_rate": 0.0004519645924363815, + "loss": 2.6577, + "step": 18054 + }, + { + "epoch": 0.5353912759837499, + "grad_norm": 0.1313696801662445, + "learning_rate": 0.0004519177593224598, + "loss": 2.6673, + "step": 18055 + }, + { + "epoch": 0.5354209293360614, + "grad_norm": 0.12269733846187592, + "learning_rate": 0.00045187092663431463, + "loss": 2.6389, + "step": 18056 + }, + { + "epoch": 0.5354505826883729, + "grad_norm": 0.12223321944475174, + "learning_rate": 0.00045182409437236065, + "loss": 2.6152, + "step": 18057 + }, + { + "epoch": 0.5354802360406844, + "grad_norm": 0.1147298812866211, + "learning_rate": 0.0004517772625370126, + "loss": 2.6661, + "step": 18058 + }, + { + "epoch": 0.5355098893929959, + "grad_norm": 0.12689824402332306, + "learning_rate": 0.0004517304311286851, + "loss": 2.5943, + "step": 18059 + }, + { + "epoch": 0.5355395427453074, + "grad_norm": 0.12282317876815796, + "learning_rate": 0.0004516836001477929, + "loss": 2.6853, + "step": 18060 + }, + { + "epoch": 0.5355691960976189, + "grad_norm": 0.13312144577503204, + "learning_rate": 0.00045163676959475077, + "loss": 2.6207, + "step": 18061 + }, + { + "epoch": 0.5355988494499303, + "grad_norm": 0.13639554381370544, + "learning_rate": 0.00045158993946997335, + "loss": 2.6061, + "step": 18062 + }, + { + "epoch": 0.5356285028022418, + "grad_norm": 0.1292237639427185, + "learning_rate": 0.0004515431097738753, + "loss": 2.6756, + "step": 18063 + }, + { + "epoch": 0.5356581561545533, + "grad_norm": 0.10972331464290619, + "learning_rate": 0.0004514962805068714, + "loss": 2.6505, + "step": 18064 + }, + { + "epoch": 0.5356878095068648, + "grad_norm": 0.11549784243106842, + "learning_rate": 0.00045144945166937636, + "loss": 2.6086, + "step": 18065 + }, + { + "epoch": 0.5357174628591762, + "grad_norm": 0.14149624109268188, + "learning_rate": 0.0004514026232618046, + "loss": 2.6219, + "step": 18066 + }, + { + "epoch": 0.5357471162114877, + "grad_norm": 0.12741418182849884, + "learning_rate": 0.0004513557952845709, + "loss": 2.6287, + "step": 18067 + }, + { + "epoch": 0.5357767695637992, + "grad_norm": 0.11383731663227081, + "learning_rate": 0.0004513089677380901, + "loss": 2.6316, + "step": 18068 + }, + { + "epoch": 0.5358064229161107, + "grad_norm": 0.12221955507993698, + "learning_rate": 0.00045126214062277673, + "loss": 2.6353, + "step": 18069 + }, + { + "epoch": 0.5358360762684221, + "grad_norm": 0.11709775775671005, + "learning_rate": 0.0004512153139390454, + "loss": 2.6792, + "step": 18070 + }, + { + "epoch": 0.5358657296207336, + "grad_norm": 0.1150515228509903, + "learning_rate": 0.000451168487687311, + "loss": 2.6928, + "step": 18071 + }, + { + "epoch": 0.5358953829730451, + "grad_norm": 0.12413225322961807, + "learning_rate": 0.0004511216618679879, + "loss": 2.6207, + "step": 18072 + }, + { + "epoch": 0.5359250363253566, + "grad_norm": 0.1337328851222992, + "learning_rate": 0.0004510748364814907, + "loss": 2.6239, + "step": 18073 + }, + { + "epoch": 0.535954689677668, + "grad_norm": 0.13496801257133484, + "learning_rate": 0.00045102801152823444, + "loss": 2.6722, + "step": 18074 + }, + { + "epoch": 0.5359843430299795, + "grad_norm": 0.12448512017726898, + "learning_rate": 0.0004509811870086336, + "loss": 2.6397, + "step": 18075 + }, + { + "epoch": 0.536013996382291, + "grad_norm": 0.1230330839753151, + "learning_rate": 0.00045093436292310265, + "loss": 2.6576, + "step": 18076 + }, + { + "epoch": 0.5360436497346025, + "grad_norm": 0.10837101191282272, + "learning_rate": 0.0004508875392720564, + "loss": 2.5973, + "step": 18077 + }, + { + "epoch": 0.5360733030869139, + "grad_norm": 0.112737275660038, + "learning_rate": 0.0004508407160559094, + "loss": 2.6598, + "step": 18078 + }, + { + "epoch": 0.5361029564392255, + "grad_norm": 0.1288313865661621, + "learning_rate": 0.00045079389327507625, + "loss": 2.6163, + "step": 18079 + }, + { + "epoch": 0.536132609791537, + "grad_norm": 0.11564379930496216, + "learning_rate": 0.0004507470709299716, + "loss": 2.6238, + "step": 18080 + }, + { + "epoch": 0.5361622631438484, + "grad_norm": 0.10953576862812042, + "learning_rate": 0.00045070024902101014, + "loss": 2.6506, + "step": 18081 + }, + { + "epoch": 0.5361919164961599, + "grad_norm": 0.11226913332939148, + "learning_rate": 0.00045065342754860657, + "loss": 2.6349, + "step": 18082 + }, + { + "epoch": 0.5362215698484714, + "grad_norm": 0.12051746994256973, + "learning_rate": 0.0004506066065131752, + "loss": 2.6091, + "step": 18083 + }, + { + "epoch": 0.5362512232007829, + "grad_norm": 0.09906381368637085, + "learning_rate": 0.00045055978591513083, + "loss": 2.6661, + "step": 18084 + }, + { + "epoch": 0.5362808765530943, + "grad_norm": 0.12182258069515228, + "learning_rate": 0.0004505129657548881, + "loss": 2.6119, + "step": 18085 + }, + { + "epoch": 0.5363105299054058, + "grad_norm": 0.11748848110437393, + "learning_rate": 0.00045046614603286143, + "loss": 2.6107, + "step": 18086 + }, + { + "epoch": 0.5363401832577173, + "grad_norm": 0.13665954768657684, + "learning_rate": 0.00045041932674946556, + "loss": 2.6608, + "step": 18087 + }, + { + "epoch": 0.5363698366100288, + "grad_norm": 0.12126392871141434, + "learning_rate": 0.00045037250790511515, + "loss": 2.6169, + "step": 18088 + }, + { + "epoch": 0.5363994899623402, + "grad_norm": 0.10242730379104614, + "learning_rate": 0.00045032568950022467, + "loss": 2.6466, + "step": 18089 + }, + { + "epoch": 0.5364291433146517, + "grad_norm": 0.12358061969280243, + "learning_rate": 0.0004502788715352087, + "loss": 2.635, + "step": 18090 + }, + { + "epoch": 0.5364587966669632, + "grad_norm": 0.13078652322292328, + "learning_rate": 0.0004502320540104819, + "loss": 2.6293, + "step": 18091 + }, + { + "epoch": 0.5364884500192747, + "grad_norm": 0.12384752929210663, + "learning_rate": 0.0004501852369264589, + "loss": 2.604, + "step": 18092 + }, + { + "epoch": 0.5365181033715861, + "grad_norm": 0.1121516227722168, + "learning_rate": 0.0004501384202835541, + "loss": 2.6259, + "step": 18093 + }, + { + "epoch": 0.5365477567238977, + "grad_norm": 0.11899670958518982, + "learning_rate": 0.00045009160408218213, + "loss": 2.6507, + "step": 18094 + }, + { + "epoch": 0.5365774100762091, + "grad_norm": 0.10556291788816452, + "learning_rate": 0.0004500447883227575, + "loss": 2.6286, + "step": 18095 + }, + { + "epoch": 0.5366070634285206, + "grad_norm": 0.13240157067775726, + "learning_rate": 0.00044999797300569494, + "loss": 2.627, + "step": 18096 + }, + { + "epoch": 0.536636716780832, + "grad_norm": 0.1256875842809677, + "learning_rate": 0.00044995115813140893, + "loss": 2.646, + "step": 18097 + }, + { + "epoch": 0.5366663701331436, + "grad_norm": 0.10033585131168365, + "learning_rate": 0.0004499043437003139, + "loss": 2.6608, + "step": 18098 + }, + { + "epoch": 0.536696023485455, + "grad_norm": 0.12105108052492142, + "learning_rate": 0.00044985752971282445, + "loss": 2.6337, + "step": 18099 + }, + { + "epoch": 0.5367256768377665, + "grad_norm": 0.11546912044286728, + "learning_rate": 0.0004498107161693553, + "loss": 2.6334, + "step": 18100 + }, + { + "epoch": 0.536755330190078, + "grad_norm": 0.12474959343671799, + "learning_rate": 0.0004497639030703209, + "loss": 2.6268, + "step": 18101 + }, + { + "epoch": 0.5367849835423895, + "grad_norm": 0.11364457756280899, + "learning_rate": 0.0004497170904161357, + "loss": 2.6561, + "step": 18102 + }, + { + "epoch": 0.536814636894701, + "grad_norm": 0.10559730976819992, + "learning_rate": 0.0004496702782072145, + "loss": 2.6473, + "step": 18103 + }, + { + "epoch": 0.5368442902470124, + "grad_norm": 0.10737238079309464, + "learning_rate": 0.0004496234664439714, + "loss": 2.6065, + "step": 18104 + }, + { + "epoch": 0.5368739435993239, + "grad_norm": 0.10616786777973175, + "learning_rate": 0.00044957665512682115, + "loss": 2.6191, + "step": 18105 + }, + { + "epoch": 0.5369035969516354, + "grad_norm": 0.09964247792959213, + "learning_rate": 0.00044952984425617837, + "loss": 2.6449, + "step": 18106 + }, + { + "epoch": 0.5369332503039469, + "grad_norm": 0.12988992035388947, + "learning_rate": 0.00044948303383245743, + "loss": 2.6396, + "step": 18107 + }, + { + "epoch": 0.5369629036562583, + "grad_norm": 0.12674683332443237, + "learning_rate": 0.0004494362238560729, + "loss": 2.641, + "step": 18108 + }, + { + "epoch": 0.5369925570085698, + "grad_norm": 0.11387678980827332, + "learning_rate": 0.0004493894143274394, + "loss": 2.6469, + "step": 18109 + }, + { + "epoch": 0.5370222103608813, + "grad_norm": 0.11316605657339096, + "learning_rate": 0.00044934260524697123, + "loss": 2.6762, + "step": 18110 + }, + { + "epoch": 0.5370518637131928, + "grad_norm": 0.12143408507108688, + "learning_rate": 0.000449295796615083, + "loss": 2.6143, + "step": 18111 + }, + { + "epoch": 0.5370815170655042, + "grad_norm": 0.12266873568296432, + "learning_rate": 0.0004492489884321889, + "loss": 2.6146, + "step": 18112 + }, + { + "epoch": 0.5371111704178158, + "grad_norm": 0.11263533681631088, + "learning_rate": 0.00044920218069870405, + "loss": 2.6105, + "step": 18113 + }, + { + "epoch": 0.5371408237701272, + "grad_norm": 0.11369909346103668, + "learning_rate": 0.0004491553734150426, + "loss": 2.6416, + "step": 18114 + }, + { + "epoch": 0.5371704771224387, + "grad_norm": 0.1279900074005127, + "learning_rate": 0.00044910856658161895, + "loss": 2.6364, + "step": 18115 + }, + { + "epoch": 0.5372001304747501, + "grad_norm": 0.14351092278957367, + "learning_rate": 0.0004490617601988477, + "loss": 2.6366, + "step": 18116 + }, + { + "epoch": 0.5372297838270617, + "grad_norm": 0.09687473624944687, + "learning_rate": 0.00044901495426714335, + "loss": 2.6209, + "step": 18117 + }, + { + "epoch": 0.5372594371793731, + "grad_norm": 0.11108440905809402, + "learning_rate": 0.0004489681487869203, + "loss": 2.6346, + "step": 18118 + }, + { + "epoch": 0.5372890905316846, + "grad_norm": 0.1074260026216507, + "learning_rate": 0.00044892134375859306, + "loss": 2.6203, + "step": 18119 + }, + { + "epoch": 0.537318743883996, + "grad_norm": 0.10470003634691238, + "learning_rate": 0.0004488745391825761, + "loss": 2.6065, + "step": 18120 + }, + { + "epoch": 0.5373483972363076, + "grad_norm": 0.1058957502245903, + "learning_rate": 0.00044882773505928395, + "loss": 2.6385, + "step": 18121 + }, + { + "epoch": 0.5373780505886191, + "grad_norm": 0.11118050664663315, + "learning_rate": 0.0004487809313891309, + "loss": 2.6474, + "step": 18122 + }, + { + "epoch": 0.5374077039409305, + "grad_norm": 0.1259259730577469, + "learning_rate": 0.00044873412817253146, + "loss": 2.674, + "step": 18123 + }, + { + "epoch": 0.537437357293242, + "grad_norm": 0.11244810372591019, + "learning_rate": 0.00044868732540990017, + "loss": 2.6324, + "step": 18124 + }, + { + "epoch": 0.5374670106455535, + "grad_norm": 0.12137872725725174, + "learning_rate": 0.0004486405231016513, + "loss": 2.6247, + "step": 18125 + }, + { + "epoch": 0.537496663997865, + "grad_norm": 0.13653793931007385, + "learning_rate": 0.00044859372124819946, + "loss": 2.6496, + "step": 18126 + }, + { + "epoch": 0.5375263173501764, + "grad_norm": 0.1444750726222992, + "learning_rate": 0.0004485469198499591, + "loss": 2.5957, + "step": 18127 + }, + { + "epoch": 0.537555970702488, + "grad_norm": 0.11724267154932022, + "learning_rate": 0.0004485001189073446, + "loss": 2.6119, + "step": 18128 + }, + { + "epoch": 0.5375856240547994, + "grad_norm": 0.11598508059978485, + "learning_rate": 0.00044845331842077035, + "loss": 2.628, + "step": 18129 + }, + { + "epoch": 0.5376152774071109, + "grad_norm": 0.1473156213760376, + "learning_rate": 0.0004484065183906509, + "loss": 2.6457, + "step": 18130 + }, + { + "epoch": 0.5376449307594223, + "grad_norm": 0.14412769675254822, + "learning_rate": 0.0004483597188174006, + "loss": 2.616, + "step": 18131 + }, + { + "epoch": 0.5376745841117339, + "grad_norm": 0.13701507449150085, + "learning_rate": 0.00044831291970143374, + "loss": 2.6315, + "step": 18132 + }, + { + "epoch": 0.5377042374640453, + "grad_norm": 0.11355333775281906, + "learning_rate": 0.00044826612104316486, + "loss": 2.6504, + "step": 18133 + }, + { + "epoch": 0.5377338908163568, + "grad_norm": 0.1275993138551712, + "learning_rate": 0.00044821932284300843, + "loss": 2.6542, + "step": 18134 + }, + { + "epoch": 0.5377635441686682, + "grad_norm": 0.1493908166885376, + "learning_rate": 0.0004481725251013787, + "loss": 2.6549, + "step": 18135 + }, + { + "epoch": 0.5377931975209798, + "grad_norm": 0.14348863065242767, + "learning_rate": 0.0004481257278186902, + "loss": 2.6474, + "step": 18136 + }, + { + "epoch": 0.5378228508732912, + "grad_norm": 0.11343253403902054, + "learning_rate": 0.0004480789309953574, + "loss": 2.644, + "step": 18137 + }, + { + "epoch": 0.5378525042256027, + "grad_norm": 0.1320989727973938, + "learning_rate": 0.0004480321346317942, + "loss": 2.6969, + "step": 18138 + }, + { + "epoch": 0.5378821575779141, + "grad_norm": 0.12848153710365295, + "learning_rate": 0.0004479853387284156, + "loss": 2.6631, + "step": 18139 + }, + { + "epoch": 0.5379118109302257, + "grad_norm": 0.11066436022520065, + "learning_rate": 0.0004479385432856358, + "loss": 2.6166, + "step": 18140 + }, + { + "epoch": 0.5379414642825372, + "grad_norm": 0.1164795309305191, + "learning_rate": 0.00044789174830386924, + "loss": 2.6326, + "step": 18141 + }, + { + "epoch": 0.5379711176348486, + "grad_norm": 0.12896223366260529, + "learning_rate": 0.00044784495378353005, + "loss": 2.6352, + "step": 18142 + }, + { + "epoch": 0.5380007709871601, + "grad_norm": 0.11685841530561447, + "learning_rate": 0.0004477981597250328, + "loss": 2.6368, + "step": 18143 + }, + { + "epoch": 0.5380304243394716, + "grad_norm": 0.1152304857969284, + "learning_rate": 0.00044775136612879176, + "loss": 2.6737, + "step": 18144 + }, + { + "epoch": 0.5380600776917831, + "grad_norm": 0.10058356821537018, + "learning_rate": 0.0004477045729952213, + "loss": 2.6074, + "step": 18145 + }, + { + "epoch": 0.5380897310440945, + "grad_norm": 0.10516233742237091, + "learning_rate": 0.00044765778032473594, + "loss": 2.6349, + "step": 18146 + }, + { + "epoch": 0.538119384396406, + "grad_norm": 0.11658605188131332, + "learning_rate": 0.00044761098811774993, + "loss": 2.661, + "step": 18147 + }, + { + "epoch": 0.5381490377487175, + "grad_norm": 0.10854899883270264, + "learning_rate": 0.00044756419637467755, + "loss": 2.6442, + "step": 18148 + }, + { + "epoch": 0.538178691101029, + "grad_norm": 0.1035173311829567, + "learning_rate": 0.0004475174050959332, + "loss": 2.6443, + "step": 18149 + }, + { + "epoch": 0.5382083444533404, + "grad_norm": 0.10914686322212219, + "learning_rate": 0.000447470614281931, + "loss": 2.6415, + "step": 18150 + }, + { + "epoch": 0.538237997805652, + "grad_norm": 0.1078614741563797, + "learning_rate": 0.00044742382393308584, + "loss": 2.6635, + "step": 18151 + }, + { + "epoch": 0.5382676511579634, + "grad_norm": 0.12115474045276642, + "learning_rate": 0.0004473770340498117, + "loss": 2.6201, + "step": 18152 + }, + { + "epoch": 0.5382973045102749, + "grad_norm": 0.11743827164173126, + "learning_rate": 0.0004473302446325229, + "loss": 2.6434, + "step": 18153 + }, + { + "epoch": 0.5383269578625863, + "grad_norm": 0.11381888389587402, + "learning_rate": 0.00044728345568163384, + "loss": 2.5999, + "step": 18154 + }, + { + "epoch": 0.5383566112148979, + "grad_norm": 0.09348822385072708, + "learning_rate": 0.00044723666719755886, + "loss": 2.6326, + "step": 18155 + }, + { + "epoch": 0.5383862645672093, + "grad_norm": 0.10007999837398529, + "learning_rate": 0.0004471898791807122, + "loss": 2.637, + "step": 18156 + }, + { + "epoch": 0.5384159179195208, + "grad_norm": 0.10915787518024445, + "learning_rate": 0.00044714309163150826, + "loss": 2.6367, + "step": 18157 + }, + { + "epoch": 0.5384455712718322, + "grad_norm": 0.10966838151216507, + "learning_rate": 0.0004470963045503614, + "loss": 2.643, + "step": 18158 + }, + { + "epoch": 0.5384752246241438, + "grad_norm": 0.1131882444024086, + "learning_rate": 0.0004470495179376857, + "loss": 2.6412, + "step": 18159 + }, + { + "epoch": 0.5385048779764552, + "grad_norm": 0.11654188483953476, + "learning_rate": 0.00044700273179389566, + "loss": 2.6437, + "step": 18160 + }, + { + "epoch": 0.5385345313287667, + "grad_norm": 0.11512688547372818, + "learning_rate": 0.00044695594611940555, + "loss": 2.6515, + "step": 18161 + }, + { + "epoch": 0.5385641846810783, + "grad_norm": 0.10885966569185257, + "learning_rate": 0.00044690916091462966, + "loss": 2.6111, + "step": 18162 + }, + { + "epoch": 0.5385938380333897, + "grad_norm": 0.10775826126337051, + "learning_rate": 0.00044686237617998204, + "loss": 2.6337, + "step": 18163 + }, + { + "epoch": 0.5386234913857012, + "grad_norm": 0.10935874283313751, + "learning_rate": 0.0004468155919158774, + "loss": 2.6258, + "step": 18164 + }, + { + "epoch": 0.5386531447380126, + "grad_norm": 0.1129760667681694, + "learning_rate": 0.0004467688081227298, + "loss": 2.6404, + "step": 18165 + }, + { + "epoch": 0.5386827980903242, + "grad_norm": 0.11669381707906723, + "learning_rate": 0.00044672202480095354, + "loss": 2.6661, + "step": 18166 + }, + { + "epoch": 0.5387124514426356, + "grad_norm": 0.1259840577840805, + "learning_rate": 0.00044667524195096285, + "loss": 2.6621, + "step": 18167 + }, + { + "epoch": 0.5387421047949471, + "grad_norm": 0.1166142076253891, + "learning_rate": 0.00044662845957317224, + "loss": 2.5982, + "step": 18168 + }, + { + "epoch": 0.5387717581472585, + "grad_norm": 0.1323712170124054, + "learning_rate": 0.0004465816776679956, + "loss": 2.6134, + "step": 18169 + }, + { + "epoch": 0.5388014114995701, + "grad_norm": 0.141449972987175, + "learning_rate": 0.0004465348962358474, + "loss": 2.6419, + "step": 18170 + }, + { + "epoch": 0.5388310648518815, + "grad_norm": 0.12112928926944733, + "learning_rate": 0.0004464881152771419, + "loss": 2.6479, + "step": 18171 + }, + { + "epoch": 0.538860718204193, + "grad_norm": 0.10497056692838669, + "learning_rate": 0.00044644133479229334, + "loss": 2.6249, + "step": 18172 + }, + { + "epoch": 0.5388903715565044, + "grad_norm": 0.13371513783931732, + "learning_rate": 0.00044639455478171587, + "loss": 2.6161, + "step": 18173 + }, + { + "epoch": 0.538920024908816, + "grad_norm": 0.12327202409505844, + "learning_rate": 0.00044634777524582385, + "loss": 2.6553, + "step": 18174 + }, + { + "epoch": 0.5389496782611274, + "grad_norm": 0.11393274366855621, + "learning_rate": 0.0004463009961850316, + "loss": 2.6449, + "step": 18175 + }, + { + "epoch": 0.5389793316134389, + "grad_norm": 0.13282445073127747, + "learning_rate": 0.00044625421759975293, + "loss": 2.6525, + "step": 18176 + }, + { + "epoch": 0.5390089849657503, + "grad_norm": 0.10676195472478867, + "learning_rate": 0.0004462074394904026, + "loss": 2.6631, + "step": 18177 + }, + { + "epoch": 0.5390386383180619, + "grad_norm": 0.1059640645980835, + "learning_rate": 0.00044616066185739465, + "loss": 2.5947, + "step": 18178 + }, + { + "epoch": 0.5390682916703733, + "grad_norm": 0.10476872324943542, + "learning_rate": 0.0004461138847011434, + "loss": 2.6548, + "step": 18179 + }, + { + "epoch": 0.5390979450226848, + "grad_norm": 0.10812485218048096, + "learning_rate": 0.0004460671080220628, + "loss": 2.6024, + "step": 18180 + }, + { + "epoch": 0.5391275983749962, + "grad_norm": 0.11778853088617325, + "learning_rate": 0.0004460203318205672, + "loss": 2.6187, + "step": 18181 + }, + { + "epoch": 0.5391572517273078, + "grad_norm": 0.10526986420154572, + "learning_rate": 0.00044597355609707085, + "loss": 2.6533, + "step": 18182 + }, + { + "epoch": 0.5391869050796193, + "grad_norm": 0.10743337124586105, + "learning_rate": 0.00044592678085198797, + "loss": 2.6445, + "step": 18183 + }, + { + "epoch": 0.5392165584319307, + "grad_norm": 0.1112120971083641, + "learning_rate": 0.00044588000608573273, + "loss": 2.6405, + "step": 18184 + }, + { + "epoch": 0.5392462117842423, + "grad_norm": 0.1415233612060547, + "learning_rate": 0.00044583323179871935, + "loss": 2.6193, + "step": 18185 + }, + { + "epoch": 0.5392758651365537, + "grad_norm": 0.13015465438365936, + "learning_rate": 0.00044578645799136195, + "loss": 2.638, + "step": 18186 + }, + { + "epoch": 0.5393055184888652, + "grad_norm": 0.11233220994472504, + "learning_rate": 0.0004457396846640748, + "loss": 2.6211, + "step": 18187 + }, + { + "epoch": 0.5393351718411766, + "grad_norm": 0.12842755019664764, + "learning_rate": 0.000445692911817272, + "loss": 2.6137, + "step": 18188 + }, + { + "epoch": 0.5393648251934882, + "grad_norm": 0.13177436590194702, + "learning_rate": 0.00044564613945136767, + "loss": 2.6579, + "step": 18189 + }, + { + "epoch": 0.5393944785457996, + "grad_norm": 0.10730837285518646, + "learning_rate": 0.00044559936756677625, + "loss": 2.6533, + "step": 18190 + }, + { + "epoch": 0.5394241318981111, + "grad_norm": 0.11348885297775269, + "learning_rate": 0.00044555259616391174, + "loss": 2.6122, + "step": 18191 + }, + { + "epoch": 0.5394537852504225, + "grad_norm": 0.1284162700176239, + "learning_rate": 0.0004455058252431884, + "loss": 2.6399, + "step": 18192 + }, + { + "epoch": 0.5394834386027341, + "grad_norm": 0.1326686590909958, + "learning_rate": 0.0004454590548050203, + "loss": 2.6414, + "step": 18193 + }, + { + "epoch": 0.5395130919550455, + "grad_norm": 0.13720819354057312, + "learning_rate": 0.0004454122848498216, + "loss": 2.6602, + "step": 18194 + }, + { + "epoch": 0.539542745307357, + "grad_norm": 0.11076032370328903, + "learning_rate": 0.0004453655153780065, + "loss": 2.641, + "step": 18195 + }, + { + "epoch": 0.5395723986596684, + "grad_norm": 0.12615875899791718, + "learning_rate": 0.00044531874638998925, + "loss": 2.6129, + "step": 18196 + }, + { + "epoch": 0.53960205201198, + "grad_norm": 0.12654311954975128, + "learning_rate": 0.00044527197788618377, + "loss": 2.6355, + "step": 18197 + }, + { + "epoch": 0.5396317053642914, + "grad_norm": 0.1308903992176056, + "learning_rate": 0.00044522520986700436, + "loss": 2.598, + "step": 18198 + }, + { + "epoch": 0.5396613587166029, + "grad_norm": 0.11938652396202087, + "learning_rate": 0.00044517844233286506, + "loss": 2.6658, + "step": 18199 + }, + { + "epoch": 0.5396910120689143, + "grad_norm": 0.135152667760849, + "learning_rate": 0.0004451316752841801, + "loss": 2.6442, + "step": 18200 + }, + { + "epoch": 0.5397206654212259, + "grad_norm": 0.13716906309127808, + "learning_rate": 0.0004450849087213636, + "loss": 2.6217, + "step": 18201 + }, + { + "epoch": 0.5397503187735373, + "grad_norm": 0.11780283600091934, + "learning_rate": 0.0004450381426448295, + "loss": 2.6255, + "step": 18202 + }, + { + "epoch": 0.5397799721258488, + "grad_norm": 0.11089544743299484, + "learning_rate": 0.0004449913770549922, + "loss": 2.6783, + "step": 18203 + }, + { + "epoch": 0.5398096254781604, + "grad_norm": 0.13282829523086548, + "learning_rate": 0.00044494461195226573, + "loss": 2.6358, + "step": 18204 + }, + { + "epoch": 0.5398392788304718, + "grad_norm": 0.11604262888431549, + "learning_rate": 0.0004448978473370641, + "loss": 2.6294, + "step": 18205 + }, + { + "epoch": 0.5398689321827833, + "grad_norm": 0.1274775266647339, + "learning_rate": 0.00044485108320980164, + "loss": 2.6508, + "step": 18206 + }, + { + "epoch": 0.5398985855350947, + "grad_norm": 0.10164223611354828, + "learning_rate": 0.00044480431957089224, + "loss": 2.6057, + "step": 18207 + }, + { + "epoch": 0.5399282388874063, + "grad_norm": 0.11043485999107361, + "learning_rate": 0.0004447575564207501, + "loss": 2.6346, + "step": 18208 + }, + { + "epoch": 0.5399578922397177, + "grad_norm": 0.10568800568580627, + "learning_rate": 0.00044471079375978914, + "loss": 2.6764, + "step": 18209 + }, + { + "epoch": 0.5399875455920292, + "grad_norm": 0.11662488430738449, + "learning_rate": 0.00044466403158842366, + "loss": 2.6376, + "step": 18210 + }, + { + "epoch": 0.5400171989443406, + "grad_norm": 0.10168306529521942, + "learning_rate": 0.0004446172699070677, + "loss": 2.6418, + "step": 18211 + }, + { + "epoch": 0.5400468522966522, + "grad_norm": 0.10182472318410873, + "learning_rate": 0.0004445705087161353, + "loss": 2.6055, + "step": 18212 + }, + { + "epoch": 0.5400765056489636, + "grad_norm": 0.10955499112606049, + "learning_rate": 0.00044452374801604065, + "loss": 2.6388, + "step": 18213 + }, + { + "epoch": 0.5401061590012751, + "grad_norm": 0.10122236609458923, + "learning_rate": 0.00044447698780719766, + "loss": 2.6491, + "step": 18214 + }, + { + "epoch": 0.5401358123535865, + "grad_norm": 0.11586097627878189, + "learning_rate": 0.00044443022809002023, + "loss": 2.607, + "step": 18215 + }, + { + "epoch": 0.5401654657058981, + "grad_norm": 0.11254080384969711, + "learning_rate": 0.00044438346886492287, + "loss": 2.6683, + "step": 18216 + }, + { + "epoch": 0.5401951190582095, + "grad_norm": 0.1205325722694397, + "learning_rate": 0.0004443367101323196, + "loss": 2.6492, + "step": 18217 + }, + { + "epoch": 0.540224772410521, + "grad_norm": 0.1092129498720169, + "learning_rate": 0.0004442899518926242, + "loss": 2.6346, + "step": 18218 + }, + { + "epoch": 0.5402544257628324, + "grad_norm": 0.10504857450723648, + "learning_rate": 0.0004442431941462508, + "loss": 2.6438, + "step": 18219 + }, + { + "epoch": 0.540284079115144, + "grad_norm": 0.09810450673103333, + "learning_rate": 0.0004441964368936135, + "loss": 2.6066, + "step": 18220 + }, + { + "epoch": 0.5403137324674554, + "grad_norm": 0.10286590456962585, + "learning_rate": 0.0004441496801351263, + "loss": 2.632, + "step": 18221 + }, + { + "epoch": 0.5403433858197669, + "grad_norm": 0.11542796343564987, + "learning_rate": 0.0004441029238712033, + "loss": 2.6598, + "step": 18222 + }, + { + "epoch": 0.5403730391720784, + "grad_norm": 0.10097559541463852, + "learning_rate": 0.00044405616810225853, + "loss": 2.6609, + "step": 18223 + }, + { + "epoch": 0.5404026925243899, + "grad_norm": 0.10498026013374329, + "learning_rate": 0.0004440094128287061, + "loss": 2.637, + "step": 18224 + }, + { + "epoch": 0.5404323458767014, + "grad_norm": 0.11233930289745331, + "learning_rate": 0.00044396265805095975, + "loss": 2.6523, + "step": 18225 + }, + { + "epoch": 0.5404619992290128, + "grad_norm": 0.10596907883882523, + "learning_rate": 0.00044391590376943377, + "loss": 2.6626, + "step": 18226 + }, + { + "epoch": 0.5404916525813244, + "grad_norm": 0.10135332494974136, + "learning_rate": 0.00044386914998454204, + "loss": 2.6249, + "step": 18227 + }, + { + "epoch": 0.5405213059336358, + "grad_norm": 0.10510135442018509, + "learning_rate": 0.00044382239669669857, + "loss": 2.6335, + "step": 18228 + }, + { + "epoch": 0.5405509592859473, + "grad_norm": 0.11981853097677231, + "learning_rate": 0.0004437756439063174, + "loss": 2.6361, + "step": 18229 + }, + { + "epoch": 0.5405806126382587, + "grad_norm": 0.11302221566438675, + "learning_rate": 0.0004437288916138127, + "loss": 2.6284, + "step": 18230 + }, + { + "epoch": 0.5406102659905703, + "grad_norm": 0.09458396583795547, + "learning_rate": 0.00044368213981959827, + "loss": 2.5827, + "step": 18231 + }, + { + "epoch": 0.5406399193428817, + "grad_norm": 0.10705042630434036, + "learning_rate": 0.0004436353885240881, + "loss": 2.6345, + "step": 18232 + }, + { + "epoch": 0.5406695726951932, + "grad_norm": 0.10467985272407532, + "learning_rate": 0.0004435886377276963, + "loss": 2.6566, + "step": 18233 + }, + { + "epoch": 0.5406992260475046, + "grad_norm": 0.09532076120376587, + "learning_rate": 0.0004435418874308369, + "loss": 2.6332, + "step": 18234 + }, + { + "epoch": 0.5407288793998162, + "grad_norm": 0.10974147170782089, + "learning_rate": 0.0004434951376339237, + "loss": 2.6344, + "step": 18235 + }, + { + "epoch": 0.5407585327521276, + "grad_norm": 0.10551971197128296, + "learning_rate": 0.00044344838833737066, + "loss": 2.656, + "step": 18236 + }, + { + "epoch": 0.5407881861044391, + "grad_norm": 0.0971008837223053, + "learning_rate": 0.0004434016395415919, + "loss": 2.6244, + "step": 18237 + }, + { + "epoch": 0.5408178394567505, + "grad_norm": 0.10117591917514801, + "learning_rate": 0.00044335489124700135, + "loss": 2.6229, + "step": 18238 + }, + { + "epoch": 0.5408474928090621, + "grad_norm": 0.09580963850021362, + "learning_rate": 0.000443308143454013, + "loss": 2.6482, + "step": 18239 + }, + { + "epoch": 0.5408771461613735, + "grad_norm": 0.10369884967803955, + "learning_rate": 0.00044326139616304066, + "loss": 2.603, + "step": 18240 + }, + { + "epoch": 0.540906799513685, + "grad_norm": 0.11010183393955231, + "learning_rate": 0.0004432146493744984, + "loss": 2.6299, + "step": 18241 + }, + { + "epoch": 0.5409364528659965, + "grad_norm": 0.12082378566265106, + "learning_rate": 0.00044316790308880026, + "loss": 2.6577, + "step": 18242 + }, + { + "epoch": 0.540966106218308, + "grad_norm": 0.12293849140405655, + "learning_rate": 0.00044312115730636, + "loss": 2.6379, + "step": 18243 + }, + { + "epoch": 0.5409957595706194, + "grad_norm": 0.11369821429252625, + "learning_rate": 0.00044307441202759185, + "loss": 2.6545, + "step": 18244 + }, + { + "epoch": 0.5410254129229309, + "grad_norm": 0.0926406979560852, + "learning_rate": 0.00044302766725290936, + "loss": 2.5973, + "step": 18245 + }, + { + "epoch": 0.5410550662752425, + "grad_norm": 0.1043878123164177, + "learning_rate": 0.00044298092298272675, + "loss": 2.6492, + "step": 18246 + }, + { + "epoch": 0.5410847196275539, + "grad_norm": 0.1274053156375885, + "learning_rate": 0.00044293417921745785, + "loss": 2.608, + "step": 18247 + }, + { + "epoch": 0.5411143729798654, + "grad_norm": 0.11822306364774704, + "learning_rate": 0.0004428874359575165, + "loss": 2.6098, + "step": 18248 + }, + { + "epoch": 0.5411440263321768, + "grad_norm": 0.104182668030262, + "learning_rate": 0.00044284069320331675, + "loss": 2.5864, + "step": 18249 + }, + { + "epoch": 0.5411736796844884, + "grad_norm": 0.14220260083675385, + "learning_rate": 0.0004427939509552725, + "loss": 2.6406, + "step": 18250 + }, + { + "epoch": 0.5412033330367998, + "grad_norm": 0.17692330479621887, + "learning_rate": 0.0004427472092137977, + "loss": 2.6171, + "step": 18251 + }, + { + "epoch": 0.5412329863891113, + "grad_norm": 0.16257891058921814, + "learning_rate": 0.0004427004679793062, + "loss": 2.6301, + "step": 18252 + }, + { + "epoch": 0.5412626397414227, + "grad_norm": 0.13582874834537506, + "learning_rate": 0.00044265372725221173, + "loss": 2.6264, + "step": 18253 + }, + { + "epoch": 0.5412922930937343, + "grad_norm": 0.10533333569765091, + "learning_rate": 0.0004426069870329282, + "loss": 2.6445, + "step": 18254 + }, + { + "epoch": 0.5413219464460457, + "grad_norm": 0.12045764178037643, + "learning_rate": 0.00044256024732187006, + "loss": 2.6429, + "step": 18255 + }, + { + "epoch": 0.5413515997983572, + "grad_norm": 0.13184654712677002, + "learning_rate": 0.00044251350811945055, + "loss": 2.6372, + "step": 18256 + }, + { + "epoch": 0.5413812531506687, + "grad_norm": 0.12435963749885559, + "learning_rate": 0.00044246676942608386, + "loss": 2.6496, + "step": 18257 + }, + { + "epoch": 0.5414109065029802, + "grad_norm": 0.11000128835439682, + "learning_rate": 0.00044242003124218376, + "loss": 2.6507, + "step": 18258 + }, + { + "epoch": 0.5414405598552916, + "grad_norm": 0.11586353927850723, + "learning_rate": 0.00044237329356816427, + "loss": 2.6094, + "step": 18259 + }, + { + "epoch": 0.5414702132076031, + "grad_norm": 0.09395962208509445, + "learning_rate": 0.000442326556404439, + "loss": 2.6432, + "step": 18260 + }, + { + "epoch": 0.5414998665599146, + "grad_norm": 0.10611295700073242, + "learning_rate": 0.0004422798197514221, + "loss": 2.6142, + "step": 18261 + }, + { + "epoch": 0.5415295199122261, + "grad_norm": 0.10101938992738724, + "learning_rate": 0.00044223308360952733, + "loss": 2.6639, + "step": 18262 + }, + { + "epoch": 0.5415591732645375, + "grad_norm": 0.10303737223148346, + "learning_rate": 0.00044218634797916844, + "loss": 2.6305, + "step": 18263 + }, + { + "epoch": 0.541588826616849, + "grad_norm": 0.11663514375686646, + "learning_rate": 0.00044213961286075935, + "loss": 2.6507, + "step": 18264 + }, + { + "epoch": 0.5416184799691605, + "grad_norm": 0.09610956907272339, + "learning_rate": 0.000442092878254714, + "loss": 2.6167, + "step": 18265 + }, + { + "epoch": 0.541648133321472, + "grad_norm": 0.09673521667718887, + "learning_rate": 0.00044204614416144605, + "loss": 2.6049, + "step": 18266 + }, + { + "epoch": 0.5416777866737835, + "grad_norm": 0.10588543117046356, + "learning_rate": 0.0004419994105813694, + "loss": 2.6317, + "step": 18267 + }, + { + "epoch": 0.5417074400260949, + "grad_norm": 0.10059372335672379, + "learning_rate": 0.000441952677514898, + "loss": 2.599, + "step": 18268 + }, + { + "epoch": 0.5417370933784065, + "grad_norm": 0.10458680242300034, + "learning_rate": 0.0004419059449624456, + "loss": 2.6603, + "step": 18269 + }, + { + "epoch": 0.5417667467307179, + "grad_norm": 0.09959510713815689, + "learning_rate": 0.00044185921292442604, + "loss": 2.6127, + "step": 18270 + }, + { + "epoch": 0.5417964000830294, + "grad_norm": 0.09950290620326996, + "learning_rate": 0.00044181248140125315, + "loss": 2.6358, + "step": 18271 + }, + { + "epoch": 0.5418260534353408, + "grad_norm": 0.12241671979427338, + "learning_rate": 0.00044176575039334083, + "loss": 2.6439, + "step": 18272 + }, + { + "epoch": 0.5418557067876524, + "grad_norm": 0.13828402757644653, + "learning_rate": 0.00044171901990110266, + "loss": 2.652, + "step": 18273 + }, + { + "epoch": 0.5418853601399638, + "grad_norm": 0.1309887170791626, + "learning_rate": 0.0004416722899249526, + "loss": 2.6211, + "step": 18274 + }, + { + "epoch": 0.5419150134922753, + "grad_norm": 0.12107448279857635, + "learning_rate": 0.00044162556046530434, + "loss": 2.6583, + "step": 18275 + }, + { + "epoch": 0.5419446668445868, + "grad_norm": 0.11734997481107712, + "learning_rate": 0.00044157883152257186, + "loss": 2.6287, + "step": 18276 + }, + { + "epoch": 0.5419743201968983, + "grad_norm": 0.1291077882051468, + "learning_rate": 0.0004415321030971688, + "loss": 2.6541, + "step": 18277 + }, + { + "epoch": 0.5420039735492097, + "grad_norm": 0.1327509880065918, + "learning_rate": 0.000441485375189509, + "loss": 2.6243, + "step": 18278 + }, + { + "epoch": 0.5420336269015212, + "grad_norm": 0.10246167331933975, + "learning_rate": 0.00044143864780000634, + "loss": 2.633, + "step": 18279 + }, + { + "epoch": 0.5420632802538327, + "grad_norm": 0.11958271265029907, + "learning_rate": 0.0004413919209290743, + "loss": 2.6228, + "step": 18280 + }, + { + "epoch": 0.5420929336061442, + "grad_norm": 0.10926004499197006, + "learning_rate": 0.000441345194577127, + "loss": 2.6494, + "step": 18281 + }, + { + "epoch": 0.5421225869584556, + "grad_norm": 0.11528633534908295, + "learning_rate": 0.0004412984687445781, + "loss": 2.6498, + "step": 18282 + }, + { + "epoch": 0.5421522403107671, + "grad_norm": 0.134368434548378, + "learning_rate": 0.00044125174343184135, + "loss": 2.6456, + "step": 18283 + }, + { + "epoch": 0.5421818936630786, + "grad_norm": 0.1150423064827919, + "learning_rate": 0.0004412050186393304, + "loss": 2.643, + "step": 18284 + }, + { + "epoch": 0.5422115470153901, + "grad_norm": 0.11283425241708755, + "learning_rate": 0.00044115829436745915, + "loss": 2.627, + "step": 18285 + }, + { + "epoch": 0.5422412003677015, + "grad_norm": 0.12860338389873505, + "learning_rate": 0.0004411115706166413, + "loss": 2.6268, + "step": 18286 + }, + { + "epoch": 0.542270853720013, + "grad_norm": 0.11101768165826797, + "learning_rate": 0.00044106484738729056, + "loss": 2.6309, + "step": 18287 + }, + { + "epoch": 0.5423005070723246, + "grad_norm": 0.10791701823472977, + "learning_rate": 0.0004410181246798208, + "loss": 2.6778, + "step": 18288 + }, + { + "epoch": 0.542330160424636, + "grad_norm": 0.1341399848461151, + "learning_rate": 0.00044097140249464576, + "loss": 2.6234, + "step": 18289 + }, + { + "epoch": 0.5423598137769475, + "grad_norm": 0.09938772022724152, + "learning_rate": 0.0004409246808321789, + "loss": 2.6485, + "step": 18290 + }, + { + "epoch": 0.542389467129259, + "grad_norm": 0.11291494220495224, + "learning_rate": 0.00044087795969283415, + "loss": 2.621, + "step": 18291 + }, + { + "epoch": 0.5424191204815705, + "grad_norm": 0.10502516478300095, + "learning_rate": 0.00044083123907702525, + "loss": 2.6385, + "step": 18292 + }, + { + "epoch": 0.5424487738338819, + "grad_norm": 0.09963113814592361, + "learning_rate": 0.00044078451898516577, + "loss": 2.6093, + "step": 18293 + }, + { + "epoch": 0.5424784271861934, + "grad_norm": 0.10107407718896866, + "learning_rate": 0.0004407377994176696, + "loss": 2.6369, + "step": 18294 + }, + { + "epoch": 0.5425080805385049, + "grad_norm": 0.1034003347158432, + "learning_rate": 0.00044069108037495046, + "loss": 2.6192, + "step": 18295 + }, + { + "epoch": 0.5425377338908164, + "grad_norm": 0.10329017788171768, + "learning_rate": 0.000440644361857422, + "loss": 2.6322, + "step": 18296 + }, + { + "epoch": 0.5425673872431278, + "grad_norm": 0.09779192507266998, + "learning_rate": 0.00044059764386549785, + "loss": 2.6174, + "step": 18297 + }, + { + "epoch": 0.5425970405954393, + "grad_norm": 0.11791303753852844, + "learning_rate": 0.0004405509263995917, + "loss": 2.6106, + "step": 18298 + }, + { + "epoch": 0.5426266939477508, + "grad_norm": 0.11271791160106659, + "learning_rate": 0.00044050420946011736, + "loss": 2.6269, + "step": 18299 + }, + { + "epoch": 0.5426563473000623, + "grad_norm": 0.11963629722595215, + "learning_rate": 0.00044045749304748863, + "loss": 2.654, + "step": 18300 + }, + { + "epoch": 0.5426860006523737, + "grad_norm": 0.10830126702785492, + "learning_rate": 0.00044041077716211886, + "loss": 2.6129, + "step": 18301 + }, + { + "epoch": 0.5427156540046852, + "grad_norm": 0.10513581335544586, + "learning_rate": 0.0004403640618044218, + "loss": 2.6502, + "step": 18302 + }, + { + "epoch": 0.5427453073569967, + "grad_norm": 0.11287734657526016, + "learning_rate": 0.00044031734697481137, + "loss": 2.6395, + "step": 18303 + }, + { + "epoch": 0.5427749607093082, + "grad_norm": 0.09800613671541214, + "learning_rate": 0.000440270632673701, + "loss": 2.6241, + "step": 18304 + }, + { + "epoch": 0.5428046140616196, + "grad_norm": 0.12107139080762863, + "learning_rate": 0.0004402239189015044, + "loss": 2.6648, + "step": 18305 + }, + { + "epoch": 0.5428342674139311, + "grad_norm": 0.11175952851772308, + "learning_rate": 0.0004401772056586352, + "loss": 2.6324, + "step": 18306 + }, + { + "epoch": 0.5428639207662426, + "grad_norm": 0.11115331947803497, + "learning_rate": 0.00044013049294550717, + "loss": 2.6278, + "step": 18307 + }, + { + "epoch": 0.5428935741185541, + "grad_norm": 0.11271417140960693, + "learning_rate": 0.00044008378076253397, + "loss": 2.6432, + "step": 18308 + }, + { + "epoch": 0.5429232274708656, + "grad_norm": 0.1283039003610611, + "learning_rate": 0.0004400370691101291, + "loss": 2.6346, + "step": 18309 + }, + { + "epoch": 0.542952880823177, + "grad_norm": 0.11937719583511353, + "learning_rate": 0.00043999035798870646, + "loss": 2.6038, + "step": 18310 + }, + { + "epoch": 0.5429825341754886, + "grad_norm": 0.12065363675355911, + "learning_rate": 0.0004399436473986793, + "loss": 2.661, + "step": 18311 + }, + { + "epoch": 0.5430121875278, + "grad_norm": 0.11201220005750656, + "learning_rate": 0.00043989693734046155, + "loss": 2.643, + "step": 18312 + }, + { + "epoch": 0.5430418408801115, + "grad_norm": 0.09888055920600891, + "learning_rate": 0.00043985022781446666, + "loss": 2.6495, + "step": 18313 + }, + { + "epoch": 0.543071494232423, + "grad_norm": 0.11166324466466904, + "learning_rate": 0.00043980351882110837, + "loss": 2.5829, + "step": 18314 + }, + { + "epoch": 0.5431011475847345, + "grad_norm": 0.11319296061992645, + "learning_rate": 0.00043975681036080024, + "loss": 2.6751, + "step": 18315 + }, + { + "epoch": 0.5431308009370459, + "grad_norm": 0.12239116430282593, + "learning_rate": 0.00043971010243395585, + "loss": 2.646, + "step": 18316 + }, + { + "epoch": 0.5431604542893574, + "grad_norm": 0.13174128532409668, + "learning_rate": 0.0004396633950409891, + "loss": 2.6657, + "step": 18317 + }, + { + "epoch": 0.5431901076416689, + "grad_norm": 0.13562428951263428, + "learning_rate": 0.0004396166881823131, + "loss": 2.6441, + "step": 18318 + }, + { + "epoch": 0.5432197609939804, + "grad_norm": 0.13711166381835938, + "learning_rate": 0.0004395699818583415, + "loss": 2.6207, + "step": 18319 + }, + { + "epoch": 0.5432494143462918, + "grad_norm": 0.1324835866689682, + "learning_rate": 0.00043952327606948844, + "loss": 2.6519, + "step": 18320 + }, + { + "epoch": 0.5432790676986033, + "grad_norm": 0.13376404345035553, + "learning_rate": 0.00043947657081616696, + "loss": 2.6297, + "step": 18321 + }, + { + "epoch": 0.5433087210509148, + "grad_norm": 0.13779014348983765, + "learning_rate": 0.0004394298660987909, + "loss": 2.6285, + "step": 18322 + }, + { + "epoch": 0.5433383744032263, + "grad_norm": 0.12534938752651215, + "learning_rate": 0.00043938316191777384, + "loss": 2.6347, + "step": 18323 + }, + { + "epoch": 0.5433680277555377, + "grad_norm": 0.10970767587423325, + "learning_rate": 0.0004393364582735292, + "loss": 2.6575, + "step": 18324 + }, + { + "epoch": 0.5433976811078493, + "grad_norm": 0.1319226324558258, + "learning_rate": 0.00043928975516647063, + "loss": 2.6226, + "step": 18325 + }, + { + "epoch": 0.5434273344601607, + "grad_norm": 0.1334536373615265, + "learning_rate": 0.00043924305259701173, + "loss": 2.6213, + "step": 18326 + }, + { + "epoch": 0.5434569878124722, + "grad_norm": 0.09775572270154953, + "learning_rate": 0.0004391963505655661, + "loss": 2.6263, + "step": 18327 + }, + { + "epoch": 0.5434866411647836, + "grad_norm": 0.1233823224902153, + "learning_rate": 0.0004391496490725472, + "loss": 2.6508, + "step": 18328 + }, + { + "epoch": 0.5435162945170952, + "grad_norm": 0.13666190207004547, + "learning_rate": 0.00043910294811836856, + "loss": 2.6545, + "step": 18329 + }, + { + "epoch": 0.5435459478694067, + "grad_norm": 0.12118276953697205, + "learning_rate": 0.0004390562477034437, + "loss": 2.6643, + "step": 18330 + }, + { + "epoch": 0.5435756012217181, + "grad_norm": 0.11732381582260132, + "learning_rate": 0.0004390095478281862, + "loss": 2.6671, + "step": 18331 + }, + { + "epoch": 0.5436052545740296, + "grad_norm": 0.11921926587820053, + "learning_rate": 0.00043896284849300973, + "loss": 2.6252, + "step": 18332 + }, + { + "epoch": 0.5436349079263411, + "grad_norm": 0.11706046760082245, + "learning_rate": 0.00043891614969832765, + "loss": 2.6335, + "step": 18333 + }, + { + "epoch": 0.5436645612786526, + "grad_norm": 0.11710352450609207, + "learning_rate": 0.0004388694514445536, + "loss": 2.6606, + "step": 18334 + }, + { + "epoch": 0.543694214630964, + "grad_norm": 0.11560333520174026, + "learning_rate": 0.000438822753732101, + "loss": 2.6345, + "step": 18335 + }, + { + "epoch": 0.5437238679832755, + "grad_norm": 0.11522389203310013, + "learning_rate": 0.0004387760565613835, + "loss": 2.6532, + "step": 18336 + }, + { + "epoch": 0.543753521335587, + "grad_norm": 0.11552202701568604, + "learning_rate": 0.0004387293599328144, + "loss": 2.6791, + "step": 18337 + }, + { + "epoch": 0.5437831746878985, + "grad_norm": 0.1253070831298828, + "learning_rate": 0.0004386826638468076, + "loss": 2.674, + "step": 18338 + }, + { + "epoch": 0.5438128280402099, + "grad_norm": 0.13718686997890472, + "learning_rate": 0.00043863596830377613, + "loss": 2.6245, + "step": 18339 + }, + { + "epoch": 0.5438424813925214, + "grad_norm": 0.14494845271110535, + "learning_rate": 0.00043858927330413374, + "loss": 2.6356, + "step": 18340 + }, + { + "epoch": 0.5438721347448329, + "grad_norm": 0.12140189856290817, + "learning_rate": 0.0004385425788482938, + "loss": 2.6352, + "step": 18341 + }, + { + "epoch": 0.5439017880971444, + "grad_norm": 0.10639018565416336, + "learning_rate": 0.0004384958849366699, + "loss": 2.6333, + "step": 18342 + }, + { + "epoch": 0.5439314414494558, + "grad_norm": 0.1183648407459259, + "learning_rate": 0.00043844919156967553, + "loss": 2.6628, + "step": 18343 + }, + { + "epoch": 0.5439610948017674, + "grad_norm": 0.10603868216276169, + "learning_rate": 0.000438402498747724, + "loss": 2.632, + "step": 18344 + }, + { + "epoch": 0.5439907481540788, + "grad_norm": 0.09627113491296768, + "learning_rate": 0.00043835580647122907, + "loss": 2.6198, + "step": 18345 + }, + { + "epoch": 0.5440204015063903, + "grad_norm": 0.10601621866226196, + "learning_rate": 0.000438309114740604, + "loss": 2.6208, + "step": 18346 + }, + { + "epoch": 0.5440500548587017, + "grad_norm": 0.09981570392847061, + "learning_rate": 0.0004382624235562624, + "loss": 2.6652, + "step": 18347 + }, + { + "epoch": 0.5440797082110133, + "grad_norm": 0.10014618188142776, + "learning_rate": 0.00043821573291861766, + "loss": 2.6363, + "step": 18348 + }, + { + "epoch": 0.5441093615633248, + "grad_norm": 0.10913921892642975, + "learning_rate": 0.0004381690428280831, + "loss": 2.5771, + "step": 18349 + }, + { + "epoch": 0.5441390149156362, + "grad_norm": 0.10809508711099625, + "learning_rate": 0.0004381223532850723, + "loss": 2.6227, + "step": 18350 + }, + { + "epoch": 0.5441686682679477, + "grad_norm": 0.11320554465055466, + "learning_rate": 0.00043807566428999867, + "loss": 2.6534, + "step": 18351 + }, + { + "epoch": 0.5441983216202592, + "grad_norm": 0.10916171222925186, + "learning_rate": 0.00043802897584327573, + "loss": 2.6263, + "step": 18352 + }, + { + "epoch": 0.5442279749725707, + "grad_norm": 0.12229348719120026, + "learning_rate": 0.00043798228794531674, + "loss": 2.606, + "step": 18353 + }, + { + "epoch": 0.5442576283248821, + "grad_norm": 0.11084499955177307, + "learning_rate": 0.0004379356005965353, + "loss": 2.6442, + "step": 18354 + }, + { + "epoch": 0.5442872816771936, + "grad_norm": 0.10779757052659988, + "learning_rate": 0.00043788891379734486, + "loss": 2.6549, + "step": 18355 + }, + { + "epoch": 0.5443169350295051, + "grad_norm": 0.123152956366539, + "learning_rate": 0.0004378422275481587, + "loss": 2.6575, + "step": 18356 + }, + { + "epoch": 0.5443465883818166, + "grad_norm": 0.14772021770477295, + "learning_rate": 0.00043779554184938995, + "loss": 2.6237, + "step": 18357 + }, + { + "epoch": 0.544376241734128, + "grad_norm": 0.13135923445224762, + "learning_rate": 0.00043774885670145274, + "loss": 2.6277, + "step": 18358 + }, + { + "epoch": 0.5444058950864396, + "grad_norm": 0.11882241815328598, + "learning_rate": 0.00043770217210476, + "loss": 2.6295, + "step": 18359 + }, + { + "epoch": 0.544435548438751, + "grad_norm": 0.15246355533599854, + "learning_rate": 0.00043765548805972524, + "loss": 2.6566, + "step": 18360 + }, + { + "epoch": 0.5444652017910625, + "grad_norm": 0.16817548871040344, + "learning_rate": 0.00043760880456676177, + "loss": 2.6336, + "step": 18361 + }, + { + "epoch": 0.5444948551433739, + "grad_norm": 0.13596723973751068, + "learning_rate": 0.00043756212162628306, + "loss": 2.6269, + "step": 18362 + }, + { + "epoch": 0.5445245084956855, + "grad_norm": 0.14511467516422272, + "learning_rate": 0.00043751543923870246, + "loss": 2.653, + "step": 18363 + }, + { + "epoch": 0.5445541618479969, + "grad_norm": 0.15153184533119202, + "learning_rate": 0.00043746875740443344, + "loss": 2.6517, + "step": 18364 + }, + { + "epoch": 0.5445838152003084, + "grad_norm": 0.13604669272899628, + "learning_rate": 0.00043742207612388934, + "loss": 2.6448, + "step": 18365 + }, + { + "epoch": 0.5446134685526198, + "grad_norm": 0.13115713000297546, + "learning_rate": 0.00043737539539748346, + "loss": 2.6184, + "step": 18366 + }, + { + "epoch": 0.5446431219049314, + "grad_norm": 0.11507011950016022, + "learning_rate": 0.00043732871522562917, + "loss": 2.6326, + "step": 18367 + }, + { + "epoch": 0.5446727752572428, + "grad_norm": 0.1356215476989746, + "learning_rate": 0.00043728203560873985, + "loss": 2.6494, + "step": 18368 + }, + { + "epoch": 0.5447024286095543, + "grad_norm": 0.13907599449157715, + "learning_rate": 0.0004372353565472289, + "loss": 2.6445, + "step": 18369 + }, + { + "epoch": 0.5447320819618658, + "grad_norm": 0.11282114684581757, + "learning_rate": 0.0004371886780415095, + "loss": 2.629, + "step": 18370 + }, + { + "epoch": 0.5447617353141773, + "grad_norm": 0.12051945179700851, + "learning_rate": 0.0004371420000919952, + "loss": 2.6197, + "step": 18371 + }, + { + "epoch": 0.5447913886664888, + "grad_norm": 0.11020185798406601, + "learning_rate": 0.0004370953226990994, + "loss": 2.6312, + "step": 18372 + }, + { + "epoch": 0.5448210420188002, + "grad_norm": 0.13405048847198486, + "learning_rate": 0.00043704864586323523, + "loss": 2.607, + "step": 18373 + }, + { + "epoch": 0.5448506953711117, + "grad_norm": 0.11618918180465698, + "learning_rate": 0.00043700196958481615, + "loss": 2.6142, + "step": 18374 + }, + { + "epoch": 0.5448803487234232, + "grad_norm": 0.11415787041187286, + "learning_rate": 0.0004369552938642554, + "loss": 2.6227, + "step": 18375 + }, + { + "epoch": 0.5449100020757347, + "grad_norm": 0.12099926173686981, + "learning_rate": 0.0004369086187019665, + "loss": 2.6208, + "step": 18376 + }, + { + "epoch": 0.5449396554280461, + "grad_norm": 0.10831522196531296, + "learning_rate": 0.0004368619440983625, + "loss": 2.6512, + "step": 18377 + }, + { + "epoch": 0.5449693087803577, + "grad_norm": 0.11410339921712875, + "learning_rate": 0.0004368152700538568, + "loss": 2.6219, + "step": 18378 + }, + { + "epoch": 0.5449989621326691, + "grad_norm": 0.11392831802368164, + "learning_rate": 0.0004367685965688627, + "loss": 2.5942, + "step": 18379 + }, + { + "epoch": 0.5450286154849806, + "grad_norm": 0.115813709795475, + "learning_rate": 0.0004367219236437936, + "loss": 2.6394, + "step": 18380 + }, + { + "epoch": 0.545058268837292, + "grad_norm": 0.11865410953760147, + "learning_rate": 0.0004366752512790627, + "loss": 2.6194, + "step": 18381 + }, + { + "epoch": 0.5450879221896036, + "grad_norm": 0.11907906085252762, + "learning_rate": 0.00043662857947508336, + "loss": 2.5967, + "step": 18382 + }, + { + "epoch": 0.545117575541915, + "grad_norm": 0.11170641332864761, + "learning_rate": 0.0004365819082322686, + "loss": 2.6398, + "step": 18383 + }, + { + "epoch": 0.5451472288942265, + "grad_norm": 0.11347637325525284, + "learning_rate": 0.00043653523755103206, + "loss": 2.6448, + "step": 18384 + }, + { + "epoch": 0.5451768822465379, + "grad_norm": 0.12205217778682709, + "learning_rate": 0.000436488567431787, + "loss": 2.6366, + "step": 18385 + }, + { + "epoch": 0.5452065355988495, + "grad_norm": 0.12629836797714233, + "learning_rate": 0.00043644189787494657, + "loss": 2.6615, + "step": 18386 + }, + { + "epoch": 0.5452361889511609, + "grad_norm": 0.12833891808986664, + "learning_rate": 0.000436395228880924, + "loss": 2.597, + "step": 18387 + }, + { + "epoch": 0.5452658423034724, + "grad_norm": 0.12058670818805695, + "learning_rate": 0.00043634856045013257, + "loss": 2.6116, + "step": 18388 + }, + { + "epoch": 0.5452954956557838, + "grad_norm": 0.09931033104658127, + "learning_rate": 0.0004363018925829856, + "loss": 2.6339, + "step": 18389 + }, + { + "epoch": 0.5453251490080954, + "grad_norm": 0.11020349711179733, + "learning_rate": 0.0004362552252798963, + "loss": 2.6021, + "step": 18390 + }, + { + "epoch": 0.5453548023604069, + "grad_norm": 0.103573739528656, + "learning_rate": 0.00043620855854127784, + "loss": 2.6038, + "step": 18391 + }, + { + "epoch": 0.5453844557127183, + "grad_norm": 0.09452494233846664, + "learning_rate": 0.0004361618923675436, + "loss": 2.6227, + "step": 18392 + }, + { + "epoch": 0.5454141090650299, + "grad_norm": 0.11182212829589844, + "learning_rate": 0.0004361152267591069, + "loss": 2.6366, + "step": 18393 + }, + { + "epoch": 0.5454437624173413, + "grad_norm": 0.11347296088933945, + "learning_rate": 0.00043606856171638067, + "loss": 2.6272, + "step": 18394 + }, + { + "epoch": 0.5454734157696528, + "grad_norm": 0.11623123288154602, + "learning_rate": 0.00043602189723977833, + "loss": 2.6517, + "step": 18395 + }, + { + "epoch": 0.5455030691219642, + "grad_norm": 0.10725667327642441, + "learning_rate": 0.0004359752333297128, + "loss": 2.5848, + "step": 18396 + }, + { + "epoch": 0.5455327224742758, + "grad_norm": 0.11149626970291138, + "learning_rate": 0.000435928569986598, + "loss": 2.6331, + "step": 18397 + }, + { + "epoch": 0.5455623758265872, + "grad_norm": 0.11349857598543167, + "learning_rate": 0.00043588190721084654, + "loss": 2.6294, + "step": 18398 + }, + { + "epoch": 0.5455920291788987, + "grad_norm": 0.11685199290513992, + "learning_rate": 0.00043583524500287175, + "loss": 2.6276, + "step": 18399 + }, + { + "epoch": 0.5456216825312101, + "grad_norm": 0.10430093109607697, + "learning_rate": 0.00043578858336308694, + "loss": 2.6726, + "step": 18400 + }, + { + "epoch": 0.5456513358835217, + "grad_norm": 0.11187440901994705, + "learning_rate": 0.00043574192229190524, + "loss": 2.6505, + "step": 18401 + }, + { + "epoch": 0.5456809892358331, + "grad_norm": 0.12013157457113266, + "learning_rate": 0.00043569526178973984, + "loss": 2.6365, + "step": 18402 + }, + { + "epoch": 0.5457106425881446, + "grad_norm": 0.12045580893754959, + "learning_rate": 0.0004356486018570041, + "loss": 2.625, + "step": 18403 + }, + { + "epoch": 0.545740295940456, + "grad_norm": 0.11424220353364944, + "learning_rate": 0.00043560194249411084, + "loss": 2.6287, + "step": 18404 + }, + { + "epoch": 0.5457699492927676, + "grad_norm": 0.12453751266002655, + "learning_rate": 0.00043555528370147346, + "loss": 2.6353, + "step": 18405 + }, + { + "epoch": 0.545799602645079, + "grad_norm": 0.11731654405593872, + "learning_rate": 0.0004355086254795051, + "loss": 2.6593, + "step": 18406 + }, + { + "epoch": 0.5458292559973905, + "grad_norm": 0.10003288090229034, + "learning_rate": 0.00043546196782861895, + "loss": 2.6522, + "step": 18407 + }, + { + "epoch": 0.5458589093497019, + "grad_norm": 0.1304635852575302, + "learning_rate": 0.00043541531074922814, + "loss": 2.6246, + "step": 18408 + }, + { + "epoch": 0.5458885627020135, + "grad_norm": 0.12291212379932404, + "learning_rate": 0.0004353686542417458, + "loss": 2.6302, + "step": 18409 + }, + { + "epoch": 0.5459182160543249, + "grad_norm": 0.12768280506134033, + "learning_rate": 0.0004353219983065851, + "loss": 2.6268, + "step": 18410 + }, + { + "epoch": 0.5459478694066364, + "grad_norm": 0.13560926914215088, + "learning_rate": 0.0004352753429441593, + "loss": 2.6322, + "step": 18411 + }, + { + "epoch": 0.545977522758948, + "grad_norm": 0.11883575469255447, + "learning_rate": 0.00043522868815488135, + "loss": 2.5955, + "step": 18412 + }, + { + "epoch": 0.5460071761112594, + "grad_norm": 0.12255915254354477, + "learning_rate": 0.0004351820339391646, + "loss": 2.6148, + "step": 18413 + }, + { + "epoch": 0.5460368294635709, + "grad_norm": 0.11624626815319061, + "learning_rate": 0.00043513538029742215, + "loss": 2.6679, + "step": 18414 + }, + { + "epoch": 0.5460664828158823, + "grad_norm": 0.11795011907815933, + "learning_rate": 0.0004350887272300669, + "loss": 2.6184, + "step": 18415 + }, + { + "epoch": 0.5460961361681939, + "grad_norm": 0.10895279049873352, + "learning_rate": 0.0004350420747375121, + "loss": 2.6267, + "step": 18416 + }, + { + "epoch": 0.5461257895205053, + "grad_norm": 0.1277415007352829, + "learning_rate": 0.0004349954228201709, + "loss": 2.6246, + "step": 18417 + }, + { + "epoch": 0.5461554428728168, + "grad_norm": 0.1296154111623764, + "learning_rate": 0.0004349487714784564, + "loss": 2.6554, + "step": 18418 + }, + { + "epoch": 0.5461850962251282, + "grad_norm": 0.09843409806489944, + "learning_rate": 0.00043490212071278165, + "loss": 2.6267, + "step": 18419 + }, + { + "epoch": 0.5462147495774398, + "grad_norm": 0.12069212645292282, + "learning_rate": 0.00043485547052356, + "loss": 2.648, + "step": 18420 + }, + { + "epoch": 0.5462444029297512, + "grad_norm": 0.12166214734315872, + "learning_rate": 0.00043480882091120415, + "loss": 2.5943, + "step": 18421 + }, + { + "epoch": 0.5462740562820627, + "grad_norm": 0.11242639273405075, + "learning_rate": 0.0004347621718761272, + "loss": 2.608, + "step": 18422 + }, + { + "epoch": 0.5463037096343741, + "grad_norm": 0.09431380778551102, + "learning_rate": 0.0004347155234187426, + "loss": 2.6337, + "step": 18423 + }, + { + "epoch": 0.5463333629866857, + "grad_norm": 0.09942370653152466, + "learning_rate": 0.0004346688755394634, + "loss": 2.6166, + "step": 18424 + }, + { + "epoch": 0.5463630163389971, + "grad_norm": 0.08911038935184479, + "learning_rate": 0.00043462222823870237, + "loss": 2.6395, + "step": 18425 + }, + { + "epoch": 0.5463926696913086, + "grad_norm": 0.11014942824840546, + "learning_rate": 0.0004345755815168727, + "loss": 2.6724, + "step": 18426 + }, + { + "epoch": 0.54642232304362, + "grad_norm": 0.128860205411911, + "learning_rate": 0.00043452893537438753, + "loss": 2.6433, + "step": 18427 + }, + { + "epoch": 0.5464519763959316, + "grad_norm": 0.1078413799405098, + "learning_rate": 0.00043448228981165995, + "loss": 2.6164, + "step": 18428 + }, + { + "epoch": 0.546481629748243, + "grad_norm": 0.10701350122690201, + "learning_rate": 0.0004344356448291028, + "loss": 2.6211, + "step": 18429 + }, + { + "epoch": 0.5465112831005545, + "grad_norm": 0.10537400096654892, + "learning_rate": 0.0004343890004271294, + "loss": 2.6142, + "step": 18430 + }, + { + "epoch": 0.5465409364528659, + "grad_norm": 0.13171379268169403, + "learning_rate": 0.0004343423566061527, + "loss": 2.6819, + "step": 18431 + }, + { + "epoch": 0.5465705898051775, + "grad_norm": 0.12284159660339355, + "learning_rate": 0.0004342957133665856, + "loss": 2.6027, + "step": 18432 + }, + { + "epoch": 0.546600243157489, + "grad_norm": 0.10927252471446991, + "learning_rate": 0.00043424907070884123, + "loss": 2.646, + "step": 18433 + }, + { + "epoch": 0.5466298965098004, + "grad_norm": 0.10512549430131912, + "learning_rate": 0.0004342024286333326, + "loss": 2.6338, + "step": 18434 + }, + { + "epoch": 0.546659549862112, + "grad_norm": 0.10693187266588211, + "learning_rate": 0.0004341557871404727, + "loss": 2.6326, + "step": 18435 + }, + { + "epoch": 0.5466892032144234, + "grad_norm": 0.11130741238594055, + "learning_rate": 0.00043410914623067466, + "loss": 2.6522, + "step": 18436 + }, + { + "epoch": 0.5467188565667349, + "grad_norm": 0.11083313077688217, + "learning_rate": 0.0004340625059043515, + "loss": 2.6411, + "step": 18437 + }, + { + "epoch": 0.5467485099190463, + "grad_norm": 0.1001487448811531, + "learning_rate": 0.0004340158661619161, + "loss": 2.6224, + "step": 18438 + }, + { + "epoch": 0.5467781632713579, + "grad_norm": 0.09813033044338226, + "learning_rate": 0.0004339692270037816, + "loss": 2.6044, + "step": 18439 + }, + { + "epoch": 0.5468078166236693, + "grad_norm": 0.11591480672359467, + "learning_rate": 0.0004339225884303609, + "loss": 2.618, + "step": 18440 + }, + { + "epoch": 0.5468374699759808, + "grad_norm": 0.12029323726892471, + "learning_rate": 0.0004338759504420672, + "loss": 2.6349, + "step": 18441 + }, + { + "epoch": 0.5468671233282922, + "grad_norm": 0.12367207556962967, + "learning_rate": 0.0004338293130393131, + "loss": 2.6391, + "step": 18442 + }, + { + "epoch": 0.5468967766806038, + "grad_norm": 0.12825827300548553, + "learning_rate": 0.0004337826762225118, + "loss": 2.5903, + "step": 18443 + }, + { + "epoch": 0.5469264300329152, + "grad_norm": 0.1448698192834854, + "learning_rate": 0.0004337360399920763, + "loss": 2.6208, + "step": 18444 + }, + { + "epoch": 0.5469560833852267, + "grad_norm": 0.1123063862323761, + "learning_rate": 0.0004336894043484195, + "loss": 2.6283, + "step": 18445 + }, + { + "epoch": 0.5469857367375381, + "grad_norm": 0.10787449777126312, + "learning_rate": 0.0004336427692919545, + "loss": 2.6152, + "step": 18446 + }, + { + "epoch": 0.5470153900898497, + "grad_norm": 0.11957802623510361, + "learning_rate": 0.0004335961348230941, + "loss": 2.6241, + "step": 18447 + }, + { + "epoch": 0.5470450434421611, + "grad_norm": 0.13094118237495422, + "learning_rate": 0.0004335495009422512, + "loss": 2.6437, + "step": 18448 + }, + { + "epoch": 0.5470746967944726, + "grad_norm": 0.10831762850284576, + "learning_rate": 0.000433502867649839, + "loss": 2.6194, + "step": 18449 + }, + { + "epoch": 0.547104350146784, + "grad_norm": 0.099501833319664, + "learning_rate": 0.00043345623494627037, + "loss": 2.6046, + "step": 18450 + }, + { + "epoch": 0.5471340034990956, + "grad_norm": 0.11290669441223145, + "learning_rate": 0.0004334096028319582, + "loss": 2.635, + "step": 18451 + }, + { + "epoch": 0.547163656851407, + "grad_norm": 0.11117023229598999, + "learning_rate": 0.00043336297130731546, + "loss": 2.643, + "step": 18452 + }, + { + "epoch": 0.5471933102037185, + "grad_norm": 0.09727330505847931, + "learning_rate": 0.000433316340372755, + "loss": 2.615, + "step": 18453 + }, + { + "epoch": 0.5472229635560301, + "grad_norm": 0.1099662333726883, + "learning_rate": 0.0004332697100286898, + "loss": 2.6169, + "step": 18454 + }, + { + "epoch": 0.5472526169083415, + "grad_norm": 0.12463922053575516, + "learning_rate": 0.0004332230802755327, + "loss": 2.6605, + "step": 18455 + }, + { + "epoch": 0.547282270260653, + "grad_norm": 0.13122007250785828, + "learning_rate": 0.0004331764511136967, + "loss": 2.6161, + "step": 18456 + }, + { + "epoch": 0.5473119236129644, + "grad_norm": 0.13211379945278168, + "learning_rate": 0.00043312982254359474, + "loss": 2.6417, + "step": 18457 + }, + { + "epoch": 0.547341576965276, + "grad_norm": 0.12341523170471191, + "learning_rate": 0.00043308319456563976, + "loss": 2.6212, + "step": 18458 + }, + { + "epoch": 0.5473712303175874, + "grad_norm": 0.12066858261823654, + "learning_rate": 0.0004330365671802445, + "loss": 2.6074, + "step": 18459 + }, + { + "epoch": 0.5474008836698989, + "grad_norm": 0.11404156684875488, + "learning_rate": 0.0004329899403878219, + "loss": 2.658, + "step": 18460 + }, + { + "epoch": 0.5474305370222103, + "grad_norm": 0.12191285192966461, + "learning_rate": 0.0004329433141887846, + "loss": 2.6304, + "step": 18461 + }, + { + "epoch": 0.5474601903745219, + "grad_norm": 0.11766018718481064, + "learning_rate": 0.0004328966885835462, + "loss": 2.6098, + "step": 18462 + }, + { + "epoch": 0.5474898437268333, + "grad_norm": 0.10834070295095444, + "learning_rate": 0.00043285006357251897, + "loss": 2.6637, + "step": 18463 + }, + { + "epoch": 0.5475194970791448, + "grad_norm": 0.12314370274543762, + "learning_rate": 0.00043280343915611604, + "loss": 2.6195, + "step": 18464 + }, + { + "epoch": 0.5475491504314562, + "grad_norm": 0.1146920770406723, + "learning_rate": 0.0004327568153347501, + "loss": 2.5924, + "step": 18465 + }, + { + "epoch": 0.5475788037837678, + "grad_norm": 0.10489402711391449, + "learning_rate": 0.0004327101921088341, + "loss": 2.61, + "step": 18466 + }, + { + "epoch": 0.5476084571360792, + "grad_norm": 0.11455672979354858, + "learning_rate": 0.00043266356947878093, + "loss": 2.6396, + "step": 18467 + }, + { + "epoch": 0.5476381104883907, + "grad_norm": 0.1135922446846962, + "learning_rate": 0.00043261694744500345, + "loss": 2.6263, + "step": 18468 + }, + { + "epoch": 0.5476677638407021, + "grad_norm": 0.10104551911354065, + "learning_rate": 0.0004325703260079145, + "loss": 2.6564, + "step": 18469 + }, + { + "epoch": 0.5476974171930137, + "grad_norm": 0.11155449599027634, + "learning_rate": 0.00043252370516792685, + "loss": 2.6029, + "step": 18470 + }, + { + "epoch": 0.5477270705453251, + "grad_norm": 0.11547049880027771, + "learning_rate": 0.00043247708492545335, + "loss": 2.6294, + "step": 18471 + }, + { + "epoch": 0.5477567238976366, + "grad_norm": 0.10180388391017914, + "learning_rate": 0.00043243046528090677, + "loss": 2.6135, + "step": 18472 + }, + { + "epoch": 0.547786377249948, + "grad_norm": 0.0961441695690155, + "learning_rate": 0.0004323838462347001, + "loss": 2.5792, + "step": 18473 + }, + { + "epoch": 0.5478160306022596, + "grad_norm": 0.10327061265707016, + "learning_rate": 0.00043233722778724597, + "loss": 2.6495, + "step": 18474 + }, + { + "epoch": 0.5478456839545711, + "grad_norm": 0.0966700091958046, + "learning_rate": 0.0004322906099389574, + "loss": 2.6555, + "step": 18475 + }, + { + "epoch": 0.5478753373068825, + "grad_norm": 0.10878454893827438, + "learning_rate": 0.00043224399269024713, + "loss": 2.6572, + "step": 18476 + }, + { + "epoch": 0.5479049906591941, + "grad_norm": 0.1031298041343689, + "learning_rate": 0.0004321973760415279, + "loss": 2.6316, + "step": 18477 + }, + { + "epoch": 0.5479346440115055, + "grad_norm": 0.10327780246734619, + "learning_rate": 0.00043215075999321253, + "loss": 2.6332, + "step": 18478 + }, + { + "epoch": 0.547964297363817, + "grad_norm": 0.09615027159452438, + "learning_rate": 0.00043210414454571393, + "loss": 2.6258, + "step": 18479 + }, + { + "epoch": 0.5479939507161284, + "grad_norm": 0.09622400254011154, + "learning_rate": 0.00043205752969944475, + "loss": 2.63, + "step": 18480 + }, + { + "epoch": 0.54802360406844, + "grad_norm": 0.1157105416059494, + "learning_rate": 0.0004320109154548177, + "loss": 2.639, + "step": 18481 + }, + { + "epoch": 0.5480532574207514, + "grad_norm": 0.09966166317462921, + "learning_rate": 0.0004319643018122458, + "loss": 2.606, + "step": 18482 + }, + { + "epoch": 0.5480829107730629, + "grad_norm": 0.10641749948263168, + "learning_rate": 0.00043191768877214157, + "loss": 2.6324, + "step": 18483 + }, + { + "epoch": 0.5481125641253743, + "grad_norm": 0.1028217151761055, + "learning_rate": 0.0004318710763349179, + "loss": 2.6633, + "step": 18484 + }, + { + "epoch": 0.5481422174776859, + "grad_norm": 0.12815900146961212, + "learning_rate": 0.00043182446450098756, + "loss": 2.6466, + "step": 18485 + }, + { + "epoch": 0.5481718708299973, + "grad_norm": 0.14881575107574463, + "learning_rate": 0.0004317778532707634, + "loss": 2.6249, + "step": 18486 + }, + { + "epoch": 0.5482015241823088, + "grad_norm": 0.11976838111877441, + "learning_rate": 0.00043173124264465776, + "loss": 2.6303, + "step": 18487 + }, + { + "epoch": 0.5482311775346203, + "grad_norm": 0.09969201683998108, + "learning_rate": 0.0004316846326230839, + "loss": 2.6352, + "step": 18488 + }, + { + "epoch": 0.5482608308869318, + "grad_norm": 0.1296699345111847, + "learning_rate": 0.0004316380232064543, + "loss": 2.6519, + "step": 18489 + }, + { + "epoch": 0.5482904842392432, + "grad_norm": 0.1247962936758995, + "learning_rate": 0.0004315914143951819, + "loss": 2.5966, + "step": 18490 + }, + { + "epoch": 0.5483201375915547, + "grad_norm": 0.10326685011386871, + "learning_rate": 0.0004315448061896791, + "loss": 2.609, + "step": 18491 + }, + { + "epoch": 0.5483497909438662, + "grad_norm": 0.1307612955570221, + "learning_rate": 0.00043149819859035883, + "loss": 2.6431, + "step": 18492 + }, + { + "epoch": 0.5483794442961777, + "grad_norm": 0.14528368413448334, + "learning_rate": 0.00043145159159763374, + "loss": 2.6229, + "step": 18493 + }, + { + "epoch": 0.5484090976484891, + "grad_norm": 0.13873390853405, + "learning_rate": 0.00043140498521191664, + "loss": 2.647, + "step": 18494 + }, + { + "epoch": 0.5484387510008006, + "grad_norm": 0.11851073801517487, + "learning_rate": 0.0004313583794336201, + "loss": 2.6269, + "step": 18495 + }, + { + "epoch": 0.5484684043531122, + "grad_norm": 0.1187511458992958, + "learning_rate": 0.00043131177426315704, + "loss": 2.627, + "step": 18496 + }, + { + "epoch": 0.5484980577054236, + "grad_norm": 0.12014622986316681, + "learning_rate": 0.0004312651697009399, + "loss": 2.6594, + "step": 18497 + }, + { + "epoch": 0.5485277110577351, + "grad_norm": 0.12020602822303772, + "learning_rate": 0.00043121856574738143, + "loss": 2.6385, + "step": 18498 + }, + { + "epoch": 0.5485573644100465, + "grad_norm": 0.12027912586927414, + "learning_rate": 0.00043117196240289446, + "loss": 2.5909, + "step": 18499 + }, + { + "epoch": 0.5485870177623581, + "grad_norm": 0.10765132308006287, + "learning_rate": 0.0004311253596678914, + "loss": 2.6344, + "step": 18500 + }, + { + "epoch": 0.5486166711146695, + "grad_norm": 0.10307115316390991, + "learning_rate": 0.00043107875754278517, + "loss": 2.6328, + "step": 18501 + }, + { + "epoch": 0.548646324466981, + "grad_norm": 0.10411392152309418, + "learning_rate": 0.00043103215602798846, + "loss": 2.6274, + "step": 18502 + }, + { + "epoch": 0.5486759778192924, + "grad_norm": 0.11618343740701675, + "learning_rate": 0.0004309855551239138, + "loss": 2.6659, + "step": 18503 + }, + { + "epoch": 0.548705631171604, + "grad_norm": 0.1294659674167633, + "learning_rate": 0.0004309389548309739, + "loss": 2.6207, + "step": 18504 + }, + { + "epoch": 0.5487352845239154, + "grad_norm": 0.12486127018928528, + "learning_rate": 0.00043089235514958145, + "loss": 2.6155, + "step": 18505 + }, + { + "epoch": 0.5487649378762269, + "grad_norm": 0.1202327162027359, + "learning_rate": 0.000430845756080149, + "loss": 2.6172, + "step": 18506 + }, + { + "epoch": 0.5487945912285384, + "grad_norm": 0.10772000998258591, + "learning_rate": 0.00043079915762308943, + "loss": 2.6366, + "step": 18507 + }, + { + "epoch": 0.5488242445808499, + "grad_norm": 0.10030227899551392, + "learning_rate": 0.000430752559778815, + "loss": 2.6278, + "step": 18508 + }, + { + "epoch": 0.5488538979331613, + "grad_norm": 0.11727546155452728, + "learning_rate": 0.0004307059625477386, + "loss": 2.6307, + "step": 18509 + }, + { + "epoch": 0.5488835512854728, + "grad_norm": 0.11948752403259277, + "learning_rate": 0.00043065936593027275, + "loss": 2.6367, + "step": 18510 + }, + { + "epoch": 0.5489132046377843, + "grad_norm": 0.12084127217531204, + "learning_rate": 0.00043061276992683017, + "loss": 2.6526, + "step": 18511 + }, + { + "epoch": 0.5489428579900958, + "grad_norm": 0.11001016199588776, + "learning_rate": 0.00043056617453782333, + "loss": 2.6415, + "step": 18512 + }, + { + "epoch": 0.5489725113424072, + "grad_norm": 0.10128548741340637, + "learning_rate": 0.00043051957976366495, + "loss": 2.6383, + "step": 18513 + }, + { + "epoch": 0.5490021646947187, + "grad_norm": 0.11557342857122421, + "learning_rate": 0.00043047298560476766, + "loss": 2.6542, + "step": 18514 + }, + { + "epoch": 0.5490318180470302, + "grad_norm": 0.13303592801094055, + "learning_rate": 0.00043042639206154407, + "loss": 2.642, + "step": 18515 + }, + { + "epoch": 0.5490614713993417, + "grad_norm": 0.11329936236143112, + "learning_rate": 0.0004303797991344066, + "loss": 2.6202, + "step": 18516 + }, + { + "epoch": 0.5490911247516532, + "grad_norm": 0.09971155226230621, + "learning_rate": 0.0004303332068237682, + "loss": 2.642, + "step": 18517 + }, + { + "epoch": 0.5491207781039646, + "grad_norm": 0.1217888742685318, + "learning_rate": 0.000430286615130041, + "loss": 2.6365, + "step": 18518 + }, + { + "epoch": 0.5491504314562762, + "grad_norm": 0.12324625253677368, + "learning_rate": 0.00043024002405363785, + "loss": 2.6289, + "step": 18519 + }, + { + "epoch": 0.5491800848085876, + "grad_norm": 0.11592970788478851, + "learning_rate": 0.0004301934335949713, + "loss": 2.6552, + "step": 18520 + }, + { + "epoch": 0.5492097381608991, + "grad_norm": 0.13201625645160675, + "learning_rate": 0.0004301468437544538, + "loss": 2.6646, + "step": 18521 + }, + { + "epoch": 0.5492393915132106, + "grad_norm": 0.1577618569135666, + "learning_rate": 0.000430100254532498, + "loss": 2.6364, + "step": 18522 + }, + { + "epoch": 0.5492690448655221, + "grad_norm": 0.15921920537948608, + "learning_rate": 0.00043005366592951646, + "loss": 2.6425, + "step": 18523 + }, + { + "epoch": 0.5492986982178335, + "grad_norm": 0.11501365154981613, + "learning_rate": 0.0004300070779459218, + "loss": 2.6213, + "step": 18524 + }, + { + "epoch": 0.549328351570145, + "grad_norm": 0.11388186365365982, + "learning_rate": 0.00042996049058212615, + "loss": 2.6461, + "step": 18525 + }, + { + "epoch": 0.5493580049224565, + "grad_norm": 0.1299101710319519, + "learning_rate": 0.0004299139038385426, + "loss": 2.6372, + "step": 18526 + }, + { + "epoch": 0.549387658274768, + "grad_norm": 0.12024157494306564, + "learning_rate": 0.0004298673177155835, + "loss": 2.6598, + "step": 18527 + }, + { + "epoch": 0.5494173116270794, + "grad_norm": 0.1303664594888687, + "learning_rate": 0.0004298207322136615, + "loss": 2.6259, + "step": 18528 + }, + { + "epoch": 0.5494469649793909, + "grad_norm": 0.10531673580408096, + "learning_rate": 0.00042977414733318874, + "loss": 2.6503, + "step": 18529 + }, + { + "epoch": 0.5494766183317024, + "grad_norm": 0.1087576225399971, + "learning_rate": 0.000429727563074578, + "loss": 2.6077, + "step": 18530 + }, + { + "epoch": 0.5495062716840139, + "grad_norm": 0.11916261166334152, + "learning_rate": 0.00042968097943824177, + "loss": 2.6529, + "step": 18531 + }, + { + "epoch": 0.5495359250363253, + "grad_norm": 0.12406395375728607, + "learning_rate": 0.00042963439642459245, + "loss": 2.6442, + "step": 18532 + }, + { + "epoch": 0.5495655783886368, + "grad_norm": 0.1300632357597351, + "learning_rate": 0.00042958781403404275, + "loss": 2.6115, + "step": 18533 + }, + { + "epoch": 0.5495952317409483, + "grad_norm": 0.1286117136478424, + "learning_rate": 0.0004295412322670051, + "loss": 2.6188, + "step": 18534 + }, + { + "epoch": 0.5496248850932598, + "grad_norm": 0.10788841545581818, + "learning_rate": 0.0004294946511238918, + "loss": 2.644, + "step": 18535 + }, + { + "epoch": 0.5496545384455712, + "grad_norm": 0.11443184316158295, + "learning_rate": 0.0004294480706051155, + "loss": 2.6396, + "step": 18536 + }, + { + "epoch": 0.5496841917978827, + "grad_norm": 0.12812986969947815, + "learning_rate": 0.0004294014907110886, + "loss": 2.6077, + "step": 18537 + }, + { + "epoch": 0.5497138451501943, + "grad_norm": 0.11265704035758972, + "learning_rate": 0.00042935491144222357, + "loss": 2.6191, + "step": 18538 + }, + { + "epoch": 0.5497434985025057, + "grad_norm": 0.10292447358369827, + "learning_rate": 0.000429308332798933, + "loss": 2.6256, + "step": 18539 + }, + { + "epoch": 0.5497731518548172, + "grad_norm": 0.12811030447483063, + "learning_rate": 0.00042926175478162924, + "loss": 2.5963, + "step": 18540 + }, + { + "epoch": 0.5498028052071287, + "grad_norm": 0.1309477984905243, + "learning_rate": 0.0004292151773907249, + "loss": 2.5986, + "step": 18541 + }, + { + "epoch": 0.5498324585594402, + "grad_norm": 0.12067500501871109, + "learning_rate": 0.0004291686006266322, + "loss": 2.6176, + "step": 18542 + }, + { + "epoch": 0.5498621119117516, + "grad_norm": 0.10521215945482254, + "learning_rate": 0.0004291220244897637, + "loss": 2.657, + "step": 18543 + }, + { + "epoch": 0.5498917652640631, + "grad_norm": 0.13114497065544128, + "learning_rate": 0.0004290754489805319, + "loss": 2.6424, + "step": 18544 + }, + { + "epoch": 0.5499214186163746, + "grad_norm": 0.1236487329006195, + "learning_rate": 0.00042902887409934924, + "loss": 2.5984, + "step": 18545 + }, + { + "epoch": 0.5499510719686861, + "grad_norm": 0.13829456269741058, + "learning_rate": 0.00042898229984662806, + "loss": 2.6164, + "step": 18546 + }, + { + "epoch": 0.5499807253209975, + "grad_norm": 0.12357231974601746, + "learning_rate": 0.0004289357262227807, + "loss": 2.5954, + "step": 18547 + }, + { + "epoch": 0.550010378673309, + "grad_norm": 0.11360856890678406, + "learning_rate": 0.0004288891532282198, + "loss": 2.6378, + "step": 18548 + }, + { + "epoch": 0.5500400320256205, + "grad_norm": 0.12046617269515991, + "learning_rate": 0.0004288425808633575, + "loss": 2.5883, + "step": 18549 + }, + { + "epoch": 0.550069685377932, + "grad_norm": 0.12433476001024246, + "learning_rate": 0.00042879600912860646, + "loss": 2.6037, + "step": 18550 + }, + { + "epoch": 0.5500993387302434, + "grad_norm": 0.11783239990472794, + "learning_rate": 0.00042874943802437884, + "loss": 2.648, + "step": 18551 + }, + { + "epoch": 0.550128992082555, + "grad_norm": 0.10332165658473969, + "learning_rate": 0.0004287028675510873, + "loss": 2.6302, + "step": 18552 + }, + { + "epoch": 0.5501586454348664, + "grad_norm": 0.12212593108415604, + "learning_rate": 0.0004286562977091441, + "loss": 2.6174, + "step": 18553 + }, + { + "epoch": 0.5501882987871779, + "grad_norm": 0.1288546323776245, + "learning_rate": 0.0004286097284989616, + "loss": 2.6447, + "step": 18554 + }, + { + "epoch": 0.5502179521394893, + "grad_norm": 0.10934550315141678, + "learning_rate": 0.00042856315992095237, + "loss": 2.6208, + "step": 18555 + }, + { + "epoch": 0.5502476054918009, + "grad_norm": 0.12373384833335876, + "learning_rate": 0.0004285165919755285, + "loss": 2.5939, + "step": 18556 + }, + { + "epoch": 0.5502772588441123, + "grad_norm": 0.12146458029747009, + "learning_rate": 0.0004284700246631025, + "loss": 2.6275, + "step": 18557 + }, + { + "epoch": 0.5503069121964238, + "grad_norm": 0.12243364751338959, + "learning_rate": 0.0004284234579840866, + "loss": 2.6133, + "step": 18558 + }, + { + "epoch": 0.5503365655487353, + "grad_norm": 0.11457331478595734, + "learning_rate": 0.0004283768919388934, + "loss": 2.6649, + "step": 18559 + }, + { + "epoch": 0.5503662189010468, + "grad_norm": 0.1078655868768692, + "learning_rate": 0.00042833032652793505, + "loss": 2.6014, + "step": 18560 + }, + { + "epoch": 0.5503958722533583, + "grad_norm": 0.09885235875844955, + "learning_rate": 0.0004282837617516239, + "loss": 2.6409, + "step": 18561 + }, + { + "epoch": 0.5504255256056697, + "grad_norm": 0.10826652497053146, + "learning_rate": 0.00042823719761037255, + "loss": 2.6204, + "step": 18562 + }, + { + "epoch": 0.5504551789579812, + "grad_norm": 0.10777998715639114, + "learning_rate": 0.000428190634104593, + "loss": 2.6519, + "step": 18563 + }, + { + "epoch": 0.5504848323102927, + "grad_norm": 0.08810853213071823, + "learning_rate": 0.0004281440712346975, + "loss": 2.6239, + "step": 18564 + }, + { + "epoch": 0.5505144856626042, + "grad_norm": 0.11462540179491043, + "learning_rate": 0.00042809750900109876, + "loss": 2.6635, + "step": 18565 + }, + { + "epoch": 0.5505441390149156, + "grad_norm": 0.09519058465957642, + "learning_rate": 0.00042805094740420914, + "loss": 2.592, + "step": 18566 + }, + { + "epoch": 0.5505737923672271, + "grad_norm": 0.09943366050720215, + "learning_rate": 0.0004280043864444405, + "loss": 2.6771, + "step": 18567 + }, + { + "epoch": 0.5506034457195386, + "grad_norm": 0.11466316878795624, + "learning_rate": 0.00042795782612220544, + "loss": 2.6293, + "step": 18568 + }, + { + "epoch": 0.5506330990718501, + "grad_norm": 0.08831131458282471, + "learning_rate": 0.00042791126643791617, + "loss": 2.6369, + "step": 18569 + }, + { + "epoch": 0.5506627524241615, + "grad_norm": 0.09716516733169556, + "learning_rate": 0.000427864707391985, + "loss": 2.6396, + "step": 18570 + }, + { + "epoch": 0.550692405776473, + "grad_norm": 0.09012757241725922, + "learning_rate": 0.00042781814898482426, + "loss": 2.6764, + "step": 18571 + }, + { + "epoch": 0.5507220591287845, + "grad_norm": 0.097968690097332, + "learning_rate": 0.0004277715912168463, + "loss": 2.6294, + "step": 18572 + }, + { + "epoch": 0.550751712481096, + "grad_norm": 0.10258522629737854, + "learning_rate": 0.0004277250340884632, + "loss": 2.6267, + "step": 18573 + }, + { + "epoch": 0.5507813658334074, + "grad_norm": 0.10777699202299118, + "learning_rate": 0.00042767847760008727, + "loss": 2.6537, + "step": 18574 + }, + { + "epoch": 0.550811019185719, + "grad_norm": 0.1324540227651596, + "learning_rate": 0.0004276319217521309, + "loss": 2.6606, + "step": 18575 + }, + { + "epoch": 0.5508406725380304, + "grad_norm": 0.1385170817375183, + "learning_rate": 0.00042758536654500623, + "loss": 2.6267, + "step": 18576 + }, + { + "epoch": 0.5508703258903419, + "grad_norm": 0.12167384475469589, + "learning_rate": 0.00042753881197912547, + "loss": 2.6238, + "step": 18577 + }, + { + "epoch": 0.5508999792426534, + "grad_norm": 0.11729961633682251, + "learning_rate": 0.0004274922580549011, + "loss": 2.6526, + "step": 18578 + }, + { + "epoch": 0.5509296325949649, + "grad_norm": 0.11073361337184906, + "learning_rate": 0.00042744570477274525, + "loss": 2.628, + "step": 18579 + }, + { + "epoch": 0.5509592859472764, + "grad_norm": 0.10642112791538239, + "learning_rate": 0.0004273991521330701, + "loss": 2.6352, + "step": 18580 + }, + { + "epoch": 0.5509889392995878, + "grad_norm": 0.10305269807577133, + "learning_rate": 0.000427352600136288, + "loss": 2.6346, + "step": 18581 + }, + { + "epoch": 0.5510185926518993, + "grad_norm": 0.11488258838653564, + "learning_rate": 0.000427306048782811, + "loss": 2.638, + "step": 18582 + }, + { + "epoch": 0.5510482460042108, + "grad_norm": 0.10472575575113297, + "learning_rate": 0.00042725949807305154, + "loss": 2.6059, + "step": 18583 + }, + { + "epoch": 0.5510778993565223, + "grad_norm": 0.1047949492931366, + "learning_rate": 0.0004272129480074216, + "loss": 2.6324, + "step": 18584 + }, + { + "epoch": 0.5511075527088337, + "grad_norm": 0.11030346900224686, + "learning_rate": 0.00042716639858633357, + "loss": 2.6631, + "step": 18585 + }, + { + "epoch": 0.5511372060611452, + "grad_norm": 0.11206136643886566, + "learning_rate": 0.0004271198498101995, + "loss": 2.6346, + "step": 18586 + }, + { + "epoch": 0.5511668594134567, + "grad_norm": 0.10398031771183014, + "learning_rate": 0.0004270733016794317, + "loss": 2.6394, + "step": 18587 + }, + { + "epoch": 0.5511965127657682, + "grad_norm": 0.11664806306362152, + "learning_rate": 0.0004270267541944423, + "loss": 2.6312, + "step": 18588 + }, + { + "epoch": 0.5512261661180796, + "grad_norm": 0.10869406163692474, + "learning_rate": 0.0004269802073556437, + "loss": 2.657, + "step": 18589 + }, + { + "epoch": 0.5512558194703912, + "grad_norm": 0.11298476159572601, + "learning_rate": 0.0004269336611634475, + "loss": 2.6623, + "step": 18590 + }, + { + "epoch": 0.5512854728227026, + "grad_norm": 0.12427869439125061, + "learning_rate": 0.0004268871156182665, + "loss": 2.6256, + "step": 18591 + }, + { + "epoch": 0.5513151261750141, + "grad_norm": 0.12567612528800964, + "learning_rate": 0.0004268405707205127, + "loss": 2.6252, + "step": 18592 + }, + { + "epoch": 0.5513447795273255, + "grad_norm": 0.12224765121936798, + "learning_rate": 0.00042679402647059826, + "loss": 2.6074, + "step": 18593 + }, + { + "epoch": 0.551374432879637, + "grad_norm": 0.12249502539634705, + "learning_rate": 0.00042674748286893516, + "loss": 2.643, + "step": 18594 + }, + { + "epoch": 0.5514040862319485, + "grad_norm": 0.11781086772680283, + "learning_rate": 0.00042670093991593574, + "loss": 2.6237, + "step": 18595 + }, + { + "epoch": 0.55143373958426, + "grad_norm": 0.12033823132514954, + "learning_rate": 0.00042665439761201194, + "loss": 2.6052, + "step": 18596 + }, + { + "epoch": 0.5514633929365714, + "grad_norm": 0.1384948343038559, + "learning_rate": 0.00042660785595757616, + "loss": 2.6309, + "step": 18597 + }, + { + "epoch": 0.551493046288883, + "grad_norm": 0.12350521236658096, + "learning_rate": 0.0004265613149530404, + "loss": 2.657, + "step": 18598 + }, + { + "epoch": 0.5515226996411945, + "grad_norm": 0.10047397762537003, + "learning_rate": 0.0004265147745988168, + "loss": 2.6145, + "step": 18599 + }, + { + "epoch": 0.5515523529935059, + "grad_norm": 0.10228107869625092, + "learning_rate": 0.0004264682348953176, + "loss": 2.6586, + "step": 18600 + }, + { + "epoch": 0.5515820063458174, + "grad_norm": 0.12105954438447952, + "learning_rate": 0.00042642169584295467, + "loss": 2.6248, + "step": 18601 + }, + { + "epoch": 0.5516116596981289, + "grad_norm": 0.12148828059434891, + "learning_rate": 0.0004263751574421402, + "loss": 2.6507, + "step": 18602 + }, + { + "epoch": 0.5516413130504404, + "grad_norm": 0.11292377859354019, + "learning_rate": 0.00042632861969328623, + "loss": 2.6178, + "step": 18603 + }, + { + "epoch": 0.5516709664027518, + "grad_norm": 0.12434003502130508, + "learning_rate": 0.0004262820825968052, + "loss": 2.6363, + "step": 18604 + }, + { + "epoch": 0.5517006197550633, + "grad_norm": 0.1250091791152954, + "learning_rate": 0.000426235546153109, + "loss": 2.6187, + "step": 18605 + }, + { + "epoch": 0.5517302731073748, + "grad_norm": 0.12228372693061829, + "learning_rate": 0.00042618901036260964, + "loss": 2.6613, + "step": 18606 + }, + { + "epoch": 0.5517599264596863, + "grad_norm": 0.1026625782251358, + "learning_rate": 0.00042614247522571925, + "loss": 2.635, + "step": 18607 + }, + { + "epoch": 0.5517895798119977, + "grad_norm": 0.12746691703796387, + "learning_rate": 0.00042609594074284997, + "loss": 2.6483, + "step": 18608 + }, + { + "epoch": 0.5518192331643093, + "grad_norm": 0.11866526305675507, + "learning_rate": 0.00042604940691441374, + "loss": 2.6314, + "step": 18609 + }, + { + "epoch": 0.5518488865166207, + "grad_norm": 0.11087644845247269, + "learning_rate": 0.00042600287374082284, + "loss": 2.63, + "step": 18610 + }, + { + "epoch": 0.5518785398689322, + "grad_norm": 0.1096191480755806, + "learning_rate": 0.00042595634122248913, + "loss": 2.6305, + "step": 18611 + }, + { + "epoch": 0.5519081932212436, + "grad_norm": 0.11791148781776428, + "learning_rate": 0.00042590980935982463, + "loss": 2.6381, + "step": 18612 + }, + { + "epoch": 0.5519378465735552, + "grad_norm": 0.12545345723628998, + "learning_rate": 0.0004258632781532416, + "loss": 2.6436, + "step": 18613 + }, + { + "epoch": 0.5519674999258666, + "grad_norm": 0.1110881119966507, + "learning_rate": 0.00042581674760315194, + "loss": 2.6687, + "step": 18614 + }, + { + "epoch": 0.5519971532781781, + "grad_norm": 0.11693218350410461, + "learning_rate": 0.00042577021770996763, + "loss": 2.6246, + "step": 18615 + }, + { + "epoch": 0.5520268066304895, + "grad_norm": 0.114615797996521, + "learning_rate": 0.00042572368847410073, + "loss": 2.6512, + "step": 18616 + }, + { + "epoch": 0.5520564599828011, + "grad_norm": 0.11343314498662949, + "learning_rate": 0.00042567715989596345, + "loss": 2.6357, + "step": 18617 + }, + { + "epoch": 0.5520861133351125, + "grad_norm": 0.11154796183109283, + "learning_rate": 0.0004256306319759677, + "loss": 2.6441, + "step": 18618 + }, + { + "epoch": 0.552115766687424, + "grad_norm": 0.12404942512512207, + "learning_rate": 0.0004255841047145254, + "loss": 2.6218, + "step": 18619 + }, + { + "epoch": 0.5521454200397355, + "grad_norm": 0.1302698254585266, + "learning_rate": 0.0004255375781120486, + "loss": 2.6409, + "step": 18620 + }, + { + "epoch": 0.552175073392047, + "grad_norm": 0.12413450330495834, + "learning_rate": 0.00042549105216894956, + "loss": 2.6419, + "step": 18621 + }, + { + "epoch": 0.5522047267443585, + "grad_norm": 0.11001242697238922, + "learning_rate": 0.00042544452688563986, + "loss": 2.6074, + "step": 18622 + }, + { + "epoch": 0.5522343800966699, + "grad_norm": 0.11877425760030746, + "learning_rate": 0.00042539800226253164, + "loss": 2.6365, + "step": 18623 + }, + { + "epoch": 0.5522640334489815, + "grad_norm": 0.10926567763090134, + "learning_rate": 0.000425351478300037, + "loss": 2.6126, + "step": 18624 + }, + { + "epoch": 0.5522936868012929, + "grad_norm": 0.12130124866962433, + "learning_rate": 0.00042530495499856774, + "loss": 2.6195, + "step": 18625 + }, + { + "epoch": 0.5523233401536044, + "grad_norm": 0.11449197679758072, + "learning_rate": 0.000425258432358536, + "loss": 2.6413, + "step": 18626 + }, + { + "epoch": 0.5523529935059158, + "grad_norm": 0.11376489698886871, + "learning_rate": 0.0004252119103803537, + "loss": 2.6398, + "step": 18627 + }, + { + "epoch": 0.5523826468582274, + "grad_norm": 0.1184564158320427, + "learning_rate": 0.00042516538906443277, + "loss": 2.644, + "step": 18628 + }, + { + "epoch": 0.5524123002105388, + "grad_norm": 0.11106058955192566, + "learning_rate": 0.00042511886841118486, + "loss": 2.61, + "step": 18629 + }, + { + "epoch": 0.5524419535628503, + "grad_norm": 0.10225334763526917, + "learning_rate": 0.0004250723484210225, + "loss": 2.6394, + "step": 18630 + }, + { + "epoch": 0.5524716069151617, + "grad_norm": 0.10885953158140182, + "learning_rate": 0.0004250258290943574, + "loss": 2.656, + "step": 18631 + }, + { + "epoch": 0.5525012602674733, + "grad_norm": 0.13008040189743042, + "learning_rate": 0.00042497931043160135, + "loss": 2.6226, + "step": 18632 + }, + { + "epoch": 0.5525309136197847, + "grad_norm": 0.11483224481344223, + "learning_rate": 0.00042493279243316644, + "loss": 2.5811, + "step": 18633 + }, + { + "epoch": 0.5525605669720962, + "grad_norm": 0.10414605587720871, + "learning_rate": 0.00042488627509946455, + "loss": 2.6292, + "step": 18634 + }, + { + "epoch": 0.5525902203244076, + "grad_norm": 0.1286221444606781, + "learning_rate": 0.0004248397584309075, + "loss": 2.6164, + "step": 18635 + }, + { + "epoch": 0.5526198736767192, + "grad_norm": 0.14603056013584137, + "learning_rate": 0.00042479324242790736, + "loss": 2.633, + "step": 18636 + }, + { + "epoch": 0.5526495270290306, + "grad_norm": 0.12708497047424316, + "learning_rate": 0.00042474672709087594, + "loss": 2.6195, + "step": 18637 + }, + { + "epoch": 0.5526791803813421, + "grad_norm": 0.1304491013288498, + "learning_rate": 0.00042470021242022526, + "loss": 2.6434, + "step": 18638 + }, + { + "epoch": 0.5527088337336535, + "grad_norm": 0.1282973438501358, + "learning_rate": 0.00042465369841636705, + "loss": 2.6422, + "step": 18639 + }, + { + "epoch": 0.5527384870859651, + "grad_norm": 0.12961646914482117, + "learning_rate": 0.00042460718507971324, + "loss": 2.6294, + "step": 18640 + }, + { + "epoch": 0.5527681404382766, + "grad_norm": 0.09663937240839005, + "learning_rate": 0.00042456067241067574, + "loss": 2.6158, + "step": 18641 + }, + { + "epoch": 0.552797793790588, + "grad_norm": 0.1172022596001625, + "learning_rate": 0.0004245141604096664, + "loss": 2.6578, + "step": 18642 + }, + { + "epoch": 0.5528274471428996, + "grad_norm": 0.13310526311397552, + "learning_rate": 0.0004244676490770972, + "loss": 2.6036, + "step": 18643 + }, + { + "epoch": 0.552857100495211, + "grad_norm": 0.11185944825410843, + "learning_rate": 0.0004244211384133799, + "loss": 2.6202, + "step": 18644 + }, + { + "epoch": 0.5528867538475225, + "grad_norm": 0.11698145419359207, + "learning_rate": 0.00042437462841892637, + "loss": 2.6385, + "step": 18645 + }, + { + "epoch": 0.5529164071998339, + "grad_norm": 0.1200370118021965, + "learning_rate": 0.00042432811909414857, + "loss": 2.6557, + "step": 18646 + }, + { + "epoch": 0.5529460605521455, + "grad_norm": 0.13680848479270935, + "learning_rate": 0.00042428161043945824, + "loss": 2.6036, + "step": 18647 + }, + { + "epoch": 0.5529757139044569, + "grad_norm": 0.10895653814077377, + "learning_rate": 0.0004242351024552673, + "loss": 2.6353, + "step": 18648 + }, + { + "epoch": 0.5530053672567684, + "grad_norm": 0.1106213703751564, + "learning_rate": 0.0004241885951419875, + "loss": 2.6644, + "step": 18649 + }, + { + "epoch": 0.5530350206090798, + "grad_norm": 0.12059660255908966, + "learning_rate": 0.0004241420885000307, + "loss": 2.6416, + "step": 18650 + }, + { + "epoch": 0.5530646739613914, + "grad_norm": 0.11566143482923508, + "learning_rate": 0.0004240955825298086, + "loss": 2.641, + "step": 18651 + }, + { + "epoch": 0.5530943273137028, + "grad_norm": 0.1107197180390358, + "learning_rate": 0.00042404907723173323, + "loss": 2.6522, + "step": 18652 + }, + { + "epoch": 0.5531239806660143, + "grad_norm": 0.11022644490003586, + "learning_rate": 0.0004240025726062164, + "loss": 2.6093, + "step": 18653 + }, + { + "epoch": 0.5531536340183257, + "grad_norm": 0.12357369810342789, + "learning_rate": 0.00042395606865366974, + "loss": 2.5988, + "step": 18654 + }, + { + "epoch": 0.5531832873706373, + "grad_norm": 0.1387176662683487, + "learning_rate": 0.000423909565374505, + "loss": 2.6585, + "step": 18655 + }, + { + "epoch": 0.5532129407229487, + "grad_norm": 0.1314588338136673, + "learning_rate": 0.0004238630627691343, + "loss": 2.639, + "step": 18656 + }, + { + "epoch": 0.5532425940752602, + "grad_norm": 0.10025542229413986, + "learning_rate": 0.00042381656083796926, + "loss": 2.619, + "step": 18657 + }, + { + "epoch": 0.5532722474275716, + "grad_norm": 0.11961928009986877, + "learning_rate": 0.00042377005958142163, + "loss": 2.6501, + "step": 18658 + }, + { + "epoch": 0.5533019007798832, + "grad_norm": 0.12576191127300262, + "learning_rate": 0.0004237235589999033, + "loss": 2.6039, + "step": 18659 + }, + { + "epoch": 0.5533315541321946, + "grad_norm": 0.11931605637073517, + "learning_rate": 0.0004236770590938259, + "loss": 2.6749, + "step": 18660 + }, + { + "epoch": 0.5533612074845061, + "grad_norm": 0.1215255856513977, + "learning_rate": 0.00042363055986360115, + "loss": 2.6452, + "step": 18661 + }, + { + "epoch": 0.5533908608368177, + "grad_norm": 0.1155654788017273, + "learning_rate": 0.0004235840613096409, + "loss": 2.619, + "step": 18662 + }, + { + "epoch": 0.5534205141891291, + "grad_norm": 0.12122892588376999, + "learning_rate": 0.00042353756343235696, + "loss": 2.6555, + "step": 18663 + }, + { + "epoch": 0.5534501675414406, + "grad_norm": 0.12492076307535172, + "learning_rate": 0.00042349106623216105, + "loss": 2.6529, + "step": 18664 + }, + { + "epoch": 0.553479820893752, + "grad_norm": 0.13151702284812927, + "learning_rate": 0.0004234445697094648, + "loss": 2.6354, + "step": 18665 + }, + { + "epoch": 0.5535094742460636, + "grad_norm": 0.09742703288793564, + "learning_rate": 0.00042339807386468023, + "loss": 2.5949, + "step": 18666 + }, + { + "epoch": 0.553539127598375, + "grad_norm": 0.11779662221670151, + "learning_rate": 0.00042335157869821866, + "loss": 2.6128, + "step": 18667 + }, + { + "epoch": 0.5535687809506865, + "grad_norm": 0.1327436864376068, + "learning_rate": 0.00042330508421049184, + "loss": 2.6251, + "step": 18668 + }, + { + "epoch": 0.5535984343029979, + "grad_norm": 0.11332324892282486, + "learning_rate": 0.00042325859040191196, + "loss": 2.6355, + "step": 18669 + }, + { + "epoch": 0.5536280876553095, + "grad_norm": 0.10992010682821274, + "learning_rate": 0.00042321209727289033, + "loss": 2.6538, + "step": 18670 + }, + { + "epoch": 0.5536577410076209, + "grad_norm": 0.10031113773584366, + "learning_rate": 0.00042316560482383883, + "loss": 2.5875, + "step": 18671 + }, + { + "epoch": 0.5536873943599324, + "grad_norm": 0.11288565397262573, + "learning_rate": 0.000423119113055169, + "loss": 2.62, + "step": 18672 + }, + { + "epoch": 0.5537170477122438, + "grad_norm": 0.12375899404287338, + "learning_rate": 0.0004230726219672927, + "loss": 2.6311, + "step": 18673 + }, + { + "epoch": 0.5537467010645554, + "grad_norm": 0.11336330324411392, + "learning_rate": 0.00042302613156062146, + "loss": 2.6131, + "step": 18674 + }, + { + "epoch": 0.5537763544168668, + "grad_norm": 0.11566636711359024, + "learning_rate": 0.0004229796418355671, + "loss": 2.6285, + "step": 18675 + }, + { + "epoch": 0.5538060077691783, + "grad_norm": 0.12544722855091095, + "learning_rate": 0.00042293315279254127, + "loss": 2.6224, + "step": 18676 + }, + { + "epoch": 0.5538356611214897, + "grad_norm": 0.11253753304481506, + "learning_rate": 0.00042288666443195556, + "loss": 2.6136, + "step": 18677 + }, + { + "epoch": 0.5538653144738013, + "grad_norm": 0.1131400465965271, + "learning_rate": 0.0004228401767542217, + "loss": 2.6469, + "step": 18678 + }, + { + "epoch": 0.5538949678261127, + "grad_norm": 0.12642653286457062, + "learning_rate": 0.0004227936897597512, + "loss": 2.6422, + "step": 18679 + }, + { + "epoch": 0.5539246211784242, + "grad_norm": 0.11737983673810959, + "learning_rate": 0.0004227472034489559, + "loss": 2.6156, + "step": 18680 + }, + { + "epoch": 0.5539542745307356, + "grad_norm": 0.10884208977222443, + "learning_rate": 0.0004227007178222473, + "loss": 2.6394, + "step": 18681 + }, + { + "epoch": 0.5539839278830472, + "grad_norm": 0.12319625914096832, + "learning_rate": 0.0004226542328800372, + "loss": 2.6344, + "step": 18682 + }, + { + "epoch": 0.5540135812353587, + "grad_norm": 0.12044539302587509, + "learning_rate": 0.00042260774862273707, + "loss": 2.6161, + "step": 18683 + }, + { + "epoch": 0.5540432345876701, + "grad_norm": 0.1332305669784546, + "learning_rate": 0.0004225612650507587, + "loss": 2.6507, + "step": 18684 + }, + { + "epoch": 0.5540728879399817, + "grad_norm": 0.1033577024936676, + "learning_rate": 0.0004225147821645136, + "loss": 2.6483, + "step": 18685 + }, + { + "epoch": 0.5541025412922931, + "grad_norm": 0.1097007468342781, + "learning_rate": 0.0004224682999644135, + "loss": 2.6274, + "step": 18686 + }, + { + "epoch": 0.5541321946446046, + "grad_norm": 0.12371201813220978, + "learning_rate": 0.0004224218184508698, + "loss": 2.6355, + "step": 18687 + }, + { + "epoch": 0.554161847996916, + "grad_norm": 0.1243833526968956, + "learning_rate": 0.0004223753376242942, + "loss": 2.6146, + "step": 18688 + }, + { + "epoch": 0.5541915013492276, + "grad_norm": 0.10726342350244522, + "learning_rate": 0.0004223288574850983, + "loss": 2.6453, + "step": 18689 + }, + { + "epoch": 0.554221154701539, + "grad_norm": 0.11557565629482269, + "learning_rate": 0.0004222823780336937, + "loss": 2.6202, + "step": 18690 + }, + { + "epoch": 0.5542508080538505, + "grad_norm": 0.11741675436496735, + "learning_rate": 0.00042223589927049203, + "loss": 2.6532, + "step": 18691 + }, + { + "epoch": 0.5542804614061619, + "grad_norm": 0.0968046560883522, + "learning_rate": 0.0004221894211959048, + "loss": 2.6285, + "step": 18692 + }, + { + "epoch": 0.5543101147584735, + "grad_norm": 0.11838172376155853, + "learning_rate": 0.0004221429438103435, + "loss": 2.6682, + "step": 18693 + }, + { + "epoch": 0.5543397681107849, + "grad_norm": 0.12558899819850922, + "learning_rate": 0.00042209646711421987, + "loss": 2.6269, + "step": 18694 + }, + { + "epoch": 0.5543694214630964, + "grad_norm": 0.09619912505149841, + "learning_rate": 0.00042204999110794547, + "loss": 2.6227, + "step": 18695 + }, + { + "epoch": 0.5543990748154078, + "grad_norm": 0.11089397221803665, + "learning_rate": 0.00042200351579193174, + "loss": 2.683, + "step": 18696 + }, + { + "epoch": 0.5544287281677194, + "grad_norm": 0.11429610103368759, + "learning_rate": 0.00042195704116659036, + "loss": 2.625, + "step": 18697 + }, + { + "epoch": 0.5544583815200308, + "grad_norm": 0.09995698928833008, + "learning_rate": 0.00042191056723233267, + "loss": 2.6189, + "step": 18698 + }, + { + "epoch": 0.5544880348723423, + "grad_norm": 0.10308416187763214, + "learning_rate": 0.0004218640939895703, + "loss": 2.6293, + "step": 18699 + }, + { + "epoch": 0.5545176882246537, + "grad_norm": 0.09535088390111923, + "learning_rate": 0.00042181762143871484, + "loss": 2.6358, + "step": 18700 + }, + { + "epoch": 0.5545473415769653, + "grad_norm": 0.11527196317911148, + "learning_rate": 0.0004217711495801777, + "loss": 2.6275, + "step": 18701 + }, + { + "epoch": 0.5545769949292767, + "grad_norm": 0.1128486841917038, + "learning_rate": 0.0004217246784143705, + "loss": 2.6437, + "step": 18702 + }, + { + "epoch": 0.5546066482815882, + "grad_norm": 0.098993681371212, + "learning_rate": 0.00042167820794170464, + "loss": 2.6381, + "step": 18703 + }, + { + "epoch": 0.5546363016338998, + "grad_norm": 0.10480498522520065, + "learning_rate": 0.00042163173816259187, + "loss": 2.5928, + "step": 18704 + }, + { + "epoch": 0.5546659549862112, + "grad_norm": 0.10003996640443802, + "learning_rate": 0.00042158526907744336, + "loss": 2.6384, + "step": 18705 + }, + { + "epoch": 0.5546956083385227, + "grad_norm": 0.09244886785745621, + "learning_rate": 0.0004215388006866706, + "loss": 2.6082, + "step": 18706 + }, + { + "epoch": 0.5547252616908341, + "grad_norm": 0.0989738255739212, + "learning_rate": 0.0004214923329906855, + "loss": 2.6532, + "step": 18707 + }, + { + "epoch": 0.5547549150431457, + "grad_norm": 0.10440783947706223, + "learning_rate": 0.00042144586598989915, + "loss": 2.6094, + "step": 18708 + }, + { + "epoch": 0.5547845683954571, + "grad_norm": 0.1084279790520668, + "learning_rate": 0.0004213993996847232, + "loss": 2.6623, + "step": 18709 + }, + { + "epoch": 0.5548142217477686, + "grad_norm": 0.10410177707672119, + "learning_rate": 0.00042135293407556895, + "loss": 2.6165, + "step": 18710 + }, + { + "epoch": 0.55484387510008, + "grad_norm": 0.1158638671040535, + "learning_rate": 0.0004213064691628481, + "loss": 2.6218, + "step": 18711 + }, + { + "epoch": 0.5548735284523916, + "grad_norm": 0.125890851020813, + "learning_rate": 0.00042126000494697194, + "loss": 2.6306, + "step": 18712 + }, + { + "epoch": 0.554903181804703, + "grad_norm": 0.151423841714859, + "learning_rate": 0.0004212135414283519, + "loss": 2.6608, + "step": 18713 + }, + { + "epoch": 0.5549328351570145, + "grad_norm": 0.13633599877357483, + "learning_rate": 0.0004211670786073996, + "loss": 2.5711, + "step": 18714 + }, + { + "epoch": 0.554962488509326, + "grad_norm": 0.09583527594804764, + "learning_rate": 0.0004211206164845262, + "loss": 2.6196, + "step": 18715 + }, + { + "epoch": 0.5549921418616375, + "grad_norm": 0.1151750311255455, + "learning_rate": 0.00042107415506014334, + "loss": 2.6355, + "step": 18716 + }, + { + "epoch": 0.5550217952139489, + "grad_norm": 0.15846888720989227, + "learning_rate": 0.0004210276943346624, + "loss": 2.6697, + "step": 18717 + }, + { + "epoch": 0.5550514485662604, + "grad_norm": 0.14240172505378723, + "learning_rate": 0.0004209812343084947, + "loss": 2.6348, + "step": 18718 + }, + { + "epoch": 0.5550811019185719, + "grad_norm": 0.11968570202589035, + "learning_rate": 0.0004209347749820517, + "loss": 2.6087, + "step": 18719 + }, + { + "epoch": 0.5551107552708834, + "grad_norm": 0.13546012341976166, + "learning_rate": 0.00042088831635574494, + "loss": 2.6079, + "step": 18720 + }, + { + "epoch": 0.5551404086231948, + "grad_norm": 0.11670311540365219, + "learning_rate": 0.00042084185842998566, + "loss": 2.6507, + "step": 18721 + }, + { + "epoch": 0.5551700619755063, + "grad_norm": 0.12302719056606293, + "learning_rate": 0.0004207954012051854, + "loss": 2.6499, + "step": 18722 + }, + { + "epoch": 0.5551997153278178, + "grad_norm": 0.11366588622331619, + "learning_rate": 0.0004207489446817554, + "loss": 2.6255, + "step": 18723 + }, + { + "epoch": 0.5552293686801293, + "grad_norm": 0.11476831883192062, + "learning_rate": 0.0004207024888601072, + "loss": 2.6317, + "step": 18724 + }, + { + "epoch": 0.5552590220324408, + "grad_norm": 0.1089971587061882, + "learning_rate": 0.000420656033740652, + "loss": 2.6522, + "step": 18725 + }, + { + "epoch": 0.5552886753847522, + "grad_norm": 0.12494248896837234, + "learning_rate": 0.00042060957932380124, + "loss": 2.635, + "step": 18726 + }, + { + "epoch": 0.5553183287370638, + "grad_norm": 0.11642595380544662, + "learning_rate": 0.0004205631256099662, + "loss": 2.6148, + "step": 18727 + }, + { + "epoch": 0.5553479820893752, + "grad_norm": 0.1063271015882492, + "learning_rate": 0.00042051667259955847, + "loss": 2.6513, + "step": 18728 + }, + { + "epoch": 0.5553776354416867, + "grad_norm": 0.1232687383890152, + "learning_rate": 0.00042047022029298914, + "loss": 2.6436, + "step": 18729 + }, + { + "epoch": 0.5554072887939981, + "grad_norm": 0.11108417063951492, + "learning_rate": 0.00042042376869066967, + "loss": 2.6581, + "step": 18730 + }, + { + "epoch": 0.5554369421463097, + "grad_norm": 0.12349552661180496, + "learning_rate": 0.0004203773177930115, + "loss": 2.6328, + "step": 18731 + }, + { + "epoch": 0.5554665954986211, + "grad_norm": 0.11795786768198013, + "learning_rate": 0.00042033086760042554, + "loss": 2.6298, + "step": 18732 + }, + { + "epoch": 0.5554962488509326, + "grad_norm": 0.10813242942094803, + "learning_rate": 0.00042028441811332363, + "loss": 2.642, + "step": 18733 + }, + { + "epoch": 0.555525902203244, + "grad_norm": 0.12733495235443115, + "learning_rate": 0.00042023796933211686, + "loss": 2.6195, + "step": 18734 + }, + { + "epoch": 0.5555555555555556, + "grad_norm": 0.12210708856582642, + "learning_rate": 0.0004201915212572167, + "loss": 2.6577, + "step": 18735 + }, + { + "epoch": 0.555585208907867, + "grad_norm": 0.10032917559146881, + "learning_rate": 0.00042014507388903424, + "loss": 2.6346, + "step": 18736 + }, + { + "epoch": 0.5556148622601785, + "grad_norm": 0.11866938322782516, + "learning_rate": 0.00042009862722798075, + "loss": 2.5958, + "step": 18737 + }, + { + "epoch": 0.55564451561249, + "grad_norm": 0.11983969807624817, + "learning_rate": 0.00042005218127446774, + "loss": 2.6126, + "step": 18738 + }, + { + "epoch": 0.5556741689648015, + "grad_norm": 0.10614064335823059, + "learning_rate": 0.0004200057360289063, + "loss": 2.5803, + "step": 18739 + }, + { + "epoch": 0.5557038223171129, + "grad_norm": 0.10987724363803864, + "learning_rate": 0.00041995929149170786, + "loss": 2.639, + "step": 18740 + }, + { + "epoch": 0.5557334756694244, + "grad_norm": 0.10507781058549881, + "learning_rate": 0.00041991284766328363, + "loss": 2.618, + "step": 18741 + }, + { + "epoch": 0.5557631290217359, + "grad_norm": 0.11576693505048752, + "learning_rate": 0.000419866404544045, + "loss": 2.6601, + "step": 18742 + }, + { + "epoch": 0.5557927823740474, + "grad_norm": 0.1344432532787323, + "learning_rate": 0.000419819962134403, + "loss": 2.6538, + "step": 18743 + }, + { + "epoch": 0.5558224357263588, + "grad_norm": 0.13515889644622803, + "learning_rate": 0.000419773520434769, + "loss": 2.6298, + "step": 18744 + }, + { + "epoch": 0.5558520890786703, + "grad_norm": 0.12372622638940811, + "learning_rate": 0.00041972707944555403, + "loss": 2.6102, + "step": 18745 + }, + { + "epoch": 0.5558817424309819, + "grad_norm": 0.11843127012252808, + "learning_rate": 0.0004196806391671698, + "loss": 2.5993, + "step": 18746 + }, + { + "epoch": 0.5559113957832933, + "grad_norm": 0.12498432397842407, + "learning_rate": 0.00041963419960002726, + "loss": 2.6267, + "step": 18747 + }, + { + "epoch": 0.5559410491356048, + "grad_norm": 0.10258075594902039, + "learning_rate": 0.00041958776074453764, + "loss": 2.6241, + "step": 18748 + }, + { + "epoch": 0.5559707024879162, + "grad_norm": 0.10505376756191254, + "learning_rate": 0.0004195413226011122, + "loss": 2.5989, + "step": 18749 + }, + { + "epoch": 0.5560003558402278, + "grad_norm": 0.12100726366043091, + "learning_rate": 0.00041949488517016223, + "loss": 2.6204, + "step": 18750 + }, + { + "epoch": 0.5560300091925392, + "grad_norm": 0.1103656068444252, + "learning_rate": 0.0004194484484520988, + "loss": 2.6735, + "step": 18751 + }, + { + "epoch": 0.5560596625448507, + "grad_norm": 0.10372789204120636, + "learning_rate": 0.0004194020124473333, + "loss": 2.6568, + "step": 18752 + }, + { + "epoch": 0.5560893158971622, + "grad_norm": 0.11739484965801239, + "learning_rate": 0.0004193555771562767, + "loss": 2.6397, + "step": 18753 + }, + { + "epoch": 0.5561189692494737, + "grad_norm": 0.09311852604150772, + "learning_rate": 0.00041930914257934035, + "loss": 2.6822, + "step": 18754 + }, + { + "epoch": 0.5561486226017851, + "grad_norm": 0.11437568068504333, + "learning_rate": 0.00041926270871693534, + "loss": 2.6235, + "step": 18755 + }, + { + "epoch": 0.5561782759540966, + "grad_norm": 0.13380469381809235, + "learning_rate": 0.0004192162755694729, + "loss": 2.6411, + "step": 18756 + }, + { + "epoch": 0.556207929306408, + "grad_norm": 0.12182486802339554, + "learning_rate": 0.0004191698431373643, + "loss": 2.6387, + "step": 18757 + }, + { + "epoch": 0.5562375826587196, + "grad_norm": 0.10784993320703506, + "learning_rate": 0.0004191234114210204, + "loss": 2.6428, + "step": 18758 + }, + { + "epoch": 0.556267236011031, + "grad_norm": 0.12841735780239105, + "learning_rate": 0.00041907698042085274, + "loss": 2.6455, + "step": 18759 + }, + { + "epoch": 0.5562968893633425, + "grad_norm": 0.10868146270513535, + "learning_rate": 0.0004190305501372723, + "loss": 2.6194, + "step": 18760 + }, + { + "epoch": 0.556326542715654, + "grad_norm": 0.10161834955215454, + "learning_rate": 0.0004189841205706902, + "loss": 2.6403, + "step": 18761 + }, + { + "epoch": 0.5563561960679655, + "grad_norm": 0.11095315217971802, + "learning_rate": 0.0004189376917215178, + "loss": 2.6272, + "step": 18762 + }, + { + "epoch": 0.5563858494202769, + "grad_norm": 0.12358806282281876, + "learning_rate": 0.0004188912635901659, + "loss": 2.6217, + "step": 18763 + }, + { + "epoch": 0.5564155027725884, + "grad_norm": 0.12208975851535797, + "learning_rate": 0.00041884483617704577, + "loss": 2.6169, + "step": 18764 + }, + { + "epoch": 0.5564451561248999, + "grad_norm": 0.12035904824733734, + "learning_rate": 0.0004187984094825686, + "loss": 2.6782, + "step": 18765 + }, + { + "epoch": 0.5564748094772114, + "grad_norm": 0.10523252934217453, + "learning_rate": 0.0004187519835071454, + "loss": 2.5915, + "step": 18766 + }, + { + "epoch": 0.5565044628295229, + "grad_norm": 0.0967857837677002, + "learning_rate": 0.0004187055582511873, + "loss": 2.6193, + "step": 18767 + }, + { + "epoch": 0.5565341161818343, + "grad_norm": 0.11547385901212692, + "learning_rate": 0.00041865913371510555, + "loss": 2.61, + "step": 18768 + }, + { + "epoch": 0.5565637695341459, + "grad_norm": 0.1126682460308075, + "learning_rate": 0.00041861270989931117, + "loss": 2.6377, + "step": 18769 + }, + { + "epoch": 0.5565934228864573, + "grad_norm": 0.10663796216249466, + "learning_rate": 0.0004185662868042151, + "loss": 2.6119, + "step": 18770 + }, + { + "epoch": 0.5566230762387688, + "grad_norm": 0.0939309373497963, + "learning_rate": 0.0004185198644302283, + "loss": 2.6622, + "step": 18771 + }, + { + "epoch": 0.5566527295910803, + "grad_norm": 0.10416680574417114, + "learning_rate": 0.00041847344277776236, + "loss": 2.6021, + "step": 18772 + }, + { + "epoch": 0.5566823829433918, + "grad_norm": 0.1053992286324501, + "learning_rate": 0.00041842702184722815, + "loss": 2.6322, + "step": 18773 + }, + { + "epoch": 0.5567120362957032, + "grad_norm": 0.11050260066986084, + "learning_rate": 0.0004183806016390366, + "loss": 2.6139, + "step": 18774 + }, + { + "epoch": 0.5567416896480147, + "grad_norm": 0.10752695798873901, + "learning_rate": 0.00041833418215359876, + "loss": 2.6398, + "step": 18775 + }, + { + "epoch": 0.5567713430003262, + "grad_norm": 0.10148844122886658, + "learning_rate": 0.0004182877633913258, + "loss": 2.6032, + "step": 18776 + }, + { + "epoch": 0.5568009963526377, + "grad_norm": 0.09983266144990921, + "learning_rate": 0.0004182413453526287, + "loss": 2.6376, + "step": 18777 + }, + { + "epoch": 0.5568306497049491, + "grad_norm": 0.11088931560516357, + "learning_rate": 0.00041819492803791853, + "loss": 2.5929, + "step": 18778 + }, + { + "epoch": 0.5568603030572606, + "grad_norm": 0.10970446467399597, + "learning_rate": 0.00041814851144760635, + "loss": 2.6341, + "step": 18779 + }, + { + "epoch": 0.5568899564095721, + "grad_norm": 0.12242743372917175, + "learning_rate": 0.0004181020955821032, + "loss": 2.6415, + "step": 18780 + }, + { + "epoch": 0.5569196097618836, + "grad_norm": 0.09564337879419327, + "learning_rate": 0.00041805568044182, + "loss": 2.6124, + "step": 18781 + }, + { + "epoch": 0.556949263114195, + "grad_norm": 0.10472874343395233, + "learning_rate": 0.00041800926602716774, + "loss": 2.6111, + "step": 18782 + }, + { + "epoch": 0.5569789164665065, + "grad_norm": 0.09679063409566879, + "learning_rate": 0.00041796285233855756, + "loss": 2.6513, + "step": 18783 + }, + { + "epoch": 0.557008569818818, + "grad_norm": 0.09976937621831894, + "learning_rate": 0.0004179164393764003, + "loss": 2.6007, + "step": 18784 + }, + { + "epoch": 0.5570382231711295, + "grad_norm": 0.12189284712076187, + "learning_rate": 0.0004178700271411071, + "loss": 2.626, + "step": 18785 + }, + { + "epoch": 0.557067876523441, + "grad_norm": 0.1323767453432083, + "learning_rate": 0.000417823615633089, + "loss": 2.5946, + "step": 18786 + }, + { + "epoch": 0.5570975298757525, + "grad_norm": 0.1302819401025772, + "learning_rate": 0.0004177772048527568, + "loss": 2.6295, + "step": 18787 + }, + { + "epoch": 0.557127183228064, + "grad_norm": 0.10304300487041473, + "learning_rate": 0.0004177307948005217, + "loss": 2.6322, + "step": 18788 + }, + { + "epoch": 0.5571568365803754, + "grad_norm": 0.1106395572423935, + "learning_rate": 0.00041768438547679445, + "loss": 2.6382, + "step": 18789 + }, + { + "epoch": 0.5571864899326869, + "grad_norm": 0.12610235810279846, + "learning_rate": 0.0004176379768819861, + "loss": 2.6668, + "step": 18790 + }, + { + "epoch": 0.5572161432849984, + "grad_norm": 0.11408630013465881, + "learning_rate": 0.0004175915690165076, + "loss": 2.6152, + "step": 18791 + }, + { + "epoch": 0.5572457966373099, + "grad_norm": 0.11883127689361572, + "learning_rate": 0.00041754516188077, + "loss": 2.6342, + "step": 18792 + }, + { + "epoch": 0.5572754499896213, + "grad_norm": 0.11750076711177826, + "learning_rate": 0.000417498755475184, + "loss": 2.6318, + "step": 18793 + }, + { + "epoch": 0.5573051033419328, + "grad_norm": 0.11434348672628403, + "learning_rate": 0.0004174523498001607, + "loss": 2.607, + "step": 18794 + }, + { + "epoch": 0.5573347566942443, + "grad_norm": 0.10403399169445038, + "learning_rate": 0.00041740594485611103, + "loss": 2.6418, + "step": 18795 + }, + { + "epoch": 0.5573644100465558, + "grad_norm": 0.10109470039606094, + "learning_rate": 0.0004173595406434459, + "loss": 2.6285, + "step": 18796 + }, + { + "epoch": 0.5573940633988672, + "grad_norm": 0.1160343736410141, + "learning_rate": 0.0004173131371625761, + "loss": 2.6168, + "step": 18797 + }, + { + "epoch": 0.5574237167511787, + "grad_norm": 0.10640701651573181, + "learning_rate": 0.0004172667344139128, + "loss": 2.598, + "step": 18798 + }, + { + "epoch": 0.5574533701034902, + "grad_norm": 0.11367788165807724, + "learning_rate": 0.0004172203323978667, + "loss": 2.686, + "step": 18799 + }, + { + "epoch": 0.5574830234558017, + "grad_norm": 0.10155583173036575, + "learning_rate": 0.00041717393111484897, + "loss": 2.6287, + "step": 18800 + }, + { + "epoch": 0.5575126768081131, + "grad_norm": 0.10876781493425369, + "learning_rate": 0.00041712753056527015, + "loss": 2.6319, + "step": 18801 + }, + { + "epoch": 0.5575423301604246, + "grad_norm": 0.11592157185077667, + "learning_rate": 0.0004170811307495412, + "loss": 2.6498, + "step": 18802 + }, + { + "epoch": 0.5575719835127361, + "grad_norm": 0.10157734900712967, + "learning_rate": 0.00041703473166807323, + "loss": 2.6303, + "step": 18803 + }, + { + "epoch": 0.5576016368650476, + "grad_norm": 0.10276944190263748, + "learning_rate": 0.00041698833332127686, + "loss": 2.6234, + "step": 18804 + }, + { + "epoch": 0.557631290217359, + "grad_norm": 0.13283415138721466, + "learning_rate": 0.000416941935709563, + "loss": 2.6298, + "step": 18805 + }, + { + "epoch": 0.5576609435696706, + "grad_norm": 0.14958035945892334, + "learning_rate": 0.00041689553883334266, + "loss": 2.6485, + "step": 18806 + }, + { + "epoch": 0.5576905969219821, + "grad_norm": 0.13085012137889862, + "learning_rate": 0.0004168491426930266, + "loss": 2.645, + "step": 18807 + }, + { + "epoch": 0.5577202502742935, + "grad_norm": 0.10479545593261719, + "learning_rate": 0.00041680274728902555, + "loss": 2.6601, + "step": 18808 + }, + { + "epoch": 0.557749903626605, + "grad_norm": 0.10479957610368729, + "learning_rate": 0.0004167563526217505, + "loss": 2.6267, + "step": 18809 + }, + { + "epoch": 0.5577795569789165, + "grad_norm": 0.11640430986881256, + "learning_rate": 0.000416709958691612, + "loss": 2.5936, + "step": 18810 + }, + { + "epoch": 0.557809210331228, + "grad_norm": 0.12443163990974426, + "learning_rate": 0.0004166635654990215, + "loss": 2.6376, + "step": 18811 + }, + { + "epoch": 0.5578388636835394, + "grad_norm": 0.12129126489162445, + "learning_rate": 0.00041661717304438924, + "loss": 2.6615, + "step": 18812 + }, + { + "epoch": 0.5578685170358509, + "grad_norm": 0.1143658459186554, + "learning_rate": 0.0004165707813281262, + "loss": 2.6149, + "step": 18813 + }, + { + "epoch": 0.5578981703881624, + "grad_norm": 0.11353075504302979, + "learning_rate": 0.0004165243903506433, + "loss": 2.6306, + "step": 18814 + }, + { + "epoch": 0.5579278237404739, + "grad_norm": 0.10603955388069153, + "learning_rate": 0.00041647800011235123, + "loss": 2.6313, + "step": 18815 + }, + { + "epoch": 0.5579574770927853, + "grad_norm": 0.1304384171962738, + "learning_rate": 0.00041643161061366075, + "loss": 2.6251, + "step": 18816 + }, + { + "epoch": 0.5579871304450968, + "grad_norm": 0.12653444707393646, + "learning_rate": 0.00041638522185498275, + "loss": 2.6087, + "step": 18817 + }, + { + "epoch": 0.5580167837974083, + "grad_norm": 0.1196853369474411, + "learning_rate": 0.0004163388338367281, + "loss": 2.6541, + "step": 18818 + }, + { + "epoch": 0.5580464371497198, + "grad_norm": 0.10583149641752243, + "learning_rate": 0.00041629244655930724, + "loss": 2.6674, + "step": 18819 + }, + { + "epoch": 0.5580760905020312, + "grad_norm": 0.10759937018156052, + "learning_rate": 0.0004162460600231312, + "loss": 2.6321, + "step": 18820 + }, + { + "epoch": 0.5581057438543428, + "grad_norm": 0.11179091036319733, + "learning_rate": 0.0004161996742286107, + "loss": 2.6163, + "step": 18821 + }, + { + "epoch": 0.5581353972066542, + "grad_norm": 0.10577116906642914, + "learning_rate": 0.00041615328917615643, + "loss": 2.6257, + "step": 18822 + }, + { + "epoch": 0.5581650505589657, + "grad_norm": 0.11133243888616562, + "learning_rate": 0.000416106904866179, + "loss": 2.596, + "step": 18823 + }, + { + "epoch": 0.5581947039112771, + "grad_norm": 0.12810559570789337, + "learning_rate": 0.00041606052129908956, + "loss": 2.6246, + "step": 18824 + }, + { + "epoch": 0.5582243572635887, + "grad_norm": 0.11743111908435822, + "learning_rate": 0.0004160141384752986, + "loss": 2.6146, + "step": 18825 + }, + { + "epoch": 0.5582540106159001, + "grad_norm": 0.1109895333647728, + "learning_rate": 0.0004159677563952168, + "loss": 2.6024, + "step": 18826 + }, + { + "epoch": 0.5582836639682116, + "grad_norm": 0.11962971836328506, + "learning_rate": 0.0004159213750592549, + "loss": 2.6482, + "step": 18827 + }, + { + "epoch": 0.5583133173205231, + "grad_norm": 0.12534202635288239, + "learning_rate": 0.00041587499446782384, + "loss": 2.6609, + "step": 18828 + }, + { + "epoch": 0.5583429706728346, + "grad_norm": 0.11160596460103989, + "learning_rate": 0.00041582861462133406, + "loss": 2.6346, + "step": 18829 + }, + { + "epoch": 0.5583726240251461, + "grad_norm": 0.10385674238204956, + "learning_rate": 0.00041578223552019624, + "loss": 2.6172, + "step": 18830 + }, + { + "epoch": 0.5584022773774575, + "grad_norm": 0.11004969477653503, + "learning_rate": 0.00041573585716482125, + "loss": 2.6144, + "step": 18831 + }, + { + "epoch": 0.558431930729769, + "grad_norm": 0.10986938327550888, + "learning_rate": 0.00041568947955561967, + "loss": 2.6529, + "step": 18832 + }, + { + "epoch": 0.5584615840820805, + "grad_norm": 0.10862839967012405, + "learning_rate": 0.00041564310269300226, + "loss": 2.6278, + "step": 18833 + }, + { + "epoch": 0.558491237434392, + "grad_norm": 0.1184835359454155, + "learning_rate": 0.0004155967265773797, + "loss": 2.6477, + "step": 18834 + }, + { + "epoch": 0.5585208907867034, + "grad_norm": 0.10585707426071167, + "learning_rate": 0.0004155503512091626, + "loss": 2.6363, + "step": 18835 + }, + { + "epoch": 0.558550544139015, + "grad_norm": 0.11880814284086227, + "learning_rate": 0.00041550397658876137, + "loss": 2.641, + "step": 18836 + }, + { + "epoch": 0.5585801974913264, + "grad_norm": 0.10720047354698181, + "learning_rate": 0.0004154576027165872, + "loss": 2.6301, + "step": 18837 + }, + { + "epoch": 0.5586098508436379, + "grad_norm": 0.10819493234157562, + "learning_rate": 0.00041541122959305043, + "loss": 2.6302, + "step": 18838 + }, + { + "epoch": 0.5586395041959493, + "grad_norm": 0.14903949201107025, + "learning_rate": 0.0004153648572185618, + "loss": 2.6765, + "step": 18839 + }, + { + "epoch": 0.5586691575482609, + "grad_norm": 0.13074931502342224, + "learning_rate": 0.0004153184855935319, + "loss": 2.6538, + "step": 18840 + }, + { + "epoch": 0.5586988109005723, + "grad_norm": 0.11242443323135376, + "learning_rate": 0.00041527211471837125, + "loss": 2.6166, + "step": 18841 + }, + { + "epoch": 0.5587284642528838, + "grad_norm": 0.1210075169801712, + "learning_rate": 0.00041522574459349057, + "loss": 2.6313, + "step": 18842 + }, + { + "epoch": 0.5587581176051952, + "grad_norm": 0.1340833455324173, + "learning_rate": 0.0004151793752193005, + "loss": 2.6333, + "step": 18843 + }, + { + "epoch": 0.5587877709575068, + "grad_norm": 0.1453208178281784, + "learning_rate": 0.0004151330065962116, + "loss": 2.6642, + "step": 18844 + }, + { + "epoch": 0.5588174243098182, + "grad_norm": 0.12666641175746918, + "learning_rate": 0.0004150866387246346, + "loss": 2.6336, + "step": 18845 + }, + { + "epoch": 0.5588470776621297, + "grad_norm": 0.12377406656742096, + "learning_rate": 0.0004150402716049799, + "loss": 2.5969, + "step": 18846 + }, + { + "epoch": 0.5588767310144411, + "grad_norm": 0.12251865118741989, + "learning_rate": 0.00041499390523765816, + "loss": 2.6043, + "step": 18847 + }, + { + "epoch": 0.5589063843667527, + "grad_norm": 0.13308097422122955, + "learning_rate": 0.00041494753962308004, + "loss": 2.6312, + "step": 18848 + }, + { + "epoch": 0.5589360377190642, + "grad_norm": 0.12263192236423492, + "learning_rate": 0.0004149011747616559, + "loss": 2.6228, + "step": 18849 + }, + { + "epoch": 0.5589656910713756, + "grad_norm": 0.10239193588495255, + "learning_rate": 0.00041485481065379657, + "loss": 2.6527, + "step": 18850 + }, + { + "epoch": 0.5589953444236871, + "grad_norm": 0.1091141477227211, + "learning_rate": 0.0004148084472999125, + "loss": 2.5876, + "step": 18851 + }, + { + "epoch": 0.5590249977759986, + "grad_norm": 0.11647723615169525, + "learning_rate": 0.00041476208470041427, + "loss": 2.6303, + "step": 18852 + }, + { + "epoch": 0.5590546511283101, + "grad_norm": 0.10077078640460968, + "learning_rate": 0.00041471572285571237, + "loss": 2.6212, + "step": 18853 + }, + { + "epoch": 0.5590843044806215, + "grad_norm": 0.1004086434841156, + "learning_rate": 0.00041466936176621746, + "loss": 2.6049, + "step": 18854 + }, + { + "epoch": 0.559113957832933, + "grad_norm": 0.11651048809289932, + "learning_rate": 0.00041462300143233996, + "loss": 2.5811, + "step": 18855 + }, + { + "epoch": 0.5591436111852445, + "grad_norm": 0.09649036824703217, + "learning_rate": 0.0004145766418544905, + "loss": 2.6097, + "step": 18856 + }, + { + "epoch": 0.559173264537556, + "grad_norm": 0.10652663558721542, + "learning_rate": 0.0004145302830330795, + "loss": 2.6052, + "step": 18857 + }, + { + "epoch": 0.5592029178898674, + "grad_norm": 0.11134850978851318, + "learning_rate": 0.00041448392496851747, + "loss": 2.6152, + "step": 18858 + }, + { + "epoch": 0.559232571242179, + "grad_norm": 0.11332815885543823, + "learning_rate": 0.000414437567661215, + "loss": 2.6619, + "step": 18859 + }, + { + "epoch": 0.5592622245944904, + "grad_norm": 0.10322414338588715, + "learning_rate": 0.0004143912111115825, + "loss": 2.6021, + "step": 18860 + }, + { + "epoch": 0.5592918779468019, + "grad_norm": 0.11366944760084152, + "learning_rate": 0.00041434485532003054, + "loss": 2.6304, + "step": 18861 + }, + { + "epoch": 0.5593215312991133, + "grad_norm": 0.096608005464077, + "learning_rate": 0.00041429850028696947, + "loss": 2.6337, + "step": 18862 + }, + { + "epoch": 0.5593511846514249, + "grad_norm": 0.10516959428787231, + "learning_rate": 0.00041425214601281, + "loss": 2.6186, + "step": 18863 + }, + { + "epoch": 0.5593808380037363, + "grad_norm": 0.11801543086767197, + "learning_rate": 0.0004142057924979626, + "loss": 2.6569, + "step": 18864 + }, + { + "epoch": 0.5594104913560478, + "grad_norm": 0.12306713312864304, + "learning_rate": 0.00041415943974283757, + "loss": 2.67, + "step": 18865 + }, + { + "epoch": 0.5594401447083592, + "grad_norm": 0.125011146068573, + "learning_rate": 0.0004141130877478455, + "loss": 2.6296, + "step": 18866 + }, + { + "epoch": 0.5594697980606708, + "grad_norm": 0.12690334022045135, + "learning_rate": 0.00041406673651339665, + "loss": 2.6379, + "step": 18867 + }, + { + "epoch": 0.5594994514129822, + "grad_norm": 0.1107446476817131, + "learning_rate": 0.0004140203860399017, + "loss": 2.6153, + "step": 18868 + }, + { + "epoch": 0.5595291047652937, + "grad_norm": 0.10174649953842163, + "learning_rate": 0.00041397403632777093, + "loss": 2.6372, + "step": 18869 + }, + { + "epoch": 0.5595587581176052, + "grad_norm": 0.09612715989351273, + "learning_rate": 0.00041392768737741483, + "loss": 2.5911, + "step": 18870 + }, + { + "epoch": 0.5595884114699167, + "grad_norm": 0.10109127312898636, + "learning_rate": 0.0004138813391892438, + "loss": 2.6398, + "step": 18871 + }, + { + "epoch": 0.5596180648222282, + "grad_norm": 0.11620572209358215, + "learning_rate": 0.00041383499176366834, + "loss": 2.62, + "step": 18872 + }, + { + "epoch": 0.5596477181745396, + "grad_norm": 0.10387971252202988, + "learning_rate": 0.00041378864510109895, + "loss": 2.6088, + "step": 18873 + }, + { + "epoch": 0.5596773715268512, + "grad_norm": 0.10809135437011719, + "learning_rate": 0.0004137422992019458, + "loss": 2.5898, + "step": 18874 + }, + { + "epoch": 0.5597070248791626, + "grad_norm": 0.0894465297460556, + "learning_rate": 0.00041369595406661906, + "loss": 2.6396, + "step": 18875 + }, + { + "epoch": 0.5597366782314741, + "grad_norm": 0.09630560874938965, + "learning_rate": 0.00041364960969552983, + "loss": 2.6829, + "step": 18876 + }, + { + "epoch": 0.5597663315837855, + "grad_norm": 0.11072684824466705, + "learning_rate": 0.00041360326608908817, + "loss": 2.6668, + "step": 18877 + }, + { + "epoch": 0.5597959849360971, + "grad_norm": 0.1203867644071579, + "learning_rate": 0.00041355692324770434, + "loss": 2.6647, + "step": 18878 + }, + { + "epoch": 0.5598256382884085, + "grad_norm": 0.1216026097536087, + "learning_rate": 0.00041351058117178877, + "loss": 2.6028, + "step": 18879 + }, + { + "epoch": 0.55985529164072, + "grad_norm": 0.11201739311218262, + "learning_rate": 0.0004134642398617518, + "loss": 2.6215, + "step": 18880 + }, + { + "epoch": 0.5598849449930314, + "grad_norm": 0.1105634868144989, + "learning_rate": 0.00041341789931800387, + "loss": 2.6068, + "step": 18881 + }, + { + "epoch": 0.559914598345343, + "grad_norm": 0.10346051305532455, + "learning_rate": 0.00041337155954095534, + "loss": 2.6422, + "step": 18882 + }, + { + "epoch": 0.5599442516976544, + "grad_norm": 0.11902236193418503, + "learning_rate": 0.0004133252205310166, + "loss": 2.6497, + "step": 18883 + }, + { + "epoch": 0.5599739050499659, + "grad_norm": 0.11962109804153442, + "learning_rate": 0.00041327888228859775, + "loss": 2.6328, + "step": 18884 + }, + { + "epoch": 0.5600035584022773, + "grad_norm": 0.11613652855157852, + "learning_rate": 0.00041323254481410944, + "loss": 2.6547, + "step": 18885 + }, + { + "epoch": 0.5600332117545889, + "grad_norm": 0.1337198168039322, + "learning_rate": 0.0004131862081079617, + "loss": 2.6594, + "step": 18886 + }, + { + "epoch": 0.5600628651069003, + "grad_norm": 0.13209925591945648, + "learning_rate": 0.0004131398721705649, + "loss": 2.6209, + "step": 18887 + }, + { + "epoch": 0.5600925184592118, + "grad_norm": 0.1273723989725113, + "learning_rate": 0.0004130935370023296, + "loss": 2.6436, + "step": 18888 + }, + { + "epoch": 0.5601221718115232, + "grad_norm": 0.12186168879270554, + "learning_rate": 0.00041304720260366593, + "loss": 2.6508, + "step": 18889 + }, + { + "epoch": 0.5601518251638348, + "grad_norm": 0.11419924348592758, + "learning_rate": 0.00041300086897498416, + "loss": 2.6349, + "step": 18890 + }, + { + "epoch": 0.5601814785161463, + "grad_norm": 0.1155834048986435, + "learning_rate": 0.00041295453611669465, + "loss": 2.6272, + "step": 18891 + }, + { + "epoch": 0.5602111318684577, + "grad_norm": 0.12488547712564468, + "learning_rate": 0.0004129082040292077, + "loss": 2.5981, + "step": 18892 + }, + { + "epoch": 0.5602407852207693, + "grad_norm": 0.09666134417057037, + "learning_rate": 0.00041286187271293354, + "loss": 2.6291, + "step": 18893 + }, + { + "epoch": 0.5602704385730807, + "grad_norm": 0.11346320807933807, + "learning_rate": 0.00041281554216828257, + "loss": 2.6312, + "step": 18894 + }, + { + "epoch": 0.5603000919253922, + "grad_norm": 0.10251946747303009, + "learning_rate": 0.0004127692123956648, + "loss": 2.6293, + "step": 18895 + }, + { + "epoch": 0.5603297452777036, + "grad_norm": 0.11734305322170258, + "learning_rate": 0.00041272288339549057, + "loss": 2.6463, + "step": 18896 + }, + { + "epoch": 0.5603593986300152, + "grad_norm": 0.11978886276483536, + "learning_rate": 0.0004126765551681703, + "loss": 2.6144, + "step": 18897 + }, + { + "epoch": 0.5603890519823266, + "grad_norm": 0.1145436242222786, + "learning_rate": 0.0004126302277141141, + "loss": 2.6391, + "step": 18898 + }, + { + "epoch": 0.5604187053346381, + "grad_norm": 0.12354462593793869, + "learning_rate": 0.00041258390103373217, + "loss": 2.6308, + "step": 18899 + }, + { + "epoch": 0.5604483586869495, + "grad_norm": 0.10176996141672134, + "learning_rate": 0.0004125375751274347, + "loss": 2.6505, + "step": 18900 + }, + { + "epoch": 0.5604780120392611, + "grad_norm": 0.10772272944450378, + "learning_rate": 0.0004124912499956322, + "loss": 2.6065, + "step": 18901 + }, + { + "epoch": 0.5605076653915725, + "grad_norm": 0.11397580057382584, + "learning_rate": 0.00041244492563873464, + "loss": 2.6232, + "step": 18902 + }, + { + "epoch": 0.560537318743884, + "grad_norm": 0.11448017507791519, + "learning_rate": 0.0004123986020571523, + "loss": 2.6182, + "step": 18903 + }, + { + "epoch": 0.5605669720961954, + "grad_norm": 0.1077178418636322, + "learning_rate": 0.00041235227925129553, + "loss": 2.6288, + "step": 18904 + }, + { + "epoch": 0.560596625448507, + "grad_norm": 0.09582841396331787, + "learning_rate": 0.0004123059572215742, + "loss": 2.6031, + "step": 18905 + }, + { + "epoch": 0.5606262788008184, + "grad_norm": 0.10480354726314545, + "learning_rate": 0.0004122596359683987, + "loss": 2.6268, + "step": 18906 + }, + { + "epoch": 0.5606559321531299, + "grad_norm": 0.10920702666044235, + "learning_rate": 0.00041221331549217923, + "loss": 2.6648, + "step": 18907 + }, + { + "epoch": 0.5606855855054413, + "grad_norm": 0.12031126767396927, + "learning_rate": 0.00041216699579332583, + "loss": 2.6251, + "step": 18908 + }, + { + "epoch": 0.5607152388577529, + "grad_norm": 0.12389940023422241, + "learning_rate": 0.0004121206768722488, + "loss": 2.5913, + "step": 18909 + }, + { + "epoch": 0.5607448922100643, + "grad_norm": 0.10410219430923462, + "learning_rate": 0.0004120743587293583, + "loss": 2.6373, + "step": 18910 + }, + { + "epoch": 0.5607745455623758, + "grad_norm": 0.0968792662024498, + "learning_rate": 0.0004120280413650645, + "loss": 2.6238, + "step": 18911 + }, + { + "epoch": 0.5608041989146874, + "grad_norm": 0.11451348662376404, + "learning_rate": 0.00041198172477977733, + "loss": 2.65, + "step": 18912 + }, + { + "epoch": 0.5608338522669988, + "grad_norm": 0.12185480445623398, + "learning_rate": 0.00041193540897390703, + "loss": 2.6258, + "step": 18913 + }, + { + "epoch": 0.5608635056193103, + "grad_norm": 0.10375528037548065, + "learning_rate": 0.0004118890939478639, + "loss": 2.6367, + "step": 18914 + }, + { + "epoch": 0.5608931589716217, + "grad_norm": 0.10139358043670654, + "learning_rate": 0.0004118427797020581, + "loss": 2.6432, + "step": 18915 + }, + { + "epoch": 0.5609228123239333, + "grad_norm": 0.10568100959062576, + "learning_rate": 0.00041179646623689954, + "loss": 2.6078, + "step": 18916 + }, + { + "epoch": 0.5609524656762447, + "grad_norm": 0.11741248518228531, + "learning_rate": 0.00041175015355279836, + "loss": 2.6303, + "step": 18917 + }, + { + "epoch": 0.5609821190285562, + "grad_norm": 0.13417603075504303, + "learning_rate": 0.0004117038416501648, + "loss": 2.6667, + "step": 18918 + }, + { + "epoch": 0.5610117723808676, + "grad_norm": 0.1333068311214447, + "learning_rate": 0.0004116575305294088, + "loss": 2.6293, + "step": 18919 + }, + { + "epoch": 0.5610414257331792, + "grad_norm": 0.12366009503602982, + "learning_rate": 0.00041161122019094055, + "loss": 2.6029, + "step": 18920 + }, + { + "epoch": 0.5610710790854906, + "grad_norm": 0.11291687190532684, + "learning_rate": 0.00041156491063517025, + "loss": 2.6476, + "step": 18921 + }, + { + "epoch": 0.5611007324378021, + "grad_norm": 0.11464616656303406, + "learning_rate": 0.00041151860186250773, + "loss": 2.6171, + "step": 18922 + }, + { + "epoch": 0.5611303857901135, + "grad_norm": 0.11279932409524918, + "learning_rate": 0.00041147229387336316, + "loss": 2.6095, + "step": 18923 + }, + { + "epoch": 0.5611600391424251, + "grad_norm": 0.11455446481704712, + "learning_rate": 0.00041142598666814664, + "loss": 2.579, + "step": 18924 + }, + { + "epoch": 0.5611896924947365, + "grad_norm": 0.11379318684339523, + "learning_rate": 0.0004113796802472682, + "loss": 2.6149, + "step": 18925 + }, + { + "epoch": 0.561219345847048, + "grad_norm": 0.11692999303340912, + "learning_rate": 0.0004113333746111378, + "loss": 2.581, + "step": 18926 + }, + { + "epoch": 0.5612489991993594, + "grad_norm": 0.1257937103509903, + "learning_rate": 0.0004112870697601657, + "loss": 2.5836, + "step": 18927 + }, + { + "epoch": 0.561278652551671, + "grad_norm": 0.12365053594112396, + "learning_rate": 0.0004112407656947618, + "loss": 2.6165, + "step": 18928 + }, + { + "epoch": 0.5613083059039824, + "grad_norm": 0.12866821885108948, + "learning_rate": 0.0004111944624153362, + "loss": 2.6554, + "step": 18929 + }, + { + "epoch": 0.5613379592562939, + "grad_norm": 0.11005175113677979, + "learning_rate": 0.00041114815992229883, + "loss": 2.6187, + "step": 18930 + }, + { + "epoch": 0.5613676126086053, + "grad_norm": 0.1105593889951706, + "learning_rate": 0.0004111018582160598, + "loss": 2.6373, + "step": 18931 + }, + { + "epoch": 0.5613972659609169, + "grad_norm": 0.1231537014245987, + "learning_rate": 0.0004110555572970291, + "loss": 2.583, + "step": 18932 + }, + { + "epoch": 0.5614269193132284, + "grad_norm": 0.12887173891067505, + "learning_rate": 0.0004110092571656167, + "loss": 2.656, + "step": 18933 + }, + { + "epoch": 0.5614565726655398, + "grad_norm": 0.11583458632230759, + "learning_rate": 0.00041096295782223257, + "loss": 2.6553, + "step": 18934 + }, + { + "epoch": 0.5614862260178514, + "grad_norm": 0.12035351246595383, + "learning_rate": 0.0004109166592672867, + "loss": 2.6274, + "step": 18935 + }, + { + "epoch": 0.5615158793701628, + "grad_norm": 0.1098397746682167, + "learning_rate": 0.00041087036150118915, + "loss": 2.5973, + "step": 18936 + }, + { + "epoch": 0.5615455327224743, + "grad_norm": 0.11834444850683212, + "learning_rate": 0.00041082406452434985, + "loss": 2.6296, + "step": 18937 + }, + { + "epoch": 0.5615751860747857, + "grad_norm": 0.1026311069726944, + "learning_rate": 0.00041077776833717885, + "loss": 2.6256, + "step": 18938 + }, + { + "epoch": 0.5616048394270973, + "grad_norm": 0.11228308081626892, + "learning_rate": 0.00041073147294008565, + "loss": 2.608, + "step": 18939 + }, + { + "epoch": 0.5616344927794087, + "grad_norm": 0.11160928010940552, + "learning_rate": 0.00041068517833348084, + "loss": 2.6183, + "step": 18940 + }, + { + "epoch": 0.5616641461317202, + "grad_norm": 0.11306633800268173, + "learning_rate": 0.00041063888451777414, + "loss": 2.6082, + "step": 18941 + }, + { + "epoch": 0.5616937994840316, + "grad_norm": 0.12252465635538101, + "learning_rate": 0.0004105925914933756, + "loss": 2.6295, + "step": 18942 + }, + { + "epoch": 0.5617234528363432, + "grad_norm": 0.12494060397148132, + "learning_rate": 0.00041054629926069475, + "loss": 2.6472, + "step": 18943 + }, + { + "epoch": 0.5617531061886546, + "grad_norm": 0.13353495299816132, + "learning_rate": 0.0004105000078201419, + "loss": 2.6383, + "step": 18944 + }, + { + "epoch": 0.5617827595409661, + "grad_norm": 0.14601799845695496, + "learning_rate": 0.00041045371717212683, + "loss": 2.6426, + "step": 18945 + }, + { + "epoch": 0.5618124128932775, + "grad_norm": 0.12430321425199509, + "learning_rate": 0.0004104074273170594, + "loss": 2.6857, + "step": 18946 + }, + { + "epoch": 0.5618420662455891, + "grad_norm": 0.11803906410932541, + "learning_rate": 0.00041036113825534964, + "loss": 2.6253, + "step": 18947 + }, + { + "epoch": 0.5618717195979005, + "grad_norm": 0.12001872062683105, + "learning_rate": 0.0004103148499874074, + "loss": 2.6062, + "step": 18948 + }, + { + "epoch": 0.561901372950212, + "grad_norm": 0.11681258678436279, + "learning_rate": 0.0004102685625136426, + "loss": 2.6338, + "step": 18949 + }, + { + "epoch": 0.5619310263025235, + "grad_norm": 0.12376219034194946, + "learning_rate": 0.00041022227583446504, + "loss": 2.5936, + "step": 18950 + }, + { + "epoch": 0.561960679654835, + "grad_norm": 0.11102853715419769, + "learning_rate": 0.0004101759899502846, + "loss": 2.592, + "step": 18951 + }, + { + "epoch": 0.5619903330071464, + "grad_norm": 0.11597459763288498, + "learning_rate": 0.0004101297048615109, + "loss": 2.661, + "step": 18952 + }, + { + "epoch": 0.5620199863594579, + "grad_norm": 0.1286362260580063, + "learning_rate": 0.00041008342056855453, + "loss": 2.6388, + "step": 18953 + }, + { + "epoch": 0.5620496397117695, + "grad_norm": 0.1105758398771286, + "learning_rate": 0.0004100371370718248, + "loss": 2.6121, + "step": 18954 + }, + { + "epoch": 0.5620792930640809, + "grad_norm": 0.10730656236410141, + "learning_rate": 0.00040999085437173156, + "loss": 2.6101, + "step": 18955 + }, + { + "epoch": 0.5621089464163924, + "grad_norm": 0.11605320870876312, + "learning_rate": 0.00040994457246868485, + "loss": 2.6758, + "step": 18956 + }, + { + "epoch": 0.5621385997687038, + "grad_norm": 0.1086762398481369, + "learning_rate": 0.00040989829136309436, + "loss": 2.615, + "step": 18957 + }, + { + "epoch": 0.5621682531210154, + "grad_norm": 0.10221142321825027, + "learning_rate": 0.00040985201105536994, + "loss": 2.6436, + "step": 18958 + }, + { + "epoch": 0.5621979064733268, + "grad_norm": 0.11681848019361496, + "learning_rate": 0.0004098057315459216, + "loss": 2.6291, + "step": 18959 + }, + { + "epoch": 0.5622275598256383, + "grad_norm": 0.1087031215429306, + "learning_rate": 0.00040975945283515885, + "loss": 2.6457, + "step": 18960 + }, + { + "epoch": 0.5622572131779497, + "grad_norm": 0.11461305618286133, + "learning_rate": 0.0004097131749234917, + "loss": 2.6038, + "step": 18961 + }, + { + "epoch": 0.5622868665302613, + "grad_norm": 0.10319861769676208, + "learning_rate": 0.00040966689781132983, + "loss": 2.6314, + "step": 18962 + }, + { + "epoch": 0.5623165198825727, + "grad_norm": 0.10303369164466858, + "learning_rate": 0.00040962062149908307, + "loss": 2.6192, + "step": 18963 + }, + { + "epoch": 0.5623461732348842, + "grad_norm": 0.11453785002231598, + "learning_rate": 0.0004095743459871612, + "loss": 2.6139, + "step": 18964 + }, + { + "epoch": 0.5623758265871956, + "grad_norm": 0.10968343168497086, + "learning_rate": 0.000409528071275974, + "loss": 2.5997, + "step": 18965 + }, + { + "epoch": 0.5624054799395072, + "grad_norm": 0.10310769081115723, + "learning_rate": 0.00040948179736593126, + "loss": 2.6138, + "step": 18966 + }, + { + "epoch": 0.5624351332918186, + "grad_norm": 0.11846549808979034, + "learning_rate": 0.0004094355242574428, + "loss": 2.597, + "step": 18967 + }, + { + "epoch": 0.5624647866441301, + "grad_norm": 0.12614020705223083, + "learning_rate": 0.0004093892519509183, + "loss": 2.659, + "step": 18968 + }, + { + "epoch": 0.5624944399964416, + "grad_norm": 0.12316606938838959, + "learning_rate": 0.00040934298044676754, + "loss": 2.6149, + "step": 18969 + }, + { + "epoch": 0.5625240933487531, + "grad_norm": 0.11932778358459473, + "learning_rate": 0.00040929670974540033, + "loss": 2.6197, + "step": 18970 + }, + { + "epoch": 0.5625537467010645, + "grad_norm": 0.11719073355197906, + "learning_rate": 0.0004092504398472263, + "loss": 2.6224, + "step": 18971 + }, + { + "epoch": 0.562583400053376, + "grad_norm": 0.10988657176494598, + "learning_rate": 0.00040920417075265516, + "loss": 2.6189, + "step": 18972 + }, + { + "epoch": 0.5626130534056875, + "grad_norm": 0.11480952799320221, + "learning_rate": 0.0004091579024620966, + "loss": 2.6477, + "step": 18973 + }, + { + "epoch": 0.562642706757999, + "grad_norm": 0.104139044880867, + "learning_rate": 0.00040911163497596046, + "loss": 2.6424, + "step": 18974 + }, + { + "epoch": 0.5626723601103105, + "grad_norm": 0.11131826043128967, + "learning_rate": 0.00040906536829465645, + "loss": 2.6335, + "step": 18975 + }, + { + "epoch": 0.5627020134626219, + "grad_norm": 0.10202238708734512, + "learning_rate": 0.00040901910241859427, + "loss": 2.6076, + "step": 18976 + }, + { + "epoch": 0.5627316668149335, + "grad_norm": 0.105076365172863, + "learning_rate": 0.0004089728373481834, + "loss": 2.5965, + "step": 18977 + }, + { + "epoch": 0.5627613201672449, + "grad_norm": 0.09731917083263397, + "learning_rate": 0.00040892657308383353, + "loss": 2.6426, + "step": 18978 + }, + { + "epoch": 0.5627909735195564, + "grad_norm": 0.1163652166724205, + "learning_rate": 0.0004088803096259547, + "loss": 2.6042, + "step": 18979 + }, + { + "epoch": 0.5628206268718678, + "grad_norm": 0.11421257257461548, + "learning_rate": 0.00040883404697495643, + "loss": 2.5953, + "step": 18980 + }, + { + "epoch": 0.5628502802241794, + "grad_norm": 0.10595270991325378, + "learning_rate": 0.00040878778513124824, + "loss": 2.6185, + "step": 18981 + }, + { + "epoch": 0.5628799335764908, + "grad_norm": 0.13529792428016663, + "learning_rate": 0.0004087415240952399, + "loss": 2.6428, + "step": 18982 + }, + { + "epoch": 0.5629095869288023, + "grad_norm": 0.1162852942943573, + "learning_rate": 0.00040869526386734094, + "loss": 2.6223, + "step": 18983 + }, + { + "epoch": 0.5629392402811138, + "grad_norm": 0.10527706891298294, + "learning_rate": 0.0004086490044479612, + "loss": 2.6349, + "step": 18984 + }, + { + "epoch": 0.5629688936334253, + "grad_norm": 0.11477299779653549, + "learning_rate": 0.0004086027458375101, + "loss": 2.6242, + "step": 18985 + }, + { + "epoch": 0.5629985469857367, + "grad_norm": 0.11856286227703094, + "learning_rate": 0.00040855648803639745, + "loss": 2.6679, + "step": 18986 + }, + { + "epoch": 0.5630282003380482, + "grad_norm": 0.11684408783912659, + "learning_rate": 0.00040851023104503294, + "loss": 2.6361, + "step": 18987 + }, + { + "epoch": 0.5630578536903597, + "grad_norm": 0.11113162338733673, + "learning_rate": 0.00040846397486382586, + "loss": 2.6406, + "step": 18988 + }, + { + "epoch": 0.5630875070426712, + "grad_norm": 0.11035581678152084, + "learning_rate": 0.00040841771949318595, + "loss": 2.6501, + "step": 18989 + }, + { + "epoch": 0.5631171603949826, + "grad_norm": 0.12092713266611099, + "learning_rate": 0.0004083714649335229, + "loss": 2.6134, + "step": 18990 + }, + { + "epoch": 0.5631468137472941, + "grad_norm": 0.12432562559843063, + "learning_rate": 0.00040832521118524623, + "loss": 2.6562, + "step": 18991 + }, + { + "epoch": 0.5631764670996056, + "grad_norm": 0.13069044053554535, + "learning_rate": 0.0004082789582487656, + "loss": 2.6464, + "step": 18992 + }, + { + "epoch": 0.5632061204519171, + "grad_norm": 0.12086419016122818, + "learning_rate": 0.0004082327061244905, + "loss": 2.6066, + "step": 18993 + }, + { + "epoch": 0.5632357738042286, + "grad_norm": 0.11632389575242996, + "learning_rate": 0.00040818645481283057, + "loss": 2.6291, + "step": 18994 + }, + { + "epoch": 0.56326542715654, + "grad_norm": 0.11398275941610336, + "learning_rate": 0.0004081402043141953, + "loss": 2.6327, + "step": 18995 + }, + { + "epoch": 0.5632950805088516, + "grad_norm": 0.12141840159893036, + "learning_rate": 0.00040809395462899434, + "loss": 2.6583, + "step": 18996 + }, + { + "epoch": 0.563324733861163, + "grad_norm": 0.11018131673336029, + "learning_rate": 0.00040804770575763726, + "loss": 2.6343, + "step": 18997 + }, + { + "epoch": 0.5633543872134745, + "grad_norm": 0.10942398011684418, + "learning_rate": 0.00040800145770053347, + "loss": 2.6393, + "step": 18998 + }, + { + "epoch": 0.563384040565786, + "grad_norm": 0.1058901995420456, + "learning_rate": 0.0004079552104580925, + "loss": 2.6332, + "step": 18999 + }, + { + "epoch": 0.5634136939180975, + "grad_norm": 0.1491490751504898, + "learning_rate": 0.000407908964030724, + "loss": 2.6441, + "step": 19000 + }, + { + "epoch": 0.5634433472704089, + "grad_norm": 0.1453363001346588, + "learning_rate": 0.00040786271841883743, + "loss": 2.6277, + "step": 19001 + }, + { + "epoch": 0.5634730006227204, + "grad_norm": 0.13334062695503235, + "learning_rate": 0.00040781647362284225, + "loss": 2.6358, + "step": 19002 + }, + { + "epoch": 0.5635026539750319, + "grad_norm": 0.111796073615551, + "learning_rate": 0.0004077702296431481, + "loss": 2.612, + "step": 19003 + }, + { + "epoch": 0.5635323073273434, + "grad_norm": 0.11042072623968124, + "learning_rate": 0.0004077239864801642, + "loss": 2.6322, + "step": 19004 + }, + { + "epoch": 0.5635619606796548, + "grad_norm": 0.12956732511520386, + "learning_rate": 0.00040767774413430036, + "loss": 2.6365, + "step": 19005 + }, + { + "epoch": 0.5635916140319663, + "grad_norm": 0.11074211448431015, + "learning_rate": 0.000407631502605966, + "loss": 2.6172, + "step": 19006 + }, + { + "epoch": 0.5636212673842778, + "grad_norm": 0.11118894815444946, + "learning_rate": 0.00040758526189557046, + "loss": 2.664, + "step": 19007 + }, + { + "epoch": 0.5636509207365893, + "grad_norm": 0.102165587246418, + "learning_rate": 0.0004075390220035235, + "loss": 2.641, + "step": 19008 + }, + { + "epoch": 0.5636805740889007, + "grad_norm": 0.1201067715883255, + "learning_rate": 0.0004074927829302342, + "loss": 2.6121, + "step": 19009 + }, + { + "epoch": 0.5637102274412122, + "grad_norm": 0.11632876098155975, + "learning_rate": 0.0004074465446761122, + "loss": 2.6127, + "step": 19010 + }, + { + "epoch": 0.5637398807935237, + "grad_norm": 0.10011996328830719, + "learning_rate": 0.0004074003072415669, + "loss": 2.6103, + "step": 19011 + }, + { + "epoch": 0.5637695341458352, + "grad_norm": 0.1087542399764061, + "learning_rate": 0.0004073540706270078, + "loss": 2.6401, + "step": 19012 + }, + { + "epoch": 0.5637991874981466, + "grad_norm": 0.10775480419397354, + "learning_rate": 0.00040730783483284433, + "loss": 2.6367, + "step": 19013 + }, + { + "epoch": 0.5638288408504581, + "grad_norm": 0.12030422687530518, + "learning_rate": 0.00040726159985948594, + "loss": 2.5764, + "step": 19014 + }, + { + "epoch": 0.5638584942027697, + "grad_norm": 0.10811713337898254, + "learning_rate": 0.00040721536570734196, + "loss": 2.612, + "step": 19015 + }, + { + "epoch": 0.5638881475550811, + "grad_norm": 0.10788432508707047, + "learning_rate": 0.00040716913237682183, + "loss": 2.5961, + "step": 19016 + }, + { + "epoch": 0.5639178009073926, + "grad_norm": 0.13027866184711456, + "learning_rate": 0.00040712289986833474, + "loss": 2.6079, + "step": 19017 + }, + { + "epoch": 0.563947454259704, + "grad_norm": 0.12161950021982193, + "learning_rate": 0.0004070766681822906, + "loss": 2.6453, + "step": 19018 + }, + { + "epoch": 0.5639771076120156, + "grad_norm": 0.0985507071018219, + "learning_rate": 0.0004070304373190984, + "loss": 2.6172, + "step": 19019 + }, + { + "epoch": 0.564006760964327, + "grad_norm": 0.12814980745315552, + "learning_rate": 0.0004069842072791677, + "loss": 2.6589, + "step": 19020 + }, + { + "epoch": 0.5640364143166385, + "grad_norm": 0.11841762810945511, + "learning_rate": 0.00040693797806290786, + "loss": 2.623, + "step": 19021 + }, + { + "epoch": 0.56406606766895, + "grad_norm": 0.10529198497533798, + "learning_rate": 0.00040689174967072805, + "loss": 2.6171, + "step": 19022 + }, + { + "epoch": 0.5640957210212615, + "grad_norm": 0.10953761637210846, + "learning_rate": 0.00040684552210303786, + "loss": 2.6011, + "step": 19023 + }, + { + "epoch": 0.5641253743735729, + "grad_norm": 0.10111331939697266, + "learning_rate": 0.0004067992953602466, + "loss": 2.6537, + "step": 19024 + }, + { + "epoch": 0.5641550277258844, + "grad_norm": 0.11884070932865143, + "learning_rate": 0.00040675306944276365, + "loss": 2.6094, + "step": 19025 + }, + { + "epoch": 0.5641846810781959, + "grad_norm": 0.08993038535118103, + "learning_rate": 0.00040670684435099814, + "loss": 2.6192, + "step": 19026 + }, + { + "epoch": 0.5642143344305074, + "grad_norm": 0.10424207895994186, + "learning_rate": 0.0004066606200853595, + "loss": 2.6047, + "step": 19027 + }, + { + "epoch": 0.5642439877828188, + "grad_norm": 0.11540724337100983, + "learning_rate": 0.0004066143966462572, + "loss": 2.6399, + "step": 19028 + }, + { + "epoch": 0.5642736411351303, + "grad_norm": 0.10999798029661179, + "learning_rate": 0.0004065681740341004, + "loss": 2.6353, + "step": 19029 + }, + { + "epoch": 0.5643032944874418, + "grad_norm": 0.10782724618911743, + "learning_rate": 0.0004065219522492983, + "loss": 2.6324, + "step": 19030 + }, + { + "epoch": 0.5643329478397533, + "grad_norm": 0.11448168754577637, + "learning_rate": 0.0004064757312922605, + "loss": 2.6211, + "step": 19031 + }, + { + "epoch": 0.5643626011920647, + "grad_norm": 0.12091011554002762, + "learning_rate": 0.00040642951116339615, + "loss": 2.6319, + "step": 19032 + }, + { + "epoch": 0.5643922545443762, + "grad_norm": 0.12835316359996796, + "learning_rate": 0.0004063832918631146, + "loss": 2.6166, + "step": 19033 + }, + { + "epoch": 0.5644219078966877, + "grad_norm": 0.1149815171957016, + "learning_rate": 0.000406337073391825, + "loss": 2.6436, + "step": 19034 + }, + { + "epoch": 0.5644515612489992, + "grad_norm": 0.11681484431028366, + "learning_rate": 0.0004062908557499368, + "loss": 2.613, + "step": 19035 + }, + { + "epoch": 0.5644812146013107, + "grad_norm": 0.0983089730143547, + "learning_rate": 0.00040624463893785905, + "loss": 2.6431, + "step": 19036 + }, + { + "epoch": 0.5645108679536222, + "grad_norm": 0.12225448340177536, + "learning_rate": 0.0004061984229560012, + "loss": 2.6088, + "step": 19037 + }, + { + "epoch": 0.5645405213059337, + "grad_norm": 0.1175539642572403, + "learning_rate": 0.0004061522078047723, + "loss": 2.6158, + "step": 19038 + }, + { + "epoch": 0.5645701746582451, + "grad_norm": 0.10956747829914093, + "learning_rate": 0.0004061059934845818, + "loss": 2.5916, + "step": 19039 + }, + { + "epoch": 0.5645998280105566, + "grad_norm": 0.13134172558784485, + "learning_rate": 0.00040605977999583876, + "loss": 2.6248, + "step": 19040 + }, + { + "epoch": 0.5646294813628681, + "grad_norm": 0.10574144124984741, + "learning_rate": 0.00040601356733895255, + "loss": 2.6413, + "step": 19041 + }, + { + "epoch": 0.5646591347151796, + "grad_norm": 0.10193860530853271, + "learning_rate": 0.0004059673555143324, + "loss": 2.6444, + "step": 19042 + }, + { + "epoch": 0.564688788067491, + "grad_norm": 0.11370982229709625, + "learning_rate": 0.00040592114452238717, + "loss": 2.6093, + "step": 19043 + }, + { + "epoch": 0.5647184414198025, + "grad_norm": 0.09462298452854156, + "learning_rate": 0.0004058749343635265, + "loss": 2.625, + "step": 19044 + }, + { + "epoch": 0.564748094772114, + "grad_norm": 0.10552898794412613, + "learning_rate": 0.00040582872503815956, + "loss": 2.6245, + "step": 19045 + }, + { + "epoch": 0.5647777481244255, + "grad_norm": 0.105945885181427, + "learning_rate": 0.00040578251654669543, + "loss": 2.5958, + "step": 19046 + }, + { + "epoch": 0.5648074014767369, + "grad_norm": 0.10910198837518692, + "learning_rate": 0.00040573630888954325, + "loss": 2.601, + "step": 19047 + }, + { + "epoch": 0.5648370548290484, + "grad_norm": 0.10927297919988632, + "learning_rate": 0.0004056901020671122, + "loss": 2.6394, + "step": 19048 + }, + { + "epoch": 0.5648667081813599, + "grad_norm": 0.1434730887413025, + "learning_rate": 0.0004056438960798115, + "loss": 2.6547, + "step": 19049 + }, + { + "epoch": 0.5648963615336714, + "grad_norm": 0.1370810717344284, + "learning_rate": 0.00040559769092805034, + "loss": 2.5974, + "step": 19050 + }, + { + "epoch": 0.5649260148859828, + "grad_norm": 0.12399504333734512, + "learning_rate": 0.00040555148661223773, + "loss": 2.6384, + "step": 19051 + }, + { + "epoch": 0.5649556682382944, + "grad_norm": 0.1272740364074707, + "learning_rate": 0.0004055052831327831, + "loss": 2.6576, + "step": 19052 + }, + { + "epoch": 0.5649853215906058, + "grad_norm": 0.11590198427438736, + "learning_rate": 0.00040545908049009527, + "loss": 2.6456, + "step": 19053 + }, + { + "epoch": 0.5650149749429173, + "grad_norm": 0.11474543809890747, + "learning_rate": 0.0004054128786845835, + "loss": 2.6703, + "step": 19054 + }, + { + "epoch": 0.5650446282952287, + "grad_norm": 0.1254907250404358, + "learning_rate": 0.00040536667771665693, + "loss": 2.65, + "step": 19055 + }, + { + "epoch": 0.5650742816475403, + "grad_norm": 0.12316172569990158, + "learning_rate": 0.0004053204775867246, + "loss": 2.6382, + "step": 19056 + }, + { + "epoch": 0.5651039349998518, + "grad_norm": 0.10221853107213974, + "learning_rate": 0.00040527427829519576, + "loss": 2.6525, + "step": 19057 + }, + { + "epoch": 0.5651335883521632, + "grad_norm": 0.12192022800445557, + "learning_rate": 0.00040522807984247946, + "loss": 2.6282, + "step": 19058 + }, + { + "epoch": 0.5651632417044747, + "grad_norm": 0.11491847783327103, + "learning_rate": 0.00040518188222898475, + "loss": 2.6555, + "step": 19059 + }, + { + "epoch": 0.5651928950567862, + "grad_norm": 0.10034020990133286, + "learning_rate": 0.0004051356854551208, + "loss": 2.6259, + "step": 19060 + }, + { + "epoch": 0.5652225484090977, + "grad_norm": 0.10276400297880173, + "learning_rate": 0.0004050894895212966, + "loss": 2.6121, + "step": 19061 + }, + { + "epoch": 0.5652522017614091, + "grad_norm": 0.10556352883577347, + "learning_rate": 0.00040504329442792134, + "loss": 2.6161, + "step": 19062 + }, + { + "epoch": 0.5652818551137206, + "grad_norm": 0.10013511031866074, + "learning_rate": 0.00040499710017540404, + "loss": 2.6062, + "step": 19063 + }, + { + "epoch": 0.5653115084660321, + "grad_norm": 0.1041366308927536, + "learning_rate": 0.00040495090676415357, + "loss": 2.6487, + "step": 19064 + }, + { + "epoch": 0.5653411618183436, + "grad_norm": 0.10691593587398529, + "learning_rate": 0.00040490471419457917, + "loss": 2.626, + "step": 19065 + }, + { + "epoch": 0.565370815170655, + "grad_norm": 0.12760834395885468, + "learning_rate": 0.00040485852246708996, + "loss": 2.6331, + "step": 19066 + }, + { + "epoch": 0.5654004685229665, + "grad_norm": 0.10329218953847885, + "learning_rate": 0.00040481233158209473, + "loss": 2.6264, + "step": 19067 + }, + { + "epoch": 0.565430121875278, + "grad_norm": 0.10868894308805466, + "learning_rate": 0.00040476614154000257, + "loss": 2.6446, + "step": 19068 + }, + { + "epoch": 0.5654597752275895, + "grad_norm": 0.10834791511297226, + "learning_rate": 0.0004047199523412227, + "loss": 2.6183, + "step": 19069 + }, + { + "epoch": 0.5654894285799009, + "grad_norm": 0.10290102660655975, + "learning_rate": 0.000404673763986164, + "loss": 2.625, + "step": 19070 + }, + { + "epoch": 0.5655190819322125, + "grad_norm": 0.1007181853055954, + "learning_rate": 0.0004046275764752355, + "loss": 2.6382, + "step": 19071 + }, + { + "epoch": 0.5655487352845239, + "grad_norm": 0.09794840216636658, + "learning_rate": 0.00040458138980884617, + "loss": 2.6335, + "step": 19072 + }, + { + "epoch": 0.5655783886368354, + "grad_norm": 0.11671032011508942, + "learning_rate": 0.0004045352039874051, + "loss": 2.6105, + "step": 19073 + }, + { + "epoch": 0.5656080419891468, + "grad_norm": 0.1112799420952797, + "learning_rate": 0.0004044890190113211, + "loss": 2.6211, + "step": 19074 + }, + { + "epoch": 0.5656376953414584, + "grad_norm": 0.10076778382062912, + "learning_rate": 0.0004044428348810032, + "loss": 2.6391, + "step": 19075 + }, + { + "epoch": 0.5656673486937698, + "grad_norm": 0.09249670803546906, + "learning_rate": 0.00040439665159686046, + "loss": 2.6, + "step": 19076 + }, + { + "epoch": 0.5656970020460813, + "grad_norm": 0.10664033889770508, + "learning_rate": 0.00040435046915930173, + "loss": 2.6375, + "step": 19077 + }, + { + "epoch": 0.5657266553983928, + "grad_norm": 0.10772982984781265, + "learning_rate": 0.00040430428756873605, + "loss": 2.6031, + "step": 19078 + }, + { + "epoch": 0.5657563087507043, + "grad_norm": 0.11979466676712036, + "learning_rate": 0.0004042581068255723, + "loss": 2.6526, + "step": 19079 + }, + { + "epoch": 0.5657859621030158, + "grad_norm": 0.09940182417631149, + "learning_rate": 0.00040421192693021956, + "loss": 2.6292, + "step": 19080 + }, + { + "epoch": 0.5658156154553272, + "grad_norm": 0.10984736680984497, + "learning_rate": 0.0004041657478830863, + "loss": 2.6367, + "step": 19081 + }, + { + "epoch": 0.5658452688076387, + "grad_norm": 0.09891404211521149, + "learning_rate": 0.0004041195696845821, + "loss": 2.608, + "step": 19082 + }, + { + "epoch": 0.5658749221599502, + "grad_norm": 0.10637130588293076, + "learning_rate": 0.0004040733923351154, + "loss": 2.647, + "step": 19083 + }, + { + "epoch": 0.5659045755122617, + "grad_norm": 0.12608057260513306, + "learning_rate": 0.00040402721583509556, + "loss": 2.6481, + "step": 19084 + }, + { + "epoch": 0.5659342288645731, + "grad_norm": 0.14338825643062592, + "learning_rate": 0.000403981040184931, + "loss": 2.6229, + "step": 19085 + }, + { + "epoch": 0.5659638822168847, + "grad_norm": 0.11900264769792557, + "learning_rate": 0.00040393486538503083, + "loss": 2.644, + "step": 19086 + }, + { + "epoch": 0.5659935355691961, + "grad_norm": 0.11052127182483673, + "learning_rate": 0.0004038886914358039, + "loss": 2.5661, + "step": 19087 + }, + { + "epoch": 0.5660231889215076, + "grad_norm": 0.11476616561412811, + "learning_rate": 0.0004038425183376591, + "loss": 2.6192, + "step": 19088 + }, + { + "epoch": 0.566052842273819, + "grad_norm": 0.12503555417060852, + "learning_rate": 0.00040379634609100535, + "loss": 2.5917, + "step": 19089 + }, + { + "epoch": 0.5660824956261306, + "grad_norm": 0.1059931069612503, + "learning_rate": 0.0004037501746962515, + "loss": 2.6137, + "step": 19090 + }, + { + "epoch": 0.566112148978442, + "grad_norm": 0.12075456231832504, + "learning_rate": 0.0004037040041538064, + "loss": 2.6472, + "step": 19091 + }, + { + "epoch": 0.5661418023307535, + "grad_norm": 0.11774582415819168, + "learning_rate": 0.00040365783446407876, + "loss": 2.6296, + "step": 19092 + }, + { + "epoch": 0.5661714556830649, + "grad_norm": 0.10269773006439209, + "learning_rate": 0.0004036116656274775, + "loss": 2.6419, + "step": 19093 + }, + { + "epoch": 0.5662011090353765, + "grad_norm": 0.10452041774988174, + "learning_rate": 0.00040356549764441145, + "loss": 2.6277, + "step": 19094 + }, + { + "epoch": 0.5662307623876879, + "grad_norm": 0.1397305279970169, + "learning_rate": 0.0004035193305152896, + "loss": 2.6307, + "step": 19095 + }, + { + "epoch": 0.5662604157399994, + "grad_norm": 0.12093479186296463, + "learning_rate": 0.0004034731642405206, + "loss": 2.6098, + "step": 19096 + }, + { + "epoch": 0.5662900690923108, + "grad_norm": 0.11085182428359985, + "learning_rate": 0.0004034269988205133, + "loss": 2.6543, + "step": 19097 + }, + { + "epoch": 0.5663197224446224, + "grad_norm": 0.12418191134929657, + "learning_rate": 0.00040338083425567653, + "loss": 2.6361, + "step": 19098 + }, + { + "epoch": 0.5663493757969339, + "grad_norm": 0.1032695323228836, + "learning_rate": 0.00040333467054641905, + "loss": 2.6239, + "step": 19099 + }, + { + "epoch": 0.5663790291492453, + "grad_norm": 0.09700077027082443, + "learning_rate": 0.00040328850769314964, + "loss": 2.5939, + "step": 19100 + }, + { + "epoch": 0.5664086825015568, + "grad_norm": 0.10878986120223999, + "learning_rate": 0.00040324234569627716, + "loss": 2.6244, + "step": 19101 + }, + { + "epoch": 0.5664383358538683, + "grad_norm": 0.10662465542554855, + "learning_rate": 0.00040319618455621025, + "loss": 2.6318, + "step": 19102 + }, + { + "epoch": 0.5664679892061798, + "grad_norm": 0.12230346351861954, + "learning_rate": 0.0004031500242733578, + "loss": 2.6183, + "step": 19103 + }, + { + "epoch": 0.5664976425584912, + "grad_norm": 0.11624372005462646, + "learning_rate": 0.0004031038648481284, + "loss": 2.6632, + "step": 19104 + }, + { + "epoch": 0.5665272959108028, + "grad_norm": 0.1135626807808876, + "learning_rate": 0.00040305770628093094, + "loss": 2.6174, + "step": 19105 + }, + { + "epoch": 0.5665569492631142, + "grad_norm": 0.11168026179075241, + "learning_rate": 0.00040301154857217417, + "loss": 2.6347, + "step": 19106 + }, + { + "epoch": 0.5665866026154257, + "grad_norm": 0.10125599801540375, + "learning_rate": 0.00040296539172226663, + "loss": 2.5976, + "step": 19107 + }, + { + "epoch": 0.5666162559677371, + "grad_norm": 0.10628367960453033, + "learning_rate": 0.0004029192357316174, + "loss": 2.6107, + "step": 19108 + }, + { + "epoch": 0.5666459093200487, + "grad_norm": 0.09985224902629852, + "learning_rate": 0.00040287308060063493, + "loss": 2.6156, + "step": 19109 + }, + { + "epoch": 0.5666755626723601, + "grad_norm": 0.09644865244626999, + "learning_rate": 0.000402826926329728, + "loss": 2.6395, + "step": 19110 + }, + { + "epoch": 0.5667052160246716, + "grad_norm": 0.11580684781074524, + "learning_rate": 0.0004027807729193054, + "loss": 2.6081, + "step": 19111 + }, + { + "epoch": 0.566734869376983, + "grad_norm": 0.11300560086965561, + "learning_rate": 0.0004027346203697757, + "loss": 2.6184, + "step": 19112 + }, + { + "epoch": 0.5667645227292946, + "grad_norm": 0.10563000291585922, + "learning_rate": 0.0004026884686815476, + "loss": 2.6517, + "step": 19113 + }, + { + "epoch": 0.566794176081606, + "grad_norm": 0.10127708315849304, + "learning_rate": 0.0004026423178550298, + "loss": 2.595, + "step": 19114 + }, + { + "epoch": 0.5668238294339175, + "grad_norm": 0.11229588836431503, + "learning_rate": 0.00040259616789063105, + "loss": 2.6071, + "step": 19115 + }, + { + "epoch": 0.5668534827862289, + "grad_norm": 0.13206890225410461, + "learning_rate": 0.0004025500187887599, + "loss": 2.6262, + "step": 19116 + }, + { + "epoch": 0.5668831361385405, + "grad_norm": 0.1156122013926506, + "learning_rate": 0.00040250387054982506, + "loss": 2.6491, + "step": 19117 + }, + { + "epoch": 0.5669127894908519, + "grad_norm": 0.1199849545955658, + "learning_rate": 0.0004024577231742353, + "loss": 2.6212, + "step": 19118 + }, + { + "epoch": 0.5669424428431634, + "grad_norm": 0.12087779492139816, + "learning_rate": 0.000402411576662399, + "loss": 2.6264, + "step": 19119 + }, + { + "epoch": 0.566972096195475, + "grad_norm": 0.11442023515701294, + "learning_rate": 0.0004023654310147248, + "loss": 2.6283, + "step": 19120 + }, + { + "epoch": 0.5670017495477864, + "grad_norm": 0.09735508263111115, + "learning_rate": 0.0004023192862316216, + "loss": 2.6125, + "step": 19121 + }, + { + "epoch": 0.5670314029000979, + "grad_norm": 0.12164374440908432, + "learning_rate": 0.0004022731423134981, + "loss": 2.6212, + "step": 19122 + }, + { + "epoch": 0.5670610562524093, + "grad_norm": 0.11806659400463104, + "learning_rate": 0.0004022269992607625, + "loss": 2.6416, + "step": 19123 + }, + { + "epoch": 0.5670907096047209, + "grad_norm": 0.12432623654603958, + "learning_rate": 0.0004021808570738236, + "loss": 2.6103, + "step": 19124 + }, + { + "epoch": 0.5671203629570323, + "grad_norm": 0.11561322212219238, + "learning_rate": 0.00040213471575309005, + "loss": 2.636, + "step": 19125 + }, + { + "epoch": 0.5671500163093438, + "grad_norm": 0.09493350982666016, + "learning_rate": 0.00040208857529897034, + "loss": 2.6148, + "step": 19126 + }, + { + "epoch": 0.5671796696616552, + "grad_norm": 0.0998557060956955, + "learning_rate": 0.0004020424357118732, + "loss": 2.6126, + "step": 19127 + }, + { + "epoch": 0.5672093230139668, + "grad_norm": 0.10523819178342819, + "learning_rate": 0.00040199629699220714, + "loss": 2.6198, + "step": 19128 + }, + { + "epoch": 0.5672389763662782, + "grad_norm": 0.10187608003616333, + "learning_rate": 0.0004019501591403806, + "loss": 2.6333, + "step": 19129 + }, + { + "epoch": 0.5672686297185897, + "grad_norm": 0.09184988588094711, + "learning_rate": 0.00040190402215680224, + "loss": 2.6234, + "step": 19130 + }, + { + "epoch": 0.5672982830709011, + "grad_norm": 0.1068120226264, + "learning_rate": 0.0004018578860418806, + "loss": 2.6407, + "step": 19131 + }, + { + "epoch": 0.5673279364232127, + "grad_norm": 0.09653767198324203, + "learning_rate": 0.00040181175079602417, + "loss": 2.6332, + "step": 19132 + }, + { + "epoch": 0.5673575897755241, + "grad_norm": 0.09686355292797089, + "learning_rate": 0.0004017656164196415, + "loss": 2.6611, + "step": 19133 + }, + { + "epoch": 0.5673872431278356, + "grad_norm": 0.10798921436071396, + "learning_rate": 0.0004017194829131412, + "loss": 2.6209, + "step": 19134 + }, + { + "epoch": 0.567416896480147, + "grad_norm": 0.1054951399564743, + "learning_rate": 0.0004016733502769318, + "loss": 2.6385, + "step": 19135 + }, + { + "epoch": 0.5674465498324586, + "grad_norm": 0.1186225414276123, + "learning_rate": 0.0004016272185114217, + "loss": 2.6325, + "step": 19136 + }, + { + "epoch": 0.56747620318477, + "grad_norm": 0.10507047176361084, + "learning_rate": 0.00040158108761701957, + "loss": 2.6295, + "step": 19137 + }, + { + "epoch": 0.5675058565370815, + "grad_norm": 0.0954604521393776, + "learning_rate": 0.0004015349575941337, + "loss": 2.6293, + "step": 19138 + }, + { + "epoch": 0.5675355098893929, + "grad_norm": 0.10309747606515884, + "learning_rate": 0.00040148882844317287, + "loss": 2.6312, + "step": 19139 + }, + { + "epoch": 0.5675651632417045, + "grad_norm": 0.10731406509876251, + "learning_rate": 0.0004014427001645452, + "loss": 2.5975, + "step": 19140 + }, + { + "epoch": 0.567594816594016, + "grad_norm": 0.11019600927829742, + "learning_rate": 0.0004013965727586594, + "loss": 2.6552, + "step": 19141 + }, + { + "epoch": 0.5676244699463274, + "grad_norm": 0.10667390376329422, + "learning_rate": 0.0004013504462259238, + "loss": 2.6049, + "step": 19142 + }, + { + "epoch": 0.567654123298639, + "grad_norm": 0.10832806676626205, + "learning_rate": 0.000401304320566747, + "loss": 2.6118, + "step": 19143 + }, + { + "epoch": 0.5676837766509504, + "grad_norm": 0.09414197504520416, + "learning_rate": 0.00040125819578153734, + "loss": 2.5922, + "step": 19144 + }, + { + "epoch": 0.5677134300032619, + "grad_norm": 0.10634932667016983, + "learning_rate": 0.00040121207187070337, + "loss": 2.6326, + "step": 19145 + }, + { + "epoch": 0.5677430833555733, + "grad_norm": 0.10834582149982452, + "learning_rate": 0.0004011659488346533, + "loss": 2.6211, + "step": 19146 + }, + { + "epoch": 0.5677727367078849, + "grad_norm": 0.1013936921954155, + "learning_rate": 0.00040111982667379584, + "loss": 2.6493, + "step": 19147 + }, + { + "epoch": 0.5678023900601963, + "grad_norm": 0.0988227128982544, + "learning_rate": 0.00040107370538853925, + "loss": 2.607, + "step": 19148 + }, + { + "epoch": 0.5678320434125078, + "grad_norm": 0.10402440279722214, + "learning_rate": 0.0004010275849792921, + "loss": 2.6321, + "step": 19149 + }, + { + "epoch": 0.5678616967648192, + "grad_norm": 0.12242157757282257, + "learning_rate": 0.0004009814654464626, + "loss": 2.5781, + "step": 19150 + }, + { + "epoch": 0.5678913501171308, + "grad_norm": 0.1368897259235382, + "learning_rate": 0.0004009353467904592, + "loss": 2.6652, + "step": 19151 + }, + { + "epoch": 0.5679210034694422, + "grad_norm": 0.1283375322818756, + "learning_rate": 0.0004008892290116903, + "loss": 2.6431, + "step": 19152 + }, + { + "epoch": 0.5679506568217537, + "grad_norm": 0.10948458313941956, + "learning_rate": 0.0004008431121105643, + "loss": 2.5819, + "step": 19153 + }, + { + "epoch": 0.5679803101740651, + "grad_norm": 0.11513222754001617, + "learning_rate": 0.00040079699608748954, + "loss": 2.6372, + "step": 19154 + }, + { + "epoch": 0.5680099635263767, + "grad_norm": 0.1337464153766632, + "learning_rate": 0.00040075088094287443, + "loss": 2.6668, + "step": 19155 + }, + { + "epoch": 0.5680396168786881, + "grad_norm": 0.1390410214662552, + "learning_rate": 0.00040070476667712743, + "loss": 2.6111, + "step": 19156 + }, + { + "epoch": 0.5680692702309996, + "grad_norm": 0.13934944570064545, + "learning_rate": 0.0004006586532906566, + "loss": 2.6521, + "step": 19157 + }, + { + "epoch": 0.568098923583311, + "grad_norm": 0.1412244439125061, + "learning_rate": 0.0004006125407838705, + "loss": 2.6161, + "step": 19158 + }, + { + "epoch": 0.5681285769356226, + "grad_norm": 0.1241099089384079, + "learning_rate": 0.0004005664291571772, + "loss": 2.6066, + "step": 19159 + }, + { + "epoch": 0.568158230287934, + "grad_norm": 0.11491437256336212, + "learning_rate": 0.00040052031841098553, + "loss": 2.6067, + "step": 19160 + }, + { + "epoch": 0.5681878836402455, + "grad_norm": 0.12158054113388062, + "learning_rate": 0.0004004742085457034, + "loss": 2.6128, + "step": 19161 + }, + { + "epoch": 0.5682175369925571, + "grad_norm": 0.11700670421123505, + "learning_rate": 0.00040042809956173926, + "loss": 2.6231, + "step": 19162 + }, + { + "epoch": 0.5682471903448685, + "grad_norm": 0.11886142194271088, + "learning_rate": 0.00040038199145950136, + "loss": 2.6313, + "step": 19163 + }, + { + "epoch": 0.56827684369718, + "grad_norm": 0.11667871475219727, + "learning_rate": 0.00040033588423939805, + "loss": 2.5961, + "step": 19164 + }, + { + "epoch": 0.5683064970494914, + "grad_norm": 0.12489549815654755, + "learning_rate": 0.00040028977790183763, + "loss": 2.661, + "step": 19165 + }, + { + "epoch": 0.568336150401803, + "grad_norm": 0.12353929877281189, + "learning_rate": 0.0004002436724472284, + "loss": 2.5943, + "step": 19166 + }, + { + "epoch": 0.5683658037541144, + "grad_norm": 0.11165083944797516, + "learning_rate": 0.0004001975678759785, + "loss": 2.6485, + "step": 19167 + }, + { + "epoch": 0.5683954571064259, + "grad_norm": 0.11712945252656937, + "learning_rate": 0.00040015146418849625, + "loss": 2.6557, + "step": 19168 + }, + { + "epoch": 0.5684251104587373, + "grad_norm": 0.12172872573137283, + "learning_rate": 0.0004001053613851899, + "loss": 2.6165, + "step": 19169 + }, + { + "epoch": 0.5684547638110489, + "grad_norm": 0.1187310442328453, + "learning_rate": 0.0004000592594664677, + "loss": 2.6061, + "step": 19170 + }, + { + "epoch": 0.5684844171633603, + "grad_norm": 0.09831859916448593, + "learning_rate": 0.000400013158432738, + "loss": 2.6272, + "step": 19171 + }, + { + "epoch": 0.5685140705156718, + "grad_norm": 0.1284891963005066, + "learning_rate": 0.00039996705828440875, + "loss": 2.6151, + "step": 19172 + }, + { + "epoch": 0.5685437238679832, + "grad_norm": 0.13922931253910065, + "learning_rate": 0.0003999209590218885, + "loss": 2.6093, + "step": 19173 + }, + { + "epoch": 0.5685733772202948, + "grad_norm": 0.09818170219659805, + "learning_rate": 0.0003998748606455854, + "loss": 2.64, + "step": 19174 + }, + { + "epoch": 0.5686030305726062, + "grad_norm": 0.12428314238786697, + "learning_rate": 0.00039982876315590747, + "loss": 2.6372, + "step": 19175 + }, + { + "epoch": 0.5686326839249177, + "grad_norm": 0.14198049902915955, + "learning_rate": 0.00039978266655326316, + "loss": 2.6308, + "step": 19176 + }, + { + "epoch": 0.5686623372772291, + "grad_norm": 0.1147952452301979, + "learning_rate": 0.0003997365708380606, + "loss": 2.6333, + "step": 19177 + }, + { + "epoch": 0.5686919906295407, + "grad_norm": 0.11541614681482315, + "learning_rate": 0.0003996904760107078, + "loss": 2.5864, + "step": 19178 + }, + { + "epoch": 0.5687216439818521, + "grad_norm": 0.13755840063095093, + "learning_rate": 0.00039964438207161306, + "loss": 2.6148, + "step": 19179 + }, + { + "epoch": 0.5687512973341636, + "grad_norm": 0.10399861633777618, + "learning_rate": 0.00039959828902118457, + "loss": 2.6559, + "step": 19180 + }, + { + "epoch": 0.568780950686475, + "grad_norm": 0.12613923847675323, + "learning_rate": 0.0003995521968598304, + "loss": 2.6457, + "step": 19181 + }, + { + "epoch": 0.5688106040387866, + "grad_norm": 0.12925098836421967, + "learning_rate": 0.0003995061055879588, + "loss": 2.6232, + "step": 19182 + }, + { + "epoch": 0.5688402573910981, + "grad_norm": 0.12410494685173035, + "learning_rate": 0.0003994600152059779, + "loss": 2.6518, + "step": 19183 + }, + { + "epoch": 0.5688699107434095, + "grad_norm": 0.12583966553211212, + "learning_rate": 0.0003994139257142959, + "loss": 2.6064, + "step": 19184 + }, + { + "epoch": 0.5688995640957211, + "grad_norm": 0.10219016671180725, + "learning_rate": 0.0003993678371133206, + "loss": 2.6476, + "step": 19185 + }, + { + "epoch": 0.5689292174480325, + "grad_norm": 0.12925994396209717, + "learning_rate": 0.0003993217494034605, + "loss": 2.6277, + "step": 19186 + }, + { + "epoch": 0.568958870800344, + "grad_norm": 0.11083568632602692, + "learning_rate": 0.00039927566258512375, + "loss": 2.6347, + "step": 19187 + }, + { + "epoch": 0.5689885241526554, + "grad_norm": 0.13053353130817413, + "learning_rate": 0.0003992295766587182, + "loss": 2.6034, + "step": 19188 + }, + { + "epoch": 0.569018177504967, + "grad_norm": 0.10555212944746017, + "learning_rate": 0.000399183491624652, + "loss": 2.6311, + "step": 19189 + }, + { + "epoch": 0.5690478308572784, + "grad_norm": 0.10956935584545135, + "learning_rate": 0.0003991374074833333, + "loss": 2.6152, + "step": 19190 + }, + { + "epoch": 0.5690774842095899, + "grad_norm": 0.12033160775899887, + "learning_rate": 0.0003990913242351702, + "loss": 2.6267, + "step": 19191 + }, + { + "epoch": 0.5691071375619013, + "grad_norm": 0.09954527020454407, + "learning_rate": 0.00039904524188057075, + "loss": 2.6054, + "step": 19192 + }, + { + "epoch": 0.5691367909142129, + "grad_norm": 0.1172880008816719, + "learning_rate": 0.000398999160419943, + "loss": 2.6495, + "step": 19193 + }, + { + "epoch": 0.5691664442665243, + "grad_norm": 0.11700901389122009, + "learning_rate": 0.0003989530798536951, + "loss": 2.6732, + "step": 19194 + }, + { + "epoch": 0.5691960976188358, + "grad_norm": 0.11111985146999359, + "learning_rate": 0.0003989070001822349, + "loss": 2.5889, + "step": 19195 + }, + { + "epoch": 0.5692257509711472, + "grad_norm": 0.10302642732858658, + "learning_rate": 0.00039886092140597063, + "loss": 2.5923, + "step": 19196 + }, + { + "epoch": 0.5692554043234588, + "grad_norm": 0.10825001448392868, + "learning_rate": 0.0003988148435253102, + "loss": 2.6302, + "step": 19197 + }, + { + "epoch": 0.5692850576757702, + "grad_norm": 0.12243258208036423, + "learning_rate": 0.00039876876654066165, + "loss": 2.6275, + "step": 19198 + }, + { + "epoch": 0.5693147110280817, + "grad_norm": 0.11370716989040375, + "learning_rate": 0.0003987226904524331, + "loss": 2.6382, + "step": 19199 + }, + { + "epoch": 0.5693443643803932, + "grad_norm": 0.11413747817277908, + "learning_rate": 0.00039867661526103254, + "loss": 2.6136, + "step": 19200 + }, + { + "epoch": 0.5693740177327047, + "grad_norm": 0.11903159320354462, + "learning_rate": 0.0003986305409668679, + "loss": 2.6452, + "step": 19201 + }, + { + "epoch": 0.5694036710850162, + "grad_norm": 0.12721984088420868, + "learning_rate": 0.0003985844675703473, + "loss": 2.6316, + "step": 19202 + }, + { + "epoch": 0.5694333244373276, + "grad_norm": 0.1364881545305252, + "learning_rate": 0.0003985383950718786, + "loss": 2.6265, + "step": 19203 + }, + { + "epoch": 0.5694629777896392, + "grad_norm": 0.11626432090997696, + "learning_rate": 0.0003984923234718699, + "loss": 2.6235, + "step": 19204 + }, + { + "epoch": 0.5694926311419506, + "grad_norm": 0.09842794388532639, + "learning_rate": 0.0003984462527707291, + "loss": 2.6135, + "step": 19205 + }, + { + "epoch": 0.5695222844942621, + "grad_norm": 0.10190922021865845, + "learning_rate": 0.00039840018296886405, + "loss": 2.6216, + "step": 19206 + }, + { + "epoch": 0.5695519378465735, + "grad_norm": 0.10374509543180466, + "learning_rate": 0.00039835411406668287, + "loss": 2.5919, + "step": 19207 + }, + { + "epoch": 0.5695815911988851, + "grad_norm": 0.09485223889350891, + "learning_rate": 0.0003983080460645935, + "loss": 2.6112, + "step": 19208 + }, + { + "epoch": 0.5696112445511965, + "grad_norm": 0.09348495304584503, + "learning_rate": 0.0003982619789630038, + "loss": 2.5633, + "step": 19209 + }, + { + "epoch": 0.569640897903508, + "grad_norm": 0.09741092473268509, + "learning_rate": 0.00039821591276232167, + "loss": 2.64, + "step": 19210 + }, + { + "epoch": 0.5696705512558194, + "grad_norm": 0.09084895998239517, + "learning_rate": 0.00039816984746295504, + "loss": 2.6014, + "step": 19211 + }, + { + "epoch": 0.569700204608131, + "grad_norm": 0.08752364665269852, + "learning_rate": 0.00039812378306531204, + "loss": 2.6097, + "step": 19212 + }, + { + "epoch": 0.5697298579604424, + "grad_norm": 0.09642254561185837, + "learning_rate": 0.0003980777195698004, + "loss": 2.6333, + "step": 19213 + }, + { + "epoch": 0.5697595113127539, + "grad_norm": 0.10750467330217361, + "learning_rate": 0.00039803165697682805, + "loss": 2.6259, + "step": 19214 + }, + { + "epoch": 0.5697891646650654, + "grad_norm": 0.12467296421527863, + "learning_rate": 0.000397985595286803, + "loss": 2.6348, + "step": 19215 + }, + { + "epoch": 0.5698188180173769, + "grad_norm": 0.1189974993467331, + "learning_rate": 0.0003979395345001329, + "loss": 2.6182, + "step": 19216 + }, + { + "epoch": 0.5698484713696883, + "grad_norm": 0.10033135861158371, + "learning_rate": 0.0003978934746172257, + "loss": 2.6265, + "step": 19217 + }, + { + "epoch": 0.5698781247219998, + "grad_norm": 0.10935575515031815, + "learning_rate": 0.00039784741563848934, + "loss": 2.6195, + "step": 19218 + }, + { + "epoch": 0.5699077780743113, + "grad_norm": 0.11102046072483063, + "learning_rate": 0.00039780135756433164, + "loss": 2.6254, + "step": 19219 + }, + { + "epoch": 0.5699374314266228, + "grad_norm": 0.11019832640886307, + "learning_rate": 0.0003977553003951605, + "loss": 2.6793, + "step": 19220 + }, + { + "epoch": 0.5699670847789342, + "grad_norm": 0.11005600541830063, + "learning_rate": 0.00039770924413138367, + "loss": 2.6342, + "step": 19221 + }, + { + "epoch": 0.5699967381312457, + "grad_norm": 0.11168976128101349, + "learning_rate": 0.00039766318877340915, + "loss": 2.6398, + "step": 19222 + }, + { + "epoch": 0.5700263914835573, + "grad_norm": 0.10758472979068756, + "learning_rate": 0.0003976171343216446, + "loss": 2.6422, + "step": 19223 + }, + { + "epoch": 0.5700560448358687, + "grad_norm": 0.1102948784828186, + "learning_rate": 0.0003975710807764977, + "loss": 2.6062, + "step": 19224 + }, + { + "epoch": 0.5700856981881802, + "grad_norm": 0.09799423068761826, + "learning_rate": 0.0003975250281383767, + "loss": 2.6236, + "step": 19225 + }, + { + "epoch": 0.5701153515404916, + "grad_norm": 0.10586746782064438, + "learning_rate": 0.0003974789764076891, + "loss": 2.6902, + "step": 19226 + }, + { + "epoch": 0.5701450048928032, + "grad_norm": 0.10159764438867569, + "learning_rate": 0.0003974329255848428, + "loss": 2.6183, + "step": 19227 + }, + { + "epoch": 0.5701746582451146, + "grad_norm": 0.1124667376279831, + "learning_rate": 0.0003973868756702455, + "loss": 2.642, + "step": 19228 + }, + { + "epoch": 0.5702043115974261, + "grad_norm": 0.1193556860089302, + "learning_rate": 0.00039734082666430503, + "loss": 2.6139, + "step": 19229 + }, + { + "epoch": 0.5702339649497375, + "grad_norm": 0.10668406635522842, + "learning_rate": 0.0003972947785674292, + "loss": 2.6618, + "step": 19230 + }, + { + "epoch": 0.5702636183020491, + "grad_norm": 0.101804718375206, + "learning_rate": 0.0003972487313800257, + "loss": 2.6378, + "step": 19231 + }, + { + "epoch": 0.5702932716543605, + "grad_norm": 0.11247619241476059, + "learning_rate": 0.00039720268510250244, + "loss": 2.6057, + "step": 19232 + }, + { + "epoch": 0.570322925006672, + "grad_norm": 0.10282745212316513, + "learning_rate": 0.00039715663973526695, + "loss": 2.5918, + "step": 19233 + }, + { + "epoch": 0.5703525783589835, + "grad_norm": 0.11023230850696564, + "learning_rate": 0.0003971105952787271, + "loss": 2.6085, + "step": 19234 + }, + { + "epoch": 0.570382231711295, + "grad_norm": 0.11641757935285568, + "learning_rate": 0.0003970645517332905, + "loss": 2.629, + "step": 19235 + }, + { + "epoch": 0.5704118850636064, + "grad_norm": 0.10790248215198517, + "learning_rate": 0.000397018509099365, + "loss": 2.595, + "step": 19236 + }, + { + "epoch": 0.5704415384159179, + "grad_norm": 0.10212133079767227, + "learning_rate": 0.00039697246737735816, + "loss": 2.6217, + "step": 19237 + }, + { + "epoch": 0.5704711917682294, + "grad_norm": 0.12542441487312317, + "learning_rate": 0.0003969264265676779, + "loss": 2.6054, + "step": 19238 + }, + { + "epoch": 0.5705008451205409, + "grad_norm": 0.13179835677146912, + "learning_rate": 0.0003968803866707319, + "loss": 2.6004, + "step": 19239 + }, + { + "epoch": 0.5705304984728523, + "grad_norm": 0.10979732125997543, + "learning_rate": 0.00039683434768692774, + "loss": 2.6239, + "step": 19240 + }, + { + "epoch": 0.5705601518251638, + "grad_norm": 0.11911260336637497, + "learning_rate": 0.0003967883096166731, + "loss": 2.6432, + "step": 19241 + }, + { + "epoch": 0.5705898051774753, + "grad_norm": 0.11189420521259308, + "learning_rate": 0.0003967422724603759, + "loss": 2.596, + "step": 19242 + }, + { + "epoch": 0.5706194585297868, + "grad_norm": 0.11779848486185074, + "learning_rate": 0.00039669623621844334, + "loss": 2.6349, + "step": 19243 + }, + { + "epoch": 0.5706491118820983, + "grad_norm": 0.11341637372970581, + "learning_rate": 0.00039665020089128345, + "loss": 2.6127, + "step": 19244 + }, + { + "epoch": 0.5706787652344097, + "grad_norm": 0.11071398109197617, + "learning_rate": 0.00039660416647930376, + "loss": 2.6832, + "step": 19245 + }, + { + "epoch": 0.5707084185867213, + "grad_norm": 0.13570980727672577, + "learning_rate": 0.00039655813298291193, + "loss": 2.6272, + "step": 19246 + }, + { + "epoch": 0.5707380719390327, + "grad_norm": 0.14274968206882477, + "learning_rate": 0.0003965121004025156, + "loss": 2.6438, + "step": 19247 + }, + { + "epoch": 0.5707677252913442, + "grad_norm": 0.1560092270374298, + "learning_rate": 0.0003964660687385223, + "loss": 2.6179, + "step": 19248 + }, + { + "epoch": 0.5707973786436557, + "grad_norm": 0.1122499480843544, + "learning_rate": 0.0003964200379913397, + "loss": 2.646, + "step": 19249 + }, + { + "epoch": 0.5708270319959672, + "grad_norm": 0.11699654161930084, + "learning_rate": 0.00039637400816137556, + "loss": 2.6504, + "step": 19250 + }, + { + "epoch": 0.5708566853482786, + "grad_norm": 0.15135599672794342, + "learning_rate": 0.0003963279792490373, + "loss": 2.6153, + "step": 19251 + }, + { + "epoch": 0.5708863387005901, + "grad_norm": 0.12751233577728271, + "learning_rate": 0.0003962819512547326, + "loss": 2.6378, + "step": 19252 + }, + { + "epoch": 0.5709159920529016, + "grad_norm": 0.11840996146202087, + "learning_rate": 0.00039623592417886916, + "loss": 2.6524, + "step": 19253 + }, + { + "epoch": 0.5709456454052131, + "grad_norm": 0.14447425305843353, + "learning_rate": 0.0003961898980218543, + "loss": 2.6229, + "step": 19254 + }, + { + "epoch": 0.5709752987575245, + "grad_norm": 0.1282745748758316, + "learning_rate": 0.0003961438727840957, + "loss": 2.6337, + "step": 19255 + }, + { + "epoch": 0.571004952109836, + "grad_norm": 0.11737528443336487, + "learning_rate": 0.00039609784846600095, + "loss": 2.665, + "step": 19256 + }, + { + "epoch": 0.5710346054621475, + "grad_norm": 0.12044351547956467, + "learning_rate": 0.0003960518250679776, + "loss": 2.6394, + "step": 19257 + }, + { + "epoch": 0.571064258814459, + "grad_norm": 0.10398533195257187, + "learning_rate": 0.0003960058025904332, + "loss": 2.629, + "step": 19258 + }, + { + "epoch": 0.5710939121667704, + "grad_norm": 0.12428804486989975, + "learning_rate": 0.0003959597810337752, + "loss": 2.6003, + "step": 19259 + }, + { + "epoch": 0.5711235655190819, + "grad_norm": 0.11433997005224228, + "learning_rate": 0.0003959137603984113, + "loss": 2.6357, + "step": 19260 + }, + { + "epoch": 0.5711532188713934, + "grad_norm": 0.1189231127500534, + "learning_rate": 0.0003958677406847489, + "loss": 2.6271, + "step": 19261 + }, + { + "epoch": 0.5711828722237049, + "grad_norm": 0.12567955255508423, + "learning_rate": 0.00039582172189319526, + "loss": 2.5861, + "step": 19262 + }, + { + "epoch": 0.5712125255760163, + "grad_norm": 0.11737136542797089, + "learning_rate": 0.0003957757040241585, + "loss": 2.6443, + "step": 19263 + }, + { + "epoch": 0.5712421789283278, + "grad_norm": 0.10242962837219238, + "learning_rate": 0.00039572968707804565, + "loss": 2.6518, + "step": 19264 + }, + { + "epoch": 0.5712718322806394, + "grad_norm": 0.12190496921539307, + "learning_rate": 0.0003956836710552643, + "loss": 2.6235, + "step": 19265 + }, + { + "epoch": 0.5713014856329508, + "grad_norm": 0.1345912367105484, + "learning_rate": 0.0003956376559562219, + "loss": 2.5804, + "step": 19266 + }, + { + "epoch": 0.5713311389852623, + "grad_norm": 0.12070897221565247, + "learning_rate": 0.00039559164178132604, + "loss": 2.6246, + "step": 19267 + }, + { + "epoch": 0.5713607923375738, + "grad_norm": 0.12542201578617096, + "learning_rate": 0.0003955456285309841, + "loss": 2.6567, + "step": 19268 + }, + { + "epoch": 0.5713904456898853, + "grad_norm": 0.11720346659421921, + "learning_rate": 0.0003954996162056036, + "loss": 2.6313, + "step": 19269 + }, + { + "epoch": 0.5714200990421967, + "grad_norm": 0.12390769273042679, + "learning_rate": 0.000395453604805592, + "loss": 2.6507, + "step": 19270 + }, + { + "epoch": 0.5714497523945082, + "grad_norm": 0.10651259124279022, + "learning_rate": 0.00039540759433135655, + "loss": 2.6181, + "step": 19271 + }, + { + "epoch": 0.5714794057468197, + "grad_norm": 0.12685415148735046, + "learning_rate": 0.0003953615847833048, + "loss": 2.6421, + "step": 19272 + }, + { + "epoch": 0.5715090590991312, + "grad_norm": 0.12855415046215057, + "learning_rate": 0.00039531557616184423, + "loss": 2.6452, + "step": 19273 + }, + { + "epoch": 0.5715387124514426, + "grad_norm": 0.11122529208660126, + "learning_rate": 0.0003952695684673822, + "loss": 2.6437, + "step": 19274 + }, + { + "epoch": 0.5715683658037541, + "grad_norm": 0.10016093403100967, + "learning_rate": 0.0003952235617003259, + "loss": 2.639, + "step": 19275 + }, + { + "epoch": 0.5715980191560656, + "grad_norm": 0.11339963972568512, + "learning_rate": 0.00039517755586108316, + "loss": 2.6025, + "step": 19276 + }, + { + "epoch": 0.5716276725083771, + "grad_norm": 0.11117758601903915, + "learning_rate": 0.00039513155095006114, + "loss": 2.6382, + "step": 19277 + }, + { + "epoch": 0.5716573258606885, + "grad_norm": 0.10457390546798706, + "learning_rate": 0.00039508554696766717, + "loss": 2.6036, + "step": 19278 + }, + { + "epoch": 0.571686979213, + "grad_norm": 0.10291232168674469, + "learning_rate": 0.0003950395439143088, + "loss": 2.6061, + "step": 19279 + }, + { + "epoch": 0.5717166325653115, + "grad_norm": 0.12378178536891937, + "learning_rate": 0.0003949935417903932, + "loss": 2.5763, + "step": 19280 + }, + { + "epoch": 0.571746285917623, + "grad_norm": 0.10746121406555176, + "learning_rate": 0.000394947540596328, + "loss": 2.6163, + "step": 19281 + }, + { + "epoch": 0.5717759392699344, + "grad_norm": 0.1137571632862091, + "learning_rate": 0.0003949015403325202, + "loss": 2.6311, + "step": 19282 + }, + { + "epoch": 0.571805592622246, + "grad_norm": 0.12073831260204315, + "learning_rate": 0.00039485554099937724, + "loss": 2.6004, + "step": 19283 + }, + { + "epoch": 0.5718352459745574, + "grad_norm": 0.10528159886598587, + "learning_rate": 0.00039480954259730664, + "loss": 2.6213, + "step": 19284 + }, + { + "epoch": 0.5718648993268689, + "grad_norm": 0.12304676324129105, + "learning_rate": 0.0003947635451267155, + "loss": 2.6211, + "step": 19285 + }, + { + "epoch": 0.5718945526791804, + "grad_norm": 0.11085020005702972, + "learning_rate": 0.0003947175485880112, + "loss": 2.6234, + "step": 19286 + }, + { + "epoch": 0.5719242060314919, + "grad_norm": 0.1184779480099678, + "learning_rate": 0.0003946715529816013, + "loss": 2.6359, + "step": 19287 + }, + { + "epoch": 0.5719538593838034, + "grad_norm": 0.12403887510299683, + "learning_rate": 0.0003946255583078925, + "loss": 2.634, + "step": 19288 + }, + { + "epoch": 0.5719835127361148, + "grad_norm": 0.11218523979187012, + "learning_rate": 0.00039457956456729265, + "loss": 2.6208, + "step": 19289 + }, + { + "epoch": 0.5720131660884263, + "grad_norm": 0.10563432425260544, + "learning_rate": 0.0003945335717602089, + "loss": 2.6178, + "step": 19290 + }, + { + "epoch": 0.5720428194407378, + "grad_norm": 0.11900386959314346, + "learning_rate": 0.0003944875798870486, + "loss": 2.6365, + "step": 19291 + }, + { + "epoch": 0.5720724727930493, + "grad_norm": 0.11121536791324615, + "learning_rate": 0.00039444158894821874, + "loss": 2.6245, + "step": 19292 + }, + { + "epoch": 0.5721021261453607, + "grad_norm": 0.11304187029600143, + "learning_rate": 0.00039439559894412674, + "loss": 2.6637, + "step": 19293 + }, + { + "epoch": 0.5721317794976722, + "grad_norm": 0.12282392382621765, + "learning_rate": 0.0003943496098751799, + "loss": 2.6195, + "step": 19294 + }, + { + "epoch": 0.5721614328499837, + "grad_norm": 0.10501158237457275, + "learning_rate": 0.00039430362174178526, + "loss": 2.6198, + "step": 19295 + }, + { + "epoch": 0.5721910862022952, + "grad_norm": 0.12618178129196167, + "learning_rate": 0.00039425763454435024, + "loss": 2.6783, + "step": 19296 + }, + { + "epoch": 0.5722207395546066, + "grad_norm": 0.11391826719045639, + "learning_rate": 0.0003942116482832821, + "loss": 2.6091, + "step": 19297 + }, + { + "epoch": 0.5722503929069181, + "grad_norm": 0.11163931339979172, + "learning_rate": 0.00039416566295898803, + "loss": 2.6178, + "step": 19298 + }, + { + "epoch": 0.5722800462592296, + "grad_norm": 0.13327506184577942, + "learning_rate": 0.0003941196785718751, + "loss": 2.5917, + "step": 19299 + }, + { + "epoch": 0.5723096996115411, + "grad_norm": 0.12550388276576996, + "learning_rate": 0.0003940736951223505, + "loss": 2.6239, + "step": 19300 + }, + { + "epoch": 0.5723393529638525, + "grad_norm": 0.10278768092393875, + "learning_rate": 0.0003940277126108215, + "loss": 2.6094, + "step": 19301 + }, + { + "epoch": 0.572369006316164, + "grad_norm": 0.11948718875646591, + "learning_rate": 0.00039398173103769534, + "loss": 2.6368, + "step": 19302 + }, + { + "epoch": 0.5723986596684755, + "grad_norm": 0.12116576731204987, + "learning_rate": 0.0003939357504033792, + "loss": 2.6526, + "step": 19303 + }, + { + "epoch": 0.572428313020787, + "grad_norm": 0.1073525920510292, + "learning_rate": 0.0003938897707082802, + "loss": 2.601, + "step": 19304 + }, + { + "epoch": 0.5724579663730984, + "grad_norm": 0.12122806906700134, + "learning_rate": 0.0003938437919528055, + "loss": 2.6274, + "step": 19305 + }, + { + "epoch": 0.57248761972541, + "grad_norm": 0.11700212210416794, + "learning_rate": 0.00039379781413736216, + "loss": 2.6248, + "step": 19306 + }, + { + "epoch": 0.5725172730777215, + "grad_norm": 0.0981343612074852, + "learning_rate": 0.0003937518372623574, + "loss": 2.6045, + "step": 19307 + }, + { + "epoch": 0.5725469264300329, + "grad_norm": 0.09913898259401321, + "learning_rate": 0.0003937058613281986, + "loss": 2.5696, + "step": 19308 + }, + { + "epoch": 0.5725765797823444, + "grad_norm": 0.11111032217741013, + "learning_rate": 0.0003936598863352924, + "loss": 2.6384, + "step": 19309 + }, + { + "epoch": 0.5726062331346559, + "grad_norm": 0.133505180478096, + "learning_rate": 0.0003936139122840462, + "loss": 2.628, + "step": 19310 + }, + { + "epoch": 0.5726358864869674, + "grad_norm": 0.11488871276378632, + "learning_rate": 0.0003935679391748671, + "loss": 2.5993, + "step": 19311 + }, + { + "epoch": 0.5726655398392788, + "grad_norm": 0.11059815436601639, + "learning_rate": 0.00039352196700816216, + "loss": 2.6165, + "step": 19312 + }, + { + "epoch": 0.5726951931915903, + "grad_norm": 0.11304686963558197, + "learning_rate": 0.00039347599578433846, + "loss": 2.596, + "step": 19313 + }, + { + "epoch": 0.5727248465439018, + "grad_norm": 0.10008765012025833, + "learning_rate": 0.000393430025503803, + "loss": 2.6322, + "step": 19314 + }, + { + "epoch": 0.5727544998962133, + "grad_norm": 0.11586028337478638, + "learning_rate": 0.0003933840561669631, + "loss": 2.6198, + "step": 19315 + }, + { + "epoch": 0.5727841532485247, + "grad_norm": 0.10874731838703156, + "learning_rate": 0.0003933380877742256, + "loss": 2.5988, + "step": 19316 + }, + { + "epoch": 0.5728138066008363, + "grad_norm": 0.09621918946504593, + "learning_rate": 0.00039329212032599776, + "loss": 2.6207, + "step": 19317 + }, + { + "epoch": 0.5728434599531477, + "grad_norm": 0.11998101323843002, + "learning_rate": 0.0003932461538226864, + "loss": 2.5948, + "step": 19318 + }, + { + "epoch": 0.5728731133054592, + "grad_norm": 0.12588222324848175, + "learning_rate": 0.0003932001882646988, + "loss": 2.6576, + "step": 19319 + }, + { + "epoch": 0.5729027666577706, + "grad_norm": 0.12550154328346252, + "learning_rate": 0.00039315422365244183, + "loss": 2.6716, + "step": 19320 + }, + { + "epoch": 0.5729324200100822, + "grad_norm": 0.13849984109401703, + "learning_rate": 0.0003931082599863225, + "loss": 2.6088, + "step": 19321 + }, + { + "epoch": 0.5729620733623936, + "grad_norm": 0.1268347203731537, + "learning_rate": 0.0003930622972667479, + "loss": 2.6468, + "step": 19322 + }, + { + "epoch": 0.5729917267147051, + "grad_norm": 0.10134457796812057, + "learning_rate": 0.000393016335494125, + "loss": 2.6158, + "step": 19323 + }, + { + "epoch": 0.5730213800670165, + "grad_norm": 0.12177126109600067, + "learning_rate": 0.00039297037466886074, + "loss": 2.614, + "step": 19324 + }, + { + "epoch": 0.5730510334193281, + "grad_norm": 0.10189349949359894, + "learning_rate": 0.0003929244147913624, + "loss": 2.6216, + "step": 19325 + }, + { + "epoch": 0.5730806867716395, + "grad_norm": 0.10302730649709702, + "learning_rate": 0.0003928784558620366, + "loss": 2.6045, + "step": 19326 + }, + { + "epoch": 0.573110340123951, + "grad_norm": 0.10413552075624466, + "learning_rate": 0.0003928324978812902, + "loss": 2.5983, + "step": 19327 + }, + { + "epoch": 0.5731399934762625, + "grad_norm": 0.10190519690513611, + "learning_rate": 0.0003927865408495307, + "loss": 2.6057, + "step": 19328 + }, + { + "epoch": 0.573169646828574, + "grad_norm": 0.12048228830099106, + "learning_rate": 0.0003927405847671649, + "loss": 2.6056, + "step": 19329 + }, + { + "epoch": 0.5731993001808855, + "grad_norm": 0.12862440943717957, + "learning_rate": 0.00039269462963459947, + "loss": 2.5874, + "step": 19330 + }, + { + "epoch": 0.5732289535331969, + "grad_norm": 0.11778555065393448, + "learning_rate": 0.00039264867545224155, + "loss": 2.5825, + "step": 19331 + }, + { + "epoch": 0.5732586068855084, + "grad_norm": 0.11428018659353256, + "learning_rate": 0.00039260272222049796, + "loss": 2.6352, + "step": 19332 + }, + { + "epoch": 0.5732882602378199, + "grad_norm": 0.11445630341768265, + "learning_rate": 0.0003925567699397758, + "loss": 2.6304, + "step": 19333 + }, + { + "epoch": 0.5733179135901314, + "grad_norm": 0.13516393303871155, + "learning_rate": 0.00039251081861048186, + "loss": 2.637, + "step": 19334 + }, + { + "epoch": 0.5733475669424428, + "grad_norm": 0.13371092081069946, + "learning_rate": 0.00039246486823302305, + "loss": 2.6415, + "step": 19335 + }, + { + "epoch": 0.5733772202947544, + "grad_norm": 0.09919149428606033, + "learning_rate": 0.00039241891880780643, + "loss": 2.5545, + "step": 19336 + }, + { + "epoch": 0.5734068736470658, + "grad_norm": 0.10670538991689682, + "learning_rate": 0.00039237297033523864, + "loss": 2.6144, + "step": 19337 + }, + { + "epoch": 0.5734365269993773, + "grad_norm": 0.09981394559144974, + "learning_rate": 0.0003923270228157266, + "loss": 2.6137, + "step": 19338 + }, + { + "epoch": 0.5734661803516887, + "grad_norm": 0.10477206110954285, + "learning_rate": 0.00039228107624967733, + "loss": 2.5915, + "step": 19339 + }, + { + "epoch": 0.5734958337040003, + "grad_norm": 0.1105671301484108, + "learning_rate": 0.0003922351306374975, + "loss": 2.6042, + "step": 19340 + }, + { + "epoch": 0.5735254870563117, + "grad_norm": 0.11490492522716522, + "learning_rate": 0.0003921891859795942, + "loss": 2.6297, + "step": 19341 + }, + { + "epoch": 0.5735551404086232, + "grad_norm": 0.10904115438461304, + "learning_rate": 0.0003921432422763741, + "loss": 2.6571, + "step": 19342 + }, + { + "epoch": 0.5735847937609346, + "grad_norm": 0.10877513885498047, + "learning_rate": 0.0003920972995282441, + "loss": 2.6363, + "step": 19343 + }, + { + "epoch": 0.5736144471132462, + "grad_norm": 0.09680324792861938, + "learning_rate": 0.00039205135773561113, + "loss": 2.5998, + "step": 19344 + }, + { + "epoch": 0.5736441004655576, + "grad_norm": 0.09741900116205215, + "learning_rate": 0.00039200541689888186, + "loss": 2.6541, + "step": 19345 + }, + { + "epoch": 0.5736737538178691, + "grad_norm": 0.11248882859945297, + "learning_rate": 0.0003919594770184633, + "loss": 2.6508, + "step": 19346 + }, + { + "epoch": 0.5737034071701805, + "grad_norm": 0.11491207033395767, + "learning_rate": 0.00039191353809476197, + "loss": 2.6053, + "step": 19347 + }, + { + "epoch": 0.5737330605224921, + "grad_norm": 0.11196376383304596, + "learning_rate": 0.00039186760012818484, + "loss": 2.626, + "step": 19348 + }, + { + "epoch": 0.5737627138748036, + "grad_norm": 0.10232427716255188, + "learning_rate": 0.00039182166311913863, + "loss": 2.6242, + "step": 19349 + }, + { + "epoch": 0.573792367227115, + "grad_norm": 0.12449511140584946, + "learning_rate": 0.0003917757270680302, + "loss": 2.6144, + "step": 19350 + }, + { + "epoch": 0.5738220205794266, + "grad_norm": 0.12760663032531738, + "learning_rate": 0.00039172979197526624, + "loss": 2.6301, + "step": 19351 + }, + { + "epoch": 0.573851673931738, + "grad_norm": 0.09755353629589081, + "learning_rate": 0.00039168385784125364, + "loss": 2.6189, + "step": 19352 + }, + { + "epoch": 0.5738813272840495, + "grad_norm": 0.1070813313126564, + "learning_rate": 0.0003916379246663989, + "loss": 2.6152, + "step": 19353 + }, + { + "epoch": 0.5739109806363609, + "grad_norm": 0.11792381852865219, + "learning_rate": 0.00039159199245110903, + "loss": 2.6354, + "step": 19354 + }, + { + "epoch": 0.5739406339886725, + "grad_norm": 0.12220537662506104, + "learning_rate": 0.0003915460611957907, + "loss": 2.601, + "step": 19355 + }, + { + "epoch": 0.5739702873409839, + "grad_norm": 0.10108285397291183, + "learning_rate": 0.0003915001309008507, + "loss": 2.6151, + "step": 19356 + }, + { + "epoch": 0.5739999406932954, + "grad_norm": 0.11420036852359772, + "learning_rate": 0.00039145420156669564, + "loss": 2.6271, + "step": 19357 + }, + { + "epoch": 0.5740295940456068, + "grad_norm": 0.11849723011255264, + "learning_rate": 0.00039140827319373217, + "loss": 2.6132, + "step": 19358 + }, + { + "epoch": 0.5740592473979184, + "grad_norm": 0.12108342349529266, + "learning_rate": 0.0003913623457823672, + "loss": 2.6017, + "step": 19359 + }, + { + "epoch": 0.5740889007502298, + "grad_norm": 0.10296264290809631, + "learning_rate": 0.0003913164193330072, + "loss": 2.6274, + "step": 19360 + }, + { + "epoch": 0.5741185541025413, + "grad_norm": 0.12410878390073776, + "learning_rate": 0.000391270493846059, + "loss": 2.5841, + "step": 19361 + }, + { + "epoch": 0.5741482074548527, + "grad_norm": 0.12105830013751984, + "learning_rate": 0.0003912245693219292, + "loss": 2.6553, + "step": 19362 + }, + { + "epoch": 0.5741778608071643, + "grad_norm": 0.11572560667991638, + "learning_rate": 0.00039117864576102457, + "loss": 2.6296, + "step": 19363 + }, + { + "epoch": 0.5742075141594757, + "grad_norm": 0.10708245635032654, + "learning_rate": 0.0003911327231637517, + "loss": 2.646, + "step": 19364 + }, + { + "epoch": 0.5742371675117872, + "grad_norm": 0.11186578124761581, + "learning_rate": 0.0003910868015305172, + "loss": 2.6569, + "step": 19365 + }, + { + "epoch": 0.5742668208640986, + "grad_norm": 0.1092817559838295, + "learning_rate": 0.0003910408808617276, + "loss": 2.6407, + "step": 19366 + }, + { + "epoch": 0.5742964742164102, + "grad_norm": 0.10848737508058548, + "learning_rate": 0.00039099496115779, + "loss": 2.5866, + "step": 19367 + }, + { + "epoch": 0.5743261275687216, + "grad_norm": 0.1205306425690651, + "learning_rate": 0.0003909490424191106, + "loss": 2.6263, + "step": 19368 + }, + { + "epoch": 0.5743557809210331, + "grad_norm": 0.10108073055744171, + "learning_rate": 0.0003909031246460962, + "loss": 2.6351, + "step": 19369 + }, + { + "epoch": 0.5743854342733447, + "grad_norm": 0.11530553549528122, + "learning_rate": 0.0003908572078391533, + "loss": 2.614, + "step": 19370 + }, + { + "epoch": 0.5744150876256561, + "grad_norm": 0.10675088316202164, + "learning_rate": 0.0003908112919986886, + "loss": 2.615, + "step": 19371 + }, + { + "epoch": 0.5744447409779676, + "grad_norm": 0.11500523239374161, + "learning_rate": 0.0003907653771251086, + "loss": 2.6472, + "step": 19372 + }, + { + "epoch": 0.574474394330279, + "grad_norm": 0.12286406755447388, + "learning_rate": 0.00039071946321881995, + "loss": 2.629, + "step": 19373 + }, + { + "epoch": 0.5745040476825906, + "grad_norm": 0.13349594175815582, + "learning_rate": 0.0003906735502802294, + "loss": 2.6284, + "step": 19374 + }, + { + "epoch": 0.574533701034902, + "grad_norm": 0.13226604461669922, + "learning_rate": 0.0003906276383097431, + "loss": 2.6102, + "step": 19375 + }, + { + "epoch": 0.5745633543872135, + "grad_norm": 0.10043612122535706, + "learning_rate": 0.0003905817273077679, + "loss": 2.6206, + "step": 19376 + }, + { + "epoch": 0.5745930077395249, + "grad_norm": 0.11313603073358536, + "learning_rate": 0.0003905358172747103, + "loss": 2.627, + "step": 19377 + }, + { + "epoch": 0.5746226610918365, + "grad_norm": 0.12397442013025284, + "learning_rate": 0.0003904899082109768, + "loss": 2.593, + "step": 19378 + }, + { + "epoch": 0.5746523144441479, + "grad_norm": 0.11034538596868515, + "learning_rate": 0.0003904440001169739, + "loss": 2.5943, + "step": 19379 + }, + { + "epoch": 0.5746819677964594, + "grad_norm": 0.10121675580739975, + "learning_rate": 0.00039039809299310825, + "loss": 2.6238, + "step": 19380 + }, + { + "epoch": 0.5747116211487708, + "grad_norm": 0.10229269415140152, + "learning_rate": 0.00039035218683978627, + "loss": 2.6193, + "step": 19381 + }, + { + "epoch": 0.5747412745010824, + "grad_norm": 0.10296595841646194, + "learning_rate": 0.00039030628165741455, + "loss": 2.6088, + "step": 19382 + }, + { + "epoch": 0.5747709278533938, + "grad_norm": 0.11237309128046036, + "learning_rate": 0.0003902603774463995, + "loss": 2.6567, + "step": 19383 + }, + { + "epoch": 0.5748005812057053, + "grad_norm": 0.11763237416744232, + "learning_rate": 0.0003902144742071478, + "loss": 2.6181, + "step": 19384 + }, + { + "epoch": 0.5748302345580167, + "grad_norm": 0.10876400023698807, + "learning_rate": 0.0003901685719400656, + "loss": 2.6221, + "step": 19385 + }, + { + "epoch": 0.5748598879103283, + "grad_norm": 0.10508742183446884, + "learning_rate": 0.00039012267064555956, + "loss": 2.6319, + "step": 19386 + }, + { + "epoch": 0.5748895412626397, + "grad_norm": 0.115725576877594, + "learning_rate": 0.00039007677032403616, + "loss": 2.5928, + "step": 19387 + }, + { + "epoch": 0.5749191946149512, + "grad_norm": 0.10354042053222656, + "learning_rate": 0.00039003087097590184, + "loss": 2.6135, + "step": 19388 + }, + { + "epoch": 0.5749488479672626, + "grad_norm": 0.10305332392454147, + "learning_rate": 0.000389984972601563, + "loss": 2.614, + "step": 19389 + }, + { + "epoch": 0.5749785013195742, + "grad_norm": 0.11960306018590927, + "learning_rate": 0.00038993907520142615, + "loss": 2.6229, + "step": 19390 + }, + { + "epoch": 0.5750081546718857, + "grad_norm": 0.1323547065258026, + "learning_rate": 0.0003898931787758978, + "loss": 2.6299, + "step": 19391 + }, + { + "epoch": 0.5750378080241971, + "grad_norm": 0.1397051364183426, + "learning_rate": 0.00038984728332538385, + "loss": 2.6134, + "step": 19392 + }, + { + "epoch": 0.5750674613765087, + "grad_norm": 0.12189527601003647, + "learning_rate": 0.0003898013888502914, + "loss": 2.6108, + "step": 19393 + }, + { + "epoch": 0.5750971147288201, + "grad_norm": 0.09712681174278259, + "learning_rate": 0.0003897554953510265, + "loss": 2.6157, + "step": 19394 + }, + { + "epoch": 0.5751267680811316, + "grad_norm": 0.10573618859052658, + "learning_rate": 0.0003897096028279957, + "loss": 2.6278, + "step": 19395 + }, + { + "epoch": 0.575156421433443, + "grad_norm": 0.13026215136051178, + "learning_rate": 0.0003896637112816053, + "loss": 2.5657, + "step": 19396 + }, + { + "epoch": 0.5751860747857546, + "grad_norm": 0.11155857890844345, + "learning_rate": 0.0003896178207122616, + "loss": 2.6397, + "step": 19397 + }, + { + "epoch": 0.575215728138066, + "grad_norm": 0.1291263997554779, + "learning_rate": 0.0003895719311203711, + "loss": 2.6002, + "step": 19398 + }, + { + "epoch": 0.5752453814903775, + "grad_norm": 0.12954063713550568, + "learning_rate": 0.00038952604250634004, + "loss": 2.6176, + "step": 19399 + }, + { + "epoch": 0.5752750348426889, + "grad_norm": 0.11632640659809113, + "learning_rate": 0.00038948015487057485, + "loss": 2.6479, + "step": 19400 + }, + { + "epoch": 0.5753046881950005, + "grad_norm": 0.12315167486667633, + "learning_rate": 0.00038943426821348205, + "loss": 2.6033, + "step": 19401 + }, + { + "epoch": 0.5753343415473119, + "grad_norm": 0.12491634488105774, + "learning_rate": 0.0003893883825354676, + "loss": 2.6011, + "step": 19402 + }, + { + "epoch": 0.5753639948996234, + "grad_norm": 0.13885769248008728, + "learning_rate": 0.000389342497836938, + "loss": 2.6309, + "step": 19403 + }, + { + "epoch": 0.5753936482519348, + "grad_norm": 0.13527318835258484, + "learning_rate": 0.0003892966141182996, + "loss": 2.6282, + "step": 19404 + }, + { + "epoch": 0.5754233016042464, + "grad_norm": 0.12962116301059723, + "learning_rate": 0.0003892507313799586, + "loss": 2.6305, + "step": 19405 + }, + { + "epoch": 0.5754529549565578, + "grad_norm": 0.11959324032068253, + "learning_rate": 0.0003892048496223214, + "loss": 2.5899, + "step": 19406 + }, + { + "epoch": 0.5754826083088693, + "grad_norm": 0.12452425807714462, + "learning_rate": 0.0003891589688457944, + "loss": 2.6345, + "step": 19407 + }, + { + "epoch": 0.5755122616611807, + "grad_norm": 0.12755230069160461, + "learning_rate": 0.0003891130890507837, + "loss": 2.6314, + "step": 19408 + }, + { + "epoch": 0.5755419150134923, + "grad_norm": 0.11184457689523697, + "learning_rate": 0.0003890672102376956, + "loss": 2.6257, + "step": 19409 + }, + { + "epoch": 0.5755715683658038, + "grad_norm": 0.11542608588933945, + "learning_rate": 0.00038902133240693637, + "loss": 2.6116, + "step": 19410 + }, + { + "epoch": 0.5756012217181152, + "grad_norm": 0.10513437539339066, + "learning_rate": 0.0003889754555589123, + "loss": 2.6171, + "step": 19411 + }, + { + "epoch": 0.5756308750704268, + "grad_norm": 0.11358968168497086, + "learning_rate": 0.00038892957969402986, + "loss": 2.5906, + "step": 19412 + }, + { + "epoch": 0.5756605284227382, + "grad_norm": 0.11831512302160263, + "learning_rate": 0.0003888837048126949, + "loss": 2.6084, + "step": 19413 + }, + { + "epoch": 0.5756901817750497, + "grad_norm": 0.11176133155822754, + "learning_rate": 0.0003888378309153137, + "loss": 2.6134, + "step": 19414 + }, + { + "epoch": 0.5757198351273611, + "grad_norm": 0.09369713813066483, + "learning_rate": 0.00038879195800229266, + "loss": 2.6148, + "step": 19415 + }, + { + "epoch": 0.5757494884796727, + "grad_norm": 0.12136875838041306, + "learning_rate": 0.00038874608607403796, + "loss": 2.6284, + "step": 19416 + }, + { + "epoch": 0.5757791418319841, + "grad_norm": 0.10379855334758759, + "learning_rate": 0.0003887002151309557, + "loss": 2.6082, + "step": 19417 + }, + { + "epoch": 0.5758087951842956, + "grad_norm": 0.10349574685096741, + "learning_rate": 0.00038865434517345205, + "loss": 2.6604, + "step": 19418 + }, + { + "epoch": 0.575838448536607, + "grad_norm": 0.10810961574316025, + "learning_rate": 0.0003886084762019334, + "loss": 2.6325, + "step": 19419 + }, + { + "epoch": 0.5758681018889186, + "grad_norm": 0.10857115685939789, + "learning_rate": 0.00038856260821680576, + "loss": 2.6112, + "step": 19420 + }, + { + "epoch": 0.57589775524123, + "grad_norm": 0.1154516339302063, + "learning_rate": 0.0003885167412184755, + "loss": 2.6456, + "step": 19421 + }, + { + "epoch": 0.5759274085935415, + "grad_norm": 0.11758022010326385, + "learning_rate": 0.00038847087520734857, + "loss": 2.6256, + "step": 19422 + }, + { + "epoch": 0.5759570619458529, + "grad_norm": 0.12062735110521317, + "learning_rate": 0.0003884250101838312, + "loss": 2.6323, + "step": 19423 + }, + { + "epoch": 0.5759867152981645, + "grad_norm": 0.10184989124536514, + "learning_rate": 0.0003883791461483295, + "loss": 2.594, + "step": 19424 + }, + { + "epoch": 0.5760163686504759, + "grad_norm": 0.10505332052707672, + "learning_rate": 0.00038833328310124957, + "loss": 2.5964, + "step": 19425 + }, + { + "epoch": 0.5760460220027874, + "grad_norm": 0.10211070626974106, + "learning_rate": 0.0003882874210429976, + "loss": 2.6429, + "step": 19426 + }, + { + "epoch": 0.5760756753550988, + "grad_norm": 0.108667753636837, + "learning_rate": 0.0003882415599739796, + "loss": 2.6185, + "step": 19427 + }, + { + "epoch": 0.5761053287074104, + "grad_norm": 0.10660134255886078, + "learning_rate": 0.0003881956998946019, + "loss": 2.6182, + "step": 19428 + }, + { + "epoch": 0.5761349820597218, + "grad_norm": 0.09705930203199387, + "learning_rate": 0.00038814984080527053, + "loss": 2.6361, + "step": 19429 + }, + { + "epoch": 0.5761646354120333, + "grad_norm": 0.1118517741560936, + "learning_rate": 0.00038810398270639124, + "loss": 2.6025, + "step": 19430 + }, + { + "epoch": 0.5761942887643449, + "grad_norm": 0.11246132850646973, + "learning_rate": 0.00038805812559837056, + "loss": 2.6084, + "step": 19431 + }, + { + "epoch": 0.5762239421166563, + "grad_norm": 0.11629398912191391, + "learning_rate": 0.0003880122694816144, + "loss": 2.6455, + "step": 19432 + }, + { + "epoch": 0.5762535954689678, + "grad_norm": 0.10051243007183075, + "learning_rate": 0.0003879664143565289, + "loss": 2.5806, + "step": 19433 + }, + { + "epoch": 0.5762832488212792, + "grad_norm": 0.11501317471265793, + "learning_rate": 0.00038792056022351996, + "loss": 2.6642, + "step": 19434 + }, + { + "epoch": 0.5763129021735908, + "grad_norm": 0.1151466965675354, + "learning_rate": 0.00038787470708299374, + "loss": 2.608, + "step": 19435 + }, + { + "epoch": 0.5763425555259022, + "grad_norm": 0.1275455504655838, + "learning_rate": 0.0003878288549353561, + "loss": 2.6029, + "step": 19436 + }, + { + "epoch": 0.5763722088782137, + "grad_norm": 0.11726083606481552, + "learning_rate": 0.0003877830037810133, + "loss": 2.6134, + "step": 19437 + }, + { + "epoch": 0.5764018622305251, + "grad_norm": 0.10163024067878723, + "learning_rate": 0.00038773715362037134, + "loss": 2.6059, + "step": 19438 + }, + { + "epoch": 0.5764315155828367, + "grad_norm": 0.1048317477107048, + "learning_rate": 0.0003876913044538362, + "loss": 2.6103, + "step": 19439 + }, + { + "epoch": 0.5764611689351481, + "grad_norm": 0.09942316263914108, + "learning_rate": 0.00038764545628181365, + "loss": 2.6508, + "step": 19440 + }, + { + "epoch": 0.5764908222874596, + "grad_norm": 0.10570527613162994, + "learning_rate": 0.00038759960910470995, + "loss": 2.644, + "step": 19441 + }, + { + "epoch": 0.576520475639771, + "grad_norm": 0.12170713394880295, + "learning_rate": 0.00038755376292293095, + "loss": 2.6098, + "step": 19442 + }, + { + "epoch": 0.5765501289920826, + "grad_norm": 0.11019584536552429, + "learning_rate": 0.0003875079177368825, + "loss": 2.6131, + "step": 19443 + }, + { + "epoch": 0.576579782344394, + "grad_norm": 0.11831928044557571, + "learning_rate": 0.00038746207354697107, + "loss": 2.5714, + "step": 19444 + }, + { + "epoch": 0.5766094356967055, + "grad_norm": 0.11194216459989548, + "learning_rate": 0.00038741623035360215, + "loss": 2.6511, + "step": 19445 + }, + { + "epoch": 0.576639089049017, + "grad_norm": 0.11430924385786057, + "learning_rate": 0.00038737038815718185, + "loss": 2.6539, + "step": 19446 + }, + { + "epoch": 0.5766687424013285, + "grad_norm": 0.13482047617435455, + "learning_rate": 0.0003873245469581161, + "loss": 2.6348, + "step": 19447 + }, + { + "epoch": 0.5766983957536399, + "grad_norm": 0.12106887996196747, + "learning_rate": 0.00038727870675681086, + "loss": 2.6181, + "step": 19448 + }, + { + "epoch": 0.5767280491059514, + "grad_norm": 0.10094718635082245, + "learning_rate": 0.00038723286755367196, + "loss": 2.6147, + "step": 19449 + }, + { + "epoch": 0.5767577024582629, + "grad_norm": 0.12711112201213837, + "learning_rate": 0.00038718702934910555, + "loss": 2.6082, + "step": 19450 + }, + { + "epoch": 0.5767873558105744, + "grad_norm": 0.11707275360822678, + "learning_rate": 0.00038714119214351716, + "loss": 2.6101, + "step": 19451 + }, + { + "epoch": 0.5768170091628859, + "grad_norm": 0.12381308525800705, + "learning_rate": 0.000387095355937313, + "loss": 2.6338, + "step": 19452 + }, + { + "epoch": 0.5768466625151973, + "grad_norm": 0.11711954325437546, + "learning_rate": 0.00038704952073089876, + "loss": 2.6398, + "step": 19453 + }, + { + "epoch": 0.5768763158675089, + "grad_norm": 0.12081684172153473, + "learning_rate": 0.00038700368652468044, + "loss": 2.6561, + "step": 19454 + }, + { + "epoch": 0.5769059692198203, + "grad_norm": 0.13778983056545258, + "learning_rate": 0.00038695785331906385, + "loss": 2.6415, + "step": 19455 + }, + { + "epoch": 0.5769356225721318, + "grad_norm": 0.12518292665481567, + "learning_rate": 0.0003869120211144548, + "loss": 2.6324, + "step": 19456 + }, + { + "epoch": 0.5769652759244432, + "grad_norm": 0.10819607973098755, + "learning_rate": 0.0003868661899112593, + "loss": 2.6272, + "step": 19457 + }, + { + "epoch": 0.5769949292767548, + "grad_norm": 0.11059878766536713, + "learning_rate": 0.0003868203597098831, + "loss": 2.6005, + "step": 19458 + }, + { + "epoch": 0.5770245826290662, + "grad_norm": 0.11461133509874344, + "learning_rate": 0.000386774530510732, + "loss": 2.6106, + "step": 19459 + }, + { + "epoch": 0.5770542359813777, + "grad_norm": 0.10721281170845032, + "learning_rate": 0.0003867287023142121, + "loss": 2.6134, + "step": 19460 + }, + { + "epoch": 0.5770838893336891, + "grad_norm": 0.1033601239323616, + "learning_rate": 0.00038668287512072877, + "loss": 2.5768, + "step": 19461 + }, + { + "epoch": 0.5771135426860007, + "grad_norm": 0.10679268091917038, + "learning_rate": 0.00038663704893068807, + "loss": 2.6442, + "step": 19462 + }, + { + "epoch": 0.5771431960383121, + "grad_norm": 0.09677253663539886, + "learning_rate": 0.00038659122374449574, + "loss": 2.6026, + "step": 19463 + }, + { + "epoch": 0.5771728493906236, + "grad_norm": 0.10814906656742096, + "learning_rate": 0.0003865453995625576, + "loss": 2.5949, + "step": 19464 + }, + { + "epoch": 0.577202502742935, + "grad_norm": 0.12066248804330826, + "learning_rate": 0.00038649957638527946, + "loss": 2.6033, + "step": 19465 + }, + { + "epoch": 0.5772321560952466, + "grad_norm": 0.11760389804840088, + "learning_rate": 0.0003864537542130669, + "loss": 2.6524, + "step": 19466 + }, + { + "epoch": 0.577261809447558, + "grad_norm": 0.10398240387439728, + "learning_rate": 0.00038640793304632614, + "loss": 2.658, + "step": 19467 + }, + { + "epoch": 0.5772914627998695, + "grad_norm": 0.10062402486801147, + "learning_rate": 0.00038636211288546244, + "loss": 2.6284, + "step": 19468 + }, + { + "epoch": 0.577321116152181, + "grad_norm": 0.09383851289749146, + "learning_rate": 0.0003863162937308815, + "loss": 2.6308, + "step": 19469 + }, + { + "epoch": 0.5773507695044925, + "grad_norm": 0.10744385421276093, + "learning_rate": 0.0003862704755829896, + "loss": 2.622, + "step": 19470 + }, + { + "epoch": 0.5773804228568039, + "grad_norm": 0.11337410658597946, + "learning_rate": 0.0003862246584421921, + "loss": 2.6522, + "step": 19471 + }, + { + "epoch": 0.5774100762091154, + "grad_norm": 0.10254767537117004, + "learning_rate": 0.00038617884230889477, + "loss": 2.6073, + "step": 19472 + }, + { + "epoch": 0.577439729561427, + "grad_norm": 0.12266828119754791, + "learning_rate": 0.00038613302718350337, + "loss": 2.6064, + "step": 19473 + }, + { + "epoch": 0.5774693829137384, + "grad_norm": 0.11884815990924835, + "learning_rate": 0.00038608721306642346, + "loss": 2.6082, + "step": 19474 + }, + { + "epoch": 0.5774990362660499, + "grad_norm": 0.10741620510816574, + "learning_rate": 0.0003860413999580609, + "loss": 2.5867, + "step": 19475 + }, + { + "epoch": 0.5775286896183613, + "grad_norm": 0.11085575819015503, + "learning_rate": 0.00038599558785882133, + "loss": 2.6338, + "step": 19476 + }, + { + "epoch": 0.5775583429706729, + "grad_norm": 0.11861258745193481, + "learning_rate": 0.0003859497767691105, + "loss": 2.6463, + "step": 19477 + }, + { + "epoch": 0.5775879963229843, + "grad_norm": 0.10230562835931778, + "learning_rate": 0.00038590396668933383, + "loss": 2.6184, + "step": 19478 + }, + { + "epoch": 0.5776176496752958, + "grad_norm": 0.1159718781709671, + "learning_rate": 0.00038585815761989705, + "loss": 2.6191, + "step": 19479 + }, + { + "epoch": 0.5776473030276073, + "grad_norm": 0.1214844286441803, + "learning_rate": 0.00038581234956120596, + "loss": 2.6084, + "step": 19480 + }, + { + "epoch": 0.5776769563799188, + "grad_norm": 0.11813505738973618, + "learning_rate": 0.0003857665425136661, + "loss": 2.6249, + "step": 19481 + }, + { + "epoch": 0.5777066097322302, + "grad_norm": 0.116146519780159, + "learning_rate": 0.000385720736477683, + "loss": 2.6179, + "step": 19482 + }, + { + "epoch": 0.5777362630845417, + "grad_norm": 0.11452890932559967, + "learning_rate": 0.00038567493145366254, + "loss": 2.6501, + "step": 19483 + }, + { + "epoch": 0.5777659164368532, + "grad_norm": 0.12978728115558624, + "learning_rate": 0.0003856291274420102, + "loss": 2.6423, + "step": 19484 + }, + { + "epoch": 0.5777955697891647, + "grad_norm": 0.11283359676599503, + "learning_rate": 0.0003855833244431315, + "loss": 2.6352, + "step": 19485 + }, + { + "epoch": 0.5778252231414761, + "grad_norm": 0.09516558796167374, + "learning_rate": 0.0003855375224574321, + "loss": 2.5741, + "step": 19486 + }, + { + "epoch": 0.5778548764937876, + "grad_norm": 0.11770191043615341, + "learning_rate": 0.00038549172148531763, + "loss": 2.6428, + "step": 19487 + }, + { + "epoch": 0.5778845298460991, + "grad_norm": 0.13265587389469147, + "learning_rate": 0.0003854459215271937, + "loss": 2.6105, + "step": 19488 + }, + { + "epoch": 0.5779141831984106, + "grad_norm": 0.11815285682678223, + "learning_rate": 0.00038540012258346567, + "loss": 2.6642, + "step": 19489 + }, + { + "epoch": 0.577943836550722, + "grad_norm": 0.11019423604011536, + "learning_rate": 0.0003853543246545393, + "loss": 2.6457, + "step": 19490 + }, + { + "epoch": 0.5779734899030335, + "grad_norm": 0.10458475351333618, + "learning_rate": 0.00038530852774082, + "loss": 2.6226, + "step": 19491 + }, + { + "epoch": 0.578003143255345, + "grad_norm": 0.09806614369153976, + "learning_rate": 0.0003852627318427134, + "loss": 2.6425, + "step": 19492 + }, + { + "epoch": 0.5780327966076565, + "grad_norm": 0.11327517032623291, + "learning_rate": 0.000385216936960625, + "loss": 2.6427, + "step": 19493 + }, + { + "epoch": 0.578062449959968, + "grad_norm": 0.11443669348955154, + "learning_rate": 0.00038517114309496043, + "loss": 2.6609, + "step": 19494 + }, + { + "epoch": 0.5780921033122794, + "grad_norm": 0.12028659135103226, + "learning_rate": 0.0003851253502461248, + "loss": 2.6336, + "step": 19495 + }, + { + "epoch": 0.578121756664591, + "grad_norm": 0.12296260893344879, + "learning_rate": 0.00038507955841452407, + "loss": 2.6146, + "step": 19496 + }, + { + "epoch": 0.5781514100169024, + "grad_norm": 0.10047470778226852, + "learning_rate": 0.00038503376760056364, + "loss": 2.6283, + "step": 19497 + }, + { + "epoch": 0.5781810633692139, + "grad_norm": 0.11931286752223969, + "learning_rate": 0.000384987977804649, + "loss": 2.577, + "step": 19498 + }, + { + "epoch": 0.5782107167215254, + "grad_norm": 0.13074661791324615, + "learning_rate": 0.0003849421890271855, + "loss": 2.5957, + "step": 19499 + }, + { + "epoch": 0.5782403700738369, + "grad_norm": 0.10389526188373566, + "learning_rate": 0.0003848964012685786, + "loss": 2.6351, + "step": 19500 + }, + { + "epoch": 0.5782700234261483, + "grad_norm": 0.09994790703058243, + "learning_rate": 0.00038485061452923387, + "loss": 2.6238, + "step": 19501 + }, + { + "epoch": 0.5782996767784598, + "grad_norm": 0.1268969029188156, + "learning_rate": 0.0003848048288095566, + "loss": 2.6596, + "step": 19502 + }, + { + "epoch": 0.5783293301307713, + "grad_norm": 0.13330306112766266, + "learning_rate": 0.0003847590441099525, + "loss": 2.6027, + "step": 19503 + }, + { + "epoch": 0.5783589834830828, + "grad_norm": 0.11748994141817093, + "learning_rate": 0.00038471326043082677, + "loss": 2.6383, + "step": 19504 + }, + { + "epoch": 0.5783886368353942, + "grad_norm": 0.09491327404975891, + "learning_rate": 0.00038466747777258505, + "loss": 2.5873, + "step": 19505 + }, + { + "epoch": 0.5784182901877057, + "grad_norm": 0.10893680900335312, + "learning_rate": 0.00038462169613563246, + "loss": 2.5847, + "step": 19506 + }, + { + "epoch": 0.5784479435400172, + "grad_norm": 0.11581473797559738, + "learning_rate": 0.0003845759155203746, + "loss": 2.6361, + "step": 19507 + }, + { + "epoch": 0.5784775968923287, + "grad_norm": 0.10033237934112549, + "learning_rate": 0.00038453013592721654, + "loss": 2.5907, + "step": 19508 + }, + { + "epoch": 0.5785072502446401, + "grad_norm": 0.09965863823890686, + "learning_rate": 0.00038448435735656436, + "loss": 2.6002, + "step": 19509 + }, + { + "epoch": 0.5785369035969516, + "grad_norm": 0.10806844383478165, + "learning_rate": 0.0003844385798088228, + "loss": 2.6309, + "step": 19510 + }, + { + "epoch": 0.5785665569492631, + "grad_norm": 0.10374782979488373, + "learning_rate": 0.0003843928032843975, + "loss": 2.5797, + "step": 19511 + }, + { + "epoch": 0.5785962103015746, + "grad_norm": 0.10356233268976212, + "learning_rate": 0.00038434702778369385, + "loss": 2.5988, + "step": 19512 + }, + { + "epoch": 0.578625863653886, + "grad_norm": 0.10328196734189987, + "learning_rate": 0.00038430125330711707, + "loss": 2.6454, + "step": 19513 + }, + { + "epoch": 0.5786555170061976, + "grad_norm": 0.1106514036655426, + "learning_rate": 0.0003842554798550725, + "loss": 2.6265, + "step": 19514 + }, + { + "epoch": 0.5786851703585091, + "grad_norm": 0.11194399744272232, + "learning_rate": 0.0003842097074279657, + "loss": 2.6304, + "step": 19515 + }, + { + "epoch": 0.5787148237108205, + "grad_norm": 0.10505073517560959, + "learning_rate": 0.0003841639360262016, + "loss": 2.6319, + "step": 19516 + }, + { + "epoch": 0.578744477063132, + "grad_norm": 0.1143801137804985, + "learning_rate": 0.00038411816565018587, + "loss": 2.6467, + "step": 19517 + }, + { + "epoch": 0.5787741304154435, + "grad_norm": 0.09577188640832901, + "learning_rate": 0.00038407239630032367, + "loss": 2.6153, + "step": 19518 + }, + { + "epoch": 0.578803783767755, + "grad_norm": 0.12179195135831833, + "learning_rate": 0.0003840266279770202, + "loss": 2.6, + "step": 19519 + }, + { + "epoch": 0.5788334371200664, + "grad_norm": 0.1252746731042862, + "learning_rate": 0.0003839808606806809, + "loss": 2.6064, + "step": 19520 + }, + { + "epoch": 0.5788630904723779, + "grad_norm": 0.11934351176023483, + "learning_rate": 0.00038393509441171083, + "loss": 2.6549, + "step": 19521 + }, + { + "epoch": 0.5788927438246894, + "grad_norm": 0.11914750188589096, + "learning_rate": 0.00038388932917051555, + "loss": 2.622, + "step": 19522 + }, + { + "epoch": 0.5789223971770009, + "grad_norm": 0.12786667048931122, + "learning_rate": 0.0003838435649575003, + "loss": 2.6143, + "step": 19523 + }, + { + "epoch": 0.5789520505293123, + "grad_norm": 0.1312321126461029, + "learning_rate": 0.0003837978017730701, + "loss": 2.6034, + "step": 19524 + }, + { + "epoch": 0.5789817038816238, + "grad_norm": 0.10534733533859253, + "learning_rate": 0.0003837520396176303, + "loss": 2.6357, + "step": 19525 + }, + { + "epoch": 0.5790113572339353, + "grad_norm": 0.11259118467569351, + "learning_rate": 0.00038370627849158633, + "loss": 2.6262, + "step": 19526 + }, + { + "epoch": 0.5790410105862468, + "grad_norm": 0.1086629331111908, + "learning_rate": 0.0003836605183953431, + "loss": 2.5807, + "step": 19527 + }, + { + "epoch": 0.5790706639385582, + "grad_norm": 0.1173647791147232, + "learning_rate": 0.00038361475932930587, + "loss": 2.6372, + "step": 19528 + }, + { + "epoch": 0.5791003172908697, + "grad_norm": 0.10662641376256943, + "learning_rate": 0.00038356900129388, + "loss": 2.6254, + "step": 19529 + }, + { + "epoch": 0.5791299706431812, + "grad_norm": 0.1145360916852951, + "learning_rate": 0.00038352324428947063, + "loss": 2.6195, + "step": 19530 + }, + { + "epoch": 0.5791596239954927, + "grad_norm": 0.11281795054674149, + "learning_rate": 0.00038347748831648286, + "loss": 2.6191, + "step": 19531 + }, + { + "epoch": 0.5791892773478041, + "grad_norm": 0.10527602583169937, + "learning_rate": 0.00038343173337532205, + "loss": 2.5725, + "step": 19532 + }, + { + "epoch": 0.5792189307001157, + "grad_norm": 0.10779573023319244, + "learning_rate": 0.0003833859794663932, + "loss": 2.6368, + "step": 19533 + }, + { + "epoch": 0.5792485840524271, + "grad_norm": 0.11914241313934326, + "learning_rate": 0.00038334022659010125, + "loss": 2.6093, + "step": 19534 + }, + { + "epoch": 0.5792782374047386, + "grad_norm": 0.11024759709835052, + "learning_rate": 0.00038329447474685186, + "loss": 2.6318, + "step": 19535 + }, + { + "epoch": 0.5793078907570501, + "grad_norm": 0.11963372677564621, + "learning_rate": 0.00038324872393705, + "loss": 2.6321, + "step": 19536 + }, + { + "epoch": 0.5793375441093616, + "grad_norm": 0.10846927762031555, + "learning_rate": 0.0003832029741611006, + "loss": 2.6065, + "step": 19537 + }, + { + "epoch": 0.5793671974616731, + "grad_norm": 0.11273974925279617, + "learning_rate": 0.000383157225419409, + "loss": 2.6342, + "step": 19538 + }, + { + "epoch": 0.5793968508139845, + "grad_norm": 0.1069992333650589, + "learning_rate": 0.0003831114777123802, + "loss": 2.6404, + "step": 19539 + }, + { + "epoch": 0.579426504166296, + "grad_norm": 0.10573861747980118, + "learning_rate": 0.0003830657310404193, + "loss": 2.6412, + "step": 19540 + }, + { + "epoch": 0.5794561575186075, + "grad_norm": 0.10931301862001419, + "learning_rate": 0.0003830199854039314, + "loss": 2.6325, + "step": 19541 + }, + { + "epoch": 0.579485810870919, + "grad_norm": 0.10583941638469696, + "learning_rate": 0.0003829742408033217, + "loss": 2.6183, + "step": 19542 + }, + { + "epoch": 0.5795154642232304, + "grad_norm": 0.11915997415781021, + "learning_rate": 0.0003829284972389952, + "loss": 2.642, + "step": 19543 + }, + { + "epoch": 0.579545117575542, + "grad_norm": 0.11803659796714783, + "learning_rate": 0.00038288275471135686, + "loss": 2.6087, + "step": 19544 + }, + { + "epoch": 0.5795747709278534, + "grad_norm": 0.11974383145570755, + "learning_rate": 0.0003828370132208119, + "loss": 2.6232, + "step": 19545 + }, + { + "epoch": 0.5796044242801649, + "grad_norm": 0.10878237336874008, + "learning_rate": 0.0003827912727677652, + "loss": 2.6246, + "step": 19546 + }, + { + "epoch": 0.5796340776324763, + "grad_norm": 0.10938943177461624, + "learning_rate": 0.00038274553335262187, + "loss": 2.6454, + "step": 19547 + }, + { + "epoch": 0.5796637309847879, + "grad_norm": 0.1283363401889801, + "learning_rate": 0.000382699794975787, + "loss": 2.629, + "step": 19548 + }, + { + "epoch": 0.5796933843370993, + "grad_norm": 0.13529933989048004, + "learning_rate": 0.00038265405763766565, + "loss": 2.6378, + "step": 19549 + }, + { + "epoch": 0.5797230376894108, + "grad_norm": 0.12671321630477905, + "learning_rate": 0.00038260832133866276, + "loss": 2.6011, + "step": 19550 + }, + { + "epoch": 0.5797526910417222, + "grad_norm": 0.11353299021720886, + "learning_rate": 0.00038256258607918326, + "loss": 2.6328, + "step": 19551 + }, + { + "epoch": 0.5797823443940338, + "grad_norm": 0.09561482816934586, + "learning_rate": 0.00038251685185963227, + "loss": 2.6115, + "step": 19552 + }, + { + "epoch": 0.5798119977463452, + "grad_norm": 0.10715802013874054, + "learning_rate": 0.00038247111868041484, + "loss": 2.6256, + "step": 19553 + }, + { + "epoch": 0.5798416510986567, + "grad_norm": 0.10652712732553482, + "learning_rate": 0.0003824253865419357, + "loss": 2.6042, + "step": 19554 + }, + { + "epoch": 0.5798713044509681, + "grad_norm": 0.10610487312078476, + "learning_rate": 0.0003823796554446, + "loss": 2.6296, + "step": 19555 + }, + { + "epoch": 0.5799009578032797, + "grad_norm": 0.10046112537384033, + "learning_rate": 0.00038233392538881256, + "loss": 2.6151, + "step": 19556 + }, + { + "epoch": 0.5799306111555912, + "grad_norm": 0.12770499289035797, + "learning_rate": 0.0003822881963749784, + "loss": 2.6182, + "step": 19557 + }, + { + "epoch": 0.5799602645079026, + "grad_norm": 0.10154201835393906, + "learning_rate": 0.00038224246840350254, + "loss": 2.6411, + "step": 19558 + }, + { + "epoch": 0.5799899178602141, + "grad_norm": 0.11451907455921173, + "learning_rate": 0.0003821967414747898, + "loss": 2.602, + "step": 19559 + }, + { + "epoch": 0.5800195712125256, + "grad_norm": 0.10211323201656342, + "learning_rate": 0.000382151015589245, + "loss": 2.5779, + "step": 19560 + }, + { + "epoch": 0.5800492245648371, + "grad_norm": 0.10297682136297226, + "learning_rate": 0.00038210529074727336, + "loss": 2.6122, + "step": 19561 + }, + { + "epoch": 0.5800788779171485, + "grad_norm": 0.10574131458997726, + "learning_rate": 0.0003820595669492796, + "loss": 2.6128, + "step": 19562 + }, + { + "epoch": 0.58010853126946, + "grad_norm": 0.10184874385595322, + "learning_rate": 0.00038201384419566856, + "loss": 2.593, + "step": 19563 + }, + { + "epoch": 0.5801381846217715, + "grad_norm": 0.10746170580387115, + "learning_rate": 0.0003819681224868453, + "loss": 2.6148, + "step": 19564 + }, + { + "epoch": 0.580167837974083, + "grad_norm": 0.11347801983356476, + "learning_rate": 0.0003819224018232145, + "loss": 2.6063, + "step": 19565 + }, + { + "epoch": 0.5801974913263944, + "grad_norm": 0.14629647135734558, + "learning_rate": 0.0003818766822051811, + "loss": 2.6624, + "step": 19566 + }, + { + "epoch": 0.580227144678706, + "grad_norm": 0.09895071387290955, + "learning_rate": 0.0003818309636331499, + "loss": 2.6422, + "step": 19567 + }, + { + "epoch": 0.5802567980310174, + "grad_norm": 0.12838684022426605, + "learning_rate": 0.00038178524610752584, + "loss": 2.6177, + "step": 19568 + }, + { + "epoch": 0.5802864513833289, + "grad_norm": 0.12539254128932953, + "learning_rate": 0.00038173952962871374, + "loss": 2.622, + "step": 19569 + }, + { + "epoch": 0.5803161047356403, + "grad_norm": 0.11138597130775452, + "learning_rate": 0.0003816938141971185, + "loss": 2.6591, + "step": 19570 + }, + { + "epoch": 0.5803457580879519, + "grad_norm": 0.11143188178539276, + "learning_rate": 0.0003816480998131447, + "loss": 2.6193, + "step": 19571 + }, + { + "epoch": 0.5803754114402633, + "grad_norm": 0.10315918922424316, + "learning_rate": 0.00038160238647719723, + "loss": 2.6068, + "step": 19572 + }, + { + "epoch": 0.5804050647925748, + "grad_norm": 0.10245412588119507, + "learning_rate": 0.0003815566741896809, + "loss": 2.6319, + "step": 19573 + }, + { + "epoch": 0.5804347181448862, + "grad_norm": 0.1051187515258789, + "learning_rate": 0.00038151096295100073, + "loss": 2.6139, + "step": 19574 + }, + { + "epoch": 0.5804643714971978, + "grad_norm": 0.11094290018081665, + "learning_rate": 0.0003814652527615613, + "loss": 2.6339, + "step": 19575 + }, + { + "epoch": 0.5804940248495092, + "grad_norm": 0.09645486623048782, + "learning_rate": 0.0003814195436217674, + "loss": 2.6267, + "step": 19576 + }, + { + "epoch": 0.5805236782018207, + "grad_norm": 0.10448162257671356, + "learning_rate": 0.0003813738355320238, + "loss": 2.6343, + "step": 19577 + }, + { + "epoch": 0.5805533315541322, + "grad_norm": 0.10344001650810242, + "learning_rate": 0.0003813281284927352, + "loss": 2.6327, + "step": 19578 + }, + { + "epoch": 0.5805829849064437, + "grad_norm": 0.09971731901168823, + "learning_rate": 0.0003812824225043064, + "loss": 2.6178, + "step": 19579 + }, + { + "epoch": 0.5806126382587552, + "grad_norm": 0.10344640910625458, + "learning_rate": 0.0003812367175671421, + "loss": 2.6092, + "step": 19580 + }, + { + "epoch": 0.5806422916110666, + "grad_norm": 0.11037985980510712, + "learning_rate": 0.0003811910136816472, + "loss": 2.6203, + "step": 19581 + }, + { + "epoch": 0.5806719449633782, + "grad_norm": 0.09863407164812088, + "learning_rate": 0.00038114531084822617, + "loss": 2.6077, + "step": 19582 + }, + { + "epoch": 0.5807015983156896, + "grad_norm": 0.10792435705661774, + "learning_rate": 0.0003810996090672838, + "loss": 2.5996, + "step": 19583 + }, + { + "epoch": 0.5807312516680011, + "grad_norm": 0.12247307598590851, + "learning_rate": 0.0003810539083392248, + "loss": 2.6187, + "step": 19584 + }, + { + "epoch": 0.5807609050203125, + "grad_norm": 0.10682334005832672, + "learning_rate": 0.0003810082086644539, + "loss": 2.6444, + "step": 19585 + }, + { + "epoch": 0.580790558372624, + "grad_norm": 0.12794339656829834, + "learning_rate": 0.0003809625100433756, + "loss": 2.692, + "step": 19586 + }, + { + "epoch": 0.5808202117249355, + "grad_norm": 0.12245313078165054, + "learning_rate": 0.0003809168124763948, + "loss": 2.613, + "step": 19587 + }, + { + "epoch": 0.580849865077247, + "grad_norm": 0.1283215880393982, + "learning_rate": 0.00038087111596391606, + "loss": 2.635, + "step": 19588 + }, + { + "epoch": 0.5808795184295584, + "grad_norm": 0.10454963147640228, + "learning_rate": 0.00038082542050634405, + "loss": 2.6169, + "step": 19589 + }, + { + "epoch": 0.58090917178187, + "grad_norm": 0.13216987252235413, + "learning_rate": 0.0003807797261040834, + "loss": 2.6375, + "step": 19590 + }, + { + "epoch": 0.5809388251341814, + "grad_norm": 0.1328718513250351, + "learning_rate": 0.0003807340327575388, + "loss": 2.6507, + "step": 19591 + }, + { + "epoch": 0.5809684784864929, + "grad_norm": 0.1376573145389557, + "learning_rate": 0.00038068834046711474, + "loss": 2.576, + "step": 19592 + }, + { + "epoch": 0.5809981318388043, + "grad_norm": 0.103233702480793, + "learning_rate": 0.00038064264923321595, + "loss": 2.6392, + "step": 19593 + }, + { + "epoch": 0.5810277851911159, + "grad_norm": 0.1366187483072281, + "learning_rate": 0.00038059695905624693, + "loss": 2.6173, + "step": 19594 + }, + { + "epoch": 0.5810574385434273, + "grad_norm": 0.1119602844119072, + "learning_rate": 0.00038055126993661237, + "loss": 2.6, + "step": 19595 + }, + { + "epoch": 0.5810870918957388, + "grad_norm": 0.12821993231773376, + "learning_rate": 0.00038050558187471676, + "loss": 2.6526, + "step": 19596 + }, + { + "epoch": 0.5811167452480502, + "grad_norm": 0.12511339783668518, + "learning_rate": 0.00038045989487096475, + "loss": 2.6383, + "step": 19597 + }, + { + "epoch": 0.5811463986003618, + "grad_norm": 0.12825840711593628, + "learning_rate": 0.00038041420892576106, + "loss": 2.6442, + "step": 19598 + }, + { + "epoch": 0.5811760519526733, + "grad_norm": 0.11187182366847992, + "learning_rate": 0.0003803685240395097, + "loss": 2.6405, + "step": 19599 + }, + { + "epoch": 0.5812057053049847, + "grad_norm": 0.11863116174936295, + "learning_rate": 0.0003803228402126159, + "loss": 2.5963, + "step": 19600 + }, + { + "epoch": 0.5812353586572963, + "grad_norm": 0.10966359823942184, + "learning_rate": 0.0003802771574454837, + "loss": 2.6236, + "step": 19601 + }, + { + "epoch": 0.5812650120096077, + "grad_norm": 0.11234992742538452, + "learning_rate": 0.0003802314757385181, + "loss": 2.6003, + "step": 19602 + }, + { + "epoch": 0.5812946653619192, + "grad_norm": 0.0994279757142067, + "learning_rate": 0.00038018579509212317, + "loss": 2.6084, + "step": 19603 + }, + { + "epoch": 0.5813243187142306, + "grad_norm": 0.12069304287433624, + "learning_rate": 0.0003801401155067036, + "loss": 2.6243, + "step": 19604 + }, + { + "epoch": 0.5813539720665422, + "grad_norm": 0.12397880852222443, + "learning_rate": 0.0003800944369826639, + "loss": 2.6211, + "step": 19605 + }, + { + "epoch": 0.5813836254188536, + "grad_norm": 0.1027478277683258, + "learning_rate": 0.0003800487595204085, + "loss": 2.6628, + "step": 19606 + }, + { + "epoch": 0.5814132787711651, + "grad_norm": 0.13497619330883026, + "learning_rate": 0.00038000308312034195, + "loss": 2.6035, + "step": 19607 + }, + { + "epoch": 0.5814429321234765, + "grad_norm": 0.12462517619132996, + "learning_rate": 0.0003799574077828688, + "loss": 2.6153, + "step": 19608 + }, + { + "epoch": 0.5814725854757881, + "grad_norm": 0.11199846118688583, + "learning_rate": 0.00037991173350839327, + "loss": 2.6463, + "step": 19609 + }, + { + "epoch": 0.5815022388280995, + "grad_norm": 0.13546055555343628, + "learning_rate": 0.00037986606029732, + "loss": 2.6455, + "step": 19610 + }, + { + "epoch": 0.581531892180411, + "grad_norm": 0.14187465608119965, + "learning_rate": 0.0003798203881500534, + "loss": 2.5951, + "step": 19611 + }, + { + "epoch": 0.5815615455327224, + "grad_norm": 0.1037490963935852, + "learning_rate": 0.0003797747170669977, + "loss": 2.6176, + "step": 19612 + }, + { + "epoch": 0.581591198885034, + "grad_norm": 0.1201704889535904, + "learning_rate": 0.0003797290470485577, + "loss": 2.6049, + "step": 19613 + }, + { + "epoch": 0.5816208522373454, + "grad_norm": 0.1137208566069603, + "learning_rate": 0.00037968337809513757, + "loss": 2.618, + "step": 19614 + }, + { + "epoch": 0.5816505055896569, + "grad_norm": 0.10216612368822098, + "learning_rate": 0.0003796377102071419, + "loss": 2.6193, + "step": 19615 + }, + { + "epoch": 0.5816801589419683, + "grad_norm": 0.112614706158638, + "learning_rate": 0.0003795920433849748, + "loss": 2.6131, + "step": 19616 + }, + { + "epoch": 0.5817098122942799, + "grad_norm": 0.11582639813423157, + "learning_rate": 0.0003795463776290409, + "loss": 2.6365, + "step": 19617 + }, + { + "epoch": 0.5817394656465913, + "grad_norm": 0.11369970440864563, + "learning_rate": 0.0003795007129397445, + "loss": 2.6294, + "step": 19618 + }, + { + "epoch": 0.5817691189989028, + "grad_norm": 0.10414184629917145, + "learning_rate": 0.00037945504931749015, + "loss": 2.6184, + "step": 19619 + }, + { + "epoch": 0.5817987723512144, + "grad_norm": 0.11690597981214523, + "learning_rate": 0.0003794093867626818, + "loss": 2.6051, + "step": 19620 + }, + { + "epoch": 0.5818284257035258, + "grad_norm": 0.1159219816327095, + "learning_rate": 0.00037936372527572416, + "loss": 2.5806, + "step": 19621 + }, + { + "epoch": 0.5818580790558373, + "grad_norm": 0.12938182055950165, + "learning_rate": 0.00037931806485702135, + "loss": 2.6412, + "step": 19622 + }, + { + "epoch": 0.5818877324081487, + "grad_norm": 0.10929937660694122, + "learning_rate": 0.0003792724055069778, + "loss": 2.5832, + "step": 19623 + }, + { + "epoch": 0.5819173857604603, + "grad_norm": 0.11460955440998077, + "learning_rate": 0.0003792267472259977, + "loss": 2.5881, + "step": 19624 + }, + { + "epoch": 0.5819470391127717, + "grad_norm": 0.1166992336511612, + "learning_rate": 0.00037918109001448564, + "loss": 2.6103, + "step": 19625 + }, + { + "epoch": 0.5819766924650832, + "grad_norm": 0.10461033880710602, + "learning_rate": 0.00037913543387284575, + "loss": 2.635, + "step": 19626 + }, + { + "epoch": 0.5820063458173946, + "grad_norm": 0.10450802743434906, + "learning_rate": 0.00037908977880148233, + "loss": 2.6292, + "step": 19627 + }, + { + "epoch": 0.5820359991697062, + "grad_norm": 0.10689760744571686, + "learning_rate": 0.0003790441248007996, + "loss": 2.6065, + "step": 19628 + }, + { + "epoch": 0.5820656525220176, + "grad_norm": 0.10060250759124756, + "learning_rate": 0.0003789984718712022, + "loss": 2.6006, + "step": 19629 + }, + { + "epoch": 0.5820953058743291, + "grad_norm": 0.10653413832187653, + "learning_rate": 0.00037895282001309383, + "loss": 2.5921, + "step": 19630 + }, + { + "epoch": 0.5821249592266405, + "grad_norm": 0.12109265476465225, + "learning_rate": 0.0003789071692268791, + "loss": 2.6224, + "step": 19631 + }, + { + "epoch": 0.5821546125789521, + "grad_norm": 0.11356959491968155, + "learning_rate": 0.0003788615195129621, + "loss": 2.6632, + "step": 19632 + }, + { + "epoch": 0.5821842659312635, + "grad_norm": 0.1041068583726883, + "learning_rate": 0.0003788158708717472, + "loss": 2.6087, + "step": 19633 + }, + { + "epoch": 0.582213919283575, + "grad_norm": 0.10620566457509995, + "learning_rate": 0.00037877022330363855, + "loss": 2.613, + "step": 19634 + }, + { + "epoch": 0.5822435726358864, + "grad_norm": 0.1093512699007988, + "learning_rate": 0.0003787245768090403, + "loss": 2.6333, + "step": 19635 + }, + { + "epoch": 0.582273225988198, + "grad_norm": 0.11749674379825592, + "learning_rate": 0.000378678931388357, + "loss": 2.6263, + "step": 19636 + }, + { + "epoch": 0.5823028793405094, + "grad_norm": 0.12370947003364563, + "learning_rate": 0.00037863328704199214, + "loss": 2.6304, + "step": 19637 + }, + { + "epoch": 0.5823325326928209, + "grad_norm": 0.12695030868053436, + "learning_rate": 0.0003785876437703506, + "loss": 2.6232, + "step": 19638 + }, + { + "epoch": 0.5823621860451325, + "grad_norm": 0.1134914681315422, + "learning_rate": 0.0003785420015738363, + "loss": 2.6362, + "step": 19639 + }, + { + "epoch": 0.5823918393974439, + "grad_norm": 0.10328105837106705, + "learning_rate": 0.00037849636045285363, + "loss": 2.6098, + "step": 19640 + }, + { + "epoch": 0.5824214927497554, + "grad_norm": 0.1259835809469223, + "learning_rate": 0.0003784507204078064, + "loss": 2.6251, + "step": 19641 + }, + { + "epoch": 0.5824511461020668, + "grad_norm": 0.12349015474319458, + "learning_rate": 0.0003784050814390988, + "loss": 2.6208, + "step": 19642 + }, + { + "epoch": 0.5824807994543784, + "grad_norm": 0.10893246531486511, + "learning_rate": 0.00037835944354713515, + "loss": 2.6137, + "step": 19643 + }, + { + "epoch": 0.5825104528066898, + "grad_norm": 0.10524982959032059, + "learning_rate": 0.00037831380673231953, + "loss": 2.6182, + "step": 19644 + }, + { + "epoch": 0.5825401061590013, + "grad_norm": 0.10002373903989792, + "learning_rate": 0.000378268170995056, + "loss": 2.6187, + "step": 19645 + }, + { + "epoch": 0.5825697595113127, + "grad_norm": 0.09786572307348251, + "learning_rate": 0.0003782225363357488, + "loss": 2.5754, + "step": 19646 + }, + { + "epoch": 0.5825994128636243, + "grad_norm": 0.10172870755195618, + "learning_rate": 0.0003781769027548019, + "loss": 2.6412, + "step": 19647 + }, + { + "epoch": 0.5826290662159357, + "grad_norm": 0.08468491584062576, + "learning_rate": 0.0003781312702526194, + "loss": 2.6226, + "step": 19648 + }, + { + "epoch": 0.5826587195682472, + "grad_norm": 0.0954810082912445, + "learning_rate": 0.0003780856388296054, + "loss": 2.642, + "step": 19649 + }, + { + "epoch": 0.5826883729205586, + "grad_norm": 0.10938455909490585, + "learning_rate": 0.0003780400084861639, + "loss": 2.6204, + "step": 19650 + }, + { + "epoch": 0.5827180262728702, + "grad_norm": 0.11545753479003906, + "learning_rate": 0.0003779943792226992, + "loss": 2.624, + "step": 19651 + }, + { + "epoch": 0.5827476796251816, + "grad_norm": 0.13008758425712585, + "learning_rate": 0.0003779487510396152, + "loss": 2.654, + "step": 19652 + }, + { + "epoch": 0.5827773329774931, + "grad_norm": 0.11408296972513199, + "learning_rate": 0.00037790312393731594, + "loss": 2.6042, + "step": 19653 + }, + { + "epoch": 0.5828069863298045, + "grad_norm": 0.1015150174498558, + "learning_rate": 0.0003778574979162055, + "loss": 2.6146, + "step": 19654 + }, + { + "epoch": 0.5828366396821161, + "grad_norm": 0.12109958380460739, + "learning_rate": 0.0003778118729766878, + "loss": 2.6308, + "step": 19655 + }, + { + "epoch": 0.5828662930344275, + "grad_norm": 0.11131621897220612, + "learning_rate": 0.00037776624911916706, + "loss": 2.5992, + "step": 19656 + }, + { + "epoch": 0.582895946386739, + "grad_norm": 0.1119115799665451, + "learning_rate": 0.00037772062634404717, + "loss": 2.6049, + "step": 19657 + }, + { + "epoch": 0.5829255997390504, + "grad_norm": 0.10228213667869568, + "learning_rate": 0.0003776750046517321, + "loss": 2.6428, + "step": 19658 + }, + { + "epoch": 0.582955253091362, + "grad_norm": 0.10169865190982819, + "learning_rate": 0.0003776293840426258, + "loss": 2.6164, + "step": 19659 + }, + { + "epoch": 0.5829849064436735, + "grad_norm": 0.11751924455165863, + "learning_rate": 0.0003775837645171324, + "loss": 2.6221, + "step": 19660 + }, + { + "epoch": 0.5830145597959849, + "grad_norm": 0.11716467887163162, + "learning_rate": 0.00037753814607565566, + "loss": 2.6254, + "step": 19661 + }, + { + "epoch": 0.5830442131482965, + "grad_norm": 0.10144820064306259, + "learning_rate": 0.0003774925287185997, + "loss": 2.6766, + "step": 19662 + }, + { + "epoch": 0.5830738665006079, + "grad_norm": 0.12372741103172302, + "learning_rate": 0.00037744691244636833, + "loss": 2.6136, + "step": 19663 + }, + { + "epoch": 0.5831035198529194, + "grad_norm": 0.11518151313066483, + "learning_rate": 0.0003774012972593657, + "loss": 2.6168, + "step": 19664 + }, + { + "epoch": 0.5831331732052308, + "grad_norm": 0.12856115400791168, + "learning_rate": 0.0003773556831579956, + "loss": 2.6235, + "step": 19665 + }, + { + "epoch": 0.5831628265575424, + "grad_norm": 0.12453188002109528, + "learning_rate": 0.000377310070142662, + "loss": 2.6278, + "step": 19666 + }, + { + "epoch": 0.5831924799098538, + "grad_norm": 0.10553420335054398, + "learning_rate": 0.0003772644582137689, + "loss": 2.6292, + "step": 19667 + }, + { + "epoch": 0.5832221332621653, + "grad_norm": 0.1598614752292633, + "learning_rate": 0.00037721884737171996, + "loss": 2.6492, + "step": 19668 + }, + { + "epoch": 0.5832517866144767, + "grad_norm": 0.12412344664335251, + "learning_rate": 0.00037717323761691927, + "loss": 2.6025, + "step": 19669 + }, + { + "epoch": 0.5832814399667883, + "grad_norm": 0.13766704499721527, + "learning_rate": 0.0003771276289497705, + "loss": 2.6385, + "step": 19670 + }, + { + "epoch": 0.5833110933190997, + "grad_norm": 0.12099071592092514, + "learning_rate": 0.00037708202137067784, + "loss": 2.62, + "step": 19671 + }, + { + "epoch": 0.5833407466714112, + "grad_norm": 0.12485270947217941, + "learning_rate": 0.0003770364148800448, + "loss": 2.6179, + "step": 19672 + }, + { + "epoch": 0.5833704000237226, + "grad_norm": 0.11805766075849533, + "learning_rate": 0.0003769908094782756, + "loss": 2.6135, + "step": 19673 + }, + { + "epoch": 0.5834000533760342, + "grad_norm": 0.12126009911298752, + "learning_rate": 0.00037694520516577386, + "loss": 2.6106, + "step": 19674 + }, + { + "epoch": 0.5834297067283456, + "grad_norm": 0.12733237445354462, + "learning_rate": 0.0003768996019429434, + "loss": 2.6106, + "step": 19675 + }, + { + "epoch": 0.5834593600806571, + "grad_norm": 0.1224438026547432, + "learning_rate": 0.00037685399981018784, + "loss": 2.6259, + "step": 19676 + }, + { + "epoch": 0.5834890134329686, + "grad_norm": 0.11590249836444855, + "learning_rate": 0.0003768083987679115, + "loss": 2.6565, + "step": 19677 + }, + { + "epoch": 0.5835186667852801, + "grad_norm": 0.1150546446442604, + "learning_rate": 0.00037676279881651803, + "loss": 2.6111, + "step": 19678 + }, + { + "epoch": 0.5835483201375915, + "grad_norm": 0.10470826923847198, + "learning_rate": 0.00037671719995641107, + "loss": 2.5912, + "step": 19679 + }, + { + "epoch": 0.583577973489903, + "grad_norm": 0.12264962494373322, + "learning_rate": 0.0003766716021879944, + "loss": 2.6156, + "step": 19680 + }, + { + "epoch": 0.5836076268422146, + "grad_norm": 0.11906595528125763, + "learning_rate": 0.0003766260055116719, + "loss": 2.6388, + "step": 19681 + }, + { + "epoch": 0.583637280194526, + "grad_norm": 0.12765347957611084, + "learning_rate": 0.00037658040992784726, + "loss": 2.6505, + "step": 19682 + }, + { + "epoch": 0.5836669335468375, + "grad_norm": 0.12138129770755768, + "learning_rate": 0.0003765348154369243, + "loss": 2.6294, + "step": 19683 + }, + { + "epoch": 0.5836965868991489, + "grad_norm": 0.10726138949394226, + "learning_rate": 0.00037648922203930684, + "loss": 2.6492, + "step": 19684 + }, + { + "epoch": 0.5837262402514605, + "grad_norm": 0.12188988924026489, + "learning_rate": 0.0003764436297353985, + "loss": 2.6151, + "step": 19685 + }, + { + "epoch": 0.5837558936037719, + "grad_norm": 0.10127317905426025, + "learning_rate": 0.0003763980385256029, + "loss": 2.6082, + "step": 19686 + }, + { + "epoch": 0.5837855469560834, + "grad_norm": 0.11162301898002625, + "learning_rate": 0.0003763524484103239, + "loss": 2.6026, + "step": 19687 + }, + { + "epoch": 0.5838152003083948, + "grad_norm": 0.1166323870420456, + "learning_rate": 0.00037630685938996525, + "loss": 2.5985, + "step": 19688 + }, + { + "epoch": 0.5838448536607064, + "grad_norm": 0.1154288649559021, + "learning_rate": 0.0003762612714649304, + "loss": 2.5978, + "step": 19689 + }, + { + "epoch": 0.5838745070130178, + "grad_norm": 0.1073836237192154, + "learning_rate": 0.0003762156846356234, + "loss": 2.6461, + "step": 19690 + }, + { + "epoch": 0.5839041603653293, + "grad_norm": 0.12351708859205246, + "learning_rate": 0.0003761700989024478, + "loss": 2.6507, + "step": 19691 + }, + { + "epoch": 0.5839338137176407, + "grad_norm": 0.10447857528924942, + "learning_rate": 0.00037612451426580716, + "loss": 2.6569, + "step": 19692 + }, + { + "epoch": 0.5839634670699523, + "grad_norm": 0.10245554149150848, + "learning_rate": 0.00037607893072610525, + "loss": 2.6409, + "step": 19693 + }, + { + "epoch": 0.5839931204222637, + "grad_norm": 0.10043098032474518, + "learning_rate": 0.0003760333482837457, + "loss": 2.6275, + "step": 19694 + }, + { + "epoch": 0.5840227737745752, + "grad_norm": 0.12198066711425781, + "learning_rate": 0.00037598776693913215, + "loss": 2.6587, + "step": 19695 + }, + { + "epoch": 0.5840524271268867, + "grad_norm": 0.1139836236834526, + "learning_rate": 0.0003759421866926682, + "loss": 2.5867, + "step": 19696 + }, + { + "epoch": 0.5840820804791982, + "grad_norm": 0.10164112597703934, + "learning_rate": 0.00037589660754475747, + "loss": 2.6259, + "step": 19697 + }, + { + "epoch": 0.5841117338315096, + "grad_norm": 0.11553016304969788, + "learning_rate": 0.0003758510294958035, + "loss": 2.608, + "step": 19698 + }, + { + "epoch": 0.5841413871838211, + "grad_norm": 0.1384359896183014, + "learning_rate": 0.00037580545254621003, + "loss": 2.6323, + "step": 19699 + }, + { + "epoch": 0.5841710405361326, + "grad_norm": 0.11633098870515823, + "learning_rate": 0.0003757598766963806, + "loss": 2.6155, + "step": 19700 + }, + { + "epoch": 0.5842006938884441, + "grad_norm": 0.10945913940668106, + "learning_rate": 0.0003757143019467188, + "loss": 2.6156, + "step": 19701 + }, + { + "epoch": 0.5842303472407556, + "grad_norm": 0.140568345785141, + "learning_rate": 0.00037566872829762805, + "loss": 2.631, + "step": 19702 + }, + { + "epoch": 0.584260000593067, + "grad_norm": 0.12215237319469452, + "learning_rate": 0.00037562315574951214, + "loss": 2.6054, + "step": 19703 + }, + { + "epoch": 0.5842896539453786, + "grad_norm": 0.11932844668626785, + "learning_rate": 0.00037557758430277455, + "loss": 2.6086, + "step": 19704 + }, + { + "epoch": 0.58431930729769, + "grad_norm": 0.12669385969638824, + "learning_rate": 0.00037553201395781893, + "loss": 2.6077, + "step": 19705 + }, + { + "epoch": 0.5843489606500015, + "grad_norm": 0.1220778375864029, + "learning_rate": 0.0003754864447150486, + "loss": 2.6344, + "step": 19706 + }, + { + "epoch": 0.584378614002313, + "grad_norm": 0.12084627896547318, + "learning_rate": 0.00037544087657486716, + "loss": 2.5912, + "step": 19707 + }, + { + "epoch": 0.5844082673546245, + "grad_norm": 0.11974363774061203, + "learning_rate": 0.00037539530953767814, + "loss": 2.6098, + "step": 19708 + }, + { + "epoch": 0.5844379207069359, + "grad_norm": 0.11409604549407959, + "learning_rate": 0.00037534974360388504, + "loss": 2.6439, + "step": 19709 + }, + { + "epoch": 0.5844675740592474, + "grad_norm": 0.1238236203789711, + "learning_rate": 0.00037530417877389133, + "loss": 2.6034, + "step": 19710 + }, + { + "epoch": 0.5844972274115589, + "grad_norm": 0.12302479147911072, + "learning_rate": 0.00037525861504810056, + "loss": 2.6215, + "step": 19711 + }, + { + "epoch": 0.5845268807638704, + "grad_norm": 0.1192072182893753, + "learning_rate": 0.00037521305242691627, + "loss": 2.6583, + "step": 19712 + }, + { + "epoch": 0.5845565341161818, + "grad_norm": 0.12031984329223633, + "learning_rate": 0.00037516749091074167, + "loss": 2.6297, + "step": 19713 + }, + { + "epoch": 0.5845861874684933, + "grad_norm": 0.10784972459077835, + "learning_rate": 0.0003751219304999804, + "loss": 2.63, + "step": 19714 + }, + { + "epoch": 0.5846158408208048, + "grad_norm": 0.11030741780996323, + "learning_rate": 0.00037507637119503566, + "loss": 2.6007, + "step": 19715 + }, + { + "epoch": 0.5846454941731163, + "grad_norm": 0.12446343898773193, + "learning_rate": 0.0003750308129963114, + "loss": 2.6117, + "step": 19716 + }, + { + "epoch": 0.5846751475254277, + "grad_norm": 0.09841091185808182, + "learning_rate": 0.0003749852559042106, + "loss": 2.5502, + "step": 19717 + }, + { + "epoch": 0.5847048008777392, + "grad_norm": 0.10912682116031647, + "learning_rate": 0.0003749396999191369, + "loss": 2.6167, + "step": 19718 + }, + { + "epoch": 0.5847344542300507, + "grad_norm": 0.10673220455646515, + "learning_rate": 0.00037489414504149354, + "loss": 2.5982, + "step": 19719 + }, + { + "epoch": 0.5847641075823622, + "grad_norm": 0.11670567840337753, + "learning_rate": 0.00037484859127168407, + "loss": 2.5936, + "step": 19720 + }, + { + "epoch": 0.5847937609346736, + "grad_norm": 0.1025143563747406, + "learning_rate": 0.0003748030386101118, + "loss": 2.6307, + "step": 19721 + }, + { + "epoch": 0.5848234142869851, + "grad_norm": 0.10067551583051682, + "learning_rate": 0.00037475748705718005, + "loss": 2.6219, + "step": 19722 + }, + { + "epoch": 0.5848530676392967, + "grad_norm": 0.10772683471441269, + "learning_rate": 0.00037471193661329247, + "loss": 2.6297, + "step": 19723 + }, + { + "epoch": 0.5848827209916081, + "grad_norm": 0.10336384922266006, + "learning_rate": 0.000374666387278852, + "loss": 2.6207, + "step": 19724 + }, + { + "epoch": 0.5849123743439196, + "grad_norm": 0.11112148314714432, + "learning_rate": 0.0003746208390542622, + "loss": 2.6119, + "step": 19725 + }, + { + "epoch": 0.584942027696231, + "grad_norm": 0.11314833164215088, + "learning_rate": 0.0003745752919399263, + "loss": 2.5975, + "step": 19726 + }, + { + "epoch": 0.5849716810485426, + "grad_norm": 0.1222623661160469, + "learning_rate": 0.0003745297459362479, + "loss": 2.6225, + "step": 19727 + }, + { + "epoch": 0.585001334400854, + "grad_norm": 0.12792782485485077, + "learning_rate": 0.0003744842010436299, + "loss": 2.6133, + "step": 19728 + }, + { + "epoch": 0.5850309877531655, + "grad_norm": 0.10246014595031738, + "learning_rate": 0.00037443865726247605, + "loss": 2.6189, + "step": 19729 + }, + { + "epoch": 0.585060641105477, + "grad_norm": 0.13333256542682648, + "learning_rate": 0.00037439311459318937, + "loss": 2.5974, + "step": 19730 + }, + { + "epoch": 0.5850902944577885, + "grad_norm": 0.11302705109119415, + "learning_rate": 0.0003743475730361732, + "loss": 2.6226, + "step": 19731 + }, + { + "epoch": 0.5851199478100999, + "grad_norm": 0.11123672872781754, + "learning_rate": 0.00037430203259183095, + "loss": 2.5895, + "step": 19732 + }, + { + "epoch": 0.5851496011624114, + "grad_norm": 0.10979767888784409, + "learning_rate": 0.00037425649326056575, + "loss": 2.6304, + "step": 19733 + }, + { + "epoch": 0.5851792545147229, + "grad_norm": 0.12471473217010498, + "learning_rate": 0.00037421095504278084, + "loss": 2.5898, + "step": 19734 + }, + { + "epoch": 0.5852089078670344, + "grad_norm": 0.09942011535167694, + "learning_rate": 0.0003741654179388795, + "loss": 2.6087, + "step": 19735 + }, + { + "epoch": 0.5852385612193458, + "grad_norm": 0.10498851537704468, + "learning_rate": 0.000374119881949265, + "loss": 2.6205, + "step": 19736 + }, + { + "epoch": 0.5852682145716573, + "grad_norm": 0.11626861989498138, + "learning_rate": 0.0003740743470743405, + "loss": 2.6167, + "step": 19737 + }, + { + "epoch": 0.5852978679239688, + "grad_norm": 0.10649634152650833, + "learning_rate": 0.00037402881331450937, + "loss": 2.6045, + "step": 19738 + }, + { + "epoch": 0.5853275212762803, + "grad_norm": 0.11583013087511063, + "learning_rate": 0.0003739832806701747, + "loss": 2.5973, + "step": 19739 + }, + { + "epoch": 0.5853571746285917, + "grad_norm": 0.12171144038438797, + "learning_rate": 0.0003739377491417397, + "loss": 2.6252, + "step": 19740 + }, + { + "epoch": 0.5853868279809032, + "grad_norm": 0.11770143359899521, + "learning_rate": 0.00037389221872960736, + "loss": 2.617, + "step": 19741 + }, + { + "epoch": 0.5854164813332147, + "grad_norm": 0.12153944373130798, + "learning_rate": 0.00037384668943418135, + "loss": 2.6276, + "step": 19742 + }, + { + "epoch": 0.5854461346855262, + "grad_norm": 0.12469569593667984, + "learning_rate": 0.0003738011612558645, + "loss": 2.6209, + "step": 19743 + }, + { + "epoch": 0.5854757880378377, + "grad_norm": 0.11071228981018066, + "learning_rate": 0.00037375563419506, + "loss": 2.6339, + "step": 19744 + }, + { + "epoch": 0.5855054413901492, + "grad_norm": 0.1058502048254013, + "learning_rate": 0.000373710108252171, + "loss": 2.6043, + "step": 19745 + }, + { + "epoch": 0.5855350947424607, + "grad_norm": 0.10596965998411179, + "learning_rate": 0.0003736645834276007, + "loss": 2.5925, + "step": 19746 + }, + { + "epoch": 0.5855647480947721, + "grad_norm": 0.10818569362163544, + "learning_rate": 0.00037361905972175223, + "loss": 2.6142, + "step": 19747 + }, + { + "epoch": 0.5855944014470836, + "grad_norm": 0.13028599321842194, + "learning_rate": 0.0003735735371350286, + "loss": 2.5922, + "step": 19748 + }, + { + "epoch": 0.585624054799395, + "grad_norm": 0.11830071359872818, + "learning_rate": 0.00037352801566783306, + "loss": 2.5344, + "step": 19749 + }, + { + "epoch": 0.5856537081517066, + "grad_norm": 0.11029282212257385, + "learning_rate": 0.00037348249532056875, + "loss": 2.6081, + "step": 19750 + }, + { + "epoch": 0.585683361504018, + "grad_norm": 0.11899524927139282, + "learning_rate": 0.0003734369760936386, + "loss": 2.6312, + "step": 19751 + }, + { + "epoch": 0.5857130148563295, + "grad_norm": 0.11293402314186096, + "learning_rate": 0.00037339145798744565, + "loss": 2.6211, + "step": 19752 + }, + { + "epoch": 0.585742668208641, + "grad_norm": 0.1302531510591507, + "learning_rate": 0.0003733459410023931, + "loss": 2.5971, + "step": 19753 + }, + { + "epoch": 0.5857723215609525, + "grad_norm": 0.14881998300552368, + "learning_rate": 0.0003733004251388839, + "loss": 2.6147, + "step": 19754 + }, + { + "epoch": 0.5858019749132639, + "grad_norm": 0.11706177145242691, + "learning_rate": 0.0003732549103973213, + "loss": 2.6628, + "step": 19755 + }, + { + "epoch": 0.5858316282655754, + "grad_norm": 0.1058792695403099, + "learning_rate": 0.00037320939677810814, + "loss": 2.5962, + "step": 19756 + }, + { + "epoch": 0.5858612816178869, + "grad_norm": 0.14179585874080658, + "learning_rate": 0.0003731638842816476, + "loss": 2.6035, + "step": 19757 + }, + { + "epoch": 0.5858909349701984, + "grad_norm": 0.10792532563209534, + "learning_rate": 0.0003731183729083427, + "loss": 2.6488, + "step": 19758 + }, + { + "epoch": 0.5859205883225098, + "grad_norm": 0.1107901930809021, + "learning_rate": 0.0003730728626585963, + "loss": 2.5948, + "step": 19759 + }, + { + "epoch": 0.5859502416748213, + "grad_norm": 0.10872361809015274, + "learning_rate": 0.0003730273535328115, + "loss": 2.6004, + "step": 19760 + }, + { + "epoch": 0.5859798950271328, + "grad_norm": 0.1052408516407013, + "learning_rate": 0.00037298184553139136, + "loss": 2.632, + "step": 19761 + }, + { + "epoch": 0.5860095483794443, + "grad_norm": 0.10857710242271423, + "learning_rate": 0.00037293633865473873, + "loss": 2.637, + "step": 19762 + }, + { + "epoch": 0.5860392017317557, + "grad_norm": 0.10410823673009872, + "learning_rate": 0.00037289083290325663, + "loss": 2.6176, + "step": 19763 + }, + { + "epoch": 0.5860688550840673, + "grad_norm": 0.10940369218587875, + "learning_rate": 0.00037284532827734797, + "loss": 2.6242, + "step": 19764 + }, + { + "epoch": 0.5860985084363788, + "grad_norm": 0.11610133200883865, + "learning_rate": 0.0003727998247774158, + "loss": 2.605, + "step": 19765 + }, + { + "epoch": 0.5861281617886902, + "grad_norm": 0.09674711525440216, + "learning_rate": 0.000372754322403863, + "loss": 2.6363, + "step": 19766 + }, + { + "epoch": 0.5861578151410017, + "grad_norm": 0.10976441949605942, + "learning_rate": 0.00037270882115709243, + "loss": 2.6414, + "step": 19767 + }, + { + "epoch": 0.5861874684933132, + "grad_norm": 0.10103577375411987, + "learning_rate": 0.0003726633210375072, + "loss": 2.6368, + "step": 19768 + }, + { + "epoch": 0.5862171218456247, + "grad_norm": 0.09465610980987549, + "learning_rate": 0.0003726178220455101, + "loss": 2.5975, + "step": 19769 + }, + { + "epoch": 0.5862467751979361, + "grad_norm": 0.10184239596128464, + "learning_rate": 0.00037257232418150407, + "loss": 2.6064, + "step": 19770 + }, + { + "epoch": 0.5862764285502476, + "grad_norm": 0.1254548579454422, + "learning_rate": 0.00037252682744589205, + "loss": 2.6311, + "step": 19771 + }, + { + "epoch": 0.5863060819025591, + "grad_norm": 0.10253483802080154, + "learning_rate": 0.00037248133183907675, + "loss": 2.6456, + "step": 19772 + }, + { + "epoch": 0.5863357352548706, + "grad_norm": 0.1209452822804451, + "learning_rate": 0.0003724358373614612, + "loss": 2.6268, + "step": 19773 + }, + { + "epoch": 0.586365388607182, + "grad_norm": 0.09899991005659103, + "learning_rate": 0.00037239034401344816, + "loss": 2.6155, + "step": 19774 + }, + { + "epoch": 0.5863950419594935, + "grad_norm": 0.11037509143352509, + "learning_rate": 0.00037234485179544054, + "loss": 2.628, + "step": 19775 + }, + { + "epoch": 0.586424695311805, + "grad_norm": 0.12097255885601044, + "learning_rate": 0.0003722993607078412, + "loss": 2.6319, + "step": 19776 + }, + { + "epoch": 0.5864543486641165, + "grad_norm": 0.10382941365242004, + "learning_rate": 0.0003722538707510529, + "loss": 2.5833, + "step": 19777 + }, + { + "epoch": 0.5864840020164279, + "grad_norm": 0.11847708374261856, + "learning_rate": 0.00037220838192547856, + "loss": 2.6401, + "step": 19778 + }, + { + "epoch": 0.5865136553687395, + "grad_norm": 0.10956640541553497, + "learning_rate": 0.00037216289423152096, + "loss": 2.5912, + "step": 19779 + }, + { + "epoch": 0.5865433087210509, + "grad_norm": 0.10815062373876572, + "learning_rate": 0.0003721174076695826, + "loss": 2.5857, + "step": 19780 + }, + { + "epoch": 0.5865729620733624, + "grad_norm": 0.11856778711080551, + "learning_rate": 0.0003720719222400668, + "loss": 2.6311, + "step": 19781 + }, + { + "epoch": 0.5866026154256738, + "grad_norm": 0.10534457117319107, + "learning_rate": 0.00037202643794337603, + "loss": 2.5997, + "step": 19782 + }, + { + "epoch": 0.5866322687779854, + "grad_norm": 0.11535745859146118, + "learning_rate": 0.0003719809547799131, + "loss": 2.6657, + "step": 19783 + }, + { + "epoch": 0.5866619221302968, + "grad_norm": 0.11223830282688141, + "learning_rate": 0.00037193547275008083, + "loss": 2.6251, + "step": 19784 + }, + { + "epoch": 0.5866915754826083, + "grad_norm": 0.11856801807880402, + "learning_rate": 0.0003718899918542819, + "loss": 2.6291, + "step": 19785 + }, + { + "epoch": 0.5867212288349198, + "grad_norm": 0.11998598277568817, + "learning_rate": 0.00037184451209291915, + "loss": 2.6096, + "step": 19786 + }, + { + "epoch": 0.5867508821872313, + "grad_norm": 0.11140411347150803, + "learning_rate": 0.00037179903346639515, + "loss": 2.6234, + "step": 19787 + }, + { + "epoch": 0.5867805355395428, + "grad_norm": 0.10724160820245743, + "learning_rate": 0.00037175355597511285, + "loss": 2.6241, + "step": 19788 + }, + { + "epoch": 0.5868101888918542, + "grad_norm": 0.13337497413158417, + "learning_rate": 0.00037170807961947475, + "loss": 2.6386, + "step": 19789 + }, + { + "epoch": 0.5868398422441657, + "grad_norm": 0.1325674057006836, + "learning_rate": 0.0003716626043998836, + "loss": 2.6297, + "step": 19790 + }, + { + "epoch": 0.5868694955964772, + "grad_norm": 0.135331392288208, + "learning_rate": 0.00037161713031674213, + "loss": 2.6112, + "step": 19791 + }, + { + "epoch": 0.5868991489487887, + "grad_norm": 0.12915131449699402, + "learning_rate": 0.00037157165737045295, + "loss": 2.6104, + "step": 19792 + }, + { + "epoch": 0.5869288023011001, + "grad_norm": 0.10377006977796555, + "learning_rate": 0.0003715261855614187, + "loss": 2.6296, + "step": 19793 + }, + { + "epoch": 0.5869584556534116, + "grad_norm": 0.1167314350605011, + "learning_rate": 0.00037148071489004233, + "loss": 2.6401, + "step": 19794 + }, + { + "epoch": 0.5869881090057231, + "grad_norm": 0.09938336163759232, + "learning_rate": 0.0003714352453567262, + "loss": 2.5947, + "step": 19795 + }, + { + "epoch": 0.5870177623580346, + "grad_norm": 0.11771897971630096, + "learning_rate": 0.00037138977696187306, + "loss": 2.6322, + "step": 19796 + }, + { + "epoch": 0.587047415710346, + "grad_norm": 0.11065974086523056, + "learning_rate": 0.0003713443097058855, + "loss": 2.6444, + "step": 19797 + }, + { + "epoch": 0.5870770690626576, + "grad_norm": 0.12113047391176224, + "learning_rate": 0.00037129884358916624, + "loss": 2.6212, + "step": 19798 + }, + { + "epoch": 0.587106722414969, + "grad_norm": 0.1227886825799942, + "learning_rate": 0.00037125337861211783, + "loss": 2.613, + "step": 19799 + }, + { + "epoch": 0.5871363757672805, + "grad_norm": 0.10878603905439377, + "learning_rate": 0.00037120791477514276, + "loss": 2.6431, + "step": 19800 + }, + { + "epoch": 0.5871660291195919, + "grad_norm": 0.10760432481765747, + "learning_rate": 0.00037116245207864375, + "loss": 2.6644, + "step": 19801 + }, + { + "epoch": 0.5871956824719035, + "grad_norm": 0.11282484978437424, + "learning_rate": 0.0003711169905230233, + "loss": 2.6109, + "step": 19802 + }, + { + "epoch": 0.5872253358242149, + "grad_norm": 0.09673545509576797, + "learning_rate": 0.00037107153010868405, + "loss": 2.6094, + "step": 19803 + }, + { + "epoch": 0.5872549891765264, + "grad_norm": 0.11273036152124405, + "learning_rate": 0.0003710260708360285, + "loss": 2.6144, + "step": 19804 + }, + { + "epoch": 0.5872846425288378, + "grad_norm": 0.09998178482055664, + "learning_rate": 0.0003709806127054592, + "loss": 2.6386, + "step": 19805 + }, + { + "epoch": 0.5873142958811494, + "grad_norm": 0.1072288304567337, + "learning_rate": 0.0003709351557173788, + "loss": 2.6311, + "step": 19806 + }, + { + "epoch": 0.5873439492334609, + "grad_norm": 0.0966009795665741, + "learning_rate": 0.00037088969987218967, + "loss": 2.5976, + "step": 19807 + }, + { + "epoch": 0.5873736025857723, + "grad_norm": 0.11161061376333237, + "learning_rate": 0.0003708442451702945, + "loss": 2.6071, + "step": 19808 + }, + { + "epoch": 0.5874032559380838, + "grad_norm": 0.09782697260379791, + "learning_rate": 0.00037079879161209574, + "loss": 2.6483, + "step": 19809 + }, + { + "epoch": 0.5874329092903953, + "grad_norm": 0.09985053539276123, + "learning_rate": 0.0003707533391979958, + "loss": 2.6364, + "step": 19810 + }, + { + "epoch": 0.5874625626427068, + "grad_norm": 0.1214093491435051, + "learning_rate": 0.0003707078879283972, + "loss": 2.6306, + "step": 19811 + }, + { + "epoch": 0.5874922159950182, + "grad_norm": 0.10937633365392685, + "learning_rate": 0.0003706624378037025, + "loss": 2.6093, + "step": 19812 + }, + { + "epoch": 0.5875218693473298, + "grad_norm": 0.10292813181877136, + "learning_rate": 0.00037061698882431403, + "loss": 2.6328, + "step": 19813 + }, + { + "epoch": 0.5875515226996412, + "grad_norm": 0.10255606472492218, + "learning_rate": 0.0003705715409906344, + "loss": 2.635, + "step": 19814 + }, + { + "epoch": 0.5875811760519527, + "grad_norm": 0.1127096563577652, + "learning_rate": 0.00037052609430306594, + "loss": 2.626, + "step": 19815 + }, + { + "epoch": 0.5876108294042641, + "grad_norm": 0.1232176199555397, + "learning_rate": 0.00037048064876201125, + "loss": 2.6351, + "step": 19816 + }, + { + "epoch": 0.5876404827565757, + "grad_norm": 0.13458876311779022, + "learning_rate": 0.0003704352043678726, + "loss": 2.6099, + "step": 19817 + }, + { + "epoch": 0.5876701361088871, + "grad_norm": 0.11498504877090454, + "learning_rate": 0.0003703897611210522, + "loss": 2.5948, + "step": 19818 + }, + { + "epoch": 0.5876997894611986, + "grad_norm": 0.1068359911441803, + "learning_rate": 0.00037034431902195305, + "loss": 2.6285, + "step": 19819 + }, + { + "epoch": 0.58772944281351, + "grad_norm": 0.10928574204444885, + "learning_rate": 0.00037029887807097706, + "loss": 2.6101, + "step": 19820 + }, + { + "epoch": 0.5877590961658216, + "grad_norm": 0.09881987422704697, + "learning_rate": 0.00037025343826852685, + "loss": 2.6406, + "step": 19821 + }, + { + "epoch": 0.587788749518133, + "grad_norm": 0.11612476408481598, + "learning_rate": 0.0003702079996150046, + "loss": 2.6335, + "step": 19822 + }, + { + "epoch": 0.5878184028704445, + "grad_norm": 0.12347275018692017, + "learning_rate": 0.0003701625621108129, + "loss": 2.6097, + "step": 19823 + }, + { + "epoch": 0.5878480562227559, + "grad_norm": 0.11164189875125885, + "learning_rate": 0.00037011712575635395, + "loss": 2.6189, + "step": 19824 + }, + { + "epoch": 0.5878777095750675, + "grad_norm": 0.10607687383890152, + "learning_rate": 0.00037007169055203014, + "loss": 2.6348, + "step": 19825 + }, + { + "epoch": 0.5879073629273789, + "grad_norm": 0.10892793536186218, + "learning_rate": 0.00037002625649824395, + "loss": 2.6317, + "step": 19826 + }, + { + "epoch": 0.5879370162796904, + "grad_norm": 0.12009729444980621, + "learning_rate": 0.0003699808235953974, + "loss": 2.6331, + "step": 19827 + }, + { + "epoch": 0.587966669632002, + "grad_norm": 0.11125130951404572, + "learning_rate": 0.000369935391843893, + "loss": 2.5936, + "step": 19828 + }, + { + "epoch": 0.5879963229843134, + "grad_norm": 0.11362642049789429, + "learning_rate": 0.0003698899612441331, + "loss": 2.6526, + "step": 19829 + }, + { + "epoch": 0.5880259763366249, + "grad_norm": 0.11068791896104813, + "learning_rate": 0.0003698445317965199, + "loss": 2.6067, + "step": 19830 + }, + { + "epoch": 0.5880556296889363, + "grad_norm": 0.11052709072828293, + "learning_rate": 0.00036979910350145554, + "loss": 2.6244, + "step": 19831 + }, + { + "epoch": 0.5880852830412479, + "grad_norm": 0.11933861672878265, + "learning_rate": 0.0003697536763593426, + "loss": 2.6214, + "step": 19832 + }, + { + "epoch": 0.5881149363935593, + "grad_norm": 0.14052672684192657, + "learning_rate": 0.0003697082503705832, + "loss": 2.5914, + "step": 19833 + }, + { + "epoch": 0.5881445897458708, + "grad_norm": 0.14306700229644775, + "learning_rate": 0.0003696628255355796, + "loss": 2.6403, + "step": 19834 + }, + { + "epoch": 0.5881742430981822, + "grad_norm": 0.1021324023604393, + "learning_rate": 0.00036961740185473415, + "loss": 2.61, + "step": 19835 + }, + { + "epoch": 0.5882038964504938, + "grad_norm": 0.1208532527089119, + "learning_rate": 0.00036957197932844886, + "loss": 2.6006, + "step": 19836 + }, + { + "epoch": 0.5882335498028052, + "grad_norm": 0.15620867908000946, + "learning_rate": 0.0003695265579571263, + "loss": 2.6222, + "step": 19837 + }, + { + "epoch": 0.5882632031551167, + "grad_norm": 0.13054881989955902, + "learning_rate": 0.0003694811377411683, + "loss": 2.5677, + "step": 19838 + }, + { + "epoch": 0.5882928565074281, + "grad_norm": 0.10359837114810944, + "learning_rate": 0.00036943571868097724, + "loss": 2.6315, + "step": 19839 + }, + { + "epoch": 0.5883225098597397, + "grad_norm": 0.1135363057255745, + "learning_rate": 0.00036939030077695525, + "loss": 2.6033, + "step": 19840 + }, + { + "epoch": 0.5883521632120511, + "grad_norm": 0.11334586888551712, + "learning_rate": 0.0003693448840295046, + "loss": 2.62, + "step": 19841 + }, + { + "epoch": 0.5883818165643626, + "grad_norm": 0.10243292152881622, + "learning_rate": 0.0003692994684390275, + "loss": 2.6086, + "step": 19842 + }, + { + "epoch": 0.588411469916674, + "grad_norm": 0.10688669234514236, + "learning_rate": 0.00036925405400592605, + "loss": 2.5721, + "step": 19843 + }, + { + "epoch": 0.5884411232689856, + "grad_norm": 0.09745711088180542, + "learning_rate": 0.00036920864073060214, + "loss": 2.6257, + "step": 19844 + }, + { + "epoch": 0.588470776621297, + "grad_norm": 0.11064621061086655, + "learning_rate": 0.0003691632286134583, + "loss": 2.6167, + "step": 19845 + }, + { + "epoch": 0.5885004299736085, + "grad_norm": 0.12036305665969849, + "learning_rate": 0.0003691178176548966, + "loss": 2.6279, + "step": 19846 + }, + { + "epoch": 0.58853008332592, + "grad_norm": 0.09945079684257507, + "learning_rate": 0.00036907240785531914, + "loss": 2.6137, + "step": 19847 + }, + { + "epoch": 0.5885597366782315, + "grad_norm": 0.12039827555418015, + "learning_rate": 0.00036902699921512796, + "loss": 2.5949, + "step": 19848 + }, + { + "epoch": 0.588589390030543, + "grad_norm": 0.10773523151874542, + "learning_rate": 0.0003689815917347251, + "loss": 2.6113, + "step": 19849 + }, + { + "epoch": 0.5886190433828544, + "grad_norm": 0.10474622994661331, + "learning_rate": 0.0003689361854145128, + "loss": 2.599, + "step": 19850 + }, + { + "epoch": 0.588648696735166, + "grad_norm": 0.12846417725086212, + "learning_rate": 0.00036889078025489306, + "loss": 2.6066, + "step": 19851 + }, + { + "epoch": 0.5886783500874774, + "grad_norm": 0.11565498262643814, + "learning_rate": 0.0003688453762562679, + "loss": 2.6381, + "step": 19852 + }, + { + "epoch": 0.5887080034397889, + "grad_norm": 0.11373595148324966, + "learning_rate": 0.00036879997341903955, + "loss": 2.606, + "step": 19853 + }, + { + "epoch": 0.5887376567921003, + "grad_norm": 0.10279493033885956, + "learning_rate": 0.00036875457174361, + "loss": 2.623, + "step": 19854 + }, + { + "epoch": 0.5887673101444119, + "grad_norm": 0.11970073729753494, + "learning_rate": 0.0003687091712303811, + "loss": 2.6244, + "step": 19855 + }, + { + "epoch": 0.5887969634967233, + "grad_norm": 0.09997230023145676, + "learning_rate": 0.0003686637718797551, + "loss": 2.6099, + "step": 19856 + }, + { + "epoch": 0.5888266168490348, + "grad_norm": 0.10310416668653488, + "learning_rate": 0.0003686183736921338, + "loss": 2.6279, + "step": 19857 + }, + { + "epoch": 0.5888562702013462, + "grad_norm": 0.09134674072265625, + "learning_rate": 0.00036857297666791945, + "loss": 2.6097, + "step": 19858 + }, + { + "epoch": 0.5888859235536578, + "grad_norm": 0.1099570244550705, + "learning_rate": 0.00036852758080751396, + "loss": 2.609, + "step": 19859 + }, + { + "epoch": 0.5889155769059692, + "grad_norm": 0.10217492282390594, + "learning_rate": 0.00036848218611131934, + "loss": 2.5899, + "step": 19860 + }, + { + "epoch": 0.5889452302582807, + "grad_norm": 0.11160147190093994, + "learning_rate": 0.0003684367925797375, + "loss": 2.6162, + "step": 19861 + }, + { + "epoch": 0.5889748836105921, + "grad_norm": 0.10241082310676575, + "learning_rate": 0.00036839140021317047, + "loss": 2.6424, + "step": 19862 + }, + { + "epoch": 0.5890045369629037, + "grad_norm": 0.12265292555093765, + "learning_rate": 0.0003683460090120202, + "loss": 2.6242, + "step": 19863 + }, + { + "epoch": 0.5890341903152151, + "grad_norm": 0.11785679310560226, + "learning_rate": 0.00036830061897668865, + "loss": 2.6359, + "step": 19864 + }, + { + "epoch": 0.5890638436675266, + "grad_norm": 0.11757393181324005, + "learning_rate": 0.0003682552301075777, + "loss": 2.6379, + "step": 19865 + }, + { + "epoch": 0.589093497019838, + "grad_norm": 0.12792709469795227, + "learning_rate": 0.00036820984240508925, + "loss": 2.6051, + "step": 19866 + }, + { + "epoch": 0.5891231503721496, + "grad_norm": 0.1371435821056366, + "learning_rate": 0.00036816445586962523, + "loss": 2.6421, + "step": 19867 + }, + { + "epoch": 0.5891528037244611, + "grad_norm": 0.10779719799757004, + "learning_rate": 0.00036811907050158767, + "loss": 2.6385, + "step": 19868 + }, + { + "epoch": 0.5891824570767725, + "grad_norm": 0.13023711740970612, + "learning_rate": 0.0003680736863013783, + "loss": 2.5996, + "step": 19869 + }, + { + "epoch": 0.5892121104290841, + "grad_norm": 0.1325693577528, + "learning_rate": 0.0003680283032693991, + "loss": 2.6368, + "step": 19870 + }, + { + "epoch": 0.5892417637813955, + "grad_norm": 0.11348091065883636, + "learning_rate": 0.00036798292140605187, + "loss": 2.6014, + "step": 19871 + }, + { + "epoch": 0.589271417133707, + "grad_norm": 0.1242302730679512, + "learning_rate": 0.0003679375407117386, + "loss": 2.6393, + "step": 19872 + }, + { + "epoch": 0.5893010704860184, + "grad_norm": 0.12740296125411987, + "learning_rate": 0.0003678921611868611, + "loss": 2.614, + "step": 19873 + }, + { + "epoch": 0.58933072383833, + "grad_norm": 0.12241145968437195, + "learning_rate": 0.00036784678283182114, + "loss": 2.6048, + "step": 19874 + }, + { + "epoch": 0.5893603771906414, + "grad_norm": 0.1044728234410286, + "learning_rate": 0.0003678014056470208, + "loss": 2.6462, + "step": 19875 + }, + { + "epoch": 0.5893900305429529, + "grad_norm": 0.13524799048900604, + "learning_rate": 0.00036775602963286155, + "loss": 2.6072, + "step": 19876 + }, + { + "epoch": 0.5894196838952643, + "grad_norm": 0.11011974513530731, + "learning_rate": 0.0003677106547897453, + "loss": 2.6219, + "step": 19877 + }, + { + "epoch": 0.5894493372475759, + "grad_norm": 0.09698716551065445, + "learning_rate": 0.00036766528111807395, + "loss": 2.6339, + "step": 19878 + }, + { + "epoch": 0.5894789905998873, + "grad_norm": 0.11426956951618195, + "learning_rate": 0.0003676199086182493, + "loss": 2.634, + "step": 19879 + }, + { + "epoch": 0.5895086439521988, + "grad_norm": 0.10510832071304321, + "learning_rate": 0.000367574537290673, + "loss": 2.6172, + "step": 19880 + }, + { + "epoch": 0.5895382973045102, + "grad_norm": 0.1029675230383873, + "learning_rate": 0.0003675291671357471, + "loss": 2.5793, + "step": 19881 + }, + { + "epoch": 0.5895679506568218, + "grad_norm": 0.10515942424535751, + "learning_rate": 0.000367483798153873, + "loss": 2.6189, + "step": 19882 + }, + { + "epoch": 0.5895976040091332, + "grad_norm": 0.11191855370998383, + "learning_rate": 0.0003674384303454524, + "loss": 2.6057, + "step": 19883 + }, + { + "epoch": 0.5896272573614447, + "grad_norm": 0.10160309821367264, + "learning_rate": 0.0003673930637108874, + "loss": 2.6224, + "step": 19884 + }, + { + "epoch": 0.5896569107137561, + "grad_norm": 0.10099136829376221, + "learning_rate": 0.00036734769825057977, + "loss": 2.6177, + "step": 19885 + }, + { + "epoch": 0.5896865640660677, + "grad_norm": 0.09144961088895798, + "learning_rate": 0.000367302333964931, + "loss": 2.6093, + "step": 19886 + }, + { + "epoch": 0.5897162174183791, + "grad_norm": 0.10019724816083908, + "learning_rate": 0.0003672569708543427, + "loss": 2.5981, + "step": 19887 + }, + { + "epoch": 0.5897458707706906, + "grad_norm": 0.09527883678674698, + "learning_rate": 0.0003672116089192168, + "loss": 2.6115, + "step": 19888 + }, + { + "epoch": 0.5897755241230022, + "grad_norm": 0.11163464188575745, + "learning_rate": 0.0003671662481599549, + "loss": 2.6302, + "step": 19889 + }, + { + "epoch": 0.5898051774753136, + "grad_norm": 0.11123031377792358, + "learning_rate": 0.0003671208885769586, + "loss": 2.6, + "step": 19890 + }, + { + "epoch": 0.5898348308276251, + "grad_norm": 0.10655556619167328, + "learning_rate": 0.00036707553017062975, + "loss": 2.5942, + "step": 19891 + }, + { + "epoch": 0.5898644841799365, + "grad_norm": 0.11109302937984467, + "learning_rate": 0.00036703017294136984, + "loss": 2.6322, + "step": 19892 + }, + { + "epoch": 0.5898941375322481, + "grad_norm": 0.11255394667387009, + "learning_rate": 0.0003669848168895806, + "loss": 2.6211, + "step": 19893 + }, + { + "epoch": 0.5899237908845595, + "grad_norm": 0.11183320730924606, + "learning_rate": 0.0003669394620156636, + "loss": 2.5998, + "step": 19894 + }, + { + "epoch": 0.589953444236871, + "grad_norm": 0.10076941549777985, + "learning_rate": 0.0003668941083200206, + "loss": 2.5919, + "step": 19895 + }, + { + "epoch": 0.5899830975891824, + "grad_norm": 0.10218561440706253, + "learning_rate": 0.00036684875580305287, + "loss": 2.5991, + "step": 19896 + }, + { + "epoch": 0.590012750941494, + "grad_norm": 0.10473727434873581, + "learning_rate": 0.00036680340446516234, + "loss": 2.6393, + "step": 19897 + }, + { + "epoch": 0.5900424042938054, + "grad_norm": 0.10678178817033768, + "learning_rate": 0.0003667580543067507, + "loss": 2.632, + "step": 19898 + }, + { + "epoch": 0.5900720576461169, + "grad_norm": 0.10527342557907104, + "learning_rate": 0.0003667127053282192, + "loss": 2.6053, + "step": 19899 + }, + { + "epoch": 0.5901017109984283, + "grad_norm": 0.10291585326194763, + "learning_rate": 0.00036666735752996965, + "loss": 2.6105, + "step": 19900 + }, + { + "epoch": 0.5901313643507399, + "grad_norm": 0.11337046325206757, + "learning_rate": 0.00036662201091240356, + "loss": 2.6045, + "step": 19901 + }, + { + "epoch": 0.5901610177030513, + "grad_norm": 0.10140549391508102, + "learning_rate": 0.0003665766654759225, + "loss": 2.627, + "step": 19902 + }, + { + "epoch": 0.5901906710553628, + "grad_norm": 0.1096145510673523, + "learning_rate": 0.00036653132122092786, + "loss": 2.6359, + "step": 19903 + }, + { + "epoch": 0.5902203244076742, + "grad_norm": 0.12839214503765106, + "learning_rate": 0.0003664859781478213, + "loss": 2.5766, + "step": 19904 + }, + { + "epoch": 0.5902499777599858, + "grad_norm": 0.12100555747747421, + "learning_rate": 0.0003664406362570043, + "loss": 2.607, + "step": 19905 + }, + { + "epoch": 0.5902796311122972, + "grad_norm": 0.10797625780105591, + "learning_rate": 0.00036639529554887844, + "loss": 2.6375, + "step": 19906 + }, + { + "epoch": 0.5903092844646087, + "grad_norm": 0.09929971396923065, + "learning_rate": 0.00036634995602384513, + "loss": 2.6125, + "step": 19907 + }, + { + "epoch": 0.5903389378169202, + "grad_norm": 0.11114735901355743, + "learning_rate": 0.00036630461768230593, + "loss": 2.6037, + "step": 19908 + }, + { + "epoch": 0.5903685911692317, + "grad_norm": 0.11163192242383957, + "learning_rate": 0.00036625928052466217, + "loss": 2.6258, + "step": 19909 + }, + { + "epoch": 0.5903982445215432, + "grad_norm": 0.11413944512605667, + "learning_rate": 0.0003662139445513156, + "loss": 2.5977, + "step": 19910 + }, + { + "epoch": 0.5904278978738546, + "grad_norm": 0.11566907167434692, + "learning_rate": 0.00036616860976266744, + "loss": 2.602, + "step": 19911 + }, + { + "epoch": 0.5904575512261662, + "grad_norm": 0.10509707033634186, + "learning_rate": 0.0003661232761591192, + "loss": 2.6206, + "step": 19912 + }, + { + "epoch": 0.5904872045784776, + "grad_norm": 0.1268722116947174, + "learning_rate": 0.0003660779437410725, + "loss": 2.5842, + "step": 19913 + }, + { + "epoch": 0.5905168579307891, + "grad_norm": 0.11838100850582123, + "learning_rate": 0.0003660326125089284, + "loss": 2.6204, + "step": 19914 + }, + { + "epoch": 0.5905465112831005, + "grad_norm": 0.1185702309012413, + "learning_rate": 0.0003659872824630886, + "loss": 2.6149, + "step": 19915 + }, + { + "epoch": 0.5905761646354121, + "grad_norm": 0.10438406467437744, + "learning_rate": 0.00036594195360395437, + "loss": 2.6502, + "step": 19916 + }, + { + "epoch": 0.5906058179877235, + "grad_norm": 0.10461214184761047, + "learning_rate": 0.00036589662593192716, + "loss": 2.6629, + "step": 19917 + }, + { + "epoch": 0.590635471340035, + "grad_norm": 0.11863759160041809, + "learning_rate": 0.0003658512994474084, + "loss": 2.633, + "step": 19918 + }, + { + "epoch": 0.5906651246923464, + "grad_norm": 0.10639037936925888, + "learning_rate": 0.00036580597415079944, + "loss": 2.6138, + "step": 19919 + }, + { + "epoch": 0.590694778044658, + "grad_norm": 0.09467655420303345, + "learning_rate": 0.00036576065004250156, + "loss": 2.6677, + "step": 19920 + }, + { + "epoch": 0.5907244313969694, + "grad_norm": 0.09850417822599411, + "learning_rate": 0.0003657153271229161, + "loss": 2.6223, + "step": 19921 + }, + { + "epoch": 0.5907540847492809, + "grad_norm": 0.10285031795501709, + "learning_rate": 0.0003656700053924443, + "loss": 2.6192, + "step": 19922 + }, + { + "epoch": 0.5907837381015923, + "grad_norm": 0.10084105283021927, + "learning_rate": 0.000365624684851488, + "loss": 2.5821, + "step": 19923 + }, + { + "epoch": 0.5908133914539039, + "grad_norm": 0.11128321290016174, + "learning_rate": 0.00036557936550044804, + "loss": 2.612, + "step": 19924 + }, + { + "epoch": 0.5908430448062153, + "grad_norm": 0.10595883429050446, + "learning_rate": 0.0003655340473397259, + "loss": 2.6035, + "step": 19925 + }, + { + "epoch": 0.5908726981585268, + "grad_norm": 0.11567274481058121, + "learning_rate": 0.00036548873036972284, + "loss": 2.5974, + "step": 19926 + }, + { + "epoch": 0.5909023515108383, + "grad_norm": 0.12079409509897232, + "learning_rate": 0.00036544341459084014, + "loss": 2.6153, + "step": 19927 + }, + { + "epoch": 0.5909320048631498, + "grad_norm": 0.11611691117286682, + "learning_rate": 0.0003653981000034791, + "loss": 2.6107, + "step": 19928 + }, + { + "epoch": 0.5909616582154612, + "grad_norm": 0.10448495298624039, + "learning_rate": 0.00036535278660804107, + "loss": 2.6208, + "step": 19929 + }, + { + "epoch": 0.5909913115677727, + "grad_norm": 0.11240218579769135, + "learning_rate": 0.00036530747440492717, + "loss": 2.6236, + "step": 19930 + }, + { + "epoch": 0.5910209649200843, + "grad_norm": 0.12053590267896652, + "learning_rate": 0.00036526216339453877, + "loss": 2.6034, + "step": 19931 + }, + { + "epoch": 0.5910506182723957, + "grad_norm": 0.10520169138908386, + "learning_rate": 0.00036521685357727697, + "loss": 2.6218, + "step": 19932 + }, + { + "epoch": 0.5910802716247072, + "grad_norm": 0.1115560382604599, + "learning_rate": 0.000365171544953543, + "loss": 2.6505, + "step": 19933 + }, + { + "epoch": 0.5911099249770186, + "grad_norm": 0.12203402072191238, + "learning_rate": 0.0003651262375237382, + "loss": 2.6113, + "step": 19934 + }, + { + "epoch": 0.5911395783293302, + "grad_norm": 0.1214444562792778, + "learning_rate": 0.0003650809312882636, + "loss": 2.6566, + "step": 19935 + }, + { + "epoch": 0.5911692316816416, + "grad_norm": 0.11674392968416214, + "learning_rate": 0.00036503562624752063, + "loss": 2.6516, + "step": 19936 + }, + { + "epoch": 0.5911988850339531, + "grad_norm": 0.09843660145998001, + "learning_rate": 0.0003649903224019104, + "loss": 2.6127, + "step": 19937 + }, + { + "epoch": 0.5912285383862645, + "grad_norm": 0.11128780245780945, + "learning_rate": 0.00036494501975183405, + "loss": 2.5714, + "step": 19938 + }, + { + "epoch": 0.5912581917385761, + "grad_norm": 0.1059461161494255, + "learning_rate": 0.00036489971829769266, + "loss": 2.6119, + "step": 19939 + }, + { + "epoch": 0.5912878450908875, + "grad_norm": 0.0979057103395462, + "learning_rate": 0.0003648544180398875, + "loss": 2.6116, + "step": 19940 + }, + { + "epoch": 0.591317498443199, + "grad_norm": 0.0975956991314888, + "learning_rate": 0.0003648091189788197, + "loss": 2.6141, + "step": 19941 + }, + { + "epoch": 0.5913471517955105, + "grad_norm": 0.11247634887695312, + "learning_rate": 0.00036476382111489026, + "loss": 2.6347, + "step": 19942 + }, + { + "epoch": 0.591376805147822, + "grad_norm": 0.10376597940921783, + "learning_rate": 0.00036471852444850046, + "loss": 2.5813, + "step": 19943 + }, + { + "epoch": 0.5914064585001334, + "grad_norm": 0.10092601180076599, + "learning_rate": 0.00036467322898005127, + "loss": 2.5924, + "step": 19944 + }, + { + "epoch": 0.5914361118524449, + "grad_norm": 0.10282884538173676, + "learning_rate": 0.00036462793470994396, + "loss": 2.6238, + "step": 19945 + }, + { + "epoch": 0.5914657652047564, + "grad_norm": 0.11662239581346512, + "learning_rate": 0.00036458264163857947, + "loss": 2.6351, + "step": 19946 + }, + { + "epoch": 0.5914954185570679, + "grad_norm": 0.11053793132305145, + "learning_rate": 0.00036453734976635906, + "loss": 2.6489, + "step": 19947 + }, + { + "epoch": 0.5915250719093793, + "grad_norm": 0.10832126438617706, + "learning_rate": 0.00036449205909368335, + "loss": 2.6024, + "step": 19948 + }, + { + "epoch": 0.5915547252616908, + "grad_norm": 0.09597140550613403, + "learning_rate": 0.0003644467696209539, + "loss": 2.6627, + "step": 19949 + }, + { + "epoch": 0.5915843786140023, + "grad_norm": 0.11135484278202057, + "learning_rate": 0.0003644014813485716, + "loss": 2.6302, + "step": 19950 + }, + { + "epoch": 0.5916140319663138, + "grad_norm": 0.1112963929772377, + "learning_rate": 0.00036435619427693756, + "loss": 2.6285, + "step": 19951 + }, + { + "epoch": 0.5916436853186253, + "grad_norm": 0.09801704436540604, + "learning_rate": 0.0003643109084064526, + "loss": 2.6098, + "step": 19952 + }, + { + "epoch": 0.5916733386709367, + "grad_norm": 0.09903930127620697, + "learning_rate": 0.0003642656237375178, + "loss": 2.6626, + "step": 19953 + }, + { + "epoch": 0.5917029920232483, + "grad_norm": 0.10427249222993851, + "learning_rate": 0.00036422034027053425, + "loss": 2.6261, + "step": 19954 + }, + { + "epoch": 0.5917326453755597, + "grad_norm": 0.10501739382743835, + "learning_rate": 0.00036417505800590287, + "loss": 2.6101, + "step": 19955 + }, + { + "epoch": 0.5917622987278712, + "grad_norm": 0.115354984998703, + "learning_rate": 0.00036412977694402467, + "loss": 2.6068, + "step": 19956 + }, + { + "epoch": 0.5917919520801826, + "grad_norm": 0.1004161387681961, + "learning_rate": 0.0003640844970853007, + "loss": 2.5912, + "step": 19957 + }, + { + "epoch": 0.5918216054324942, + "grad_norm": 0.11333966255187988, + "learning_rate": 0.00036403921843013176, + "loss": 2.6277, + "step": 19958 + }, + { + "epoch": 0.5918512587848056, + "grad_norm": 0.13230189681053162, + "learning_rate": 0.00036399394097891887, + "loss": 2.6507, + "step": 19959 + }, + { + "epoch": 0.5918809121371171, + "grad_norm": 0.13591140508651733, + "learning_rate": 0.000363948664732063, + "loss": 2.6201, + "step": 19960 + }, + { + "epoch": 0.5919105654894286, + "grad_norm": 0.1149546429514885, + "learning_rate": 0.00036390338968996487, + "loss": 2.6402, + "step": 19961 + }, + { + "epoch": 0.5919402188417401, + "grad_norm": 0.11532159894704819, + "learning_rate": 0.00036385811585302574, + "loss": 2.6344, + "step": 19962 + }, + { + "epoch": 0.5919698721940515, + "grad_norm": 0.12600652873516083, + "learning_rate": 0.0003638128432216464, + "loss": 2.6193, + "step": 19963 + }, + { + "epoch": 0.591999525546363, + "grad_norm": 0.13636860251426697, + "learning_rate": 0.00036376757179622764, + "loss": 2.6018, + "step": 19964 + }, + { + "epoch": 0.5920291788986745, + "grad_norm": 0.11109580099582672, + "learning_rate": 0.0003637223015771705, + "loss": 2.6068, + "step": 19965 + }, + { + "epoch": 0.592058832250986, + "grad_norm": 0.13950085639953613, + "learning_rate": 0.00036367703256487573, + "loss": 2.594, + "step": 19966 + }, + { + "epoch": 0.5920884856032974, + "grad_norm": 0.14930589497089386, + "learning_rate": 0.00036363176475974425, + "loss": 2.5945, + "step": 19967 + }, + { + "epoch": 0.5921181389556089, + "grad_norm": 0.130042165517807, + "learning_rate": 0.000363586498162177, + "loss": 2.6005, + "step": 19968 + }, + { + "epoch": 0.5921477923079204, + "grad_norm": 0.10109405219554901, + "learning_rate": 0.00036354123277257454, + "loss": 2.6037, + "step": 19969 + }, + { + "epoch": 0.5921774456602319, + "grad_norm": 0.14209714531898499, + "learning_rate": 0.000363495968591338, + "loss": 2.6382, + "step": 19970 + }, + { + "epoch": 0.5922070990125433, + "grad_norm": 0.14315980672836304, + "learning_rate": 0.00036345070561886805, + "loss": 2.6031, + "step": 19971 + }, + { + "epoch": 0.5922367523648548, + "grad_norm": 0.11256355792284012, + "learning_rate": 0.0003634054438555655, + "loss": 2.6048, + "step": 19972 + }, + { + "epoch": 0.5922664057171664, + "grad_norm": 0.14446206390857697, + "learning_rate": 0.0003633601833018313, + "loss": 2.6051, + "step": 19973 + }, + { + "epoch": 0.5922960590694778, + "grad_norm": 0.1302586942911148, + "learning_rate": 0.0003633149239580659, + "loss": 2.5736, + "step": 19974 + }, + { + "epoch": 0.5923257124217893, + "grad_norm": 0.12681911885738373, + "learning_rate": 0.00036326966582467046, + "loss": 2.6025, + "step": 19975 + }, + { + "epoch": 0.5923553657741008, + "grad_norm": 0.14482006430625916, + "learning_rate": 0.0003632244089020457, + "loss": 2.5927, + "step": 19976 + }, + { + "epoch": 0.5923850191264123, + "grad_norm": 0.12446438521146774, + "learning_rate": 0.00036317915319059214, + "loss": 2.6071, + "step": 19977 + }, + { + "epoch": 0.5924146724787237, + "grad_norm": 0.12675629556179047, + "learning_rate": 0.0003631338986907108, + "loss": 2.5974, + "step": 19978 + }, + { + "epoch": 0.5924443258310352, + "grad_norm": 0.1363259255886078, + "learning_rate": 0.0003630886454028022, + "loss": 2.6284, + "step": 19979 + }, + { + "epoch": 0.5924739791833467, + "grad_norm": 0.11146333813667297, + "learning_rate": 0.0003630433933272671, + "loss": 2.61, + "step": 19980 + }, + { + "epoch": 0.5925036325356582, + "grad_norm": 0.12261951714754105, + "learning_rate": 0.00036299814246450624, + "loss": 2.6333, + "step": 19981 + }, + { + "epoch": 0.5925332858879696, + "grad_norm": 0.11500851064920425, + "learning_rate": 0.00036295289281492045, + "loss": 2.5909, + "step": 19982 + }, + { + "epoch": 0.5925629392402811, + "grad_norm": 0.11643687635660172, + "learning_rate": 0.00036290764437891024, + "loss": 2.5714, + "step": 19983 + }, + { + "epoch": 0.5925925925925926, + "grad_norm": 0.12634141743183136, + "learning_rate": 0.00036286239715687643, + "loss": 2.5929, + "step": 19984 + }, + { + "epoch": 0.5926222459449041, + "grad_norm": 0.10282941907644272, + "learning_rate": 0.00036281715114921964, + "loss": 2.5606, + "step": 19985 + }, + { + "epoch": 0.5926518992972155, + "grad_norm": 0.10302963852882385, + "learning_rate": 0.00036277190635634025, + "loss": 2.5983, + "step": 19986 + }, + { + "epoch": 0.592681552649527, + "grad_norm": 0.10475943237543106, + "learning_rate": 0.0003627266627786395, + "loss": 2.632, + "step": 19987 + }, + { + "epoch": 0.5927112060018385, + "grad_norm": 0.09630683064460754, + "learning_rate": 0.0003626814204165176, + "loss": 2.6097, + "step": 19988 + }, + { + "epoch": 0.59274085935415, + "grad_norm": 0.10402341932058334, + "learning_rate": 0.0003626361792703754, + "loss": 2.6517, + "step": 19989 + }, + { + "epoch": 0.5927705127064614, + "grad_norm": 0.11110363900661469, + "learning_rate": 0.00036259093934061336, + "loss": 2.6553, + "step": 19990 + }, + { + "epoch": 0.592800166058773, + "grad_norm": 0.10972168296575546, + "learning_rate": 0.0003625457006276321, + "loss": 2.6442, + "step": 19991 + }, + { + "epoch": 0.5928298194110844, + "grad_norm": 0.11094903200864792, + "learning_rate": 0.0003625004631318323, + "loss": 2.63, + "step": 19992 + }, + { + "epoch": 0.5928594727633959, + "grad_norm": 0.1075839102268219, + "learning_rate": 0.0003624552268536144, + "loss": 2.5936, + "step": 19993 + }, + { + "epoch": 0.5928891261157074, + "grad_norm": 0.10728536546230316, + "learning_rate": 0.0003624099917933792, + "loss": 2.6376, + "step": 19994 + }, + { + "epoch": 0.5929187794680189, + "grad_norm": 0.11849202960729599, + "learning_rate": 0.0003623647579515271, + "loss": 2.633, + "step": 19995 + }, + { + "epoch": 0.5929484328203304, + "grad_norm": 0.10982780158519745, + "learning_rate": 0.0003623195253284587, + "loss": 2.641, + "step": 19996 + }, + { + "epoch": 0.5929780861726418, + "grad_norm": 0.0998528003692627, + "learning_rate": 0.00036227429392457456, + "loss": 2.6452, + "step": 19997 + }, + { + "epoch": 0.5930077395249533, + "grad_norm": 0.11089254170656204, + "learning_rate": 0.0003622290637402751, + "loss": 2.6222, + "step": 19998 + }, + { + "epoch": 0.5930373928772648, + "grad_norm": 0.10828027874231339, + "learning_rate": 0.00036218383477596084, + "loss": 2.5983, + "step": 19999 + }, + { + "epoch": 0.5930670462295763, + "grad_norm": 0.11283642798662186, + "learning_rate": 0.0003621386070320325, + "loss": 2.621, + "step": 20000 + }, + { + "epoch": 0.5930966995818877, + "grad_norm": 0.10148791968822479, + "learning_rate": 0.00036209338050889053, + "loss": 2.6057, + "step": 20001 + }, + { + "epoch": 0.5931263529341992, + "grad_norm": 0.10183215886354446, + "learning_rate": 0.00036204815520693526, + "loss": 2.5704, + "step": 20002 + }, + { + "epoch": 0.5931560062865107, + "grad_norm": 0.09759315103292465, + "learning_rate": 0.00036200293112656723, + "loss": 2.621, + "step": 20003 + }, + { + "epoch": 0.5931856596388222, + "grad_norm": 0.11791057884693146, + "learning_rate": 0.000361957708268187, + "loss": 2.6037, + "step": 20004 + }, + { + "epoch": 0.5932153129911336, + "grad_norm": 0.09244978427886963, + "learning_rate": 0.00036191248663219487, + "loss": 2.5977, + "step": 20005 + }, + { + "epoch": 0.5932449663434451, + "grad_norm": 0.11002001166343689, + "learning_rate": 0.00036186726621899155, + "loss": 2.6722, + "step": 20006 + }, + { + "epoch": 0.5932746196957566, + "grad_norm": 0.11295267939567566, + "learning_rate": 0.00036182204702897726, + "loss": 2.6231, + "step": 20007 + }, + { + "epoch": 0.5933042730480681, + "grad_norm": 0.10718488693237305, + "learning_rate": 0.0003617768290625524, + "loss": 2.6466, + "step": 20008 + }, + { + "epoch": 0.5933339264003795, + "grad_norm": 0.11926662176847458, + "learning_rate": 0.0003617316123201174, + "loss": 2.6007, + "step": 20009 + }, + { + "epoch": 0.593363579752691, + "grad_norm": 0.10332730412483215, + "learning_rate": 0.0003616863968020727, + "loss": 2.6464, + "step": 20010 + }, + { + "epoch": 0.5933932331050025, + "grad_norm": 0.1063508540391922, + "learning_rate": 0.00036164118250881877, + "loss": 2.6128, + "step": 20011 + }, + { + "epoch": 0.593422886457314, + "grad_norm": 0.12712830305099487, + "learning_rate": 0.0003615959694407558, + "loss": 2.6552, + "step": 20012 + }, + { + "epoch": 0.5934525398096254, + "grad_norm": 0.11610905826091766, + "learning_rate": 0.0003615507575982843, + "loss": 2.5989, + "step": 20013 + }, + { + "epoch": 0.593482193161937, + "grad_norm": 0.09862079471349716, + "learning_rate": 0.0003615055469818047, + "loss": 2.6036, + "step": 20014 + }, + { + "epoch": 0.5935118465142485, + "grad_norm": 0.11092562973499298, + "learning_rate": 0.0003614603375917172, + "loss": 2.6, + "step": 20015 + }, + { + "epoch": 0.5935414998665599, + "grad_norm": 0.09399966150522232, + "learning_rate": 0.0003614151294284224, + "loss": 2.6114, + "step": 20016 + }, + { + "epoch": 0.5935711532188714, + "grad_norm": 0.110240139067173, + "learning_rate": 0.00036136992249232016, + "loss": 2.6339, + "step": 20017 + }, + { + "epoch": 0.5936008065711829, + "grad_norm": 0.09733723104000092, + "learning_rate": 0.0003613247167838111, + "loss": 2.6032, + "step": 20018 + }, + { + "epoch": 0.5936304599234944, + "grad_norm": 0.10639970004558563, + "learning_rate": 0.0003612795123032955, + "loss": 2.5962, + "step": 20019 + }, + { + "epoch": 0.5936601132758058, + "grad_norm": 0.10937804728746414, + "learning_rate": 0.0003612343090511736, + "loss": 2.6436, + "step": 20020 + }, + { + "epoch": 0.5936897666281173, + "grad_norm": 0.10891273617744446, + "learning_rate": 0.0003611891070278458, + "loss": 2.627, + "step": 20021 + }, + { + "epoch": 0.5937194199804288, + "grad_norm": 0.10606015473604202, + "learning_rate": 0.00036114390623371217, + "loss": 2.5928, + "step": 20022 + }, + { + "epoch": 0.5937490733327403, + "grad_norm": 0.10796914994716644, + "learning_rate": 0.0003610987066691733, + "loss": 2.611, + "step": 20023 + }, + { + "epoch": 0.5937787266850517, + "grad_norm": 0.09352808445692062, + "learning_rate": 0.000361053508334629, + "loss": 2.6746, + "step": 20024 + }, + { + "epoch": 0.5938083800373632, + "grad_norm": 0.1076599583029747, + "learning_rate": 0.0003610083112304796, + "loss": 2.6383, + "step": 20025 + }, + { + "epoch": 0.5938380333896747, + "grad_norm": 0.12031245976686478, + "learning_rate": 0.00036096311535712566, + "loss": 2.6284, + "step": 20026 + }, + { + "epoch": 0.5938676867419862, + "grad_norm": 0.11477815359830856, + "learning_rate": 0.0003609179207149673, + "loss": 2.6119, + "step": 20027 + }, + { + "epoch": 0.5938973400942976, + "grad_norm": 0.10350130498409271, + "learning_rate": 0.0003608727273044045, + "loss": 2.5769, + "step": 20028 + }, + { + "epoch": 0.5939269934466092, + "grad_norm": 0.10430917888879776, + "learning_rate": 0.0003608275351258376, + "loss": 2.577, + "step": 20029 + }, + { + "epoch": 0.5939566467989206, + "grad_norm": 0.109938845038414, + "learning_rate": 0.0003607823441796668, + "loss": 2.5899, + "step": 20030 + }, + { + "epoch": 0.5939863001512321, + "grad_norm": 0.10367769002914429, + "learning_rate": 0.00036073715446629216, + "loss": 2.5596, + "step": 20031 + }, + { + "epoch": 0.5940159535035435, + "grad_norm": 0.10236241668462753, + "learning_rate": 0.00036069196598611397, + "loss": 2.6265, + "step": 20032 + }, + { + "epoch": 0.5940456068558551, + "grad_norm": 0.09629673510789871, + "learning_rate": 0.00036064677873953244, + "loss": 2.6089, + "step": 20033 + }, + { + "epoch": 0.5940752602081665, + "grad_norm": 0.10360430926084518, + "learning_rate": 0.0003606015927269475, + "loss": 2.6178, + "step": 20034 + }, + { + "epoch": 0.594104913560478, + "grad_norm": 0.0964890718460083, + "learning_rate": 0.0003605564079487594, + "loss": 2.631, + "step": 20035 + }, + { + "epoch": 0.5941345669127895, + "grad_norm": 0.11703410744667053, + "learning_rate": 0.0003605112244053682, + "loss": 2.6225, + "step": 20036 + }, + { + "epoch": 0.594164220265101, + "grad_norm": 0.10871231555938721, + "learning_rate": 0.00036046604209717404, + "loss": 2.6454, + "step": 20037 + }, + { + "epoch": 0.5941938736174125, + "grad_norm": 0.10119886696338654, + "learning_rate": 0.00036042086102457696, + "loss": 2.6292, + "step": 20038 + }, + { + "epoch": 0.5942235269697239, + "grad_norm": 0.12350741773843765, + "learning_rate": 0.00036037568118797725, + "loss": 2.6466, + "step": 20039 + }, + { + "epoch": 0.5942531803220354, + "grad_norm": 0.11010244488716125, + "learning_rate": 0.0003603305025877748, + "loss": 2.6006, + "step": 20040 + }, + { + "epoch": 0.5942828336743469, + "grad_norm": 0.12162510305643082, + "learning_rate": 0.00036028532522436974, + "loss": 2.6424, + "step": 20041 + }, + { + "epoch": 0.5943124870266584, + "grad_norm": 0.13271452486515045, + "learning_rate": 0.00036024014909816205, + "loss": 2.6225, + "step": 20042 + }, + { + "epoch": 0.5943421403789698, + "grad_norm": 0.12296295166015625, + "learning_rate": 0.00036019497420955194, + "loss": 2.6675, + "step": 20043 + }, + { + "epoch": 0.5943717937312814, + "grad_norm": 0.1160174086689949, + "learning_rate": 0.00036014980055893933, + "loss": 2.6343, + "step": 20044 + }, + { + "epoch": 0.5944014470835928, + "grad_norm": 0.11068824678659439, + "learning_rate": 0.0003601046281467242, + "loss": 2.6085, + "step": 20045 + }, + { + "epoch": 0.5944311004359043, + "grad_norm": 0.11866990476846695, + "learning_rate": 0.00036005945697330656, + "loss": 2.6073, + "step": 20046 + }, + { + "epoch": 0.5944607537882157, + "grad_norm": 0.12058387696743011, + "learning_rate": 0.00036001428703908643, + "loss": 2.6326, + "step": 20047 + }, + { + "epoch": 0.5944904071405273, + "grad_norm": 0.11127527803182602, + "learning_rate": 0.00035996911834446386, + "loss": 2.607, + "step": 20048 + }, + { + "epoch": 0.5945200604928387, + "grad_norm": 0.10504256933927536, + "learning_rate": 0.0003599239508898389, + "loss": 2.6272, + "step": 20049 + }, + { + "epoch": 0.5945497138451502, + "grad_norm": 0.12123250216245651, + "learning_rate": 0.0003598787846756113, + "loss": 2.5944, + "step": 20050 + }, + { + "epoch": 0.5945793671974616, + "grad_norm": 0.10733252018690109, + "learning_rate": 0.0003598336197021809, + "loss": 2.6274, + "step": 20051 + }, + { + "epoch": 0.5946090205497732, + "grad_norm": 0.09089724719524384, + "learning_rate": 0.00035978845596994804, + "loss": 2.5996, + "step": 20052 + }, + { + "epoch": 0.5946386739020846, + "grad_norm": 0.10390035808086395, + "learning_rate": 0.00035974329347931245, + "loss": 2.6111, + "step": 20053 + }, + { + "epoch": 0.5946683272543961, + "grad_norm": 0.10837104171514511, + "learning_rate": 0.00035969813223067426, + "loss": 2.623, + "step": 20054 + }, + { + "epoch": 0.5946979806067076, + "grad_norm": 0.11454355716705322, + "learning_rate": 0.000359652972224433, + "loss": 2.6038, + "step": 20055 + }, + { + "epoch": 0.5947276339590191, + "grad_norm": 0.10572551190853119, + "learning_rate": 0.0003596078134609888, + "loss": 2.6319, + "step": 20056 + }, + { + "epoch": 0.5947572873113306, + "grad_norm": 0.09830957651138306, + "learning_rate": 0.0003595626559407415, + "loss": 2.611, + "step": 20057 + }, + { + "epoch": 0.594786940663642, + "grad_norm": 0.10507969558238983, + "learning_rate": 0.000359517499664091, + "loss": 2.6096, + "step": 20058 + }, + { + "epoch": 0.5948165940159535, + "grad_norm": 0.0944993868470192, + "learning_rate": 0.0003594723446314371, + "loss": 2.6541, + "step": 20059 + }, + { + "epoch": 0.594846247368265, + "grad_norm": 0.10562368482351303, + "learning_rate": 0.00035942719084317975, + "loss": 2.6544, + "step": 20060 + }, + { + "epoch": 0.5948759007205765, + "grad_norm": 0.09411341696977615, + "learning_rate": 0.0003593820382997189, + "loss": 2.6214, + "step": 20061 + }, + { + "epoch": 0.5949055540728879, + "grad_norm": 0.10386276245117188, + "learning_rate": 0.00035933688700145403, + "loss": 2.6056, + "step": 20062 + }, + { + "epoch": 0.5949352074251995, + "grad_norm": 0.10195671021938324, + "learning_rate": 0.00035929173694878533, + "loss": 2.6551, + "step": 20063 + }, + { + "epoch": 0.5949648607775109, + "grad_norm": 0.09794962406158447, + "learning_rate": 0.0003592465881421121, + "loss": 2.5679, + "step": 20064 + }, + { + "epoch": 0.5949945141298224, + "grad_norm": 0.09870800375938416, + "learning_rate": 0.0003592014405818349, + "loss": 2.6093, + "step": 20065 + }, + { + "epoch": 0.5950241674821338, + "grad_norm": 0.09902434051036835, + "learning_rate": 0.0003591562942683529, + "loss": 2.6066, + "step": 20066 + }, + { + "epoch": 0.5950538208344454, + "grad_norm": 0.11137599498033524, + "learning_rate": 0.0003591111492020662, + "loss": 2.6129, + "step": 20067 + }, + { + "epoch": 0.5950834741867568, + "grad_norm": 0.11131121218204498, + "learning_rate": 0.0003590660053833744, + "loss": 2.6125, + "step": 20068 + }, + { + "epoch": 0.5951131275390683, + "grad_norm": 0.10605242848396301, + "learning_rate": 0.0003590208628126773, + "loss": 2.6063, + "step": 20069 + }, + { + "epoch": 0.5951427808913797, + "grad_norm": 0.11285588145256042, + "learning_rate": 0.0003589757214903747, + "loss": 2.6155, + "step": 20070 + }, + { + "epoch": 0.5951724342436913, + "grad_norm": 0.11798655241727829, + "learning_rate": 0.00035893058141686635, + "loss": 2.6323, + "step": 20071 + }, + { + "epoch": 0.5952020875960027, + "grad_norm": 0.12331399321556091, + "learning_rate": 0.00035888544259255183, + "loss": 2.6167, + "step": 20072 + }, + { + "epoch": 0.5952317409483142, + "grad_norm": 0.11264367401599884, + "learning_rate": 0.00035884030501783095, + "loss": 2.6274, + "step": 20073 + }, + { + "epoch": 0.5952613943006256, + "grad_norm": 0.10887392610311508, + "learning_rate": 0.0003587951686931034, + "loss": 2.6153, + "step": 20074 + }, + { + "epoch": 0.5952910476529372, + "grad_norm": 0.10979923605918884, + "learning_rate": 0.0003587500336187689, + "loss": 2.6256, + "step": 20075 + }, + { + "epoch": 0.5953207010052487, + "grad_norm": 0.11377105116844177, + "learning_rate": 0.00035870489979522704, + "loss": 2.6206, + "step": 20076 + }, + { + "epoch": 0.5953503543575601, + "grad_norm": 0.10283311456441879, + "learning_rate": 0.0003586597672228774, + "loss": 2.6089, + "step": 20077 + }, + { + "epoch": 0.5953800077098717, + "grad_norm": 0.11902156472206116, + "learning_rate": 0.00035861463590211996, + "loss": 2.6338, + "step": 20078 + }, + { + "epoch": 0.5954096610621831, + "grad_norm": 0.09704890102148056, + "learning_rate": 0.00035856950583335425, + "loss": 2.5937, + "step": 20079 + }, + { + "epoch": 0.5954393144144946, + "grad_norm": 0.0992179661989212, + "learning_rate": 0.0003585243770169797, + "loss": 2.6164, + "step": 20080 + }, + { + "epoch": 0.595468967766806, + "grad_norm": 0.11895658075809479, + "learning_rate": 0.0003584792494533962, + "loss": 2.6262, + "step": 20081 + }, + { + "epoch": 0.5954986211191176, + "grad_norm": 0.11848364770412445, + "learning_rate": 0.00035843412314300326, + "loss": 2.5707, + "step": 20082 + }, + { + "epoch": 0.595528274471429, + "grad_norm": 0.12362619489431381, + "learning_rate": 0.0003583889980862004, + "loss": 2.5754, + "step": 20083 + }, + { + "epoch": 0.5955579278237405, + "grad_norm": 0.11671378463506699, + "learning_rate": 0.00035834387428338723, + "loss": 2.5986, + "step": 20084 + }, + { + "epoch": 0.5955875811760519, + "grad_norm": 0.10159695893526077, + "learning_rate": 0.0003582987517349634, + "loss": 2.6159, + "step": 20085 + }, + { + "epoch": 0.5956172345283635, + "grad_norm": 0.10914461314678192, + "learning_rate": 0.00035825363044132843, + "loss": 2.6334, + "step": 20086 + }, + { + "epoch": 0.5956468878806749, + "grad_norm": 0.11368834972381592, + "learning_rate": 0.00035820851040288185, + "loss": 2.6567, + "step": 20087 + }, + { + "epoch": 0.5956765412329864, + "grad_norm": 0.11641035228967667, + "learning_rate": 0.0003581633916200234, + "loss": 2.6472, + "step": 20088 + }, + { + "epoch": 0.5957061945852978, + "grad_norm": 0.10604216903448105, + "learning_rate": 0.00035811827409315235, + "loss": 2.6223, + "step": 20089 + }, + { + "epoch": 0.5957358479376094, + "grad_norm": 0.11784099787473679, + "learning_rate": 0.0003580731578226681, + "loss": 2.5893, + "step": 20090 + }, + { + "epoch": 0.5957655012899208, + "grad_norm": 0.12294670194387436, + "learning_rate": 0.0003580280428089707, + "loss": 2.6227, + "step": 20091 + }, + { + "epoch": 0.5957951546422323, + "grad_norm": 0.10433758795261383, + "learning_rate": 0.0003579829290524594, + "loss": 2.6314, + "step": 20092 + }, + { + "epoch": 0.5958248079945437, + "grad_norm": 0.10751684755086899, + "learning_rate": 0.0003579378165535335, + "loss": 2.6073, + "step": 20093 + }, + { + "epoch": 0.5958544613468553, + "grad_norm": 0.13184675574302673, + "learning_rate": 0.0003578927053125927, + "loss": 2.5924, + "step": 20094 + }, + { + "epoch": 0.5958841146991667, + "grad_norm": 0.13095711171627045, + "learning_rate": 0.0003578475953300363, + "loss": 2.6022, + "step": 20095 + }, + { + "epoch": 0.5959137680514782, + "grad_norm": 0.10423921793699265, + "learning_rate": 0.0003578024866062639, + "loss": 2.574, + "step": 20096 + }, + { + "epoch": 0.5959434214037898, + "grad_norm": 0.12329193204641342, + "learning_rate": 0.0003577573791416748, + "loss": 2.6192, + "step": 20097 + }, + { + "epoch": 0.5959730747561012, + "grad_norm": 0.13010451197624207, + "learning_rate": 0.00035771227293666865, + "loss": 2.5741, + "step": 20098 + }, + { + "epoch": 0.5960027281084127, + "grad_norm": 0.12193018943071365, + "learning_rate": 0.0003576671679916448, + "loss": 2.6108, + "step": 20099 + }, + { + "epoch": 0.5960323814607241, + "grad_norm": 0.12085767090320587, + "learning_rate": 0.0003576220643070025, + "loss": 2.5857, + "step": 20100 + }, + { + "epoch": 0.5960620348130357, + "grad_norm": 0.11807404458522797, + "learning_rate": 0.00035757696188314125, + "loss": 2.622, + "step": 20101 + }, + { + "epoch": 0.5960916881653471, + "grad_norm": 0.1303907334804535, + "learning_rate": 0.0003575318607204605, + "loss": 2.6808, + "step": 20102 + }, + { + "epoch": 0.5961213415176586, + "grad_norm": 0.11265245825052261, + "learning_rate": 0.0003574867608193594, + "loss": 2.6144, + "step": 20103 + }, + { + "epoch": 0.59615099486997, + "grad_norm": 0.12429724633693695, + "learning_rate": 0.0003574416621802377, + "loss": 2.5768, + "step": 20104 + }, + { + "epoch": 0.5961806482222816, + "grad_norm": 0.12152960896492004, + "learning_rate": 0.0003573965648034944, + "loss": 2.6199, + "step": 20105 + }, + { + "epoch": 0.596210301574593, + "grad_norm": 0.1243218258023262, + "learning_rate": 0.00035735146868952914, + "loss": 2.6188, + "step": 20106 + }, + { + "epoch": 0.5962399549269045, + "grad_norm": 0.1259390264749527, + "learning_rate": 0.000357306373838741, + "loss": 2.6178, + "step": 20107 + }, + { + "epoch": 0.5962696082792159, + "grad_norm": 0.11970643699169159, + "learning_rate": 0.0003572612802515295, + "loss": 2.6357, + "step": 20108 + }, + { + "epoch": 0.5962992616315275, + "grad_norm": 0.1022610142827034, + "learning_rate": 0.0003572161879282939, + "loss": 2.6063, + "step": 20109 + }, + { + "epoch": 0.5963289149838389, + "grad_norm": 0.11936508119106293, + "learning_rate": 0.0003571710968694334, + "loss": 2.5911, + "step": 20110 + }, + { + "epoch": 0.5963585683361504, + "grad_norm": 0.09587250649929047, + "learning_rate": 0.00035712600707534734, + "loss": 2.5908, + "step": 20111 + }, + { + "epoch": 0.5963882216884618, + "grad_norm": 0.1171109527349472, + "learning_rate": 0.000357080918546435, + "loss": 2.6486, + "step": 20112 + }, + { + "epoch": 0.5964178750407734, + "grad_norm": 0.10374563187360764, + "learning_rate": 0.0003570358312830957, + "loss": 2.6083, + "step": 20113 + }, + { + "epoch": 0.5964475283930848, + "grad_norm": 0.10315801203250885, + "learning_rate": 0.0003569907452857286, + "loss": 2.6122, + "step": 20114 + }, + { + "epoch": 0.5964771817453963, + "grad_norm": 0.1142592653632164, + "learning_rate": 0.000356945660554733, + "loss": 2.623, + "step": 20115 + }, + { + "epoch": 0.5965068350977077, + "grad_norm": 0.10782502591609955, + "learning_rate": 0.00035690057709050803, + "loss": 2.6308, + "step": 20116 + }, + { + "epoch": 0.5965364884500193, + "grad_norm": 0.11246603727340698, + "learning_rate": 0.00035685549489345315, + "loss": 2.6348, + "step": 20117 + }, + { + "epoch": 0.5965661418023308, + "grad_norm": 0.11183210462331772, + "learning_rate": 0.0003568104139639675, + "loss": 2.6369, + "step": 20118 + }, + { + "epoch": 0.5965957951546422, + "grad_norm": 0.09851936995983124, + "learning_rate": 0.0003567653343024501, + "loss": 2.6286, + "step": 20119 + }, + { + "epoch": 0.5966254485069538, + "grad_norm": 0.09611805528402328, + "learning_rate": 0.00035672025590930036, + "loss": 2.6374, + "step": 20120 + }, + { + "epoch": 0.5966551018592652, + "grad_norm": 0.10485507547855377, + "learning_rate": 0.0003566751787849173, + "loss": 2.6156, + "step": 20121 + }, + { + "epoch": 0.5966847552115767, + "grad_norm": 0.12211541086435318, + "learning_rate": 0.0003566301029297001, + "loss": 2.5802, + "step": 20122 + }, + { + "epoch": 0.5967144085638881, + "grad_norm": 0.1220850870013237, + "learning_rate": 0.00035658502834404795, + "loss": 2.601, + "step": 20123 + }, + { + "epoch": 0.5967440619161997, + "grad_norm": 0.11518660187721252, + "learning_rate": 0.00035653995502836, + "loss": 2.5782, + "step": 20124 + }, + { + "epoch": 0.5967737152685111, + "grad_norm": 0.10818634182214737, + "learning_rate": 0.0003564948829830353, + "loss": 2.6074, + "step": 20125 + }, + { + "epoch": 0.5968033686208226, + "grad_norm": 0.12179990857839584, + "learning_rate": 0.0003564498122084733, + "loss": 2.603, + "step": 20126 + }, + { + "epoch": 0.596833021973134, + "grad_norm": 0.09644751995801926, + "learning_rate": 0.0003564047427050726, + "loss": 2.6357, + "step": 20127 + }, + { + "epoch": 0.5968626753254456, + "grad_norm": 0.11647548526525497, + "learning_rate": 0.00035635967447323263, + "loss": 2.6359, + "step": 20128 + }, + { + "epoch": 0.596892328677757, + "grad_norm": 0.11502403020858765, + "learning_rate": 0.0003563146075133522, + "loss": 2.5703, + "step": 20129 + }, + { + "epoch": 0.5969219820300685, + "grad_norm": 0.09506838768720627, + "learning_rate": 0.0003562695418258308, + "loss": 2.6266, + "step": 20130 + }, + { + "epoch": 0.5969516353823799, + "grad_norm": 0.11263855546712875, + "learning_rate": 0.00035622447741106726, + "loss": 2.6453, + "step": 20131 + }, + { + "epoch": 0.5969812887346915, + "grad_norm": 0.10881289839744568, + "learning_rate": 0.0003561794142694607, + "loss": 2.6282, + "step": 20132 + }, + { + "epoch": 0.5970109420870029, + "grad_norm": 0.09967051446437836, + "learning_rate": 0.00035613435240141, + "loss": 2.61, + "step": 20133 + }, + { + "epoch": 0.5970405954393144, + "grad_norm": 0.10530588030815125, + "learning_rate": 0.00035608929180731434, + "loss": 2.6194, + "step": 20134 + }, + { + "epoch": 0.5970702487916258, + "grad_norm": 0.11209813505411148, + "learning_rate": 0.0003560442324875727, + "loss": 2.6275, + "step": 20135 + }, + { + "epoch": 0.5970999021439374, + "grad_norm": 0.12338187545537949, + "learning_rate": 0.0003559991744425841, + "loss": 2.6343, + "step": 20136 + }, + { + "epoch": 0.5971295554962488, + "grad_norm": 0.10317981988191605, + "learning_rate": 0.00035595411767274765, + "loss": 2.6542, + "step": 20137 + }, + { + "epoch": 0.5971592088485603, + "grad_norm": 0.09958690404891968, + "learning_rate": 0.00035590906217846215, + "loss": 2.6227, + "step": 20138 + }, + { + "epoch": 0.5971888622008719, + "grad_norm": 0.11919762194156647, + "learning_rate": 0.00035586400796012654, + "loss": 2.6148, + "step": 20139 + }, + { + "epoch": 0.5972185155531833, + "grad_norm": 0.10687132179737091, + "learning_rate": 0.0003558189550181399, + "loss": 2.6444, + "step": 20140 + }, + { + "epoch": 0.5972481689054948, + "grad_norm": 0.10867094248533249, + "learning_rate": 0.0003557739033529012, + "loss": 2.6162, + "step": 20141 + }, + { + "epoch": 0.5972778222578062, + "grad_norm": 0.10005553811788559, + "learning_rate": 0.0003557288529648093, + "loss": 2.586, + "step": 20142 + }, + { + "epoch": 0.5973074756101178, + "grad_norm": 0.09827270358800888, + "learning_rate": 0.00035568380385426313, + "loss": 2.6187, + "step": 20143 + }, + { + "epoch": 0.5973371289624292, + "grad_norm": 0.09520002454519272, + "learning_rate": 0.00035563875602166175, + "loss": 2.6235, + "step": 20144 + }, + { + "epoch": 0.5973667823147407, + "grad_norm": 0.09470299631357193, + "learning_rate": 0.0003555937094674039, + "loss": 2.6351, + "step": 20145 + }, + { + "epoch": 0.5973964356670521, + "grad_norm": 0.10427989810705185, + "learning_rate": 0.0003555486641918886, + "loss": 2.6151, + "step": 20146 + }, + { + "epoch": 0.5974260890193637, + "grad_norm": 0.10012033581733704, + "learning_rate": 0.00035550362019551475, + "loss": 2.617, + "step": 20147 + }, + { + "epoch": 0.5974557423716751, + "grad_norm": 0.10739334672689438, + "learning_rate": 0.00035545857747868104, + "loss": 2.6536, + "step": 20148 + }, + { + "epoch": 0.5974853957239866, + "grad_norm": 0.10935820639133453, + "learning_rate": 0.0003554135360417864, + "loss": 2.6654, + "step": 20149 + }, + { + "epoch": 0.597515049076298, + "grad_norm": 0.10852481424808502, + "learning_rate": 0.0003553684958852298, + "loss": 2.6218, + "step": 20150 + }, + { + "epoch": 0.5975447024286096, + "grad_norm": 0.10926651954650879, + "learning_rate": 0.0003553234570094099, + "loss": 2.623, + "step": 20151 + }, + { + "epoch": 0.597574355780921, + "grad_norm": 0.10326581448316574, + "learning_rate": 0.0003552784194147257, + "loss": 2.5906, + "step": 20152 + }, + { + "epoch": 0.5976040091332325, + "grad_norm": 0.10854507982730865, + "learning_rate": 0.00035523338310157595, + "loss": 2.6318, + "step": 20153 + }, + { + "epoch": 0.597633662485544, + "grad_norm": 0.1300334930419922, + "learning_rate": 0.00035518834807035947, + "loss": 2.5872, + "step": 20154 + }, + { + "epoch": 0.5976633158378555, + "grad_norm": 0.13533705472946167, + "learning_rate": 0.00035514331432147476, + "loss": 2.6131, + "step": 20155 + }, + { + "epoch": 0.5976929691901669, + "grad_norm": 0.11556396633386612, + "learning_rate": 0.000355098281855321, + "loss": 2.6131, + "step": 20156 + }, + { + "epoch": 0.5977226225424784, + "grad_norm": 0.10762897878885269, + "learning_rate": 0.00035505325067229686, + "loss": 2.5935, + "step": 20157 + }, + { + "epoch": 0.5977522758947899, + "grad_norm": 0.135523721575737, + "learning_rate": 0.00035500822077280127, + "loss": 2.6036, + "step": 20158 + }, + { + "epoch": 0.5977819292471014, + "grad_norm": 0.13100466132164001, + "learning_rate": 0.00035496319215723253, + "loss": 2.6533, + "step": 20159 + }, + { + "epoch": 0.5978115825994129, + "grad_norm": 0.11558376252651215, + "learning_rate": 0.0003549181648259897, + "loss": 2.6292, + "step": 20160 + }, + { + "epoch": 0.5978412359517243, + "grad_norm": 0.10265033692121506, + "learning_rate": 0.00035487313877947144, + "loss": 2.6185, + "step": 20161 + }, + { + "epoch": 0.5978708893040359, + "grad_norm": 0.12186834216117859, + "learning_rate": 0.00035482811401807635, + "loss": 2.6252, + "step": 20162 + }, + { + "epoch": 0.5979005426563473, + "grad_norm": 0.10703834146261215, + "learning_rate": 0.0003547830905422033, + "loss": 2.5901, + "step": 20163 + }, + { + "epoch": 0.5979301960086588, + "grad_norm": 0.10469973832368851, + "learning_rate": 0.00035473806835225095, + "loss": 2.6008, + "step": 20164 + }, + { + "epoch": 0.5979598493609702, + "grad_norm": 0.10784696042537689, + "learning_rate": 0.00035469304744861795, + "loss": 2.5874, + "step": 20165 + }, + { + "epoch": 0.5979895027132818, + "grad_norm": 0.10601923614740372, + "learning_rate": 0.0003546480278317029, + "loss": 2.6076, + "step": 20166 + }, + { + "epoch": 0.5980191560655932, + "grad_norm": 0.10507474839687347, + "learning_rate": 0.0003546030095019045, + "loss": 2.6012, + "step": 20167 + }, + { + "epoch": 0.5980488094179047, + "grad_norm": 0.09778887778520584, + "learning_rate": 0.00035455799245962135, + "loss": 2.6194, + "step": 20168 + }, + { + "epoch": 0.5980784627702161, + "grad_norm": 0.10474013537168503, + "learning_rate": 0.0003545129767052522, + "loss": 2.5992, + "step": 20169 + }, + { + "epoch": 0.5981081161225277, + "grad_norm": 0.0994996502995491, + "learning_rate": 0.0003544679622391956, + "loss": 2.5877, + "step": 20170 + }, + { + "epoch": 0.5981377694748391, + "grad_norm": 0.09291068464517593, + "learning_rate": 0.0003544229490618502, + "loss": 2.581, + "step": 20171 + }, + { + "epoch": 0.5981674228271506, + "grad_norm": 0.09647359699010849, + "learning_rate": 0.00035437793717361455, + "loss": 2.602, + "step": 20172 + }, + { + "epoch": 0.598197076179462, + "grad_norm": 0.09369836002588272, + "learning_rate": 0.0003543329265748873, + "loss": 2.6121, + "step": 20173 + }, + { + "epoch": 0.5982267295317736, + "grad_norm": 0.08485598117113113, + "learning_rate": 0.000354287917266067, + "loss": 2.6056, + "step": 20174 + }, + { + "epoch": 0.598256382884085, + "grad_norm": 0.0901358425617218, + "learning_rate": 0.0003542429092475522, + "loss": 2.5963, + "step": 20175 + }, + { + "epoch": 0.5982860362363965, + "grad_norm": 0.09648735821247101, + "learning_rate": 0.0003541979025197415, + "loss": 2.5745, + "step": 20176 + }, + { + "epoch": 0.598315689588708, + "grad_norm": 0.10185132175683975, + "learning_rate": 0.00035415289708303334, + "loss": 2.5693, + "step": 20177 + }, + { + "epoch": 0.5983453429410195, + "grad_norm": 0.11433599889278412, + "learning_rate": 0.0003541078929378263, + "loss": 2.5886, + "step": 20178 + }, + { + "epoch": 0.5983749962933309, + "grad_norm": 0.11070392280817032, + "learning_rate": 0.0003540628900845189, + "loss": 2.601, + "step": 20179 + }, + { + "epoch": 0.5984046496456424, + "grad_norm": 0.08509615063667297, + "learning_rate": 0.0003540178885235096, + "loss": 2.6063, + "step": 20180 + }, + { + "epoch": 0.598434302997954, + "grad_norm": 0.10670991986989975, + "learning_rate": 0.00035397288825519697, + "loss": 2.6132, + "step": 20181 + }, + { + "epoch": 0.5984639563502654, + "grad_norm": 0.1312514841556549, + "learning_rate": 0.00035392788927997954, + "loss": 2.6033, + "step": 20182 + }, + { + "epoch": 0.5984936097025769, + "grad_norm": 0.13834717869758606, + "learning_rate": 0.0003538828915982557, + "loss": 2.6288, + "step": 20183 + }, + { + "epoch": 0.5985232630548883, + "grad_norm": 0.10206519067287445, + "learning_rate": 0.000353837895210424, + "loss": 2.6299, + "step": 20184 + }, + { + "epoch": 0.5985529164071999, + "grad_norm": 0.09454138576984406, + "learning_rate": 0.0003537929001168828, + "loss": 2.5837, + "step": 20185 + }, + { + "epoch": 0.5985825697595113, + "grad_norm": 0.11049602925777435, + "learning_rate": 0.00035374790631803057, + "loss": 2.606, + "step": 20186 + }, + { + "epoch": 0.5986122231118228, + "grad_norm": 0.12265413999557495, + "learning_rate": 0.00035370291381426575, + "loss": 2.6181, + "step": 20187 + }, + { + "epoch": 0.5986418764641342, + "grad_norm": 0.10703551024198532, + "learning_rate": 0.0003536579226059867, + "loss": 2.6165, + "step": 20188 + }, + { + "epoch": 0.5986715298164458, + "grad_norm": 0.10909620672464371, + "learning_rate": 0.00035361293269359185, + "loss": 2.5887, + "step": 20189 + }, + { + "epoch": 0.5987011831687572, + "grad_norm": 0.10038311034440994, + "learning_rate": 0.0003535679440774796, + "loss": 2.5997, + "step": 20190 + }, + { + "epoch": 0.5987308365210687, + "grad_norm": 0.11643095314502716, + "learning_rate": 0.0003535229567580484, + "loss": 2.6059, + "step": 20191 + }, + { + "epoch": 0.5987604898733802, + "grad_norm": 0.10427995026111603, + "learning_rate": 0.0003534779707356966, + "loss": 2.6225, + "step": 20192 + }, + { + "epoch": 0.5987901432256917, + "grad_norm": 0.11876916140317917, + "learning_rate": 0.0003534329860108222, + "loss": 2.6444, + "step": 20193 + }, + { + "epoch": 0.5988197965780031, + "grad_norm": 0.12336497753858566, + "learning_rate": 0.0003533880025838241, + "loss": 2.6272, + "step": 20194 + }, + { + "epoch": 0.5988494499303146, + "grad_norm": 0.12876130640506744, + "learning_rate": 0.0003533430204551005, + "loss": 2.5957, + "step": 20195 + }, + { + "epoch": 0.5988791032826261, + "grad_norm": 0.11307782679796219, + "learning_rate": 0.0003532980396250496, + "loss": 2.6131, + "step": 20196 + }, + { + "epoch": 0.5989087566349376, + "grad_norm": 0.13294439017772675, + "learning_rate": 0.0003532530600940697, + "loss": 2.6201, + "step": 20197 + }, + { + "epoch": 0.598938409987249, + "grad_norm": 0.11699973791837692, + "learning_rate": 0.00035320808186255903, + "loss": 2.6082, + "step": 20198 + }, + { + "epoch": 0.5989680633395605, + "grad_norm": 0.09037574380636215, + "learning_rate": 0.00035316310493091615, + "loss": 2.6399, + "step": 20199 + }, + { + "epoch": 0.598997716691872, + "grad_norm": 0.12273003160953522, + "learning_rate": 0.0003531181292995391, + "loss": 2.5789, + "step": 20200 + }, + { + "epoch": 0.5990273700441835, + "grad_norm": 0.14512652158737183, + "learning_rate": 0.0003530731549688262, + "loss": 2.5879, + "step": 20201 + }, + { + "epoch": 0.599057023396495, + "grad_norm": 0.10795216262340546, + "learning_rate": 0.00035302818193917577, + "loss": 2.5919, + "step": 20202 + }, + { + "epoch": 0.5990866767488064, + "grad_norm": 0.13888120651245117, + "learning_rate": 0.0003529832102109861, + "loss": 2.5961, + "step": 20203 + }, + { + "epoch": 0.599116330101118, + "grad_norm": 0.1250876635313034, + "learning_rate": 0.0003529382397846553, + "loss": 2.6236, + "step": 20204 + }, + { + "epoch": 0.5991459834534294, + "grad_norm": 0.10802479833364487, + "learning_rate": 0.0003528932706605816, + "loss": 2.6313, + "step": 20205 + }, + { + "epoch": 0.5991756368057409, + "grad_norm": 0.14366991817951202, + "learning_rate": 0.00035284830283916315, + "loss": 2.6242, + "step": 20206 + }, + { + "epoch": 0.5992052901580524, + "grad_norm": 0.12015221267938614, + "learning_rate": 0.00035280333632079825, + "loss": 2.6103, + "step": 20207 + }, + { + "epoch": 0.5992349435103639, + "grad_norm": 0.10892553627490997, + "learning_rate": 0.00035275837110588517, + "loss": 2.6402, + "step": 20208 + }, + { + "epoch": 0.5992645968626753, + "grad_norm": 0.12740741670131683, + "learning_rate": 0.0003527134071948219, + "loss": 2.5782, + "step": 20209 + }, + { + "epoch": 0.5992942502149868, + "grad_norm": 0.11581986397504807, + "learning_rate": 0.00035266844458800676, + "loss": 2.6081, + "step": 20210 + }, + { + "epoch": 0.5993239035672983, + "grad_norm": 0.11163539439439774, + "learning_rate": 0.0003526234832858378, + "loss": 2.6279, + "step": 20211 + }, + { + "epoch": 0.5993535569196098, + "grad_norm": 0.10873960703611374, + "learning_rate": 0.0003525785232887132, + "loss": 2.5993, + "step": 20212 + }, + { + "epoch": 0.5993832102719212, + "grad_norm": 0.135240375995636, + "learning_rate": 0.0003525335645970312, + "loss": 2.6173, + "step": 20213 + }, + { + "epoch": 0.5994128636242327, + "grad_norm": 0.11156444251537323, + "learning_rate": 0.00035248860721118967, + "loss": 2.6365, + "step": 20214 + }, + { + "epoch": 0.5994425169765442, + "grad_norm": 0.11388211697340012, + "learning_rate": 0.0003524436511315869, + "loss": 2.65, + "step": 20215 + }, + { + "epoch": 0.5994721703288557, + "grad_norm": 0.12176944315433502, + "learning_rate": 0.00035239869635862085, + "loss": 2.6306, + "step": 20216 + }, + { + "epoch": 0.5995018236811671, + "grad_norm": 0.11728179454803467, + "learning_rate": 0.0003523537428926897, + "loss": 2.6094, + "step": 20217 + }, + { + "epoch": 0.5995314770334786, + "grad_norm": 0.09449131041765213, + "learning_rate": 0.0003523087907341915, + "loss": 2.5966, + "step": 20218 + }, + { + "epoch": 0.5995611303857901, + "grad_norm": 0.11436052620410919, + "learning_rate": 0.0003522638398835243, + "loss": 2.6146, + "step": 20219 + }, + { + "epoch": 0.5995907837381016, + "grad_norm": 0.1064179465174675, + "learning_rate": 0.00035221889034108613, + "loss": 2.6074, + "step": 20220 + }, + { + "epoch": 0.599620437090413, + "grad_norm": 0.10801651328802109, + "learning_rate": 0.0003521739421072751, + "loss": 2.6422, + "step": 20221 + }, + { + "epoch": 0.5996500904427245, + "grad_norm": 0.10581718385219574, + "learning_rate": 0.0003521289951824892, + "loss": 2.6493, + "step": 20222 + }, + { + "epoch": 0.5996797437950361, + "grad_norm": 0.10793454945087433, + "learning_rate": 0.0003520840495671265, + "loss": 2.5911, + "step": 20223 + }, + { + "epoch": 0.5997093971473475, + "grad_norm": 0.09693058580160141, + "learning_rate": 0.0003520391052615849, + "loss": 2.6753, + "step": 20224 + }, + { + "epoch": 0.599739050499659, + "grad_norm": 0.11919767409563065, + "learning_rate": 0.00035199416226626233, + "loss": 2.6202, + "step": 20225 + }, + { + "epoch": 0.5997687038519705, + "grad_norm": 0.11505173146724701, + "learning_rate": 0.0003519492205815569, + "loss": 2.5971, + "step": 20226 + }, + { + "epoch": 0.599798357204282, + "grad_norm": 0.10462617129087448, + "learning_rate": 0.0003519042802078665, + "loss": 2.6021, + "step": 20227 + }, + { + "epoch": 0.5998280105565934, + "grad_norm": 0.13595347106456757, + "learning_rate": 0.00035185934114558915, + "loss": 2.6057, + "step": 20228 + }, + { + "epoch": 0.5998576639089049, + "grad_norm": 0.1516309678554535, + "learning_rate": 0.0003518144033951228, + "loss": 2.5825, + "step": 20229 + }, + { + "epoch": 0.5998873172612164, + "grad_norm": 0.11485744267702103, + "learning_rate": 0.0003517694669568654, + "loss": 2.6126, + "step": 20230 + }, + { + "epoch": 0.5999169706135279, + "grad_norm": 0.12354573607444763, + "learning_rate": 0.00035172453183121474, + "loss": 2.6051, + "step": 20231 + }, + { + "epoch": 0.5999466239658393, + "grad_norm": 0.13889144361019135, + "learning_rate": 0.0003516795980185685, + "loss": 2.6211, + "step": 20232 + }, + { + "epoch": 0.5999762773181508, + "grad_norm": 0.10967982560396194, + "learning_rate": 0.0003516346655193252, + "loss": 2.6096, + "step": 20233 + }, + { + "epoch": 0.6000059306704623, + "grad_norm": 0.12000523507595062, + "learning_rate": 0.00035158973433388246, + "loss": 2.6181, + "step": 20234 + }, + { + "epoch": 0.6000355840227738, + "grad_norm": 0.13139550387859344, + "learning_rate": 0.000351544804462638, + "loss": 2.6216, + "step": 20235 + }, + { + "epoch": 0.6000652373750852, + "grad_norm": 0.09915763884782791, + "learning_rate": 0.00035149987590598974, + "loss": 2.6318, + "step": 20236 + }, + { + "epoch": 0.6000948907273967, + "grad_norm": 0.12090111523866653, + "learning_rate": 0.00035145494866433563, + "loss": 2.63, + "step": 20237 + }, + { + "epoch": 0.6001245440797082, + "grad_norm": 0.12353414297103882, + "learning_rate": 0.00035141002273807344, + "loss": 2.6183, + "step": 20238 + }, + { + "epoch": 0.6001541974320197, + "grad_norm": 0.10995863378047943, + "learning_rate": 0.00035136509812760096, + "loss": 2.5984, + "step": 20239 + }, + { + "epoch": 0.6001838507843311, + "grad_norm": 0.1159956306219101, + "learning_rate": 0.00035132017483331614, + "loss": 2.5991, + "step": 20240 + }, + { + "epoch": 0.6002135041366427, + "grad_norm": 0.11219210177659988, + "learning_rate": 0.00035127525285561667, + "loss": 2.646, + "step": 20241 + }, + { + "epoch": 0.6002431574889541, + "grad_norm": 0.10972143709659576, + "learning_rate": 0.00035123033219490034, + "loss": 2.612, + "step": 20242 + }, + { + "epoch": 0.6002728108412656, + "grad_norm": 0.10857058316469193, + "learning_rate": 0.0003511854128515649, + "loss": 2.6017, + "step": 20243 + }, + { + "epoch": 0.6003024641935771, + "grad_norm": 0.1033538281917572, + "learning_rate": 0.0003511404948260083, + "loss": 2.6149, + "step": 20244 + }, + { + "epoch": 0.6003321175458886, + "grad_norm": 0.10366629809141159, + "learning_rate": 0.0003510955781186279, + "loss": 2.5955, + "step": 20245 + }, + { + "epoch": 0.6003617708982001, + "grad_norm": 0.09530576318502426, + "learning_rate": 0.0003510506627298219, + "loss": 2.6314, + "step": 20246 + }, + { + "epoch": 0.6003914242505115, + "grad_norm": 0.09579911082983017, + "learning_rate": 0.00035100574865998784, + "loss": 2.6449, + "step": 20247 + }, + { + "epoch": 0.600421077602823, + "grad_norm": 0.10790137201547623, + "learning_rate": 0.00035096083590952344, + "loss": 2.6209, + "step": 20248 + }, + { + "epoch": 0.6004507309551345, + "grad_norm": 0.10090849548578262, + "learning_rate": 0.0003509159244788265, + "loss": 2.6219, + "step": 20249 + }, + { + "epoch": 0.600480384307446, + "grad_norm": 0.10431984812021255, + "learning_rate": 0.0003508710143682945, + "loss": 2.5739, + "step": 20250 + }, + { + "epoch": 0.6005100376597574, + "grad_norm": 0.09827612340450287, + "learning_rate": 0.0003508261055783255, + "loss": 2.5929, + "step": 20251 + }, + { + "epoch": 0.6005396910120689, + "grad_norm": 0.09646953642368317, + "learning_rate": 0.00035078119810931675, + "loss": 2.6217, + "step": 20252 + }, + { + "epoch": 0.6005693443643804, + "grad_norm": 0.10177993774414062, + "learning_rate": 0.00035073629196166614, + "loss": 2.6025, + "step": 20253 + }, + { + "epoch": 0.6005989977166919, + "grad_norm": 0.10148099064826965, + "learning_rate": 0.00035069138713577134, + "loss": 2.65, + "step": 20254 + }, + { + "epoch": 0.6006286510690033, + "grad_norm": 0.09620246291160583, + "learning_rate": 0.0003506464836320298, + "loss": 2.6418, + "step": 20255 + }, + { + "epoch": 0.6006583044213148, + "grad_norm": 0.10614058375358582, + "learning_rate": 0.0003506015814508394, + "loss": 2.6241, + "step": 20256 + }, + { + "epoch": 0.6006879577736263, + "grad_norm": 0.10373654961585999, + "learning_rate": 0.0003505566805925976, + "loss": 2.6188, + "step": 20257 + }, + { + "epoch": 0.6007176111259378, + "grad_norm": 0.09794706851243973, + "learning_rate": 0.0003505117810577019, + "loss": 2.5914, + "step": 20258 + }, + { + "epoch": 0.6007472644782492, + "grad_norm": 0.11097036302089691, + "learning_rate": 0.00035046688284655017, + "loss": 2.5898, + "step": 20259 + }, + { + "epoch": 0.6007769178305608, + "grad_norm": 0.10318734496831894, + "learning_rate": 0.0003504219859595399, + "loss": 2.5704, + "step": 20260 + }, + { + "epoch": 0.6008065711828722, + "grad_norm": 0.12728090584278107, + "learning_rate": 0.00035037709039706865, + "loss": 2.5887, + "step": 20261 + }, + { + "epoch": 0.6008362245351837, + "grad_norm": 0.11769766360521317, + "learning_rate": 0.00035033219615953395, + "loss": 2.616, + "step": 20262 + }, + { + "epoch": 0.6008658778874952, + "grad_norm": 0.1012873649597168, + "learning_rate": 0.0003502873032473333, + "loss": 2.6465, + "step": 20263 + }, + { + "epoch": 0.6008955312398067, + "grad_norm": 0.11366354674100876, + "learning_rate": 0.00035024241166086415, + "loss": 2.6125, + "step": 20264 + }, + { + "epoch": 0.6009251845921182, + "grad_norm": 0.13780680298805237, + "learning_rate": 0.00035019752140052427, + "loss": 2.5887, + "step": 20265 + }, + { + "epoch": 0.6009548379444296, + "grad_norm": 0.11663759499788284, + "learning_rate": 0.000350152632466711, + "loss": 2.6144, + "step": 20266 + }, + { + "epoch": 0.6009844912967411, + "grad_norm": 0.11360689997673035, + "learning_rate": 0.000350107744859822, + "loss": 2.6183, + "step": 20267 + }, + { + "epoch": 0.6010141446490526, + "grad_norm": 0.14128686487674713, + "learning_rate": 0.0003500628585802547, + "loss": 2.6517, + "step": 20268 + }, + { + "epoch": 0.6010437980013641, + "grad_norm": 0.11218798160552979, + "learning_rate": 0.00035001797362840635, + "loss": 2.6033, + "step": 20269 + }, + { + "epoch": 0.6010734513536755, + "grad_norm": 0.11020482331514359, + "learning_rate": 0.0003499730900046746, + "loss": 2.596, + "step": 20270 + }, + { + "epoch": 0.601103104705987, + "grad_norm": 0.11978389322757721, + "learning_rate": 0.0003499282077094568, + "loss": 2.6229, + "step": 20271 + }, + { + "epoch": 0.6011327580582985, + "grad_norm": 0.1171792671084404, + "learning_rate": 0.0003498833267431507, + "loss": 2.6296, + "step": 20272 + }, + { + "epoch": 0.60116241141061, + "grad_norm": 0.1132250726222992, + "learning_rate": 0.0003498384471061534, + "loss": 2.6171, + "step": 20273 + }, + { + "epoch": 0.6011920647629214, + "grad_norm": 0.19057650864124298, + "learning_rate": 0.00034979356879886244, + "loss": 2.6239, + "step": 20274 + }, + { + "epoch": 0.601221718115233, + "grad_norm": 0.12720145285129547, + "learning_rate": 0.00034974869182167524, + "loss": 2.6219, + "step": 20275 + }, + { + "epoch": 0.6012513714675444, + "grad_norm": 0.11188673228025436, + "learning_rate": 0.00034970381617498907, + "loss": 2.5946, + "step": 20276 + }, + { + "epoch": 0.6012810248198559, + "grad_norm": 0.12178125977516174, + "learning_rate": 0.0003496589418592015, + "loss": 2.6283, + "step": 20277 + }, + { + "epoch": 0.6013106781721673, + "grad_norm": 0.11671115458011627, + "learning_rate": 0.0003496140688747098, + "loss": 2.6032, + "step": 20278 + }, + { + "epoch": 0.6013403315244789, + "grad_norm": 0.09950052946805954, + "learning_rate": 0.00034956919722191137, + "loss": 2.6442, + "step": 20279 + }, + { + "epoch": 0.6013699848767903, + "grad_norm": 0.1112944483757019, + "learning_rate": 0.0003495243269012035, + "loss": 2.6349, + "step": 20280 + }, + { + "epoch": 0.6013996382291018, + "grad_norm": 0.10477429628372192, + "learning_rate": 0.0003494794579129835, + "loss": 2.5871, + "step": 20281 + }, + { + "epoch": 0.6014292915814132, + "grad_norm": 0.09681684523820877, + "learning_rate": 0.0003494345902576487, + "loss": 2.6341, + "step": 20282 + }, + { + "epoch": 0.6014589449337248, + "grad_norm": 0.09743959456682205, + "learning_rate": 0.00034938972393559655, + "loss": 2.6415, + "step": 20283 + }, + { + "epoch": 0.6014885982860363, + "grad_norm": 0.09546410292387009, + "learning_rate": 0.000349344858947224, + "loss": 2.6361, + "step": 20284 + }, + { + "epoch": 0.6015182516383477, + "grad_norm": 0.10259556770324707, + "learning_rate": 0.00034929999529292877, + "loss": 2.5818, + "step": 20285 + }, + { + "epoch": 0.6015479049906592, + "grad_norm": 0.10804303735494614, + "learning_rate": 0.000349255132973108, + "loss": 2.5981, + "step": 20286 + }, + { + "epoch": 0.6015775583429707, + "grad_norm": 0.09425946325063705, + "learning_rate": 0.0003492102719881588, + "loss": 2.6435, + "step": 20287 + }, + { + "epoch": 0.6016072116952822, + "grad_norm": 0.1016889363527298, + "learning_rate": 0.00034916541233847865, + "loss": 2.5849, + "step": 20288 + }, + { + "epoch": 0.6016368650475936, + "grad_norm": 0.1053905263543129, + "learning_rate": 0.0003491205540244646, + "loss": 2.6113, + "step": 20289 + }, + { + "epoch": 0.6016665183999051, + "grad_norm": 0.09554164856672287, + "learning_rate": 0.000349075697046514, + "loss": 2.5895, + "step": 20290 + }, + { + "epoch": 0.6016961717522166, + "grad_norm": 0.10815158486366272, + "learning_rate": 0.00034903084140502395, + "loss": 2.6654, + "step": 20291 + }, + { + "epoch": 0.6017258251045281, + "grad_norm": 0.10098441690206528, + "learning_rate": 0.00034898598710039167, + "loss": 2.6249, + "step": 20292 + }, + { + "epoch": 0.6017554784568395, + "grad_norm": 0.10054071992635727, + "learning_rate": 0.00034894113413301445, + "loss": 2.5854, + "step": 20293 + }, + { + "epoch": 0.601785131809151, + "grad_norm": 0.09484143555164337, + "learning_rate": 0.00034889628250328944, + "loss": 2.5987, + "step": 20294 + }, + { + "epoch": 0.6018147851614625, + "grad_norm": 0.10686725378036499, + "learning_rate": 0.0003488514322116136, + "loss": 2.6295, + "step": 20295 + }, + { + "epoch": 0.601844438513774, + "grad_norm": 0.09276457875967026, + "learning_rate": 0.0003488065832583846, + "loss": 2.5936, + "step": 20296 + }, + { + "epoch": 0.6018740918660854, + "grad_norm": 0.12644869089126587, + "learning_rate": 0.00034876173564399885, + "loss": 2.6637, + "step": 20297 + }, + { + "epoch": 0.601903745218397, + "grad_norm": 0.1063159927725792, + "learning_rate": 0.00034871688936885417, + "loss": 2.6099, + "step": 20298 + }, + { + "epoch": 0.6019333985707084, + "grad_norm": 0.11042091250419617, + "learning_rate": 0.00034867204443334737, + "loss": 2.6167, + "step": 20299 + }, + { + "epoch": 0.6019630519230199, + "grad_norm": 0.12423372268676758, + "learning_rate": 0.0003486272008378756, + "loss": 2.6007, + "step": 20300 + }, + { + "epoch": 0.6019927052753313, + "grad_norm": 0.11207308620214462, + "learning_rate": 0.00034858235858283595, + "loss": 2.5916, + "step": 20301 + }, + { + "epoch": 0.6020223586276429, + "grad_norm": 0.1249309703707695, + "learning_rate": 0.0003485375176686254, + "loss": 2.5978, + "step": 20302 + }, + { + "epoch": 0.6020520119799543, + "grad_norm": 0.12329883128404617, + "learning_rate": 0.0003484926780956412, + "loss": 2.5963, + "step": 20303 + }, + { + "epoch": 0.6020816653322658, + "grad_norm": 0.11472277343273163, + "learning_rate": 0.0003484478398642804, + "loss": 2.6095, + "step": 20304 + }, + { + "epoch": 0.6021113186845773, + "grad_norm": 0.1410958170890808, + "learning_rate": 0.00034840300297493985, + "loss": 2.6414, + "step": 20305 + }, + { + "epoch": 0.6021409720368888, + "grad_norm": 0.12275893241167068, + "learning_rate": 0.000348358167428017, + "loss": 2.6332, + "step": 20306 + }, + { + "epoch": 0.6021706253892003, + "grad_norm": 0.11842195689678192, + "learning_rate": 0.00034831333322390837, + "loss": 2.6083, + "step": 20307 + }, + { + "epoch": 0.6022002787415117, + "grad_norm": 0.13456980884075165, + "learning_rate": 0.00034826850036301127, + "loss": 2.6246, + "step": 20308 + }, + { + "epoch": 0.6022299320938233, + "grad_norm": 0.10788429528474808, + "learning_rate": 0.0003482236688457226, + "loss": 2.6194, + "step": 20309 + }, + { + "epoch": 0.6022595854461347, + "grad_norm": 0.10881754010915756, + "learning_rate": 0.0003481788386724393, + "loss": 2.6167, + "step": 20310 + }, + { + "epoch": 0.6022892387984462, + "grad_norm": 0.11468455195426941, + "learning_rate": 0.0003481340098435586, + "loss": 2.622, + "step": 20311 + }, + { + "epoch": 0.6023188921507576, + "grad_norm": 0.1045311987400055, + "learning_rate": 0.0003480891823594773, + "loss": 2.5713, + "step": 20312 + }, + { + "epoch": 0.6023485455030692, + "grad_norm": 0.0988740399479866, + "learning_rate": 0.0003480443562205923, + "loss": 2.6174, + "step": 20313 + }, + { + "epoch": 0.6023781988553806, + "grad_norm": 0.10871566087007523, + "learning_rate": 0.0003479995314273007, + "loss": 2.6201, + "step": 20314 + }, + { + "epoch": 0.6024078522076921, + "grad_norm": 0.10738267004489899, + "learning_rate": 0.00034795470797999926, + "loss": 2.6313, + "step": 20315 + }, + { + "epoch": 0.6024375055600035, + "grad_norm": 0.10399787873029709, + "learning_rate": 0.000347909885879085, + "loss": 2.6252, + "step": 20316 + }, + { + "epoch": 0.6024671589123151, + "grad_norm": 0.1042238399386406, + "learning_rate": 0.00034786506512495495, + "loss": 2.6307, + "step": 20317 + }, + { + "epoch": 0.6024968122646265, + "grad_norm": 0.09387271851301193, + "learning_rate": 0.00034782024571800577, + "loss": 2.6003, + "step": 20318 + }, + { + "epoch": 0.602526465616938, + "grad_norm": 0.10678383708000183, + "learning_rate": 0.00034777542765863435, + "loss": 2.6074, + "step": 20319 + }, + { + "epoch": 0.6025561189692494, + "grad_norm": 0.11184462904930115, + "learning_rate": 0.00034773061094723766, + "loss": 2.6183, + "step": 20320 + }, + { + "epoch": 0.602585772321561, + "grad_norm": 0.11121697723865509, + "learning_rate": 0.0003476857955842126, + "loss": 2.6144, + "step": 20321 + }, + { + "epoch": 0.6026154256738724, + "grad_norm": 0.11879517138004303, + "learning_rate": 0.00034764098156995593, + "loss": 2.5808, + "step": 20322 + }, + { + "epoch": 0.6026450790261839, + "grad_norm": 0.10985088348388672, + "learning_rate": 0.00034759616890486447, + "loss": 2.6263, + "step": 20323 + }, + { + "epoch": 0.6026747323784953, + "grad_norm": 0.11155322939157486, + "learning_rate": 0.0003475513575893351, + "loss": 2.6279, + "step": 20324 + }, + { + "epoch": 0.6027043857308069, + "grad_norm": 0.09883131831884384, + "learning_rate": 0.0003475065476237647, + "loss": 2.6535, + "step": 20325 + }, + { + "epoch": 0.6027340390831184, + "grad_norm": 0.11188948899507523, + "learning_rate": 0.00034746173900855, + "loss": 2.583, + "step": 20326 + }, + { + "epoch": 0.6027636924354298, + "grad_norm": 0.10324396938085556, + "learning_rate": 0.00034741693174408783, + "loss": 2.5654, + "step": 20327 + }, + { + "epoch": 0.6027933457877414, + "grad_norm": 0.09549839049577713, + "learning_rate": 0.00034737212583077487, + "loss": 2.6381, + "step": 20328 + }, + { + "epoch": 0.6028229991400528, + "grad_norm": 0.11146280914545059, + "learning_rate": 0.0003473273212690079, + "loss": 2.6343, + "step": 20329 + }, + { + "epoch": 0.6028526524923643, + "grad_norm": 0.10091942548751831, + "learning_rate": 0.00034728251805918374, + "loss": 2.6232, + "step": 20330 + }, + { + "epoch": 0.6028823058446757, + "grad_norm": 0.09862954914569855, + "learning_rate": 0.0003472377162016991, + "loss": 2.5985, + "step": 20331 + }, + { + "epoch": 0.6029119591969873, + "grad_norm": 0.09934709966182709, + "learning_rate": 0.0003471929156969507, + "loss": 2.6241, + "step": 20332 + }, + { + "epoch": 0.6029416125492987, + "grad_norm": 0.09901444613933563, + "learning_rate": 0.0003471481165453352, + "loss": 2.6348, + "step": 20333 + }, + { + "epoch": 0.6029712659016102, + "grad_norm": 0.11109607666730881, + "learning_rate": 0.00034710331874724954, + "loss": 2.6235, + "step": 20334 + }, + { + "epoch": 0.6030009192539216, + "grad_norm": 0.12156133353710175, + "learning_rate": 0.00034705852230309007, + "loss": 2.5886, + "step": 20335 + }, + { + "epoch": 0.6030305726062332, + "grad_norm": 0.12040116637945175, + "learning_rate": 0.0003470137272132535, + "loss": 2.6328, + "step": 20336 + }, + { + "epoch": 0.6030602259585446, + "grad_norm": 0.11442217975854874, + "learning_rate": 0.0003469689334781368, + "loss": 2.5966, + "step": 20337 + }, + { + "epoch": 0.6030898793108561, + "grad_norm": 0.12168759107589722, + "learning_rate": 0.00034692414109813657, + "loss": 2.6149, + "step": 20338 + }, + { + "epoch": 0.6031195326631675, + "grad_norm": 0.12799502909183502, + "learning_rate": 0.00034687935007364924, + "loss": 2.6552, + "step": 20339 + }, + { + "epoch": 0.6031491860154791, + "grad_norm": 0.11717235296964645, + "learning_rate": 0.00034683456040507154, + "loss": 2.6198, + "step": 20340 + }, + { + "epoch": 0.6031788393677905, + "grad_norm": 0.11158689856529236, + "learning_rate": 0.00034678977209280014, + "loss": 2.6066, + "step": 20341 + }, + { + "epoch": 0.603208492720102, + "grad_norm": 0.11199187487363815, + "learning_rate": 0.00034674498513723157, + "loss": 2.6078, + "step": 20342 + }, + { + "epoch": 0.6032381460724134, + "grad_norm": 0.10143785178661346, + "learning_rate": 0.0003467001995387625, + "loss": 2.6014, + "step": 20343 + }, + { + "epoch": 0.603267799424725, + "grad_norm": 0.10434799641370773, + "learning_rate": 0.00034665541529778956, + "loss": 2.5806, + "step": 20344 + }, + { + "epoch": 0.6032974527770364, + "grad_norm": 0.1040254607796669, + "learning_rate": 0.00034661063241470916, + "loss": 2.6174, + "step": 20345 + }, + { + "epoch": 0.6033271061293479, + "grad_norm": 0.10481037944555283, + "learning_rate": 0.0003465658508899179, + "loss": 2.6136, + "step": 20346 + }, + { + "epoch": 0.6033567594816595, + "grad_norm": 0.10058503597974777, + "learning_rate": 0.00034652107072381236, + "loss": 2.6577, + "step": 20347 + }, + { + "epoch": 0.6033864128339709, + "grad_norm": 0.09818948805332184, + "learning_rate": 0.0003464762919167891, + "loss": 2.616, + "step": 20348 + }, + { + "epoch": 0.6034160661862824, + "grad_norm": 0.11254100501537323, + "learning_rate": 0.00034643151446924446, + "loss": 2.6033, + "step": 20349 + }, + { + "epoch": 0.6034457195385938, + "grad_norm": 0.10684751719236374, + "learning_rate": 0.0003463867383815753, + "loss": 2.5954, + "step": 20350 + }, + { + "epoch": 0.6034753728909054, + "grad_norm": 0.09645869582891464, + "learning_rate": 0.0003463419636541779, + "loss": 2.5964, + "step": 20351 + }, + { + "epoch": 0.6035050262432168, + "grad_norm": 0.09905219823122025, + "learning_rate": 0.00034629719028744885, + "loss": 2.5976, + "step": 20352 + }, + { + "epoch": 0.6035346795955283, + "grad_norm": 0.10557051748037338, + "learning_rate": 0.0003462524182817845, + "loss": 2.646, + "step": 20353 + }, + { + "epoch": 0.6035643329478397, + "grad_norm": 0.09265300631523132, + "learning_rate": 0.0003462076476375814, + "loss": 2.6245, + "step": 20354 + }, + { + "epoch": 0.6035939863001513, + "grad_norm": 0.10523059964179993, + "learning_rate": 0.0003461628783552361, + "loss": 2.6085, + "step": 20355 + }, + { + "epoch": 0.6036236396524627, + "grad_norm": 0.1255665272474289, + "learning_rate": 0.0003461181104351447, + "loss": 2.6015, + "step": 20356 + }, + { + "epoch": 0.6036532930047742, + "grad_norm": 0.11688537895679474, + "learning_rate": 0.000346073343877704, + "loss": 2.6267, + "step": 20357 + }, + { + "epoch": 0.6036829463570856, + "grad_norm": 0.11666446179151535, + "learning_rate": 0.00034602857868331016, + "loss": 2.6089, + "step": 20358 + }, + { + "epoch": 0.6037125997093972, + "grad_norm": 0.12057847529649734, + "learning_rate": 0.00034598381485235975, + "loss": 2.6328, + "step": 20359 + }, + { + "epoch": 0.6037422530617086, + "grad_norm": 0.10107514262199402, + "learning_rate": 0.0003459390523852491, + "loss": 2.6066, + "step": 20360 + }, + { + "epoch": 0.6037719064140201, + "grad_norm": 0.1039697602391243, + "learning_rate": 0.00034589429128237443, + "loss": 2.6189, + "step": 20361 + }, + { + "epoch": 0.6038015597663315, + "grad_norm": 0.11013306677341461, + "learning_rate": 0.00034584953154413235, + "loss": 2.6174, + "step": 20362 + }, + { + "epoch": 0.6038312131186431, + "grad_norm": 0.12407152354717255, + "learning_rate": 0.00034580477317091917, + "loss": 2.6086, + "step": 20363 + }, + { + "epoch": 0.6038608664709545, + "grad_norm": 0.11291922628879547, + "learning_rate": 0.00034576001616313124, + "loss": 2.624, + "step": 20364 + }, + { + "epoch": 0.603890519823266, + "grad_norm": 0.10501565039157867, + "learning_rate": 0.0003457152605211649, + "loss": 2.5951, + "step": 20365 + }, + { + "epoch": 0.6039201731755774, + "grad_norm": 0.09621772915124893, + "learning_rate": 0.0003456705062454163, + "loss": 2.6524, + "step": 20366 + }, + { + "epoch": 0.603949826527889, + "grad_norm": 0.09877629578113556, + "learning_rate": 0.0003456257533362818, + "loss": 2.6294, + "step": 20367 + }, + { + "epoch": 0.6039794798802005, + "grad_norm": 0.09869863837957382, + "learning_rate": 0.00034558100179415784, + "loss": 2.6036, + "step": 20368 + }, + { + "epoch": 0.6040091332325119, + "grad_norm": 0.09179995208978653, + "learning_rate": 0.0003455362516194406, + "loss": 2.6064, + "step": 20369 + }, + { + "epoch": 0.6040387865848235, + "grad_norm": 0.10296376049518585, + "learning_rate": 0.00034549150281252633, + "loss": 2.6246, + "step": 20370 + }, + { + "epoch": 0.6040684399371349, + "grad_norm": 0.1003066673874855, + "learning_rate": 0.0003454467553738113, + "loss": 2.593, + "step": 20371 + }, + { + "epoch": 0.6040980932894464, + "grad_norm": 0.10207316279411316, + "learning_rate": 0.00034540200930369196, + "loss": 2.6277, + "step": 20372 + }, + { + "epoch": 0.6041277466417578, + "grad_norm": 0.11964598298072815, + "learning_rate": 0.0003453572646025642, + "loss": 2.6012, + "step": 20373 + }, + { + "epoch": 0.6041573999940694, + "grad_norm": 0.10317081958055496, + "learning_rate": 0.00034531252127082427, + "loss": 2.6032, + "step": 20374 + }, + { + "epoch": 0.6041870533463808, + "grad_norm": 0.0977298840880394, + "learning_rate": 0.00034526777930886864, + "loss": 2.5809, + "step": 20375 + }, + { + "epoch": 0.6042167066986923, + "grad_norm": 0.10254064202308655, + "learning_rate": 0.0003452230387170935, + "loss": 2.616, + "step": 20376 + }, + { + "epoch": 0.6042463600510037, + "grad_norm": 0.10498911142349243, + "learning_rate": 0.00034517829949589485, + "loss": 2.5807, + "step": 20377 + }, + { + "epoch": 0.6042760134033153, + "grad_norm": 0.11756011098623276, + "learning_rate": 0.000345133561645669, + "loss": 2.5973, + "step": 20378 + }, + { + "epoch": 0.6043056667556267, + "grad_norm": 0.1183713749051094, + "learning_rate": 0.00034508882516681195, + "loss": 2.5994, + "step": 20379 + }, + { + "epoch": 0.6043353201079382, + "grad_norm": 0.0965833067893982, + "learning_rate": 0.0003450440900597199, + "loss": 2.6361, + "step": 20380 + }, + { + "epoch": 0.6043649734602496, + "grad_norm": 0.12965019047260284, + "learning_rate": 0.0003449993563247892, + "loss": 2.6417, + "step": 20381 + }, + { + "epoch": 0.6043946268125612, + "grad_norm": 0.1332501322031021, + "learning_rate": 0.0003449546239624157, + "loss": 2.5738, + "step": 20382 + }, + { + "epoch": 0.6044242801648726, + "grad_norm": 0.11248515546321869, + "learning_rate": 0.0003449098929729957, + "loss": 2.5986, + "step": 20383 + }, + { + "epoch": 0.6044539335171841, + "grad_norm": 0.12193486094474792, + "learning_rate": 0.00034486516335692507, + "loss": 2.6139, + "step": 20384 + }, + { + "epoch": 0.6044835868694955, + "grad_norm": 0.11238446831703186, + "learning_rate": 0.00034482043511460013, + "loss": 2.5995, + "step": 20385 + }, + { + "epoch": 0.6045132402218071, + "grad_norm": 0.11943041533231735, + "learning_rate": 0.00034477570824641683, + "loss": 2.6225, + "step": 20386 + }, + { + "epoch": 0.6045428935741185, + "grad_norm": 0.10056324303150177, + "learning_rate": 0.00034473098275277127, + "loss": 2.6208, + "step": 20387 + }, + { + "epoch": 0.60457254692643, + "grad_norm": 0.11405530571937561, + "learning_rate": 0.0003446862586340595, + "loss": 2.6093, + "step": 20388 + }, + { + "epoch": 0.6046022002787416, + "grad_norm": 0.11407329887151718, + "learning_rate": 0.0003446415358906776, + "loss": 2.6174, + "step": 20389 + }, + { + "epoch": 0.604631853631053, + "grad_norm": 0.10688691586256027, + "learning_rate": 0.0003445968145230216, + "loss": 2.5882, + "step": 20390 + }, + { + "epoch": 0.6046615069833645, + "grad_norm": 0.10365072637796402, + "learning_rate": 0.00034455209453148743, + "loss": 2.614, + "step": 20391 + }, + { + "epoch": 0.6046911603356759, + "grad_norm": 0.11933411657810211, + "learning_rate": 0.0003445073759164712, + "loss": 2.6048, + "step": 20392 + }, + { + "epoch": 0.6047208136879875, + "grad_norm": 0.11525624245405197, + "learning_rate": 0.0003444626586783689, + "loss": 2.6091, + "step": 20393 + }, + { + "epoch": 0.6047504670402989, + "grad_norm": 0.109777070581913, + "learning_rate": 0.00034441794281757636, + "loss": 2.6336, + "step": 20394 + }, + { + "epoch": 0.6047801203926104, + "grad_norm": 0.10782384872436523, + "learning_rate": 0.0003443732283344897, + "loss": 2.565, + "step": 20395 + }, + { + "epoch": 0.6048097737449218, + "grad_norm": 0.10079864412546158, + "learning_rate": 0.00034432851522950476, + "loss": 2.599, + "step": 20396 + }, + { + "epoch": 0.6048394270972334, + "grad_norm": 0.12160186469554901, + "learning_rate": 0.00034428380350301756, + "loss": 2.6115, + "step": 20397 + }, + { + "epoch": 0.6048690804495448, + "grad_norm": 0.10411813110113144, + "learning_rate": 0.000344239093155424, + "loss": 2.5865, + "step": 20398 + }, + { + "epoch": 0.6048987338018563, + "grad_norm": 0.10951577872037888, + "learning_rate": 0.00034419438418712014, + "loss": 2.5906, + "step": 20399 + }, + { + "epoch": 0.6049283871541677, + "grad_norm": 0.10924142599105835, + "learning_rate": 0.0003441496765985014, + "loss": 2.6215, + "step": 20400 + }, + { + "epoch": 0.6049580405064793, + "grad_norm": 0.101175457239151, + "learning_rate": 0.0003441049703899642, + "loss": 2.6204, + "step": 20401 + }, + { + "epoch": 0.6049876938587907, + "grad_norm": 0.09788631647825241, + "learning_rate": 0.00034406026556190426, + "loss": 2.5834, + "step": 20402 + }, + { + "epoch": 0.6050173472111022, + "grad_norm": 0.10502421110868454, + "learning_rate": 0.00034401556211471765, + "loss": 2.5918, + "step": 20403 + }, + { + "epoch": 0.6050470005634137, + "grad_norm": 0.10090138018131256, + "learning_rate": 0.00034397086004879973, + "loss": 2.6206, + "step": 20404 + }, + { + "epoch": 0.6050766539157252, + "grad_norm": 0.09208739548921585, + "learning_rate": 0.0003439261593645467, + "loss": 2.6018, + "step": 20405 + }, + { + "epoch": 0.6051063072680366, + "grad_norm": 0.09887150675058365, + "learning_rate": 0.00034388146006235446, + "loss": 2.6295, + "step": 20406 + }, + { + "epoch": 0.6051359606203481, + "grad_norm": 0.10381097346544266, + "learning_rate": 0.00034383676214261847, + "loss": 2.6077, + "step": 20407 + }, + { + "epoch": 0.6051656139726596, + "grad_norm": 0.10371439903974533, + "learning_rate": 0.0003437920656057349, + "loss": 2.6078, + "step": 20408 + }, + { + "epoch": 0.6051952673249711, + "grad_norm": 0.11805593222379684, + "learning_rate": 0.0003437473704520994, + "loss": 2.609, + "step": 20409 + }, + { + "epoch": 0.6052249206772826, + "grad_norm": 0.12971097230911255, + "learning_rate": 0.00034370267668210774, + "loss": 2.563, + "step": 20410 + }, + { + "epoch": 0.605254574029594, + "grad_norm": 0.12213675677776337, + "learning_rate": 0.0003436579842961557, + "loss": 2.6327, + "step": 20411 + }, + { + "epoch": 0.6052842273819056, + "grad_norm": 0.12421268224716187, + "learning_rate": 0.00034361329329463906, + "loss": 2.6174, + "step": 20412 + }, + { + "epoch": 0.605313880734217, + "grad_norm": 0.10726729780435562, + "learning_rate": 0.00034356860367795326, + "loss": 2.6323, + "step": 20413 + }, + { + "epoch": 0.6053435340865285, + "grad_norm": 0.11141443997621536, + "learning_rate": 0.0003435239154464947, + "loss": 2.621, + "step": 20414 + }, + { + "epoch": 0.6053731874388399, + "grad_norm": 0.12341099977493286, + "learning_rate": 0.00034347922860065863, + "loss": 2.6046, + "step": 20415 + }, + { + "epoch": 0.6054028407911515, + "grad_norm": 0.13127869367599487, + "learning_rate": 0.0003434345431408408, + "loss": 2.6065, + "step": 20416 + }, + { + "epoch": 0.6054324941434629, + "grad_norm": 0.11213172972202301, + "learning_rate": 0.000343389859067437, + "loss": 2.5999, + "step": 20417 + }, + { + "epoch": 0.6054621474957744, + "grad_norm": 0.10312554240226746, + "learning_rate": 0.0003433451763808429, + "loss": 2.5853, + "step": 20418 + }, + { + "epoch": 0.6054918008480858, + "grad_norm": 0.11659561842679977, + "learning_rate": 0.00034330049508145413, + "loss": 2.627, + "step": 20419 + }, + { + "epoch": 0.6055214542003974, + "grad_norm": 0.0894637182354927, + "learning_rate": 0.00034325581516966646, + "loss": 2.6102, + "step": 20420 + }, + { + "epoch": 0.6055511075527088, + "grad_norm": 0.10794838517904282, + "learning_rate": 0.0003432111366458754, + "loss": 2.6314, + "step": 20421 + }, + { + "epoch": 0.6055807609050203, + "grad_norm": 0.11321977525949478, + "learning_rate": 0.0003431664595104766, + "loss": 2.6068, + "step": 20422 + }, + { + "epoch": 0.6056104142573318, + "grad_norm": 0.10735411942005157, + "learning_rate": 0.0003431217837638657, + "loss": 2.6323, + "step": 20423 + }, + { + "epoch": 0.6056400676096433, + "grad_norm": 0.10974033176898956, + "learning_rate": 0.00034307710940643834, + "loss": 2.6093, + "step": 20424 + }, + { + "epoch": 0.6056697209619547, + "grad_norm": 0.09753860533237457, + "learning_rate": 0.0003430324364385902, + "loss": 2.5796, + "step": 20425 + }, + { + "epoch": 0.6056993743142662, + "grad_norm": 0.11013751477003098, + "learning_rate": 0.0003429877648607166, + "loss": 2.5991, + "step": 20426 + }, + { + "epoch": 0.6057290276665777, + "grad_norm": 0.10311577469110489, + "learning_rate": 0.0003429430946732134, + "loss": 2.624, + "step": 20427 + }, + { + "epoch": 0.6057586810188892, + "grad_norm": 0.10036325454711914, + "learning_rate": 0.000342898425876476, + "loss": 2.5996, + "step": 20428 + }, + { + "epoch": 0.6057883343712006, + "grad_norm": 0.10626605153083801, + "learning_rate": 0.0003428537584709001, + "loss": 2.6261, + "step": 20429 + }, + { + "epoch": 0.6058179877235121, + "grad_norm": 0.10608013719320297, + "learning_rate": 0.0003428090924568811, + "loss": 2.5952, + "step": 20430 + }, + { + "epoch": 0.6058476410758237, + "grad_norm": 0.11305376887321472, + "learning_rate": 0.00034276442783481463, + "loss": 2.613, + "step": 20431 + }, + { + "epoch": 0.6058772944281351, + "grad_norm": 0.10979102551937103, + "learning_rate": 0.00034271976460509615, + "loss": 2.6038, + "step": 20432 + }, + { + "epoch": 0.6059069477804466, + "grad_norm": 0.09833132475614548, + "learning_rate": 0.000342675102768121, + "loss": 2.619, + "step": 20433 + }, + { + "epoch": 0.605936601132758, + "grad_norm": 0.10592056065797806, + "learning_rate": 0.00034263044232428496, + "loss": 2.6215, + "step": 20434 + }, + { + "epoch": 0.6059662544850696, + "grad_norm": 0.093113973736763, + "learning_rate": 0.0003425857832739833, + "loss": 2.5925, + "step": 20435 + }, + { + "epoch": 0.605995907837381, + "grad_norm": 0.10029449313879013, + "learning_rate": 0.0003425411256176115, + "loss": 2.5969, + "step": 20436 + }, + { + "epoch": 0.6060255611896925, + "grad_norm": 0.09980779141187668, + "learning_rate": 0.00034249646935556523, + "loss": 2.6164, + "step": 20437 + }, + { + "epoch": 0.606055214542004, + "grad_norm": 0.09896275401115417, + "learning_rate": 0.0003424518144882397, + "loss": 2.6155, + "step": 20438 + }, + { + "epoch": 0.6060848678943155, + "grad_norm": 0.09209048002958298, + "learning_rate": 0.0003424071610160301, + "loss": 2.6383, + "step": 20439 + }, + { + "epoch": 0.6061145212466269, + "grad_norm": 0.11402907222509384, + "learning_rate": 0.0003423625089393324, + "loss": 2.6257, + "step": 20440 + }, + { + "epoch": 0.6061441745989384, + "grad_norm": 0.10676980018615723, + "learning_rate": 0.0003423178582585418, + "loss": 2.5919, + "step": 20441 + }, + { + "epoch": 0.6061738279512499, + "grad_norm": 0.10714655369520187, + "learning_rate": 0.00034227320897405354, + "loss": 2.6212, + "step": 20442 + }, + { + "epoch": 0.6062034813035614, + "grad_norm": 0.09339240938425064, + "learning_rate": 0.0003422285610862631, + "loss": 2.6298, + "step": 20443 + }, + { + "epoch": 0.6062331346558728, + "grad_norm": 0.11049138754606247, + "learning_rate": 0.00034218391459556587, + "loss": 2.6249, + "step": 20444 + }, + { + "epoch": 0.6062627880081843, + "grad_norm": 0.09986880421638489, + "learning_rate": 0.00034213926950235716, + "loss": 2.6252, + "step": 20445 + }, + { + "epoch": 0.6062924413604958, + "grad_norm": 0.09067562222480774, + "learning_rate": 0.0003420946258070322, + "loss": 2.5984, + "step": 20446 + }, + { + "epoch": 0.6063220947128073, + "grad_norm": 0.10283113270998001, + "learning_rate": 0.00034204998350998654, + "loss": 2.6444, + "step": 20447 + }, + { + "epoch": 0.6063517480651187, + "grad_norm": 0.11010164022445679, + "learning_rate": 0.0003420053426116155, + "loss": 2.5953, + "step": 20448 + }, + { + "epoch": 0.6063814014174302, + "grad_norm": 0.11077459901571274, + "learning_rate": 0.00034196070311231407, + "loss": 2.6091, + "step": 20449 + }, + { + "epoch": 0.6064110547697417, + "grad_norm": 0.09835412353277206, + "learning_rate": 0.0003419160650124779, + "loss": 2.5717, + "step": 20450 + }, + { + "epoch": 0.6064407081220532, + "grad_norm": 0.12350023537874222, + "learning_rate": 0.00034187142831250194, + "loss": 2.6047, + "step": 20451 + }, + { + "epoch": 0.6064703614743647, + "grad_norm": 0.11582870036363602, + "learning_rate": 0.0003418267930127816, + "loss": 2.6062, + "step": 20452 + }, + { + "epoch": 0.6065000148266761, + "grad_norm": 0.09131032228469849, + "learning_rate": 0.0003417821591137122, + "loss": 2.6265, + "step": 20453 + }, + { + "epoch": 0.6065296681789877, + "grad_norm": 0.12488705664873123, + "learning_rate": 0.0003417375266156891, + "loss": 2.6277, + "step": 20454 + }, + { + "epoch": 0.6065593215312991, + "grad_norm": 0.13269542157649994, + "learning_rate": 0.0003416928955191072, + "loss": 2.6058, + "step": 20455 + }, + { + "epoch": 0.6065889748836106, + "grad_norm": 0.10310646146535873, + "learning_rate": 0.000341648265824362, + "loss": 2.6227, + "step": 20456 + }, + { + "epoch": 0.606618628235922, + "grad_norm": 0.10935075581073761, + "learning_rate": 0.00034160363753184855, + "loss": 2.6001, + "step": 20457 + }, + { + "epoch": 0.6066482815882336, + "grad_norm": 0.12226791679859161, + "learning_rate": 0.00034155901064196215, + "loss": 2.6168, + "step": 20458 + }, + { + "epoch": 0.606677934940545, + "grad_norm": 0.10305503010749817, + "learning_rate": 0.0003415143851550978, + "loss": 2.5888, + "step": 20459 + }, + { + "epoch": 0.6067075882928565, + "grad_norm": 0.10901983827352524, + "learning_rate": 0.0003414697610716508, + "loss": 2.6034, + "step": 20460 + }, + { + "epoch": 0.606737241645168, + "grad_norm": 0.10782444477081299, + "learning_rate": 0.0003414251383920163, + "loss": 2.6091, + "step": 20461 + }, + { + "epoch": 0.6067668949974795, + "grad_norm": 0.09264705330133438, + "learning_rate": 0.00034138051711658936, + "loss": 2.596, + "step": 20462 + }, + { + "epoch": 0.6067965483497909, + "grad_norm": 0.11315155774354935, + "learning_rate": 0.0003413358972457652, + "loss": 2.5755, + "step": 20463 + }, + { + "epoch": 0.6068262017021024, + "grad_norm": 0.09750255942344666, + "learning_rate": 0.00034129127877993894, + "loss": 2.6412, + "step": 20464 + }, + { + "epoch": 0.6068558550544139, + "grad_norm": 0.10286519676446915, + "learning_rate": 0.0003412466617195055, + "loss": 2.5741, + "step": 20465 + }, + { + "epoch": 0.6068855084067254, + "grad_norm": 0.10589536279439926, + "learning_rate": 0.0003412020460648602, + "loss": 2.6071, + "step": 20466 + }, + { + "epoch": 0.6069151617590368, + "grad_norm": 0.1058942973613739, + "learning_rate": 0.00034115743181639804, + "loss": 2.5996, + "step": 20467 + }, + { + "epoch": 0.6069448151113483, + "grad_norm": 0.1055915504693985, + "learning_rate": 0.00034111281897451415, + "loss": 2.5901, + "step": 20468 + }, + { + "epoch": 0.6069744684636598, + "grad_norm": 0.1084320917725563, + "learning_rate": 0.00034106820753960354, + "loss": 2.5862, + "step": 20469 + }, + { + "epoch": 0.6070041218159713, + "grad_norm": 0.10964890569448471, + "learning_rate": 0.0003410235975120612, + "loss": 2.6181, + "step": 20470 + }, + { + "epoch": 0.6070337751682828, + "grad_norm": 0.11388735473155975, + "learning_rate": 0.00034097898889228216, + "loss": 2.6061, + "step": 20471 + }, + { + "epoch": 0.6070634285205943, + "grad_norm": 0.10278426110744476, + "learning_rate": 0.0003409343816806615, + "loss": 2.6225, + "step": 20472 + }, + { + "epoch": 0.6070930818729058, + "grad_norm": 0.10661105066537857, + "learning_rate": 0.0003408897758775942, + "loss": 2.6312, + "step": 20473 + }, + { + "epoch": 0.6071227352252172, + "grad_norm": 0.0950712114572525, + "learning_rate": 0.00034084517148347515, + "loss": 2.6098, + "step": 20474 + }, + { + "epoch": 0.6071523885775287, + "grad_norm": 0.11652321368455887, + "learning_rate": 0.0003408005684986996, + "loss": 2.6242, + "step": 20475 + }, + { + "epoch": 0.6071820419298402, + "grad_norm": 0.13010510802268982, + "learning_rate": 0.00034075596692366227, + "loss": 2.6073, + "step": 20476 + }, + { + "epoch": 0.6072116952821517, + "grad_norm": 0.12590105831623077, + "learning_rate": 0.0003407113667587581, + "loss": 2.6238, + "step": 20477 + }, + { + "epoch": 0.6072413486344631, + "grad_norm": 0.1055927500128746, + "learning_rate": 0.00034066676800438204, + "loss": 2.6191, + "step": 20478 + }, + { + "epoch": 0.6072710019867746, + "grad_norm": 0.1174456998705864, + "learning_rate": 0.00034062217066092924, + "loss": 2.5682, + "step": 20479 + }, + { + "epoch": 0.6073006553390861, + "grad_norm": 0.10615599155426025, + "learning_rate": 0.00034057757472879445, + "loss": 2.6464, + "step": 20480 + }, + { + "epoch": 0.6073303086913976, + "grad_norm": 0.12152259051799774, + "learning_rate": 0.00034053298020837264, + "loss": 2.6534, + "step": 20481 + }, + { + "epoch": 0.607359962043709, + "grad_norm": 0.1169363483786583, + "learning_rate": 0.00034048838710005867, + "loss": 2.6297, + "step": 20482 + }, + { + "epoch": 0.6073896153960205, + "grad_norm": 0.09772181510925293, + "learning_rate": 0.00034044379540424736, + "loss": 2.5885, + "step": 20483 + }, + { + "epoch": 0.607419268748332, + "grad_norm": 0.10396851599216461, + "learning_rate": 0.0003403992051213337, + "loss": 2.5865, + "step": 20484 + }, + { + "epoch": 0.6074489221006435, + "grad_norm": 0.1095205545425415, + "learning_rate": 0.0003403546162517124, + "loss": 2.5937, + "step": 20485 + }, + { + "epoch": 0.6074785754529549, + "grad_norm": 0.10653872042894363, + "learning_rate": 0.00034031002879577845, + "loss": 2.6123, + "step": 20486 + }, + { + "epoch": 0.6075082288052664, + "grad_norm": 0.10405904054641724, + "learning_rate": 0.0003402654427539266, + "loss": 2.5708, + "step": 20487 + }, + { + "epoch": 0.6075378821575779, + "grad_norm": 0.09640531241893768, + "learning_rate": 0.00034022085812655155, + "loss": 2.6236, + "step": 20488 + }, + { + "epoch": 0.6075675355098894, + "grad_norm": 0.12149131298065186, + "learning_rate": 0.00034017627491404837, + "loss": 2.6158, + "step": 20489 + }, + { + "epoch": 0.6075971888622008, + "grad_norm": 0.11687720566987991, + "learning_rate": 0.00034013169311681164, + "loss": 2.5877, + "step": 20490 + }, + { + "epoch": 0.6076268422145124, + "grad_norm": 0.106251560151577, + "learning_rate": 0.00034008711273523606, + "loss": 2.5993, + "step": 20491 + }, + { + "epoch": 0.6076564955668239, + "grad_norm": 0.09902927279472351, + "learning_rate": 0.0003400425337697167, + "loss": 2.6054, + "step": 20492 + }, + { + "epoch": 0.6076861489191353, + "grad_norm": 0.10121534764766693, + "learning_rate": 0.0003399979562206482, + "loss": 2.6075, + "step": 20493 + }, + { + "epoch": 0.6077158022714468, + "grad_norm": 0.09009470790624619, + "learning_rate": 0.0003399533800884252, + "loss": 2.5862, + "step": 20494 + }, + { + "epoch": 0.6077454556237583, + "grad_norm": 0.09563979506492615, + "learning_rate": 0.0003399088053734425, + "loss": 2.6133, + "step": 20495 + }, + { + "epoch": 0.6077751089760698, + "grad_norm": 0.10742530226707458, + "learning_rate": 0.00033986423207609494, + "loss": 2.6276, + "step": 20496 + }, + { + "epoch": 0.6078047623283812, + "grad_norm": 0.10002885013818741, + "learning_rate": 0.00033981966019677696, + "loss": 2.6193, + "step": 20497 + }, + { + "epoch": 0.6078344156806927, + "grad_norm": 0.10905785858631134, + "learning_rate": 0.0003397750897358834, + "loss": 2.6353, + "step": 20498 + }, + { + "epoch": 0.6078640690330042, + "grad_norm": 0.11152739077806473, + "learning_rate": 0.00033973052069380887, + "loss": 2.6193, + "step": 20499 + }, + { + "epoch": 0.6078937223853157, + "grad_norm": 0.09990277141332626, + "learning_rate": 0.00033968595307094817, + "loss": 2.5932, + "step": 20500 + }, + { + "epoch": 0.6079233757376271, + "grad_norm": 0.09174777567386627, + "learning_rate": 0.0003396413868676959, + "loss": 2.6274, + "step": 20501 + }, + { + "epoch": 0.6079530290899386, + "grad_norm": 0.11141382902860641, + "learning_rate": 0.00033959682208444653, + "loss": 2.6131, + "step": 20502 + }, + { + "epoch": 0.6079826824422501, + "grad_norm": 0.12495098263025284, + "learning_rate": 0.000339552258721595, + "loss": 2.612, + "step": 20503 + }, + { + "epoch": 0.6080123357945616, + "grad_norm": 0.1113346517086029, + "learning_rate": 0.00033950769677953545, + "loss": 2.6266, + "step": 20504 + }, + { + "epoch": 0.608041989146873, + "grad_norm": 0.10096737742424011, + "learning_rate": 0.000339463136258663, + "loss": 2.6169, + "step": 20505 + }, + { + "epoch": 0.6080716424991846, + "grad_norm": 0.111186683177948, + "learning_rate": 0.0003394185771593721, + "loss": 2.6563, + "step": 20506 + }, + { + "epoch": 0.608101295851496, + "grad_norm": 0.12447655946016312, + "learning_rate": 0.00033937401948205733, + "loss": 2.6173, + "step": 20507 + }, + { + "epoch": 0.6081309492038075, + "grad_norm": 0.11506713926792145, + "learning_rate": 0.00033932946322711306, + "loss": 2.5991, + "step": 20508 + }, + { + "epoch": 0.6081606025561189, + "grad_norm": 0.11248080432415009, + "learning_rate": 0.00033928490839493395, + "loss": 2.6298, + "step": 20509 + }, + { + "epoch": 0.6081902559084305, + "grad_norm": 0.10131311416625977, + "learning_rate": 0.00033924035498591467, + "loss": 2.6088, + "step": 20510 + }, + { + "epoch": 0.6082199092607419, + "grad_norm": 0.1082373559474945, + "learning_rate": 0.0003391958030004495, + "loss": 2.6116, + "step": 20511 + }, + { + "epoch": 0.6082495626130534, + "grad_norm": 0.12294284254312515, + "learning_rate": 0.0003391512524389332, + "loss": 2.6202, + "step": 20512 + }, + { + "epoch": 0.6082792159653649, + "grad_norm": 0.10788799822330475, + "learning_rate": 0.0003391067033017602, + "loss": 2.588, + "step": 20513 + }, + { + "epoch": 0.6083088693176764, + "grad_norm": 0.11756019294261932, + "learning_rate": 0.0003390621555893249, + "loss": 2.6145, + "step": 20514 + }, + { + "epoch": 0.6083385226699879, + "grad_norm": 0.12157323211431503, + "learning_rate": 0.00033901760930202187, + "loss": 2.6224, + "step": 20515 + }, + { + "epoch": 0.6083681760222993, + "grad_norm": 0.11159786581993103, + "learning_rate": 0.00033897306444024547, + "loss": 2.6031, + "step": 20516 + }, + { + "epoch": 0.6083978293746108, + "grad_norm": 0.11596444994211197, + "learning_rate": 0.00033892852100439013, + "loss": 2.6091, + "step": 20517 + }, + { + "epoch": 0.6084274827269223, + "grad_norm": 0.1088188961148262, + "learning_rate": 0.0003388839789948505, + "loss": 2.6182, + "step": 20518 + }, + { + "epoch": 0.6084571360792338, + "grad_norm": 0.1008448526263237, + "learning_rate": 0.0003388394384120209, + "loss": 2.611, + "step": 20519 + }, + { + "epoch": 0.6084867894315452, + "grad_norm": 0.11350277066230774, + "learning_rate": 0.00033879489925629574, + "loss": 2.6103, + "step": 20520 + }, + { + "epoch": 0.6085164427838567, + "grad_norm": 0.11431547999382019, + "learning_rate": 0.00033875036152806944, + "loss": 2.6233, + "step": 20521 + }, + { + "epoch": 0.6085460961361682, + "grad_norm": 0.10797489434480667, + "learning_rate": 0.0003387058252277363, + "loss": 2.5734, + "step": 20522 + }, + { + "epoch": 0.6085757494884797, + "grad_norm": 0.11037296801805496, + "learning_rate": 0.0003386612903556908, + "loss": 2.5855, + "step": 20523 + }, + { + "epoch": 0.6086054028407911, + "grad_norm": 0.11282667517662048, + "learning_rate": 0.0003386167569123274, + "loss": 2.6067, + "step": 20524 + }, + { + "epoch": 0.6086350561931027, + "grad_norm": 0.10132303833961487, + "learning_rate": 0.00033857222489804017, + "loss": 2.606, + "step": 20525 + }, + { + "epoch": 0.6086647095454141, + "grad_norm": 0.10774676501750946, + "learning_rate": 0.0003385276943132236, + "loss": 2.6549, + "step": 20526 + }, + { + "epoch": 0.6086943628977256, + "grad_norm": 0.12746401131153107, + "learning_rate": 0.00033848316515827203, + "loss": 2.6288, + "step": 20527 + }, + { + "epoch": 0.608724016250037, + "grad_norm": 0.11229826509952545, + "learning_rate": 0.00033843863743357976, + "loss": 2.6442, + "step": 20528 + }, + { + "epoch": 0.6087536696023486, + "grad_norm": 0.10633332282304764, + "learning_rate": 0.0003383941111395411, + "loss": 2.6368, + "step": 20529 + }, + { + "epoch": 0.60878332295466, + "grad_norm": 0.1141342893242836, + "learning_rate": 0.0003383495862765502, + "loss": 2.5861, + "step": 20530 + }, + { + "epoch": 0.6088129763069715, + "grad_norm": 0.11674220114946365, + "learning_rate": 0.0003383050628450016, + "loss": 2.5858, + "step": 20531 + }, + { + "epoch": 0.6088426296592829, + "grad_norm": 0.10739497095346451, + "learning_rate": 0.00033826054084528935, + "loss": 2.5752, + "step": 20532 + }, + { + "epoch": 0.6088722830115945, + "grad_norm": 0.11493103951215744, + "learning_rate": 0.0003382160202778078, + "loss": 2.5959, + "step": 20533 + }, + { + "epoch": 0.608901936363906, + "grad_norm": 0.10976205766201019, + "learning_rate": 0.0003381715011429513, + "loss": 2.6005, + "step": 20534 + }, + { + "epoch": 0.6089315897162174, + "grad_norm": 0.11901258677244186, + "learning_rate": 0.0003381269834411138, + "loss": 2.5923, + "step": 20535 + }, + { + "epoch": 0.608961243068529, + "grad_norm": 0.10959887504577637, + "learning_rate": 0.00033808246717268966, + "loss": 2.5879, + "step": 20536 + }, + { + "epoch": 0.6089908964208404, + "grad_norm": 0.10794834047555923, + "learning_rate": 0.0003380379523380731, + "loss": 2.5606, + "step": 20537 + }, + { + "epoch": 0.6090205497731519, + "grad_norm": 0.12877897918224335, + "learning_rate": 0.0003379934389376582, + "loss": 2.5993, + "step": 20538 + }, + { + "epoch": 0.6090502031254633, + "grad_norm": 0.12871398031711578, + "learning_rate": 0.00033794892697183926, + "loss": 2.6125, + "step": 20539 + }, + { + "epoch": 0.6090798564777749, + "grad_norm": 0.10307502001523972, + "learning_rate": 0.00033790441644101035, + "loss": 2.6233, + "step": 20540 + }, + { + "epoch": 0.6091095098300863, + "grad_norm": 0.10799740999937057, + "learning_rate": 0.0003378599073455658, + "loss": 2.6137, + "step": 20541 + }, + { + "epoch": 0.6091391631823978, + "grad_norm": 0.1279037743806839, + "learning_rate": 0.0003378153996858993, + "loss": 2.61, + "step": 20542 + }, + { + "epoch": 0.6091688165347092, + "grad_norm": 0.11068914830684662, + "learning_rate": 0.0003377708934624054, + "loss": 2.6508, + "step": 20543 + }, + { + "epoch": 0.6091984698870208, + "grad_norm": 0.09879777580499649, + "learning_rate": 0.00033772638867547814, + "loss": 2.6243, + "step": 20544 + }, + { + "epoch": 0.6092281232393322, + "grad_norm": 0.10578307509422302, + "learning_rate": 0.0003376818853255117, + "loss": 2.6213, + "step": 20545 + }, + { + "epoch": 0.6092577765916437, + "grad_norm": 0.10697290301322937, + "learning_rate": 0.00033763738341289985, + "loss": 2.5568, + "step": 20546 + }, + { + "epoch": 0.6092874299439551, + "grad_norm": 0.097693532705307, + "learning_rate": 0.00033759288293803687, + "loss": 2.5936, + "step": 20547 + }, + { + "epoch": 0.6093170832962667, + "grad_norm": 0.13040058314800262, + "learning_rate": 0.00033754838390131684, + "loss": 2.6177, + "step": 20548 + }, + { + "epoch": 0.6093467366485781, + "grad_norm": 0.1393156796693802, + "learning_rate": 0.00033750388630313365, + "loss": 2.6083, + "step": 20549 + }, + { + "epoch": 0.6093763900008896, + "grad_norm": 0.10754049569368362, + "learning_rate": 0.00033745939014388147, + "loss": 2.6435, + "step": 20550 + }, + { + "epoch": 0.609406043353201, + "grad_norm": 0.1088392585515976, + "learning_rate": 0.0003374148954239544, + "loss": 2.5875, + "step": 20551 + }, + { + "epoch": 0.6094356967055126, + "grad_norm": 0.10647879540920258, + "learning_rate": 0.00033737040214374634, + "loss": 2.5878, + "step": 20552 + }, + { + "epoch": 0.609465350057824, + "grad_norm": 0.10156144946813583, + "learning_rate": 0.00033732591030365117, + "loss": 2.6007, + "step": 20553 + }, + { + "epoch": 0.6094950034101355, + "grad_norm": 0.10644717514514923, + "learning_rate": 0.000337281419904063, + "loss": 2.633, + "step": 20554 + }, + { + "epoch": 0.609524656762447, + "grad_norm": 0.10814491659402847, + "learning_rate": 0.00033723693094537573, + "loss": 2.5632, + "step": 20555 + }, + { + "epoch": 0.6095543101147585, + "grad_norm": 0.09066683799028397, + "learning_rate": 0.0003371924434279834, + "loss": 2.5936, + "step": 20556 + }, + { + "epoch": 0.60958396346707, + "grad_norm": 0.10495632886886597, + "learning_rate": 0.00033714795735228, + "loss": 2.5766, + "step": 20557 + }, + { + "epoch": 0.6096136168193814, + "grad_norm": 0.11736714839935303, + "learning_rate": 0.00033710347271865936, + "loss": 2.618, + "step": 20558 + }, + { + "epoch": 0.609643270171693, + "grad_norm": 0.11233770102262497, + "learning_rate": 0.0003370589895275155, + "loss": 2.5843, + "step": 20559 + }, + { + "epoch": 0.6096729235240044, + "grad_norm": 0.11398052424192429, + "learning_rate": 0.00033701450777924217, + "loss": 2.6592, + "step": 20560 + }, + { + "epoch": 0.6097025768763159, + "grad_norm": 0.12528640031814575, + "learning_rate": 0.0003369700274742333, + "loss": 2.6121, + "step": 20561 + }, + { + "epoch": 0.6097322302286273, + "grad_norm": 0.1342727690935135, + "learning_rate": 0.00033692554861288306, + "loss": 2.6295, + "step": 20562 + }, + { + "epoch": 0.6097618835809389, + "grad_norm": 0.10733520239591599, + "learning_rate": 0.0003368810711955849, + "loss": 2.5904, + "step": 20563 + }, + { + "epoch": 0.6097915369332503, + "grad_norm": 0.11952045559883118, + "learning_rate": 0.00033683659522273284, + "loss": 2.6176, + "step": 20564 + }, + { + "epoch": 0.6098211902855618, + "grad_norm": 0.1164437010884285, + "learning_rate": 0.00033679212069472075, + "loss": 2.5969, + "step": 20565 + }, + { + "epoch": 0.6098508436378732, + "grad_norm": 0.10687059164047241, + "learning_rate": 0.0003367476476119424, + "loss": 2.6067, + "step": 20566 + }, + { + "epoch": 0.6098804969901848, + "grad_norm": 0.0934063121676445, + "learning_rate": 0.00033670317597479175, + "loss": 2.5717, + "step": 20567 + }, + { + "epoch": 0.6099101503424962, + "grad_norm": 0.10028364509344101, + "learning_rate": 0.0003366587057836624, + "loss": 2.6172, + "step": 20568 + }, + { + "epoch": 0.6099398036948077, + "grad_norm": 0.0906461849808693, + "learning_rate": 0.0003366142370389483, + "loss": 2.6034, + "step": 20569 + }, + { + "epoch": 0.6099694570471191, + "grad_norm": 0.1068282425403595, + "learning_rate": 0.00033656976974104314, + "loss": 2.5959, + "step": 20570 + }, + { + "epoch": 0.6099991103994307, + "grad_norm": 0.10041928291320801, + "learning_rate": 0.0003365253038903408, + "loss": 2.5899, + "step": 20571 + }, + { + "epoch": 0.6100287637517421, + "grad_norm": 0.1112886592745781, + "learning_rate": 0.00033648083948723505, + "loss": 2.595, + "step": 20572 + }, + { + "epoch": 0.6100584171040536, + "grad_norm": 0.10154904425144196, + "learning_rate": 0.0003364363765321194, + "loss": 2.6254, + "step": 20573 + }, + { + "epoch": 0.610088070456365, + "grad_norm": 0.10327228903770447, + "learning_rate": 0.00033639191502538767, + "loss": 2.6318, + "step": 20574 + }, + { + "epoch": 0.6101177238086766, + "grad_norm": 0.09460695832967758, + "learning_rate": 0.0003363474549674337, + "loss": 2.5936, + "step": 20575 + }, + { + "epoch": 0.6101473771609881, + "grad_norm": 0.10260185599327087, + "learning_rate": 0.0003363029963586511, + "loss": 2.6054, + "step": 20576 + }, + { + "epoch": 0.6101770305132995, + "grad_norm": 0.12144384533166885, + "learning_rate": 0.0003362585391994335, + "loss": 2.5719, + "step": 20577 + }, + { + "epoch": 0.610206683865611, + "grad_norm": 0.08837208896875381, + "learning_rate": 0.00033621408349017467, + "loss": 2.5938, + "step": 20578 + }, + { + "epoch": 0.6102363372179225, + "grad_norm": 0.11673494428396225, + "learning_rate": 0.00033616962923126837, + "loss": 2.6257, + "step": 20579 + }, + { + "epoch": 0.610265990570234, + "grad_norm": 0.12487217038869858, + "learning_rate": 0.000336125176423108, + "loss": 2.612, + "step": 20580 + }, + { + "epoch": 0.6102956439225454, + "grad_norm": 0.09202443063259125, + "learning_rate": 0.00033608072506608713, + "loss": 2.5598, + "step": 20581 + }, + { + "epoch": 0.610325297274857, + "grad_norm": 0.10307905822992325, + "learning_rate": 0.00033603627516059977, + "loss": 2.6492, + "step": 20582 + }, + { + "epoch": 0.6103549506271684, + "grad_norm": 0.102450892329216, + "learning_rate": 0.0003359918267070394, + "loss": 2.6429, + "step": 20583 + }, + { + "epoch": 0.6103846039794799, + "grad_norm": 0.1035616472363472, + "learning_rate": 0.00033594737970579953, + "loss": 2.6281, + "step": 20584 + }, + { + "epoch": 0.6104142573317913, + "grad_norm": 0.12023666501045227, + "learning_rate": 0.00033590293415727374, + "loss": 2.5965, + "step": 20585 + }, + { + "epoch": 0.6104439106841029, + "grad_norm": 0.11918508261442184, + "learning_rate": 0.00033585849006185565, + "loss": 2.6105, + "step": 20586 + }, + { + "epoch": 0.6104735640364143, + "grad_norm": 0.09555492550134659, + "learning_rate": 0.00033581404741993883, + "loss": 2.6259, + "step": 20587 + }, + { + "epoch": 0.6105032173887258, + "grad_norm": 0.1057562381029129, + "learning_rate": 0.0003357696062319167, + "loss": 2.6281, + "step": 20588 + }, + { + "epoch": 0.6105328707410372, + "grad_norm": 0.10506504029035568, + "learning_rate": 0.00033572516649818307, + "loss": 2.6318, + "step": 20589 + }, + { + "epoch": 0.6105625240933488, + "grad_norm": 0.10717809200286865, + "learning_rate": 0.0003356807282191312, + "loss": 2.5662, + "step": 20590 + }, + { + "epoch": 0.6105921774456602, + "grad_norm": 0.09803242236375809, + "learning_rate": 0.00033563629139515464, + "loss": 2.5769, + "step": 20591 + }, + { + "epoch": 0.6106218307979717, + "grad_norm": 0.09644570201635361, + "learning_rate": 0.00033559185602664697, + "loss": 2.6101, + "step": 20592 + }, + { + "epoch": 0.6106514841502831, + "grad_norm": 0.10390613228082657, + "learning_rate": 0.0003355474221140016, + "loss": 2.6042, + "step": 20593 + }, + { + "epoch": 0.6106811375025947, + "grad_norm": 0.10155394673347473, + "learning_rate": 0.00033550298965761194, + "loss": 2.5899, + "step": 20594 + }, + { + "epoch": 0.6107107908549061, + "grad_norm": 0.1266787052154541, + "learning_rate": 0.00033545855865787166, + "loss": 2.6096, + "step": 20595 + }, + { + "epoch": 0.6107404442072176, + "grad_norm": 0.1246945932507515, + "learning_rate": 0.0003354141291151741, + "loss": 2.6423, + "step": 20596 + }, + { + "epoch": 0.6107700975595292, + "grad_norm": 0.1125112995505333, + "learning_rate": 0.00033536970102991263, + "loss": 2.6073, + "step": 20597 + }, + { + "epoch": 0.6107997509118406, + "grad_norm": 0.10930461436510086, + "learning_rate": 0.00033532527440248075, + "loss": 2.5954, + "step": 20598 + }, + { + "epoch": 0.6108294042641521, + "grad_norm": 0.14090080559253693, + "learning_rate": 0.00033528084923327184, + "loss": 2.6201, + "step": 20599 + }, + { + "epoch": 0.6108590576164635, + "grad_norm": 0.11567109078168869, + "learning_rate": 0.0003352364255226794, + "loss": 2.6188, + "step": 20600 + }, + { + "epoch": 0.6108887109687751, + "grad_norm": 0.10901284962892532, + "learning_rate": 0.00033519200327109655, + "loss": 2.6008, + "step": 20601 + }, + { + "epoch": 0.6109183643210865, + "grad_norm": 0.1361062377691269, + "learning_rate": 0.0003351475824789168, + "loss": 2.5983, + "step": 20602 + }, + { + "epoch": 0.610948017673398, + "grad_norm": 0.12442880123853683, + "learning_rate": 0.00033510316314653356, + "loss": 2.61, + "step": 20603 + }, + { + "epoch": 0.6109776710257094, + "grad_norm": 0.09058783948421478, + "learning_rate": 0.0003350587452743401, + "loss": 2.6034, + "step": 20604 + }, + { + "epoch": 0.611007324378021, + "grad_norm": 0.12732061743736267, + "learning_rate": 0.00033501432886272965, + "loss": 2.6332, + "step": 20605 + }, + { + "epoch": 0.6110369777303324, + "grad_norm": 0.1449098289012909, + "learning_rate": 0.00033496991391209575, + "loss": 2.6339, + "step": 20606 + }, + { + "epoch": 0.6110666310826439, + "grad_norm": 0.11124102771282196, + "learning_rate": 0.00033492550042283145, + "loss": 2.6316, + "step": 20607 + }, + { + "epoch": 0.6110962844349553, + "grad_norm": 0.12668900191783905, + "learning_rate": 0.00033488108839533026, + "loss": 2.6361, + "step": 20608 + }, + { + "epoch": 0.6111259377872669, + "grad_norm": 0.14239628612995148, + "learning_rate": 0.0003348366778299854, + "loss": 2.63, + "step": 20609 + }, + { + "epoch": 0.6111555911395783, + "grad_norm": 0.10401998460292816, + "learning_rate": 0.0003347922687271903, + "loss": 2.5829, + "step": 20610 + }, + { + "epoch": 0.6111852444918898, + "grad_norm": 0.12309958040714264, + "learning_rate": 0.0003347478610873378, + "loss": 2.5691, + "step": 20611 + }, + { + "epoch": 0.6112148978442012, + "grad_norm": 0.1209036335349083, + "learning_rate": 0.00033470345491082136, + "loss": 2.6023, + "step": 20612 + }, + { + "epoch": 0.6112445511965128, + "grad_norm": 0.11425431817770004, + "learning_rate": 0.0003346590501980342, + "loss": 2.6035, + "step": 20613 + }, + { + "epoch": 0.6112742045488242, + "grad_norm": 0.11872580647468567, + "learning_rate": 0.0003346146469493696, + "loss": 2.5866, + "step": 20614 + }, + { + "epoch": 0.6113038579011357, + "grad_norm": 0.12135182321071625, + "learning_rate": 0.0003345702451652206, + "loss": 2.6293, + "step": 20615 + }, + { + "epoch": 0.6113335112534471, + "grad_norm": 0.13729585707187653, + "learning_rate": 0.00033452584484598057, + "loss": 2.5913, + "step": 20616 + }, + { + "epoch": 0.6113631646057587, + "grad_norm": 0.12968842685222626, + "learning_rate": 0.00033448144599204264, + "loss": 2.608, + "step": 20617 + }, + { + "epoch": 0.6113928179580702, + "grad_norm": 0.11694909632205963, + "learning_rate": 0.0003344370486037998, + "loss": 2.6039, + "step": 20618 + }, + { + "epoch": 0.6114224713103816, + "grad_norm": 0.11453748494386673, + "learning_rate": 0.0003343926526816453, + "loss": 2.6114, + "step": 20619 + }, + { + "epoch": 0.6114521246626932, + "grad_norm": 0.11523646861314774, + "learning_rate": 0.00033434825822597217, + "loss": 2.5975, + "step": 20620 + }, + { + "epoch": 0.6114817780150046, + "grad_norm": 0.13430221378803253, + "learning_rate": 0.00033430386523717383, + "loss": 2.6391, + "step": 20621 + }, + { + "epoch": 0.6115114313673161, + "grad_norm": 0.11839480698108673, + "learning_rate": 0.0003342594737156432, + "loss": 2.5976, + "step": 20622 + }, + { + "epoch": 0.6115410847196275, + "grad_norm": 0.10946639627218246, + "learning_rate": 0.0003342150836617733, + "loss": 2.6338, + "step": 20623 + }, + { + "epoch": 0.6115707380719391, + "grad_norm": 0.11238078027963638, + "learning_rate": 0.00033417069507595736, + "loss": 2.6236, + "step": 20624 + }, + { + "epoch": 0.6116003914242505, + "grad_norm": 0.10283619910478592, + "learning_rate": 0.00033412630795858834, + "loss": 2.599, + "step": 20625 + }, + { + "epoch": 0.611630044776562, + "grad_norm": 0.11019126325845718, + "learning_rate": 0.0003340819223100594, + "loss": 2.6072, + "step": 20626 + }, + { + "epoch": 0.6116596981288734, + "grad_norm": 0.0989314466714859, + "learning_rate": 0.00033403753813076356, + "loss": 2.6384, + "step": 20627 + }, + { + "epoch": 0.611689351481185, + "grad_norm": 0.10533825308084488, + "learning_rate": 0.00033399315542109373, + "loss": 2.5914, + "step": 20628 + }, + { + "epoch": 0.6117190048334964, + "grad_norm": 0.10679072886705399, + "learning_rate": 0.000333948774181443, + "loss": 2.6039, + "step": 20629 + }, + { + "epoch": 0.6117486581858079, + "grad_norm": 0.09961411356925964, + "learning_rate": 0.00033390439441220433, + "loss": 2.612, + "step": 20630 + }, + { + "epoch": 0.6117783115381193, + "grad_norm": 0.09992088377475739, + "learning_rate": 0.0003338600161137708, + "loss": 2.6135, + "step": 20631 + }, + { + "epoch": 0.6118079648904309, + "grad_norm": 0.11335621774196625, + "learning_rate": 0.00033381563928653533, + "loss": 2.6312, + "step": 20632 + }, + { + "epoch": 0.6118376182427423, + "grad_norm": 0.11412326246500015, + "learning_rate": 0.0003337712639308908, + "loss": 2.6213, + "step": 20633 + }, + { + "epoch": 0.6118672715950538, + "grad_norm": 0.10423161834478378, + "learning_rate": 0.0003337268900472303, + "loss": 2.6214, + "step": 20634 + }, + { + "epoch": 0.6118969249473653, + "grad_norm": 0.09622431546449661, + "learning_rate": 0.0003336825176359468, + "loss": 2.6165, + "step": 20635 + }, + { + "epoch": 0.6119265782996768, + "grad_norm": 0.09103468060493469, + "learning_rate": 0.0003336381466974332, + "loss": 2.6159, + "step": 20636 + }, + { + "epoch": 0.6119562316519882, + "grad_norm": 0.09548568725585938, + "learning_rate": 0.00033359377723208227, + "loss": 2.6131, + "step": 20637 + }, + { + "epoch": 0.6119858850042997, + "grad_norm": 0.09678435325622559, + "learning_rate": 0.0003335494092402872, + "loss": 2.5891, + "step": 20638 + }, + { + "epoch": 0.6120155383566113, + "grad_norm": 0.10402414947748184, + "learning_rate": 0.00033350504272244055, + "loss": 2.599, + "step": 20639 + }, + { + "epoch": 0.6120451917089227, + "grad_norm": 0.10092644393444061, + "learning_rate": 0.00033346067767893526, + "loss": 2.6057, + "step": 20640 + }, + { + "epoch": 0.6120748450612342, + "grad_norm": 0.11511864513158798, + "learning_rate": 0.0003334163141101644, + "loss": 2.612, + "step": 20641 + }, + { + "epoch": 0.6121044984135456, + "grad_norm": 0.10023095458745956, + "learning_rate": 0.00033337195201652047, + "loss": 2.602, + "step": 20642 + }, + { + "epoch": 0.6121341517658572, + "grad_norm": 0.10094193369150162, + "learning_rate": 0.00033332759139839663, + "loss": 2.6209, + "step": 20643 + }, + { + "epoch": 0.6121638051181686, + "grad_norm": 0.09780539572238922, + "learning_rate": 0.0003332832322561856, + "loss": 2.6208, + "step": 20644 + }, + { + "epoch": 0.6121934584704801, + "grad_norm": 0.09855041652917862, + "learning_rate": 0.0003332388745902802, + "loss": 2.6118, + "step": 20645 + }, + { + "epoch": 0.6122231118227915, + "grad_norm": 0.09522207081317902, + "learning_rate": 0.00033319451840107297, + "loss": 2.588, + "step": 20646 + }, + { + "epoch": 0.6122527651751031, + "grad_norm": 0.10246367007493973, + "learning_rate": 0.000333150163688957, + "loss": 2.6415, + "step": 20647 + }, + { + "epoch": 0.6122824185274145, + "grad_norm": 0.10184195637702942, + "learning_rate": 0.00033310581045432517, + "loss": 2.6185, + "step": 20648 + }, + { + "epoch": 0.612312071879726, + "grad_norm": 0.0947413444519043, + "learning_rate": 0.0003330614586975699, + "loss": 2.6092, + "step": 20649 + }, + { + "epoch": 0.6123417252320374, + "grad_norm": 0.09885315597057343, + "learning_rate": 0.00033301710841908405, + "loss": 2.6201, + "step": 20650 + }, + { + "epoch": 0.612371378584349, + "grad_norm": 0.09241917729377747, + "learning_rate": 0.0003329727596192603, + "loss": 2.6236, + "step": 20651 + }, + { + "epoch": 0.6124010319366604, + "grad_norm": 0.1042308434844017, + "learning_rate": 0.0003329284122984916, + "loss": 2.588, + "step": 20652 + }, + { + "epoch": 0.6124306852889719, + "grad_norm": 0.1064455509185791, + "learning_rate": 0.0003328840664571704, + "loss": 2.6132, + "step": 20653 + }, + { + "epoch": 0.6124603386412834, + "grad_norm": 0.09307414293289185, + "learning_rate": 0.0003328397220956895, + "loss": 2.6132, + "step": 20654 + }, + { + "epoch": 0.6124899919935949, + "grad_norm": 0.10320716351270676, + "learning_rate": 0.0003327953792144416, + "loss": 2.5808, + "step": 20655 + }, + { + "epoch": 0.6125196453459063, + "grad_norm": 0.10585510730743408, + "learning_rate": 0.0003327510378138192, + "loss": 2.5745, + "step": 20656 + }, + { + "epoch": 0.6125492986982178, + "grad_norm": 0.10239384323358536, + "learning_rate": 0.0003327066978942151, + "loss": 2.6028, + "step": 20657 + }, + { + "epoch": 0.6125789520505293, + "grad_norm": 0.09405391663312912, + "learning_rate": 0.000332662359456022, + "loss": 2.5806, + "step": 20658 + }, + { + "epoch": 0.6126086054028408, + "grad_norm": 0.10437461733818054, + "learning_rate": 0.00033261802249963217, + "loss": 2.6209, + "step": 20659 + }, + { + "epoch": 0.6126382587551523, + "grad_norm": 0.10346313565969467, + "learning_rate": 0.00033257368702543867, + "loss": 2.6317, + "step": 20660 + }, + { + "epoch": 0.6126679121074637, + "grad_norm": 0.10006166994571686, + "learning_rate": 0.0003325293530338339, + "loss": 2.6315, + "step": 20661 + }, + { + "epoch": 0.6126975654597753, + "grad_norm": 0.10877925157546997, + "learning_rate": 0.0003324850205252105, + "loss": 2.6095, + "step": 20662 + }, + { + "epoch": 0.6127272188120867, + "grad_norm": 0.1052861362695694, + "learning_rate": 0.0003324406894999609, + "loss": 2.6116, + "step": 20663 + }, + { + "epoch": 0.6127568721643982, + "grad_norm": 0.1184566468000412, + "learning_rate": 0.00033239635995847784, + "loss": 2.6, + "step": 20664 + }, + { + "epoch": 0.6127865255167096, + "grad_norm": 0.11502593755722046, + "learning_rate": 0.0003323520319011538, + "loss": 2.635, + "step": 20665 + }, + { + "epoch": 0.6128161788690212, + "grad_norm": 0.1165100559592247, + "learning_rate": 0.00033230770532838126, + "loss": 2.6422, + "step": 20666 + }, + { + "epoch": 0.6128458322213326, + "grad_norm": 0.12764833867549896, + "learning_rate": 0.0003322633802405528, + "loss": 2.6059, + "step": 20667 + }, + { + "epoch": 0.6128754855736441, + "grad_norm": 0.1101921945810318, + "learning_rate": 0.00033221905663806086, + "loss": 2.6254, + "step": 20668 + }, + { + "epoch": 0.6129051389259556, + "grad_norm": 0.1318945437669754, + "learning_rate": 0.0003321747345212979, + "loss": 2.6319, + "step": 20669 + }, + { + "epoch": 0.6129347922782671, + "grad_norm": 0.13453803956508636, + "learning_rate": 0.0003321304138906566, + "loss": 2.5869, + "step": 20670 + }, + { + "epoch": 0.6129644456305785, + "grad_norm": 0.11159805208444595, + "learning_rate": 0.0003320860947465292, + "loss": 2.6189, + "step": 20671 + }, + { + "epoch": 0.61299409898289, + "grad_norm": 0.11518630385398865, + "learning_rate": 0.0003320417770893082, + "loss": 2.6635, + "step": 20672 + }, + { + "epoch": 0.6130237523352015, + "grad_norm": 0.11934390664100647, + "learning_rate": 0.0003319974609193862, + "loss": 2.5746, + "step": 20673 + }, + { + "epoch": 0.613053405687513, + "grad_norm": 0.11721906810998917, + "learning_rate": 0.0003319531462371555, + "loss": 2.5995, + "step": 20674 + }, + { + "epoch": 0.6130830590398244, + "grad_norm": 0.10843106359243393, + "learning_rate": 0.00033190883304300855, + "loss": 2.5912, + "step": 20675 + }, + { + "epoch": 0.6131127123921359, + "grad_norm": 0.12435004860162735, + "learning_rate": 0.0003318645213373378, + "loss": 2.6436, + "step": 20676 + }, + { + "epoch": 0.6131423657444474, + "grad_norm": 0.1274164915084839, + "learning_rate": 0.00033182021112053553, + "loss": 2.5974, + "step": 20677 + }, + { + "epoch": 0.6131720190967589, + "grad_norm": 0.13246583938598633, + "learning_rate": 0.0003317759023929942, + "loss": 2.6415, + "step": 20678 + }, + { + "epoch": 0.6132016724490703, + "grad_norm": 0.11361460387706757, + "learning_rate": 0.00033173159515510596, + "loss": 2.5803, + "step": 20679 + }, + { + "epoch": 0.6132313258013818, + "grad_norm": 0.10492558777332306, + "learning_rate": 0.00033168728940726345, + "loss": 2.6211, + "step": 20680 + }, + { + "epoch": 0.6132609791536934, + "grad_norm": 0.1279471516609192, + "learning_rate": 0.0003316429851498589, + "loss": 2.601, + "step": 20681 + }, + { + "epoch": 0.6132906325060048, + "grad_norm": 0.11531593650579453, + "learning_rate": 0.0003315986823832845, + "loss": 2.562, + "step": 20682 + }, + { + "epoch": 0.6133202858583163, + "grad_norm": 0.12291119992733002, + "learning_rate": 0.0003315543811079329, + "loss": 2.5903, + "step": 20683 + }, + { + "epoch": 0.6133499392106277, + "grad_norm": 0.12245992571115494, + "learning_rate": 0.000331510081324196, + "loss": 2.611, + "step": 20684 + }, + { + "epoch": 0.6133795925629393, + "grad_norm": 0.11760576069355011, + "learning_rate": 0.00033146578303246604, + "loss": 2.6268, + "step": 20685 + }, + { + "epoch": 0.6134092459152507, + "grad_norm": 0.10795976966619492, + "learning_rate": 0.0003314214862331358, + "loss": 2.6334, + "step": 20686 + }, + { + "epoch": 0.6134388992675622, + "grad_norm": 0.11144540458917618, + "learning_rate": 0.00033137719092659723, + "loss": 2.5824, + "step": 20687 + }, + { + "epoch": 0.6134685526198737, + "grad_norm": 0.10622863471508026, + "learning_rate": 0.0003313328971132425, + "loss": 2.599, + "step": 20688 + }, + { + "epoch": 0.6134982059721852, + "grad_norm": 0.1127200648188591, + "learning_rate": 0.0003312886047934639, + "loss": 2.5913, + "step": 20689 + }, + { + "epoch": 0.6135278593244966, + "grad_norm": 0.10822804272174835, + "learning_rate": 0.00033124431396765376, + "loss": 2.6407, + "step": 20690 + }, + { + "epoch": 0.6135575126768081, + "grad_norm": 0.12089667469263077, + "learning_rate": 0.0003312000246362041, + "loss": 2.62, + "step": 20691 + }, + { + "epoch": 0.6135871660291196, + "grad_norm": 0.09852170199155807, + "learning_rate": 0.00033115573679950717, + "loss": 2.6214, + "step": 20692 + }, + { + "epoch": 0.6136168193814311, + "grad_norm": 0.11253222078084946, + "learning_rate": 0.0003311114504579553, + "loss": 2.575, + "step": 20693 + }, + { + "epoch": 0.6136464727337425, + "grad_norm": 0.12794111669063568, + "learning_rate": 0.0003310671656119405, + "loss": 2.6324, + "step": 20694 + }, + { + "epoch": 0.613676126086054, + "grad_norm": 0.12404555082321167, + "learning_rate": 0.0003310228822618548, + "loss": 2.6085, + "step": 20695 + }, + { + "epoch": 0.6137057794383655, + "grad_norm": 0.10789240151643753, + "learning_rate": 0.00033097860040809065, + "loss": 2.6298, + "step": 20696 + }, + { + "epoch": 0.613735432790677, + "grad_norm": 0.12678244709968567, + "learning_rate": 0.0003309343200510399, + "loss": 2.6011, + "step": 20697 + }, + { + "epoch": 0.6137650861429884, + "grad_norm": 0.11019980907440186, + "learning_rate": 0.00033089004119109467, + "loss": 2.6407, + "step": 20698 + }, + { + "epoch": 0.6137947394953, + "grad_norm": 0.10998537391424179, + "learning_rate": 0.00033084576382864727, + "loss": 2.614, + "step": 20699 + }, + { + "epoch": 0.6138243928476115, + "grad_norm": 0.12321391701698303, + "learning_rate": 0.0003308014879640897, + "loss": 2.5906, + "step": 20700 + }, + { + "epoch": 0.6138540461999229, + "grad_norm": 0.10161688923835754, + "learning_rate": 0.00033075721359781396, + "loss": 2.5879, + "step": 20701 + }, + { + "epoch": 0.6138836995522344, + "grad_norm": 0.11310608685016632, + "learning_rate": 0.0003307129407302122, + "loss": 2.5925, + "step": 20702 + }, + { + "epoch": 0.6139133529045459, + "grad_norm": 0.11679691821336746, + "learning_rate": 0.0003306686693616764, + "loss": 2.6394, + "step": 20703 + }, + { + "epoch": 0.6139430062568574, + "grad_norm": 0.10546064376831055, + "learning_rate": 0.00033062439949259857, + "loss": 2.6195, + "step": 20704 + }, + { + "epoch": 0.6139726596091688, + "grad_norm": 0.11609448492527008, + "learning_rate": 0.0003305801311233707, + "loss": 2.5833, + "step": 20705 + }, + { + "epoch": 0.6140023129614803, + "grad_norm": 0.12131612747907639, + "learning_rate": 0.000330535864254385, + "loss": 2.6194, + "step": 20706 + }, + { + "epoch": 0.6140319663137918, + "grad_norm": 0.13506431877613068, + "learning_rate": 0.00033049159888603314, + "loss": 2.6152, + "step": 20707 + }, + { + "epoch": 0.6140616196661033, + "grad_norm": 0.12633821368217468, + "learning_rate": 0.0003304473350187073, + "loss": 2.6083, + "step": 20708 + }, + { + "epoch": 0.6140912730184147, + "grad_norm": 0.11423169821500778, + "learning_rate": 0.00033040307265279947, + "loss": 2.5782, + "step": 20709 + }, + { + "epoch": 0.6141209263707262, + "grad_norm": 0.10570164769887924, + "learning_rate": 0.00033035881178870163, + "loss": 2.6062, + "step": 20710 + }, + { + "epoch": 0.6141505797230377, + "grad_norm": 0.10264626890420914, + "learning_rate": 0.0003303145524268053, + "loss": 2.5751, + "step": 20711 + }, + { + "epoch": 0.6141802330753492, + "grad_norm": 0.11405100673437119, + "learning_rate": 0.0003302702945675029, + "loss": 2.6241, + "step": 20712 + }, + { + "epoch": 0.6142098864276606, + "grad_norm": 0.10587847977876663, + "learning_rate": 0.00033022603821118626, + "loss": 2.6192, + "step": 20713 + }, + { + "epoch": 0.6142395397799721, + "grad_norm": 0.1144430935382843, + "learning_rate": 0.0003301817833582472, + "loss": 2.6191, + "step": 20714 + }, + { + "epoch": 0.6142691931322836, + "grad_norm": 0.11223017424345016, + "learning_rate": 0.00033013753000907763, + "loss": 2.616, + "step": 20715 + }, + { + "epoch": 0.6142988464845951, + "grad_norm": 0.11907900869846344, + "learning_rate": 0.0003300932781640693, + "loss": 2.6137, + "step": 20716 + }, + { + "epoch": 0.6143284998369065, + "grad_norm": 0.11291848123073578, + "learning_rate": 0.00033004902782361414, + "loss": 2.6297, + "step": 20717 + }, + { + "epoch": 0.614358153189218, + "grad_norm": 0.11320997029542923, + "learning_rate": 0.00033000477898810406, + "loss": 2.6042, + "step": 20718 + }, + { + "epoch": 0.6143878065415295, + "grad_norm": 0.11422714591026306, + "learning_rate": 0.0003299605316579308, + "loss": 2.6245, + "step": 20719 + }, + { + "epoch": 0.614417459893841, + "grad_norm": 0.10259491205215454, + "learning_rate": 0.00032991628583348625, + "loss": 2.6164, + "step": 20720 + }, + { + "epoch": 0.6144471132461525, + "grad_norm": 0.10144418478012085, + "learning_rate": 0.0003298720415151623, + "loss": 2.6213, + "step": 20721 + }, + { + "epoch": 0.614476766598464, + "grad_norm": 0.09966909885406494, + "learning_rate": 0.00032982779870335046, + "loss": 2.6332, + "step": 20722 + }, + { + "epoch": 0.6145064199507755, + "grad_norm": 0.10214665532112122, + "learning_rate": 0.0003297835573984425, + "loss": 2.5902, + "step": 20723 + }, + { + "epoch": 0.6145360733030869, + "grad_norm": 0.10665606707334518, + "learning_rate": 0.0003297393176008306, + "loss": 2.6293, + "step": 20724 + }, + { + "epoch": 0.6145657266553984, + "grad_norm": 0.11772618442773819, + "learning_rate": 0.00032969507931090633, + "loss": 2.6363, + "step": 20725 + }, + { + "epoch": 0.6145953800077099, + "grad_norm": 0.11963322758674622, + "learning_rate": 0.00032965084252906124, + "loss": 2.5879, + "step": 20726 + }, + { + "epoch": 0.6146250333600214, + "grad_norm": 0.10179591923952103, + "learning_rate": 0.0003296066072556873, + "loss": 2.6176, + "step": 20727 + }, + { + "epoch": 0.6146546867123328, + "grad_norm": 0.11814671754837036, + "learning_rate": 0.0003295623734911759, + "loss": 2.6174, + "step": 20728 + }, + { + "epoch": 0.6146843400646443, + "grad_norm": 0.10598350316286087, + "learning_rate": 0.00032951814123591906, + "loss": 2.6407, + "step": 20729 + }, + { + "epoch": 0.6147139934169558, + "grad_norm": 0.1194690614938736, + "learning_rate": 0.00032947391049030836, + "loss": 2.6094, + "step": 20730 + }, + { + "epoch": 0.6147436467692673, + "grad_norm": 0.10098825395107269, + "learning_rate": 0.00032942968125473546, + "loss": 2.6089, + "step": 20731 + }, + { + "epoch": 0.6147733001215787, + "grad_norm": 0.1102745532989502, + "learning_rate": 0.00032938545352959205, + "loss": 2.6069, + "step": 20732 + }, + { + "epoch": 0.6148029534738902, + "grad_norm": 0.11168418079614639, + "learning_rate": 0.0003293412273152696, + "loss": 2.5869, + "step": 20733 + }, + { + "epoch": 0.6148326068262017, + "grad_norm": 0.10521538555622101, + "learning_rate": 0.00032929700261215993, + "loss": 2.5899, + "step": 20734 + }, + { + "epoch": 0.6148622601785132, + "grad_norm": 0.11184961348772049, + "learning_rate": 0.00032925277942065463, + "loss": 2.5892, + "step": 20735 + }, + { + "epoch": 0.6148919135308246, + "grad_norm": 0.10219326615333557, + "learning_rate": 0.0003292085577411451, + "loss": 2.5863, + "step": 20736 + }, + { + "epoch": 0.6149215668831362, + "grad_norm": 0.11168182641267776, + "learning_rate": 0.0003291643375740233, + "loss": 2.5933, + "step": 20737 + }, + { + "epoch": 0.6149512202354476, + "grad_norm": 0.12250879406929016, + "learning_rate": 0.00032912011891968055, + "loss": 2.5905, + "step": 20738 + }, + { + "epoch": 0.6149808735877591, + "grad_norm": 0.10866556316614151, + "learning_rate": 0.00032907590177850855, + "loss": 2.5975, + "step": 20739 + }, + { + "epoch": 0.6150105269400705, + "grad_norm": 0.11173301190137863, + "learning_rate": 0.00032903168615089866, + "loss": 2.6093, + "step": 20740 + }, + { + "epoch": 0.615040180292382, + "grad_norm": 0.1202617809176445, + "learning_rate": 0.0003289874720372427, + "loss": 2.5976, + "step": 20741 + }, + { + "epoch": 0.6150698336446936, + "grad_norm": 0.1256745308637619, + "learning_rate": 0.00032894325943793196, + "loss": 2.6197, + "step": 20742 + }, + { + "epoch": 0.615099486997005, + "grad_norm": 0.10227984189987183, + "learning_rate": 0.000328899048353358, + "loss": 2.6364, + "step": 20743 + }, + { + "epoch": 0.6151291403493165, + "grad_norm": 0.1093905121088028, + "learning_rate": 0.0003288548387839123, + "loss": 2.5788, + "step": 20744 + }, + { + "epoch": 0.615158793701628, + "grad_norm": 0.10283976048231125, + "learning_rate": 0.0003288106307299864, + "loss": 2.5781, + "step": 20745 + }, + { + "epoch": 0.6151884470539395, + "grad_norm": 0.10313121974468231, + "learning_rate": 0.0003287664241919718, + "loss": 2.593, + "step": 20746 + }, + { + "epoch": 0.6152181004062509, + "grad_norm": 0.11261321604251862, + "learning_rate": 0.00032872221917025984, + "loss": 2.6289, + "step": 20747 + }, + { + "epoch": 0.6152477537585624, + "grad_norm": 0.10640418529510498, + "learning_rate": 0.00032867801566524216, + "loss": 2.6228, + "step": 20748 + }, + { + "epoch": 0.6152774071108739, + "grad_norm": 0.11389388889074326, + "learning_rate": 0.0003286338136773097, + "loss": 2.6021, + "step": 20749 + }, + { + "epoch": 0.6153070604631854, + "grad_norm": 0.10689719766378403, + "learning_rate": 0.00032858961320685455, + "loss": 2.6004, + "step": 20750 + }, + { + "epoch": 0.6153367138154968, + "grad_norm": 0.11799430847167969, + "learning_rate": 0.0003285454142542677, + "loss": 2.6183, + "step": 20751 + }, + { + "epoch": 0.6153663671678083, + "grad_norm": 0.13517266511917114, + "learning_rate": 0.0003285012168199408, + "loss": 2.6005, + "step": 20752 + }, + { + "epoch": 0.6153960205201198, + "grad_norm": 0.12414511293172836, + "learning_rate": 0.00032845702090426485, + "loss": 2.6254, + "step": 20753 + }, + { + "epoch": 0.6154256738724313, + "grad_norm": 0.09328696876764297, + "learning_rate": 0.0003284128265076315, + "loss": 2.6394, + "step": 20754 + }, + { + "epoch": 0.6154553272247427, + "grad_norm": 0.12243243306875229, + "learning_rate": 0.00032836863363043214, + "loss": 2.6183, + "step": 20755 + }, + { + "epoch": 0.6154849805770543, + "grad_norm": 0.1193508431315422, + "learning_rate": 0.0003283244422730578, + "loss": 2.6184, + "step": 20756 + }, + { + "epoch": 0.6155146339293657, + "grad_norm": 0.10817377269268036, + "learning_rate": 0.0003282802524359001, + "loss": 2.6051, + "step": 20757 + }, + { + "epoch": 0.6155442872816772, + "grad_norm": 0.12168626487255096, + "learning_rate": 0.0003282360641193502, + "loss": 2.6067, + "step": 20758 + }, + { + "epoch": 0.6155739406339886, + "grad_norm": 0.11404477059841156, + "learning_rate": 0.0003281918773237995, + "loss": 2.6343, + "step": 20759 + }, + { + "epoch": 0.6156035939863002, + "grad_norm": 0.09343995898962021, + "learning_rate": 0.00032814769204963926, + "loss": 2.6142, + "step": 20760 + }, + { + "epoch": 0.6156332473386116, + "grad_norm": 0.11838523298501968, + "learning_rate": 0.00032810350829726056, + "loss": 2.6109, + "step": 20761 + }, + { + "epoch": 0.6156629006909231, + "grad_norm": 0.09888146817684174, + "learning_rate": 0.0003280593260670547, + "loss": 2.6131, + "step": 20762 + }, + { + "epoch": 0.6156925540432346, + "grad_norm": 0.10874814540147781, + "learning_rate": 0.0003280151453594132, + "loss": 2.6029, + "step": 20763 + }, + { + "epoch": 0.6157222073955461, + "grad_norm": 0.11286452412605286, + "learning_rate": 0.0003279709661747271, + "loss": 2.5911, + "step": 20764 + }, + { + "epoch": 0.6157518607478576, + "grad_norm": 0.09810317307710648, + "learning_rate": 0.0003279267885133876, + "loss": 2.6413, + "step": 20765 + }, + { + "epoch": 0.615781514100169, + "grad_norm": 0.1132964938879013, + "learning_rate": 0.00032788261237578587, + "loss": 2.6482, + "step": 20766 + }, + { + "epoch": 0.6158111674524805, + "grad_norm": 0.10374262928962708, + "learning_rate": 0.00032783843776231327, + "loss": 2.6047, + "step": 20767 + }, + { + "epoch": 0.615840820804792, + "grad_norm": 0.12489567697048187, + "learning_rate": 0.0003277942646733607, + "loss": 2.612, + "step": 20768 + }, + { + "epoch": 0.6158704741571035, + "grad_norm": 0.11267473548650742, + "learning_rate": 0.0003277500931093197, + "loss": 2.5757, + "step": 20769 + }, + { + "epoch": 0.6159001275094149, + "grad_norm": 0.10159529000520706, + "learning_rate": 0.00032770592307058104, + "loss": 2.616, + "step": 20770 + }, + { + "epoch": 0.6159297808617265, + "grad_norm": 0.12839221954345703, + "learning_rate": 0.00032766175455753606, + "loss": 2.6358, + "step": 20771 + }, + { + "epoch": 0.6159594342140379, + "grad_norm": 0.13043731451034546, + "learning_rate": 0.0003276175875705758, + "loss": 2.5919, + "step": 20772 + }, + { + "epoch": 0.6159890875663494, + "grad_norm": 0.08813600242137909, + "learning_rate": 0.00032757342211009135, + "loss": 2.5979, + "step": 20773 + }, + { + "epoch": 0.6160187409186608, + "grad_norm": 0.1451796144247055, + "learning_rate": 0.0003275292581764738, + "loss": 2.5802, + "step": 20774 + }, + { + "epoch": 0.6160483942709724, + "grad_norm": 0.12118782848119736, + "learning_rate": 0.0003274850957701143, + "loss": 2.5888, + "step": 20775 + }, + { + "epoch": 0.6160780476232838, + "grad_norm": 0.10723966360092163, + "learning_rate": 0.0003274409348914039, + "loss": 2.5976, + "step": 20776 + }, + { + "epoch": 0.6161077009755953, + "grad_norm": 0.11799325793981552, + "learning_rate": 0.0003273967755407337, + "loss": 2.5819, + "step": 20777 + }, + { + "epoch": 0.6161373543279067, + "grad_norm": 0.11198990792036057, + "learning_rate": 0.0003273526177184947, + "loss": 2.5961, + "step": 20778 + }, + { + "epoch": 0.6161670076802183, + "grad_norm": 0.10801854729652405, + "learning_rate": 0.0003273084614250778, + "loss": 2.5973, + "step": 20779 + }, + { + "epoch": 0.6161966610325297, + "grad_norm": 0.11063922941684723, + "learning_rate": 0.0003272643066608743, + "loss": 2.6284, + "step": 20780 + }, + { + "epoch": 0.6162263143848412, + "grad_norm": 0.11788584291934967, + "learning_rate": 0.00032722015342627486, + "loss": 2.6101, + "step": 20781 + }, + { + "epoch": 0.6162559677371526, + "grad_norm": 0.10244495421648026, + "learning_rate": 0.00032717600172167064, + "loss": 2.6396, + "step": 20782 + }, + { + "epoch": 0.6162856210894642, + "grad_norm": 0.10304337739944458, + "learning_rate": 0.0003271318515474526, + "loss": 2.6064, + "step": 20783 + }, + { + "epoch": 0.6163152744417757, + "grad_norm": 0.1140899509191513, + "learning_rate": 0.0003270877029040117, + "loss": 2.6182, + "step": 20784 + }, + { + "epoch": 0.6163449277940871, + "grad_norm": 0.09857302159070969, + "learning_rate": 0.0003270435557917388, + "loss": 2.6085, + "step": 20785 + }, + { + "epoch": 0.6163745811463986, + "grad_norm": 0.11149170994758606, + "learning_rate": 0.0003269994102110251, + "loss": 2.6177, + "step": 20786 + }, + { + "epoch": 0.6164042344987101, + "grad_norm": 0.11193352192640305, + "learning_rate": 0.00032695526616226115, + "loss": 2.5929, + "step": 20787 + }, + { + "epoch": 0.6164338878510216, + "grad_norm": 0.09704335778951645, + "learning_rate": 0.00032691112364583786, + "loss": 2.6186, + "step": 20788 + }, + { + "epoch": 0.616463541203333, + "grad_norm": 0.10225620865821838, + "learning_rate": 0.0003268669826621464, + "loss": 2.5996, + "step": 20789 + }, + { + "epoch": 0.6164931945556446, + "grad_norm": 0.10977782309055328, + "learning_rate": 0.00032682284321157776, + "loss": 2.6274, + "step": 20790 + }, + { + "epoch": 0.616522847907956, + "grad_norm": 0.09619088470935822, + "learning_rate": 0.00032677870529452234, + "loss": 2.6159, + "step": 20791 + }, + { + "epoch": 0.6165525012602675, + "grad_norm": 0.1046924889087677, + "learning_rate": 0.0003267345689113713, + "loss": 2.6147, + "step": 20792 + }, + { + "epoch": 0.6165821546125789, + "grad_norm": 0.10362294316291809, + "learning_rate": 0.0003266904340625154, + "loss": 2.6243, + "step": 20793 + }, + { + "epoch": 0.6166118079648905, + "grad_norm": 0.1034378856420517, + "learning_rate": 0.00032664630074834543, + "loss": 2.575, + "step": 20794 + }, + { + "epoch": 0.6166414613172019, + "grad_norm": 0.09193303436040878, + "learning_rate": 0.0003266021689692522, + "loss": 2.6053, + "step": 20795 + }, + { + "epoch": 0.6166711146695134, + "grad_norm": 0.10399793088436127, + "learning_rate": 0.0003265580387256265, + "loss": 2.5956, + "step": 20796 + }, + { + "epoch": 0.6167007680218248, + "grad_norm": 0.10583402216434479, + "learning_rate": 0.00032651391001785936, + "loss": 2.6474, + "step": 20797 + }, + { + "epoch": 0.6167304213741364, + "grad_norm": 0.10472378879785538, + "learning_rate": 0.0003264697828463411, + "loss": 2.6263, + "step": 20798 + }, + { + "epoch": 0.6167600747264478, + "grad_norm": 0.11137641221284866, + "learning_rate": 0.0003264256572114628, + "loss": 2.6344, + "step": 20799 + }, + { + "epoch": 0.6167897280787593, + "grad_norm": 0.09992468357086182, + "learning_rate": 0.00032638153311361504, + "loss": 2.6002, + "step": 20800 + }, + { + "epoch": 0.6168193814310707, + "grad_norm": 0.10159587860107422, + "learning_rate": 0.00032633741055318845, + "loss": 2.6123, + "step": 20801 + }, + { + "epoch": 0.6168490347833823, + "grad_norm": 0.10718238353729248, + "learning_rate": 0.00032629328953057406, + "loss": 2.602, + "step": 20802 + }, + { + "epoch": 0.6168786881356937, + "grad_norm": 0.10658971220254898, + "learning_rate": 0.0003262491700461624, + "loss": 2.6186, + "step": 20803 + }, + { + "epoch": 0.6169083414880052, + "grad_norm": 0.11901640146970749, + "learning_rate": 0.0003262050521003442, + "loss": 2.6335, + "step": 20804 + }, + { + "epoch": 0.6169379948403168, + "grad_norm": 0.1210685446858406, + "learning_rate": 0.00032616093569351, + "loss": 2.6195, + "step": 20805 + }, + { + "epoch": 0.6169676481926282, + "grad_norm": 0.09853903204202652, + "learning_rate": 0.0003261168208260507, + "loss": 2.6105, + "step": 20806 + }, + { + "epoch": 0.6169973015449397, + "grad_norm": 0.10912472009658813, + "learning_rate": 0.00032607270749835683, + "loss": 2.5744, + "step": 20807 + }, + { + "epoch": 0.6170269548972511, + "grad_norm": 0.12459944188594818, + "learning_rate": 0.00032602859571081885, + "loss": 2.5778, + "step": 20808 + }, + { + "epoch": 0.6170566082495627, + "grad_norm": 0.1017136499285698, + "learning_rate": 0.00032598448546382753, + "loss": 2.6202, + "step": 20809 + }, + { + "epoch": 0.6170862616018741, + "grad_norm": 0.1320478320121765, + "learning_rate": 0.00032594037675777346, + "loss": 2.5983, + "step": 20810 + }, + { + "epoch": 0.6171159149541856, + "grad_norm": 0.12157471477985382, + "learning_rate": 0.00032589626959304727, + "loss": 2.5823, + "step": 20811 + }, + { + "epoch": 0.617145568306497, + "grad_norm": 0.10397016257047653, + "learning_rate": 0.0003258521639700395, + "loss": 2.5772, + "step": 20812 + }, + { + "epoch": 0.6171752216588086, + "grad_norm": 0.12255680561065674, + "learning_rate": 0.00032580805988914065, + "loss": 2.5893, + "step": 20813 + }, + { + "epoch": 0.61720487501112, + "grad_norm": 0.11655028909444809, + "learning_rate": 0.0003257639573507413, + "loss": 2.619, + "step": 20814 + }, + { + "epoch": 0.6172345283634315, + "grad_norm": 0.11021624505519867, + "learning_rate": 0.00032571985635523204, + "loss": 2.6303, + "step": 20815 + }, + { + "epoch": 0.6172641817157429, + "grad_norm": 0.13206380605697632, + "learning_rate": 0.00032567575690300337, + "loss": 2.5664, + "step": 20816 + }, + { + "epoch": 0.6172938350680545, + "grad_norm": 0.09976381808519363, + "learning_rate": 0.0003256316589944458, + "loss": 2.5567, + "step": 20817 + }, + { + "epoch": 0.6173234884203659, + "grad_norm": 0.11315739899873734, + "learning_rate": 0.00032558756262994997, + "loss": 2.5742, + "step": 20818 + }, + { + "epoch": 0.6173531417726774, + "grad_norm": 0.10808864235877991, + "learning_rate": 0.00032554346780990607, + "loss": 2.6265, + "step": 20819 + }, + { + "epoch": 0.6173827951249888, + "grad_norm": 0.10758164525032043, + "learning_rate": 0.0003254993745347047, + "loss": 2.6042, + "step": 20820 + }, + { + "epoch": 0.6174124484773004, + "grad_norm": 0.1095416322350502, + "learning_rate": 0.0003254552828047364, + "loss": 2.6419, + "step": 20821 + }, + { + "epoch": 0.6174421018296118, + "grad_norm": 0.10170076787471771, + "learning_rate": 0.0003254111926203915, + "loss": 2.587, + "step": 20822 + }, + { + "epoch": 0.6174717551819233, + "grad_norm": 0.0924917683005333, + "learning_rate": 0.0003253671039820604, + "loss": 2.583, + "step": 20823 + }, + { + "epoch": 0.6175014085342347, + "grad_norm": 0.10821533203125, + "learning_rate": 0.0003253230168901337, + "loss": 2.6057, + "step": 20824 + }, + { + "epoch": 0.6175310618865463, + "grad_norm": 0.10208281129598618, + "learning_rate": 0.0003252789313450016, + "loss": 2.6079, + "step": 20825 + }, + { + "epoch": 0.6175607152388578, + "grad_norm": 0.08854561299085617, + "learning_rate": 0.0003252348473470545, + "loss": 2.5903, + "step": 20826 + }, + { + "epoch": 0.6175903685911692, + "grad_norm": 0.10748866200447083, + "learning_rate": 0.00032519076489668266, + "loss": 2.631, + "step": 20827 + }, + { + "epoch": 0.6176200219434808, + "grad_norm": 0.10009880363941193, + "learning_rate": 0.00032514668399427686, + "loss": 2.5867, + "step": 20828 + }, + { + "epoch": 0.6176496752957922, + "grad_norm": 0.11100814491510391, + "learning_rate": 0.0003251026046402271, + "loss": 2.6084, + "step": 20829 + }, + { + "epoch": 0.6176793286481037, + "grad_norm": 0.10399170964956284, + "learning_rate": 0.00032505852683492386, + "loss": 2.5836, + "step": 20830 + }, + { + "epoch": 0.6177089820004151, + "grad_norm": 0.10894384980201721, + "learning_rate": 0.00032501445057875734, + "loss": 2.5746, + "step": 20831 + }, + { + "epoch": 0.6177386353527267, + "grad_norm": 0.10796184092760086, + "learning_rate": 0.00032497037587211794, + "loss": 2.6128, + "step": 20832 + }, + { + "epoch": 0.6177682887050381, + "grad_norm": 0.10330870747566223, + "learning_rate": 0.0003249263027153958, + "loss": 2.6393, + "step": 20833 + }, + { + "epoch": 0.6177979420573496, + "grad_norm": 0.11282490938901901, + "learning_rate": 0.0003248822311089814, + "loss": 2.5818, + "step": 20834 + }, + { + "epoch": 0.617827595409661, + "grad_norm": 0.09751659631729126, + "learning_rate": 0.000324838161053265, + "loss": 2.6018, + "step": 20835 + }, + { + "epoch": 0.6178572487619726, + "grad_norm": 0.12787921726703644, + "learning_rate": 0.00032479409254863655, + "loss": 2.6298, + "step": 20836 + }, + { + "epoch": 0.617886902114284, + "grad_norm": 0.11324109137058258, + "learning_rate": 0.0003247500255954866, + "loss": 2.6118, + "step": 20837 + }, + { + "epoch": 0.6179165554665955, + "grad_norm": 0.10769350081682205, + "learning_rate": 0.00032470596019420524, + "loss": 2.5983, + "step": 20838 + }, + { + "epoch": 0.6179462088189069, + "grad_norm": 0.10329833626747131, + "learning_rate": 0.0003246618963451826, + "loss": 2.5915, + "step": 20839 + }, + { + "epoch": 0.6179758621712185, + "grad_norm": 0.09894285351037979, + "learning_rate": 0.00032461783404880894, + "loss": 2.5927, + "step": 20840 + }, + { + "epoch": 0.6180055155235299, + "grad_norm": 0.10684449970722198, + "learning_rate": 0.00032457377330547454, + "loss": 2.6081, + "step": 20841 + }, + { + "epoch": 0.6180351688758414, + "grad_norm": 0.1153813898563385, + "learning_rate": 0.0003245297141155695, + "loss": 2.6342, + "step": 20842 + }, + { + "epoch": 0.6180648222281528, + "grad_norm": 0.10516335815191269, + "learning_rate": 0.000324485656479484, + "loss": 2.6042, + "step": 20843 + }, + { + "epoch": 0.6180944755804644, + "grad_norm": 0.10075830668210983, + "learning_rate": 0.00032444160039760806, + "loss": 2.6059, + "step": 20844 + }, + { + "epoch": 0.6181241289327758, + "grad_norm": 0.11060170829296112, + "learning_rate": 0.00032439754587033204, + "loss": 2.6118, + "step": 20845 + }, + { + "epoch": 0.6181537822850873, + "grad_norm": 0.11480172723531723, + "learning_rate": 0.0003243534928980458, + "loss": 2.5873, + "step": 20846 + }, + { + "epoch": 0.6181834356373989, + "grad_norm": 0.11898557096719742, + "learning_rate": 0.0003243094414811395, + "loss": 2.5606, + "step": 20847 + }, + { + "epoch": 0.6182130889897103, + "grad_norm": 0.09827800840139389, + "learning_rate": 0.00032426539162000326, + "loss": 2.5979, + "step": 20848 + }, + { + "epoch": 0.6182427423420218, + "grad_norm": 0.10053134709596634, + "learning_rate": 0.0003242213433150271, + "loss": 2.5854, + "step": 20849 + }, + { + "epoch": 0.6182723956943332, + "grad_norm": 0.12156936526298523, + "learning_rate": 0.00032417729656660123, + "loss": 2.6168, + "step": 20850 + }, + { + "epoch": 0.6183020490466448, + "grad_norm": 0.11182773113250732, + "learning_rate": 0.00032413325137511543, + "loss": 2.5918, + "step": 20851 + }, + { + "epoch": 0.6183317023989562, + "grad_norm": 0.10490529984235764, + "learning_rate": 0.0003240892077409601, + "loss": 2.6019, + "step": 20852 + }, + { + "epoch": 0.6183613557512677, + "grad_norm": 0.11034546792507172, + "learning_rate": 0.0003240451656645247, + "loss": 2.604, + "step": 20853 + }, + { + "epoch": 0.6183910091035791, + "grad_norm": 0.11789064854383469, + "learning_rate": 0.00032400112514619975, + "loss": 2.6172, + "step": 20854 + }, + { + "epoch": 0.6184206624558907, + "grad_norm": 0.11170948296785355, + "learning_rate": 0.000323957086186375, + "loss": 2.6353, + "step": 20855 + }, + { + "epoch": 0.6184503158082021, + "grad_norm": 0.10400068759918213, + "learning_rate": 0.0003239130487854406, + "loss": 2.6017, + "step": 20856 + }, + { + "epoch": 0.6184799691605136, + "grad_norm": 0.11022622883319855, + "learning_rate": 0.00032386901294378625, + "loss": 2.6, + "step": 20857 + }, + { + "epoch": 0.618509622512825, + "grad_norm": 0.12610286474227905, + "learning_rate": 0.00032382497866180205, + "loss": 2.5678, + "step": 20858 + }, + { + "epoch": 0.6185392758651366, + "grad_norm": 0.10789431631565094, + "learning_rate": 0.00032378094593987793, + "loss": 2.6247, + "step": 20859 + }, + { + "epoch": 0.618568929217448, + "grad_norm": 0.11474211513996124, + "learning_rate": 0.0003237369147784037, + "loss": 2.5904, + "step": 20860 + }, + { + "epoch": 0.6185985825697595, + "grad_norm": 0.11725078523159027, + "learning_rate": 0.00032369288517776945, + "loss": 2.6044, + "step": 20861 + }, + { + "epoch": 0.618628235922071, + "grad_norm": 0.11319248378276825, + "learning_rate": 0.000323648857138365, + "loss": 2.6007, + "step": 20862 + }, + { + "epoch": 0.6186578892743825, + "grad_norm": 0.10057519376277924, + "learning_rate": 0.00032360483066058016, + "loss": 2.6049, + "step": 20863 + }, + { + "epoch": 0.6186875426266939, + "grad_norm": 0.11056903004646301, + "learning_rate": 0.0003235608057448047, + "loss": 2.6402, + "step": 20864 + }, + { + "epoch": 0.6187171959790054, + "grad_norm": 0.10576466470956802, + "learning_rate": 0.00032351678239142875, + "loss": 2.6176, + "step": 20865 + }, + { + "epoch": 0.6187468493313169, + "grad_norm": 0.10368605703115463, + "learning_rate": 0.00032347276060084175, + "loss": 2.615, + "step": 20866 + }, + { + "epoch": 0.6187765026836284, + "grad_norm": 0.11788540333509445, + "learning_rate": 0.00032342874037343395, + "loss": 2.6163, + "step": 20867 + }, + { + "epoch": 0.6188061560359399, + "grad_norm": 0.09931927919387817, + "learning_rate": 0.0003233847217095949, + "loss": 2.576, + "step": 20868 + }, + { + "epoch": 0.6188358093882513, + "grad_norm": 0.10174181312322617, + "learning_rate": 0.0003233407046097144, + "loss": 2.5988, + "step": 20869 + }, + { + "epoch": 0.6188654627405629, + "grad_norm": 0.09991702437400818, + "learning_rate": 0.0003232966890741824, + "loss": 2.5995, + "step": 20870 + }, + { + "epoch": 0.6188951160928743, + "grad_norm": 0.10240640491247177, + "learning_rate": 0.0003232526751033885, + "loss": 2.6345, + "step": 20871 + }, + { + "epoch": 0.6189247694451858, + "grad_norm": 0.10982649028301239, + "learning_rate": 0.00032320866269772256, + "loss": 2.6668, + "step": 20872 + }, + { + "epoch": 0.6189544227974972, + "grad_norm": 0.10389775037765503, + "learning_rate": 0.0003231646518575743, + "loss": 2.6227, + "step": 20873 + }, + { + "epoch": 0.6189840761498088, + "grad_norm": 0.09954838454723358, + "learning_rate": 0.0003231206425833333, + "loss": 2.553, + "step": 20874 + }, + { + "epoch": 0.6190137295021202, + "grad_norm": 0.09732398390769958, + "learning_rate": 0.00032307663487538934, + "loss": 2.6084, + "step": 20875 + }, + { + "epoch": 0.6190433828544317, + "grad_norm": 0.11047700047492981, + "learning_rate": 0.0003230326287341322, + "loss": 2.6105, + "step": 20876 + }, + { + "epoch": 0.6190730362067431, + "grad_norm": 0.09372556954622269, + "learning_rate": 0.00032298862415995144, + "loss": 2.6208, + "step": 20877 + }, + { + "epoch": 0.6191026895590547, + "grad_norm": 0.11590205132961273, + "learning_rate": 0.00032294462115323684, + "loss": 2.6317, + "step": 20878 + }, + { + "epoch": 0.6191323429113661, + "grad_norm": 0.10436961054801941, + "learning_rate": 0.00032290061971437795, + "loss": 2.6154, + "step": 20879 + }, + { + "epoch": 0.6191619962636776, + "grad_norm": 0.11347221583127975, + "learning_rate": 0.0003228566198437646, + "loss": 2.6087, + "step": 20880 + }, + { + "epoch": 0.619191649615989, + "grad_norm": 0.10529428720474243, + "learning_rate": 0.00032281262154178615, + "loss": 2.6228, + "step": 20881 + }, + { + "epoch": 0.6192213029683006, + "grad_norm": 0.11744453012943268, + "learning_rate": 0.00032276862480883246, + "loss": 2.5852, + "step": 20882 + }, + { + "epoch": 0.619250956320612, + "grad_norm": 0.11843980103731155, + "learning_rate": 0.0003227246296452931, + "loss": 2.614, + "step": 20883 + }, + { + "epoch": 0.6192806096729235, + "grad_norm": 0.10524041205644608, + "learning_rate": 0.0003226806360515574, + "loss": 2.5916, + "step": 20884 + }, + { + "epoch": 0.619310263025235, + "grad_norm": 0.09979555755853653, + "learning_rate": 0.00032263664402801517, + "loss": 2.6147, + "step": 20885 + }, + { + "epoch": 0.6193399163775465, + "grad_norm": 0.10332904756069183, + "learning_rate": 0.0003225926535750559, + "loss": 2.5923, + "step": 20886 + }, + { + "epoch": 0.6193695697298579, + "grad_norm": 0.09856761246919632, + "learning_rate": 0.00032254866469306917, + "loss": 2.5766, + "step": 20887 + }, + { + "epoch": 0.6193992230821694, + "grad_norm": 0.10622578859329224, + "learning_rate": 0.0003225046773824444, + "loss": 2.6107, + "step": 20888 + }, + { + "epoch": 0.619428876434481, + "grad_norm": 0.11015362292528152, + "learning_rate": 0.00032246069164357125, + "loss": 2.5591, + "step": 20889 + }, + { + "epoch": 0.6194585297867924, + "grad_norm": 0.11088989675045013, + "learning_rate": 0.00032241670747683917, + "loss": 2.6091, + "step": 20890 + }, + { + "epoch": 0.6194881831391039, + "grad_norm": 0.10935025662183762, + "learning_rate": 0.00032237272488263755, + "loss": 2.6221, + "step": 20891 + }, + { + "epoch": 0.6195178364914153, + "grad_norm": 0.10145009309053421, + "learning_rate": 0.0003223287438613558, + "loss": 2.5815, + "step": 20892 + }, + { + "epoch": 0.6195474898437269, + "grad_norm": 0.10684965550899506, + "learning_rate": 0.0003222847644133836, + "loss": 2.5888, + "step": 20893 + }, + { + "epoch": 0.6195771431960383, + "grad_norm": 0.12436027079820633, + "learning_rate": 0.00032224078653911054, + "loss": 2.601, + "step": 20894 + }, + { + "epoch": 0.6196067965483498, + "grad_norm": 0.10765901952981949, + "learning_rate": 0.00032219681023892567, + "loss": 2.6238, + "step": 20895 + }, + { + "epoch": 0.6196364499006612, + "grad_norm": 0.10105854272842407, + "learning_rate": 0.0003221528355132186, + "loss": 2.6, + "step": 20896 + }, + { + "epoch": 0.6196661032529728, + "grad_norm": 0.11007378995418549, + "learning_rate": 0.0003221088623623787, + "loss": 2.5982, + "step": 20897 + }, + { + "epoch": 0.6196957566052842, + "grad_norm": 0.11775516718626022, + "learning_rate": 0.0003220648907867953, + "loss": 2.5872, + "step": 20898 + }, + { + "epoch": 0.6197254099575957, + "grad_norm": 0.11694884300231934, + "learning_rate": 0.00032202092078685785, + "loss": 2.5998, + "step": 20899 + }, + { + "epoch": 0.6197550633099072, + "grad_norm": 0.10979586839675903, + "learning_rate": 0.0003219769523629558, + "loss": 2.5822, + "step": 20900 + }, + { + "epoch": 0.6197847166622187, + "grad_norm": 0.10470627248287201, + "learning_rate": 0.00032193298551547834, + "loss": 2.611, + "step": 20901 + }, + { + "epoch": 0.6198143700145301, + "grad_norm": 0.09905745834112167, + "learning_rate": 0.00032188902024481484, + "loss": 2.6318, + "step": 20902 + }, + { + "epoch": 0.6198440233668416, + "grad_norm": 0.10922452062368393, + "learning_rate": 0.00032184505655135455, + "loss": 2.6061, + "step": 20903 + }, + { + "epoch": 0.619873676719153, + "grad_norm": 0.10650620609521866, + "learning_rate": 0.000321801094435487, + "loss": 2.5691, + "step": 20904 + }, + { + "epoch": 0.6199033300714646, + "grad_norm": 0.09077940881252289, + "learning_rate": 0.00032175713389760117, + "loss": 2.6073, + "step": 20905 + }, + { + "epoch": 0.619932983423776, + "grad_norm": 0.11678622663021088, + "learning_rate": 0.0003217131749380866, + "loss": 2.6282, + "step": 20906 + }, + { + "epoch": 0.6199626367760875, + "grad_norm": 0.11340389400720596, + "learning_rate": 0.00032166921755733246, + "loss": 2.5932, + "step": 20907 + }, + { + "epoch": 0.6199922901283991, + "grad_norm": 0.1034872904419899, + "learning_rate": 0.0003216252617557281, + "loss": 2.6019, + "step": 20908 + }, + { + "epoch": 0.6200219434807105, + "grad_norm": 0.09887529164552689, + "learning_rate": 0.0003215813075336625, + "loss": 2.6342, + "step": 20909 + }, + { + "epoch": 0.620051596833022, + "grad_norm": 0.1048821285367012, + "learning_rate": 0.00032153735489152523, + "loss": 2.6097, + "step": 20910 + }, + { + "epoch": 0.6200812501853334, + "grad_norm": 0.09932056814432144, + "learning_rate": 0.00032149340382970536, + "loss": 2.5861, + "step": 20911 + }, + { + "epoch": 0.620110903537645, + "grad_norm": 0.10790344327688217, + "learning_rate": 0.0003214494543485919, + "loss": 2.5952, + "step": 20912 + }, + { + "epoch": 0.6201405568899564, + "grad_norm": 0.1068166196346283, + "learning_rate": 0.0003214055064485742, + "loss": 2.5986, + "step": 20913 + }, + { + "epoch": 0.6201702102422679, + "grad_norm": 0.1225447952747345, + "learning_rate": 0.00032136156013004135, + "loss": 2.6475, + "step": 20914 + }, + { + "epoch": 0.6201998635945793, + "grad_norm": 0.11042553931474686, + "learning_rate": 0.00032131761539338263, + "loss": 2.5999, + "step": 20915 + }, + { + "epoch": 0.6202295169468909, + "grad_norm": 0.11499768495559692, + "learning_rate": 0.0003212736722389871, + "loss": 2.5813, + "step": 20916 + }, + { + "epoch": 0.6202591702992023, + "grad_norm": 0.10020148009061813, + "learning_rate": 0.0003212297306672437, + "loss": 2.5783, + "step": 20917 + }, + { + "epoch": 0.6202888236515138, + "grad_norm": 0.1204451322555542, + "learning_rate": 0.00032118579067854183, + "loss": 2.6061, + "step": 20918 + }, + { + "epoch": 0.6203184770038253, + "grad_norm": 0.11367113888263702, + "learning_rate": 0.00032114185227327045, + "loss": 2.5982, + "step": 20919 + }, + { + "epoch": 0.6203481303561368, + "grad_norm": 0.11389288306236267, + "learning_rate": 0.0003210979154518188, + "loss": 2.6299, + "step": 20920 + }, + { + "epoch": 0.6203777837084482, + "grad_norm": 0.09398487955331802, + "learning_rate": 0.00032105398021457576, + "loss": 2.618, + "step": 20921 + }, + { + "epoch": 0.6204074370607597, + "grad_norm": 0.10920194536447525, + "learning_rate": 0.00032101004656193046, + "loss": 2.5942, + "step": 20922 + }, + { + "epoch": 0.6204370904130712, + "grad_norm": 0.11262165009975433, + "learning_rate": 0.0003209661144942718, + "loss": 2.6169, + "step": 20923 + }, + { + "epoch": 0.6204667437653827, + "grad_norm": 0.10617942363023758, + "learning_rate": 0.00032092218401198904, + "loss": 2.6005, + "step": 20924 + }, + { + "epoch": 0.6204963971176941, + "grad_norm": 0.09731169044971466, + "learning_rate": 0.0003208782551154711, + "loss": 2.5738, + "step": 20925 + }, + { + "epoch": 0.6205260504700056, + "grad_norm": 0.09819551557302475, + "learning_rate": 0.00032083432780510683, + "loss": 2.6012, + "step": 20926 + }, + { + "epoch": 0.6205557038223171, + "grad_norm": 0.09117849916219711, + "learning_rate": 0.0003207904020812854, + "loss": 2.5923, + "step": 20927 + }, + { + "epoch": 0.6205853571746286, + "grad_norm": 0.10676607489585876, + "learning_rate": 0.0003207464779443958, + "loss": 2.5961, + "step": 20928 + }, + { + "epoch": 0.6206150105269401, + "grad_norm": 0.1005452424287796, + "learning_rate": 0.00032070255539482683, + "loss": 2.6082, + "step": 20929 + }, + { + "epoch": 0.6206446638792515, + "grad_norm": 0.11509571969509125, + "learning_rate": 0.0003206586344329674, + "loss": 2.6158, + "step": 20930 + }, + { + "epoch": 0.6206743172315631, + "grad_norm": 0.10830509662628174, + "learning_rate": 0.0003206147150592066, + "loss": 2.5943, + "step": 20931 + }, + { + "epoch": 0.6207039705838745, + "grad_norm": 0.09236449748277664, + "learning_rate": 0.00032057079727393345, + "loss": 2.5647, + "step": 20932 + }, + { + "epoch": 0.620733623936186, + "grad_norm": 0.09820116311311722, + "learning_rate": 0.0003205268810775366, + "loss": 2.6019, + "step": 20933 + }, + { + "epoch": 0.6207632772884975, + "grad_norm": 0.10559224337339401, + "learning_rate": 0.000320482966470405, + "loss": 2.6313, + "step": 20934 + }, + { + "epoch": 0.620792930640809, + "grad_norm": 0.10893336683511734, + "learning_rate": 0.0003204390534529276, + "loss": 2.6206, + "step": 20935 + }, + { + "epoch": 0.6208225839931204, + "grad_norm": 0.11514893174171448, + "learning_rate": 0.00032039514202549316, + "loss": 2.6126, + "step": 20936 + }, + { + "epoch": 0.6208522373454319, + "grad_norm": 0.09710370749235153, + "learning_rate": 0.0003203512321884905, + "loss": 2.6168, + "step": 20937 + }, + { + "epoch": 0.6208818906977434, + "grad_norm": 0.10825230181217194, + "learning_rate": 0.0003203073239423087, + "loss": 2.6364, + "step": 20938 + }, + { + "epoch": 0.6209115440500549, + "grad_norm": 0.11291971802711487, + "learning_rate": 0.00032026341728733624, + "loss": 2.587, + "step": 20939 + }, + { + "epoch": 0.6209411974023663, + "grad_norm": 0.10070832818746567, + "learning_rate": 0.0003202195122239622, + "loss": 2.6299, + "step": 20940 + }, + { + "epoch": 0.6209708507546778, + "grad_norm": 0.09658727049827576, + "learning_rate": 0.00032017560875257506, + "loss": 2.6149, + "step": 20941 + }, + { + "epoch": 0.6210005041069893, + "grad_norm": 0.0997677892446518, + "learning_rate": 0.00032013170687356383, + "loss": 2.6173, + "step": 20942 + }, + { + "epoch": 0.6210301574593008, + "grad_norm": 0.09803411364555359, + "learning_rate": 0.00032008780658731716, + "loss": 2.5697, + "step": 20943 + }, + { + "epoch": 0.6210598108116122, + "grad_norm": 0.10251680761575699, + "learning_rate": 0.0003200439078942239, + "loss": 2.5967, + "step": 20944 + }, + { + "epoch": 0.6210894641639237, + "grad_norm": 0.09116262942552567, + "learning_rate": 0.0003200000107946728, + "loss": 2.5843, + "step": 20945 + }, + { + "epoch": 0.6211191175162352, + "grad_norm": 0.09951741248369217, + "learning_rate": 0.0003199561152890524, + "loss": 2.6267, + "step": 20946 + }, + { + "epoch": 0.6211487708685467, + "grad_norm": 0.10207245498895645, + "learning_rate": 0.0003199122213777516, + "loss": 2.5911, + "step": 20947 + }, + { + "epoch": 0.6211784242208581, + "grad_norm": 0.10962450504302979, + "learning_rate": 0.00031986832906115886, + "loss": 2.5806, + "step": 20948 + }, + { + "epoch": 0.6212080775731696, + "grad_norm": 0.11059675365686417, + "learning_rate": 0.00031982443833966314, + "loss": 2.6297, + "step": 20949 + }, + { + "epoch": 0.6212377309254812, + "grad_norm": 0.12535572052001953, + "learning_rate": 0.0003197805492136529, + "loss": 2.596, + "step": 20950 + }, + { + "epoch": 0.6212673842777926, + "grad_norm": 0.1157931312918663, + "learning_rate": 0.0003197366616835168, + "loss": 2.5876, + "step": 20951 + }, + { + "epoch": 0.6212970376301041, + "grad_norm": 0.09540870040655136, + "learning_rate": 0.0003196927757496435, + "loss": 2.6033, + "step": 20952 + }, + { + "epoch": 0.6213266909824156, + "grad_norm": 0.11163820326328278, + "learning_rate": 0.0003196488914124216, + "loss": 2.5831, + "step": 20953 + }, + { + "epoch": 0.6213563443347271, + "grad_norm": 0.11600932478904724, + "learning_rate": 0.0003196050086722397, + "loss": 2.588, + "step": 20954 + }, + { + "epoch": 0.6213859976870385, + "grad_norm": 0.10256004333496094, + "learning_rate": 0.00031956112752948664, + "loss": 2.5826, + "step": 20955 + }, + { + "epoch": 0.62141565103935, + "grad_norm": 0.08916937559843063, + "learning_rate": 0.00031951724798455033, + "loss": 2.63, + "step": 20956 + }, + { + "epoch": 0.6214453043916615, + "grad_norm": 0.1079576388001442, + "learning_rate": 0.00031947337003782007, + "loss": 2.6219, + "step": 20957 + }, + { + "epoch": 0.621474957743973, + "grad_norm": 0.1188497468829155, + "learning_rate": 0.000319429493689684, + "loss": 2.6016, + "step": 20958 + }, + { + "epoch": 0.6215046110962844, + "grad_norm": 0.11608386039733887, + "learning_rate": 0.0003193856189405309, + "loss": 2.6173, + "step": 20959 + }, + { + "epoch": 0.6215342644485959, + "grad_norm": 0.09790266305208206, + "learning_rate": 0.0003193417457907491, + "loss": 2.6279, + "step": 20960 + }, + { + "epoch": 0.6215639178009074, + "grad_norm": 0.10364130139350891, + "learning_rate": 0.0003192978742407271, + "loss": 2.5964, + "step": 20961 + }, + { + "epoch": 0.6215935711532189, + "grad_norm": 0.09024158120155334, + "learning_rate": 0.0003192540042908534, + "loss": 2.6262, + "step": 20962 + }, + { + "epoch": 0.6216232245055303, + "grad_norm": 0.09274707734584808, + "learning_rate": 0.0003192101359415166, + "loss": 2.6033, + "step": 20963 + }, + { + "epoch": 0.6216528778578418, + "grad_norm": 0.10210434347391129, + "learning_rate": 0.0003191662691931051, + "loss": 2.5949, + "step": 20964 + }, + { + "epoch": 0.6216825312101533, + "grad_norm": 0.09420593082904816, + "learning_rate": 0.00031912240404600724, + "loss": 2.6231, + "step": 20965 + }, + { + "epoch": 0.6217121845624648, + "grad_norm": 0.1069844514131546, + "learning_rate": 0.0003190785405006117, + "loss": 2.5701, + "step": 20966 + }, + { + "epoch": 0.6217418379147762, + "grad_norm": 0.1167835220694542, + "learning_rate": 0.00031903467855730664, + "loss": 2.5907, + "step": 20967 + }, + { + "epoch": 0.6217714912670878, + "grad_norm": 0.11157182604074478, + "learning_rate": 0.00031899081821648047, + "loss": 2.6027, + "step": 20968 + }, + { + "epoch": 0.6218011446193992, + "grad_norm": 0.12586112320423126, + "learning_rate": 0.00031894695947852156, + "loss": 2.6161, + "step": 20969 + }, + { + "epoch": 0.6218307979717107, + "grad_norm": 0.12232458591461182, + "learning_rate": 0.0003189031023438187, + "loss": 2.6058, + "step": 20970 + }, + { + "epoch": 0.6218604513240222, + "grad_norm": 0.12306179851293564, + "learning_rate": 0.0003188592468127598, + "loss": 2.6188, + "step": 20971 + }, + { + "epoch": 0.6218901046763337, + "grad_norm": 0.12126635760068893, + "learning_rate": 0.00031881539288573335, + "loss": 2.5994, + "step": 20972 + }, + { + "epoch": 0.6219197580286452, + "grad_norm": 0.1150561273097992, + "learning_rate": 0.0003187715405631278, + "loss": 2.6431, + "step": 20973 + }, + { + "epoch": 0.6219494113809566, + "grad_norm": 0.13112613558769226, + "learning_rate": 0.0003187276898453313, + "loss": 2.6548, + "step": 20974 + }, + { + "epoch": 0.6219790647332681, + "grad_norm": 0.1105111688375473, + "learning_rate": 0.00031868384073273224, + "loss": 2.6155, + "step": 20975 + }, + { + "epoch": 0.6220087180855796, + "grad_norm": 0.11322972178459167, + "learning_rate": 0.00031863999322571897, + "loss": 2.5756, + "step": 20976 + }, + { + "epoch": 0.6220383714378911, + "grad_norm": 0.11500754207372665, + "learning_rate": 0.00031859614732467957, + "loss": 2.6224, + "step": 20977 + }, + { + "epoch": 0.6220680247902025, + "grad_norm": 0.11472047865390778, + "learning_rate": 0.0003185523030300025, + "loss": 2.6058, + "step": 20978 + }, + { + "epoch": 0.622097678142514, + "grad_norm": 0.10092204064130783, + "learning_rate": 0.0003185084603420759, + "loss": 2.6304, + "step": 20979 + }, + { + "epoch": 0.6221273314948255, + "grad_norm": 0.1276225447654724, + "learning_rate": 0.000318464619261288, + "loss": 2.5877, + "step": 20980 + }, + { + "epoch": 0.622156984847137, + "grad_norm": 0.11544987559318542, + "learning_rate": 0.0003184207797880271, + "loss": 2.5989, + "step": 20981 + }, + { + "epoch": 0.6221866381994484, + "grad_norm": 0.11310072243213654, + "learning_rate": 0.00031837694192268117, + "loss": 2.6304, + "step": 20982 + }, + { + "epoch": 0.62221629155176, + "grad_norm": 0.10489057749509811, + "learning_rate": 0.0003183331056656388, + "loss": 2.6046, + "step": 20983 + }, + { + "epoch": 0.6222459449040714, + "grad_norm": 0.12723608314990997, + "learning_rate": 0.00031828927101728796, + "loss": 2.6218, + "step": 20984 + }, + { + "epoch": 0.6222755982563829, + "grad_norm": 0.09769950062036514, + "learning_rate": 0.00031824543797801674, + "loss": 2.5762, + "step": 20985 + }, + { + "epoch": 0.6223052516086943, + "grad_norm": 0.1049213707447052, + "learning_rate": 0.0003182016065482134, + "loss": 2.5794, + "step": 20986 + }, + { + "epoch": 0.6223349049610059, + "grad_norm": 0.11681126058101654, + "learning_rate": 0.0003181577767282662, + "loss": 2.6078, + "step": 20987 + }, + { + "epoch": 0.6223645583133173, + "grad_norm": 0.0922195091843605, + "learning_rate": 0.0003181139485185629, + "loss": 2.5831, + "step": 20988 + }, + { + "epoch": 0.6223942116656288, + "grad_norm": 0.10473507642745972, + "learning_rate": 0.00031807012191949186, + "loss": 2.6365, + "step": 20989 + }, + { + "epoch": 0.6224238650179402, + "grad_norm": 0.11297088116407394, + "learning_rate": 0.00031802629693144114, + "loss": 2.6139, + "step": 20990 + }, + { + "epoch": 0.6224535183702518, + "grad_norm": 0.10716494172811508, + "learning_rate": 0.00031798247355479875, + "loss": 2.6259, + "step": 20991 + }, + { + "epoch": 0.6224831717225633, + "grad_norm": 0.11144150793552399, + "learning_rate": 0.0003179386517899528, + "loss": 2.6321, + "step": 20992 + }, + { + "epoch": 0.6225128250748747, + "grad_norm": 0.09521305561065674, + "learning_rate": 0.0003178948316372915, + "loss": 2.6168, + "step": 20993 + }, + { + "epoch": 0.6225424784271862, + "grad_norm": 0.13166777789592743, + "learning_rate": 0.00031785101309720254, + "loss": 2.6059, + "step": 20994 + }, + { + "epoch": 0.6225721317794977, + "grad_norm": 0.10575827211141586, + "learning_rate": 0.000317807196170074, + "loss": 2.6172, + "step": 20995 + }, + { + "epoch": 0.6226017851318092, + "grad_norm": 0.10808156430721283, + "learning_rate": 0.00031776338085629413, + "loss": 2.5974, + "step": 20996 + }, + { + "epoch": 0.6226314384841206, + "grad_norm": 0.10934525728225708, + "learning_rate": 0.00031771956715625094, + "loss": 2.6118, + "step": 20997 + }, + { + "epoch": 0.6226610918364321, + "grad_norm": 0.1082267314195633, + "learning_rate": 0.00031767575507033217, + "loss": 2.5905, + "step": 20998 + }, + { + "epoch": 0.6226907451887436, + "grad_norm": 0.10897741466760635, + "learning_rate": 0.00031763194459892596, + "loss": 2.5967, + "step": 20999 + }, + { + "epoch": 0.6227203985410551, + "grad_norm": 0.12863372266292572, + "learning_rate": 0.00031758813574242007, + "loss": 2.6313, + "step": 21000 + }, + { + "epoch": 0.6227500518933665, + "grad_norm": 0.10317890346050262, + "learning_rate": 0.00031754432850120265, + "loss": 2.584, + "step": 21001 + }, + { + "epoch": 0.622779705245678, + "grad_norm": 0.11301769316196442, + "learning_rate": 0.00031750052287566146, + "loss": 2.594, + "step": 21002 + }, + { + "epoch": 0.6228093585979895, + "grad_norm": 0.1096099466085434, + "learning_rate": 0.00031745671886618444, + "loss": 2.5827, + "step": 21003 + }, + { + "epoch": 0.622839011950301, + "grad_norm": 0.11614438891410828, + "learning_rate": 0.0003174129164731596, + "loss": 2.6261, + "step": 21004 + }, + { + "epoch": 0.6228686653026124, + "grad_norm": 0.11533229053020477, + "learning_rate": 0.0003173691156969747, + "loss": 2.5773, + "step": 21005 + }, + { + "epoch": 0.622898318654924, + "grad_norm": 0.11956862360239029, + "learning_rate": 0.0003173253165380176, + "loss": 2.6173, + "step": 21006 + }, + { + "epoch": 0.6229279720072354, + "grad_norm": 0.10381200164556503, + "learning_rate": 0.0003172815189966762, + "loss": 2.6233, + "step": 21007 + }, + { + "epoch": 0.6229576253595469, + "grad_norm": 0.1036856472492218, + "learning_rate": 0.00031723772307333813, + "loss": 2.5844, + "step": 21008 + }, + { + "epoch": 0.6229872787118583, + "grad_norm": 0.09317431598901749, + "learning_rate": 0.0003171939287683916, + "loss": 2.5926, + "step": 21009 + }, + { + "epoch": 0.6230169320641699, + "grad_norm": 0.1073327288031578, + "learning_rate": 0.0003171501360822241, + "loss": 2.5818, + "step": 21010 + }, + { + "epoch": 0.6230465854164813, + "grad_norm": 0.10228485614061356, + "learning_rate": 0.00031710634501522363, + "loss": 2.5903, + "step": 21011 + }, + { + "epoch": 0.6230762387687928, + "grad_norm": 0.0901041179895401, + "learning_rate": 0.0003170625555677778, + "loss": 2.5939, + "step": 21012 + }, + { + "epoch": 0.6231058921211043, + "grad_norm": 0.12030243128538132, + "learning_rate": 0.0003170187677402745, + "loss": 2.6069, + "step": 21013 + }, + { + "epoch": 0.6231355454734158, + "grad_norm": 0.11531458050012589, + "learning_rate": 0.00031697498153310146, + "loss": 2.5838, + "step": 21014 + }, + { + "epoch": 0.6231651988257273, + "grad_norm": 0.10161180794239044, + "learning_rate": 0.0003169311969466463, + "loss": 2.5989, + "step": 21015 + }, + { + "epoch": 0.6231948521780387, + "grad_norm": 0.08869906514883041, + "learning_rate": 0.00031688741398129685, + "loss": 2.6102, + "step": 21016 + }, + { + "epoch": 0.6232245055303502, + "grad_norm": 0.09769310057163239, + "learning_rate": 0.00031684363263744067, + "loss": 2.6016, + "step": 21017 + }, + { + "epoch": 0.6232541588826617, + "grad_norm": 0.10160831362009048, + "learning_rate": 0.00031679985291546567, + "loss": 2.6196, + "step": 21018 + }, + { + "epoch": 0.6232838122349732, + "grad_norm": 0.1000814214348793, + "learning_rate": 0.00031675607481575934, + "loss": 2.6195, + "step": 21019 + }, + { + "epoch": 0.6233134655872846, + "grad_norm": 0.09515645354986191, + "learning_rate": 0.0003167122983387095, + "loss": 2.605, + "step": 21020 + }, + { + "epoch": 0.6233431189395962, + "grad_norm": 0.10574769973754883, + "learning_rate": 0.00031666852348470354, + "loss": 2.6446, + "step": 21021 + }, + { + "epoch": 0.6233727722919076, + "grad_norm": 0.10669367015361786, + "learning_rate": 0.00031662475025412943, + "loss": 2.6072, + "step": 21022 + }, + { + "epoch": 0.6234024256442191, + "grad_norm": 0.10148728638887405, + "learning_rate": 0.0003165809786473747, + "loss": 2.6192, + "step": 21023 + }, + { + "epoch": 0.6234320789965305, + "grad_norm": 0.10161206871271133, + "learning_rate": 0.0003165372086648267, + "loss": 2.5821, + "step": 21024 + }, + { + "epoch": 0.6234617323488421, + "grad_norm": 0.1111259013414383, + "learning_rate": 0.0003164934403068734, + "loss": 2.6095, + "step": 21025 + }, + { + "epoch": 0.6234913857011535, + "grad_norm": 0.09682263433933258, + "learning_rate": 0.0003164496735739021, + "loss": 2.5874, + "step": 21026 + }, + { + "epoch": 0.623521039053465, + "grad_norm": 0.10466238111257553, + "learning_rate": 0.00031640590846630047, + "loss": 2.5951, + "step": 21027 + }, + { + "epoch": 0.6235506924057764, + "grad_norm": 0.10769228637218475, + "learning_rate": 0.000316362144984456, + "loss": 2.6062, + "step": 21028 + }, + { + "epoch": 0.623580345758088, + "grad_norm": 0.1131676584482193, + "learning_rate": 0.00031631838312875626, + "loss": 2.6, + "step": 21029 + }, + { + "epoch": 0.6236099991103994, + "grad_norm": 0.12384886294603348, + "learning_rate": 0.00031627462289958886, + "loss": 2.6079, + "step": 21030 + }, + { + "epoch": 0.6236396524627109, + "grad_norm": 0.12004631757736206, + "learning_rate": 0.00031623086429734113, + "loss": 2.6104, + "step": 21031 + }, + { + "epoch": 0.6236693058150223, + "grad_norm": 0.10493919998407364, + "learning_rate": 0.0003161871073224007, + "loss": 2.6027, + "step": 21032 + }, + { + "epoch": 0.6236989591673339, + "grad_norm": 0.12097221612930298, + "learning_rate": 0.00031614335197515487, + "loss": 2.5953, + "step": 21033 + }, + { + "epoch": 0.6237286125196454, + "grad_norm": 0.11547980457544327, + "learning_rate": 0.0003160995982559911, + "loss": 2.5781, + "step": 21034 + }, + { + "epoch": 0.6237582658719568, + "grad_norm": 0.117030568420887, + "learning_rate": 0.00031605584616529726, + "loss": 2.5999, + "step": 21035 + }, + { + "epoch": 0.6237879192242684, + "grad_norm": 0.10901140421628952, + "learning_rate": 0.0003160120957034603, + "loss": 2.6121, + "step": 21036 + }, + { + "epoch": 0.6238175725765798, + "grad_norm": 0.11796113103628159, + "learning_rate": 0.0003159683468708678, + "loss": 2.5949, + "step": 21037 + }, + { + "epoch": 0.6238472259288913, + "grad_norm": 0.09255005419254303, + "learning_rate": 0.00031592459966790727, + "loss": 2.5901, + "step": 21038 + }, + { + "epoch": 0.6238768792812027, + "grad_norm": 0.11735714972019196, + "learning_rate": 0.0003158808540949659, + "loss": 2.5906, + "step": 21039 + }, + { + "epoch": 0.6239065326335143, + "grad_norm": 0.0955934152007103, + "learning_rate": 0.0003158371101524313, + "loss": 2.5943, + "step": 21040 + }, + { + "epoch": 0.6239361859858257, + "grad_norm": 0.10903635621070862, + "learning_rate": 0.0003157933678406906, + "loss": 2.618, + "step": 21041 + }, + { + "epoch": 0.6239658393381372, + "grad_norm": 0.100718654692173, + "learning_rate": 0.00031574962716013143, + "loss": 2.5864, + "step": 21042 + }, + { + "epoch": 0.6239954926904486, + "grad_norm": 0.09747367352247238, + "learning_rate": 0.0003157058881111408, + "loss": 2.6037, + "step": 21043 + }, + { + "epoch": 0.6240251460427602, + "grad_norm": 0.11055534332990646, + "learning_rate": 0.0003156621506941061, + "loss": 2.5936, + "step": 21044 + }, + { + "epoch": 0.6240547993950716, + "grad_norm": 0.10961070656776428, + "learning_rate": 0.0003156184149094148, + "loss": 2.6014, + "step": 21045 + }, + { + "epoch": 0.6240844527473831, + "grad_norm": 0.10214924067258835, + "learning_rate": 0.000315574680757454, + "loss": 2.6015, + "step": 21046 + }, + { + "epoch": 0.6241141060996945, + "grad_norm": 0.10162647813558578, + "learning_rate": 0.00031553094823861096, + "loss": 2.6009, + "step": 21047 + }, + { + "epoch": 0.6241437594520061, + "grad_norm": 0.09752938896417618, + "learning_rate": 0.00031548721735327317, + "loss": 2.6051, + "step": 21048 + }, + { + "epoch": 0.6241734128043175, + "grad_norm": 0.1012478917837143, + "learning_rate": 0.0003154434881018278, + "loss": 2.604, + "step": 21049 + }, + { + "epoch": 0.624203066156629, + "grad_norm": 0.08959776163101196, + "learning_rate": 0.0003153997604846619, + "loss": 2.5833, + "step": 21050 + }, + { + "epoch": 0.6242327195089404, + "grad_norm": 0.09153541177511215, + "learning_rate": 0.0003153560345021629, + "loss": 2.6099, + "step": 21051 + }, + { + "epoch": 0.624262372861252, + "grad_norm": 0.09613898396492004, + "learning_rate": 0.00031531231015471793, + "loss": 2.5613, + "step": 21052 + }, + { + "epoch": 0.6242920262135634, + "grad_norm": 0.10368330031633377, + "learning_rate": 0.00031526858744271416, + "loss": 2.5842, + "step": 21053 + }, + { + "epoch": 0.6243216795658749, + "grad_norm": 0.09094779938459396, + "learning_rate": 0.0003152248663665387, + "loss": 2.5824, + "step": 21054 + }, + { + "epoch": 0.6243513329181865, + "grad_norm": 0.10794412344694138, + "learning_rate": 0.00031518114692657875, + "loss": 2.6289, + "step": 21055 + }, + { + "epoch": 0.6243809862704979, + "grad_norm": 0.0984097570180893, + "learning_rate": 0.00031513742912322144, + "loss": 2.6233, + "step": 21056 + }, + { + "epoch": 0.6244106396228094, + "grad_norm": 0.09045427292585373, + "learning_rate": 0.00031509371295685394, + "loss": 2.6206, + "step": 21057 + }, + { + "epoch": 0.6244402929751208, + "grad_norm": 0.09336361289024353, + "learning_rate": 0.0003150499984278634, + "loss": 2.6031, + "step": 21058 + }, + { + "epoch": 0.6244699463274324, + "grad_norm": 0.09515459090471268, + "learning_rate": 0.00031500628553663703, + "loss": 2.6149, + "step": 21059 + }, + { + "epoch": 0.6244995996797438, + "grad_norm": 0.1036868616938591, + "learning_rate": 0.00031496257428356136, + "loss": 2.5813, + "step": 21060 + }, + { + "epoch": 0.6245292530320553, + "grad_norm": 0.10217463970184326, + "learning_rate": 0.000314918864669024, + "loss": 2.6064, + "step": 21061 + }, + { + "epoch": 0.6245589063843667, + "grad_norm": 0.09512868523597717, + "learning_rate": 0.00031487515669341193, + "loss": 2.596, + "step": 21062 + }, + { + "epoch": 0.6245885597366783, + "grad_norm": 0.09570666402578354, + "learning_rate": 0.00031483145035711223, + "loss": 2.6354, + "step": 21063 + }, + { + "epoch": 0.6246182130889897, + "grad_norm": 0.09354959428310394, + "learning_rate": 0.0003147877456605117, + "loss": 2.6339, + "step": 21064 + }, + { + "epoch": 0.6246478664413012, + "grad_norm": 0.10374318808317184, + "learning_rate": 0.0003147440426039975, + "loss": 2.6101, + "step": 21065 + }, + { + "epoch": 0.6246775197936126, + "grad_norm": 0.09110739827156067, + "learning_rate": 0.00031470034118795664, + "loss": 2.6137, + "step": 21066 + }, + { + "epoch": 0.6247071731459242, + "grad_norm": 0.09438943862915039, + "learning_rate": 0.000314656641412776, + "loss": 2.6137, + "step": 21067 + }, + { + "epoch": 0.6247368264982356, + "grad_norm": 0.09959489852190018, + "learning_rate": 0.0003146129432788426, + "loss": 2.5753, + "step": 21068 + }, + { + "epoch": 0.6247664798505471, + "grad_norm": 0.0982198491692543, + "learning_rate": 0.00031456924678654357, + "loss": 2.6335, + "step": 21069 + }, + { + "epoch": 0.6247961332028585, + "grad_norm": 0.0932118222117424, + "learning_rate": 0.00031452555193626553, + "loss": 2.6036, + "step": 21070 + }, + { + "epoch": 0.6248257865551701, + "grad_norm": 0.09856156259775162, + "learning_rate": 0.00031448185872839566, + "loss": 2.6059, + "step": 21071 + }, + { + "epoch": 0.6248554399074815, + "grad_norm": 0.10116085410118103, + "learning_rate": 0.0003144381671633206, + "loss": 2.5936, + "step": 21072 + }, + { + "epoch": 0.624885093259793, + "grad_norm": 0.10525134205818176, + "learning_rate": 0.0003143944772414275, + "loss": 2.6178, + "step": 21073 + }, + { + "epoch": 0.6249147466121044, + "grad_norm": 0.10258839279413223, + "learning_rate": 0.0003143507889631032, + "loss": 2.6029, + "step": 21074 + }, + { + "epoch": 0.624944399964416, + "grad_norm": 0.10279711335897446, + "learning_rate": 0.0003143071023287345, + "loss": 2.6422, + "step": 21075 + }, + { + "epoch": 0.6249740533167275, + "grad_norm": 0.10776089876890182, + "learning_rate": 0.00031426341733870836, + "loss": 2.596, + "step": 21076 + }, + { + "epoch": 0.6250037066690389, + "grad_norm": 0.12085922807455063, + "learning_rate": 0.00031421973399341143, + "loss": 2.6361, + "step": 21077 + }, + { + "epoch": 0.6250333600213505, + "grad_norm": 0.11144194006919861, + "learning_rate": 0.00031417605229323067, + "loss": 2.5955, + "step": 21078 + }, + { + "epoch": 0.6250630133736619, + "grad_norm": 0.11245667189359665, + "learning_rate": 0.0003141323722385529, + "loss": 2.6181, + "step": 21079 + }, + { + "epoch": 0.6250926667259734, + "grad_norm": 0.11803611367940903, + "learning_rate": 0.00031408869382976504, + "loss": 2.6007, + "step": 21080 + }, + { + "epoch": 0.6251223200782848, + "grad_norm": 0.11019732058048248, + "learning_rate": 0.0003140450170672535, + "loss": 2.6022, + "step": 21081 + }, + { + "epoch": 0.6251519734305964, + "grad_norm": 0.14291046559810638, + "learning_rate": 0.0003140013419514053, + "loss": 2.5919, + "step": 21082 + }, + { + "epoch": 0.6251816267829078, + "grad_norm": 0.11389940977096558, + "learning_rate": 0.0003139576684826071, + "loss": 2.5995, + "step": 21083 + }, + { + "epoch": 0.6252112801352193, + "grad_norm": 0.12526842951774597, + "learning_rate": 0.0003139139966612457, + "loss": 2.5849, + "step": 21084 + }, + { + "epoch": 0.6252409334875307, + "grad_norm": 0.12397190183401108, + "learning_rate": 0.00031387032648770786, + "loss": 2.6036, + "step": 21085 + }, + { + "epoch": 0.6252705868398423, + "grad_norm": 0.11342599242925644, + "learning_rate": 0.00031382665796238, + "loss": 2.5815, + "step": 21086 + }, + { + "epoch": 0.6253002401921537, + "grad_norm": 0.10014194250106812, + "learning_rate": 0.00031378299108564923, + "loss": 2.6109, + "step": 21087 + }, + { + "epoch": 0.6253298935444652, + "grad_norm": 0.11030321568250656, + "learning_rate": 0.000313739325857902, + "loss": 2.6427, + "step": 21088 + }, + { + "epoch": 0.6253595468967766, + "grad_norm": 0.10921739041805267, + "learning_rate": 0.00031369566227952507, + "loss": 2.6221, + "step": 21089 + }, + { + "epoch": 0.6253892002490882, + "grad_norm": 0.10830383747816086, + "learning_rate": 0.0003136520003509051, + "loss": 2.5895, + "step": 21090 + }, + { + "epoch": 0.6254188536013996, + "grad_norm": 0.10878551006317139, + "learning_rate": 0.00031360834007242855, + "loss": 2.6154, + "step": 21091 + }, + { + "epoch": 0.6254485069537111, + "grad_norm": 0.09745844453573227, + "learning_rate": 0.0003135646814444821, + "loss": 2.5838, + "step": 21092 + }, + { + "epoch": 0.6254781603060225, + "grad_norm": 0.09459248185157776, + "learning_rate": 0.0003135210244674525, + "loss": 2.5771, + "step": 21093 + }, + { + "epoch": 0.6255078136583341, + "grad_norm": 0.11254919320344925, + "learning_rate": 0.0003134773691417262, + "loss": 2.6369, + "step": 21094 + }, + { + "epoch": 0.6255374670106455, + "grad_norm": 0.1067713275551796, + "learning_rate": 0.0003134337154676897, + "loss": 2.6104, + "step": 21095 + }, + { + "epoch": 0.625567120362957, + "grad_norm": 0.10419749468564987, + "learning_rate": 0.0003133900634457298, + "loss": 2.6129, + "step": 21096 + }, + { + "epoch": 0.6255967737152686, + "grad_norm": 0.10593554377555847, + "learning_rate": 0.00031334641307623304, + "loss": 2.605, + "step": 21097 + }, + { + "epoch": 0.62562642706758, + "grad_norm": 0.10857544094324112, + "learning_rate": 0.00031330276435958545, + "loss": 2.6387, + "step": 21098 + }, + { + "epoch": 0.6256560804198915, + "grad_norm": 0.1116776391863823, + "learning_rate": 0.0003132591172961741, + "loss": 2.6195, + "step": 21099 + }, + { + "epoch": 0.6256857337722029, + "grad_norm": 0.12640531361103058, + "learning_rate": 0.0003132154718863854, + "loss": 2.605, + "step": 21100 + }, + { + "epoch": 0.6257153871245145, + "grad_norm": 0.11098933219909668, + "learning_rate": 0.00031317182813060587, + "loss": 2.5994, + "step": 21101 + }, + { + "epoch": 0.6257450404768259, + "grad_norm": 0.10808798670768738, + "learning_rate": 0.0003131281860292217, + "loss": 2.6116, + "step": 21102 + }, + { + "epoch": 0.6257746938291374, + "grad_norm": 0.14587989449501038, + "learning_rate": 0.00031308454558261954, + "loss": 2.586, + "step": 21103 + }, + { + "epoch": 0.6258043471814488, + "grad_norm": 0.1282120794057846, + "learning_rate": 0.0003130409067911858, + "loss": 2.6236, + "step": 21104 + }, + { + "epoch": 0.6258340005337604, + "grad_norm": 0.11551022529602051, + "learning_rate": 0.00031299726965530695, + "loss": 2.6162, + "step": 21105 + }, + { + "epoch": 0.6258636538860718, + "grad_norm": 0.11055773496627808, + "learning_rate": 0.0003129536341753694, + "loss": 2.6097, + "step": 21106 + }, + { + "epoch": 0.6258933072383833, + "grad_norm": 0.12617608904838562, + "learning_rate": 0.00031291000035175954, + "loss": 2.619, + "step": 21107 + }, + { + "epoch": 0.6259229605906947, + "grad_norm": 0.13534162938594818, + "learning_rate": 0.0003128663681848637, + "loss": 2.6083, + "step": 21108 + }, + { + "epoch": 0.6259526139430063, + "grad_norm": 0.10887549072504044, + "learning_rate": 0.0003128227376750683, + "loss": 2.599, + "step": 21109 + }, + { + "epoch": 0.6259822672953177, + "grad_norm": 0.1037493497133255, + "learning_rate": 0.0003127791088227597, + "loss": 2.6077, + "step": 21110 + }, + { + "epoch": 0.6260119206476292, + "grad_norm": 0.11997620016336441, + "learning_rate": 0.0003127354816283241, + "loss": 2.6116, + "step": 21111 + }, + { + "epoch": 0.6260415739999406, + "grad_norm": 0.11000367999076843, + "learning_rate": 0.000312691856092148, + "loss": 2.6283, + "step": 21112 + }, + { + "epoch": 0.6260712273522522, + "grad_norm": 0.10501664131879807, + "learning_rate": 0.00031264823221461777, + "loss": 2.6182, + "step": 21113 + }, + { + "epoch": 0.6261008807045636, + "grad_norm": 0.08956141024827957, + "learning_rate": 0.0003126046099961195, + "loss": 2.5813, + "step": 21114 + }, + { + "epoch": 0.6261305340568751, + "grad_norm": 0.0957857072353363, + "learning_rate": 0.00031256098943703965, + "loss": 2.582, + "step": 21115 + }, + { + "epoch": 0.6261601874091867, + "grad_norm": 0.10110943764448166, + "learning_rate": 0.00031251737053776443, + "loss": 2.5899, + "step": 21116 + }, + { + "epoch": 0.6261898407614981, + "grad_norm": 0.10957684367895126, + "learning_rate": 0.00031247375329868, + "loss": 2.5972, + "step": 21117 + }, + { + "epoch": 0.6262194941138096, + "grad_norm": 0.12023890018463135, + "learning_rate": 0.0003124301377201728, + "loss": 2.6222, + "step": 21118 + }, + { + "epoch": 0.626249147466121, + "grad_norm": 0.11941038072109222, + "learning_rate": 0.00031238652380262877, + "loss": 2.5979, + "step": 21119 + }, + { + "epoch": 0.6262788008184326, + "grad_norm": 0.08293375372886658, + "learning_rate": 0.0003123429115464344, + "loss": 2.5805, + "step": 21120 + }, + { + "epoch": 0.626308454170744, + "grad_norm": 0.1174335852265358, + "learning_rate": 0.0003122993009519757, + "loss": 2.604, + "step": 21121 + }, + { + "epoch": 0.6263381075230555, + "grad_norm": 0.10416939854621887, + "learning_rate": 0.00031225569201963886, + "loss": 2.5915, + "step": 21122 + }, + { + "epoch": 0.6263677608753669, + "grad_norm": 0.12141035497188568, + "learning_rate": 0.0003122120847498101, + "loss": 2.6186, + "step": 21123 + }, + { + "epoch": 0.6263974142276785, + "grad_norm": 0.09941182285547256, + "learning_rate": 0.0003121684791428755, + "loss": 2.5962, + "step": 21124 + }, + { + "epoch": 0.6264270675799899, + "grad_norm": 0.11234818398952484, + "learning_rate": 0.0003121248751992214, + "loss": 2.6091, + "step": 21125 + }, + { + "epoch": 0.6264567209323014, + "grad_norm": 0.10257527977228165, + "learning_rate": 0.00031208127291923373, + "loss": 2.5945, + "step": 21126 + }, + { + "epoch": 0.6264863742846128, + "grad_norm": 0.10257775336503983, + "learning_rate": 0.0003120376723032986, + "loss": 2.6048, + "step": 21127 + }, + { + "epoch": 0.6265160276369244, + "grad_norm": 0.11635144054889679, + "learning_rate": 0.0003119940733518023, + "loss": 2.63, + "step": 21128 + }, + { + "epoch": 0.6265456809892358, + "grad_norm": 0.10564152896404266, + "learning_rate": 0.0003119504760651307, + "loss": 2.6155, + "step": 21129 + }, + { + "epoch": 0.6265753343415473, + "grad_norm": 0.10143067687749863, + "learning_rate": 0.0003119068804436699, + "loss": 2.587, + "step": 21130 + }, + { + "epoch": 0.6266049876938588, + "grad_norm": 0.11304915696382523, + "learning_rate": 0.0003118632864878058, + "loss": 2.6251, + "step": 21131 + }, + { + "epoch": 0.6266346410461703, + "grad_norm": 0.1070118322968483, + "learning_rate": 0.00031181969419792475, + "loss": 2.6307, + "step": 21132 + }, + { + "epoch": 0.6266642943984817, + "grad_norm": 0.11450506001710892, + "learning_rate": 0.00031177610357441255, + "loss": 2.6014, + "step": 21133 + }, + { + "epoch": 0.6266939477507932, + "grad_norm": 0.10246071964502335, + "learning_rate": 0.0003117325146176553, + "loss": 2.6341, + "step": 21134 + }, + { + "epoch": 0.6267236011031047, + "grad_norm": 0.10738112777471542, + "learning_rate": 0.000311688927328039, + "loss": 2.6006, + "step": 21135 + }, + { + "epoch": 0.6267532544554162, + "grad_norm": 0.10173214226961136, + "learning_rate": 0.00031164534170594955, + "loss": 2.5857, + "step": 21136 + }, + { + "epoch": 0.6267829078077277, + "grad_norm": 0.10605482757091522, + "learning_rate": 0.0003116017577517727, + "loss": 2.582, + "step": 21137 + }, + { + "epoch": 0.6268125611600391, + "grad_norm": 0.100710429251194, + "learning_rate": 0.00031155817546589477, + "loss": 2.5885, + "step": 21138 + }, + { + "epoch": 0.6268422145123507, + "grad_norm": 0.0932217538356781, + "learning_rate": 0.0003115145948487017, + "loss": 2.6032, + "step": 21139 + }, + { + "epoch": 0.6268718678646621, + "grad_norm": 0.09784994274377823, + "learning_rate": 0.0003114710159005791, + "loss": 2.5985, + "step": 21140 + }, + { + "epoch": 0.6269015212169736, + "grad_norm": 0.09956356137990952, + "learning_rate": 0.000311427438621913, + "loss": 2.6189, + "step": 21141 + }, + { + "epoch": 0.626931174569285, + "grad_norm": 0.09878759831190109, + "learning_rate": 0.00031138386301308936, + "loss": 2.5973, + "step": 21142 + }, + { + "epoch": 0.6269608279215966, + "grad_norm": 0.09729871153831482, + "learning_rate": 0.00031134028907449395, + "loss": 2.5779, + "step": 21143 + }, + { + "epoch": 0.626990481273908, + "grad_norm": 0.10390303283929825, + "learning_rate": 0.0003112967168065127, + "loss": 2.6241, + "step": 21144 + }, + { + "epoch": 0.6270201346262195, + "grad_norm": 0.11074741929769516, + "learning_rate": 0.0003112531462095315, + "loss": 2.6167, + "step": 21145 + }, + { + "epoch": 0.627049787978531, + "grad_norm": 0.10374542325735092, + "learning_rate": 0.00031120957728393594, + "loss": 2.6018, + "step": 21146 + }, + { + "epoch": 0.6270794413308425, + "grad_norm": 0.093161441385746, + "learning_rate": 0.00031116601003011203, + "loss": 2.6248, + "step": 21147 + }, + { + "epoch": 0.6271090946831539, + "grad_norm": 0.10839658230543137, + "learning_rate": 0.0003111224444484455, + "loss": 2.6143, + "step": 21148 + }, + { + "epoch": 0.6271387480354654, + "grad_norm": 0.13638418912887573, + "learning_rate": 0.0003110788805393221, + "loss": 2.6618, + "step": 21149 + }, + { + "epoch": 0.6271684013877769, + "grad_norm": 0.12368766963481903, + "learning_rate": 0.0003110353183031276, + "loss": 2.6023, + "step": 21150 + }, + { + "epoch": 0.6271980547400884, + "grad_norm": 0.09442204236984253, + "learning_rate": 0.0003109917577402479, + "loss": 2.6157, + "step": 21151 + }, + { + "epoch": 0.6272277080923998, + "grad_norm": 0.1149609386920929, + "learning_rate": 0.0003109481988510686, + "loss": 2.5897, + "step": 21152 + }, + { + "epoch": 0.6272573614447113, + "grad_norm": 0.12154898047447205, + "learning_rate": 0.00031090464163597545, + "loss": 2.596, + "step": 21153 + }, + { + "epoch": 0.6272870147970228, + "grad_norm": 0.09213785082101822, + "learning_rate": 0.0003108610860953541, + "loss": 2.6019, + "step": 21154 + }, + { + "epoch": 0.6273166681493343, + "grad_norm": 0.10337036848068237, + "learning_rate": 0.00031081753222959044, + "loss": 2.5924, + "step": 21155 + }, + { + "epoch": 0.6273463215016457, + "grad_norm": 0.09225275367498398, + "learning_rate": 0.00031077398003907, + "loss": 2.5824, + "step": 21156 + }, + { + "epoch": 0.6273759748539572, + "grad_norm": 0.09429343789815903, + "learning_rate": 0.0003107304295241784, + "loss": 2.6026, + "step": 21157 + }, + { + "epoch": 0.6274056282062688, + "grad_norm": 0.09606001526117325, + "learning_rate": 0.0003106868806853013, + "loss": 2.595, + "step": 21158 + }, + { + "epoch": 0.6274352815585802, + "grad_norm": 0.09758087247610092, + "learning_rate": 0.00031064333352282436, + "loss": 2.5968, + "step": 21159 + }, + { + "epoch": 0.6274649349108917, + "grad_norm": 0.09596866369247437, + "learning_rate": 0.00031059978803713316, + "loss": 2.6207, + "step": 21160 + }, + { + "epoch": 0.6274945882632031, + "grad_norm": 0.1162317618727684, + "learning_rate": 0.00031055624422861343, + "loss": 2.5729, + "step": 21161 + }, + { + "epoch": 0.6275242416155147, + "grad_norm": 0.11442145705223083, + "learning_rate": 0.0003105127020976507, + "loss": 2.6129, + "step": 21162 + }, + { + "epoch": 0.6275538949678261, + "grad_norm": 0.09604822844266891, + "learning_rate": 0.00031046916164463036, + "loss": 2.5741, + "step": 21163 + }, + { + "epoch": 0.6275835483201376, + "grad_norm": 0.10359315574169159, + "learning_rate": 0.0003104256228699382, + "loss": 2.5707, + "step": 21164 + }, + { + "epoch": 0.627613201672449, + "grad_norm": 0.11257020384073257, + "learning_rate": 0.00031038208577395976, + "loss": 2.6294, + "step": 21165 + }, + { + "epoch": 0.6276428550247606, + "grad_norm": 0.12643307447433472, + "learning_rate": 0.00031033855035708056, + "loss": 2.6022, + "step": 21166 + }, + { + "epoch": 0.627672508377072, + "grad_norm": 0.10247719287872314, + "learning_rate": 0.00031029501661968597, + "loss": 2.6084, + "step": 21167 + }, + { + "epoch": 0.6277021617293835, + "grad_norm": 0.101008839905262, + "learning_rate": 0.00031025148456216144, + "loss": 2.6389, + "step": 21168 + }, + { + "epoch": 0.627731815081695, + "grad_norm": 0.10459397733211517, + "learning_rate": 0.0003102079541848928, + "loss": 2.6238, + "step": 21169 + }, + { + "epoch": 0.6277614684340065, + "grad_norm": 0.10285633057355881, + "learning_rate": 0.00031016442548826515, + "loss": 2.601, + "step": 21170 + }, + { + "epoch": 0.6277911217863179, + "grad_norm": 0.10475008934736252, + "learning_rate": 0.0003101208984726641, + "loss": 2.6113, + "step": 21171 + }, + { + "epoch": 0.6278207751386294, + "grad_norm": 0.10483904182910919, + "learning_rate": 0.00031007737313847516, + "loss": 2.5587, + "step": 21172 + }, + { + "epoch": 0.6278504284909409, + "grad_norm": 0.09604837000370026, + "learning_rate": 0.0003100338494860838, + "loss": 2.6086, + "step": 21173 + }, + { + "epoch": 0.6278800818432524, + "grad_norm": 0.09851252287626266, + "learning_rate": 0.00030999032751587516, + "loss": 2.582, + "step": 21174 + }, + { + "epoch": 0.6279097351955638, + "grad_norm": 0.10229373723268509, + "learning_rate": 0.00030994680722823474, + "loss": 2.6107, + "step": 21175 + }, + { + "epoch": 0.6279393885478753, + "grad_norm": 0.09818561375141144, + "learning_rate": 0.00030990328862354785, + "loss": 2.6254, + "step": 21176 + }, + { + "epoch": 0.6279690419001868, + "grad_norm": 0.11523298174142838, + "learning_rate": 0.0003098597717022002, + "loss": 2.6059, + "step": 21177 + }, + { + "epoch": 0.6279986952524983, + "grad_norm": 0.11274781823158264, + "learning_rate": 0.0003098162564645768, + "loss": 2.6036, + "step": 21178 + }, + { + "epoch": 0.6280283486048098, + "grad_norm": 0.11098745465278625, + "learning_rate": 0.00030977274291106317, + "loss": 2.6075, + "step": 21179 + }, + { + "epoch": 0.6280580019571212, + "grad_norm": 0.11688202619552612, + "learning_rate": 0.00030972923104204456, + "loss": 2.6019, + "step": 21180 + }, + { + "epoch": 0.6280876553094328, + "grad_norm": 0.11577937006950378, + "learning_rate": 0.0003096857208579062, + "loss": 2.6109, + "step": 21181 + }, + { + "epoch": 0.6281173086617442, + "grad_norm": 0.0963960736989975, + "learning_rate": 0.00030964221235903346, + "loss": 2.599, + "step": 21182 + }, + { + "epoch": 0.6281469620140557, + "grad_norm": 0.10724552720785141, + "learning_rate": 0.0003095987055458116, + "loss": 2.621, + "step": 21183 + }, + { + "epoch": 0.6281766153663672, + "grad_norm": 0.11514066159725189, + "learning_rate": 0.00030955520041862607, + "loss": 2.5803, + "step": 21184 + }, + { + "epoch": 0.6282062687186787, + "grad_norm": 0.10780078917741776, + "learning_rate": 0.00030951169697786176, + "loss": 2.5934, + "step": 21185 + }, + { + "epoch": 0.6282359220709901, + "grad_norm": 0.10607832670211792, + "learning_rate": 0.000309468195223904, + "loss": 2.6047, + "step": 21186 + }, + { + "epoch": 0.6282655754233016, + "grad_norm": 0.12260844558477402, + "learning_rate": 0.00030942469515713816, + "loss": 2.6169, + "step": 21187 + }, + { + "epoch": 0.6282952287756131, + "grad_norm": 0.10761860758066177, + "learning_rate": 0.00030938119677794936, + "loss": 2.5952, + "step": 21188 + }, + { + "epoch": 0.6283248821279246, + "grad_norm": 0.09531956911087036, + "learning_rate": 0.00030933770008672267, + "loss": 2.5794, + "step": 21189 + }, + { + "epoch": 0.628354535480236, + "grad_norm": 0.11434675008058548, + "learning_rate": 0.0003092942050838434, + "loss": 2.6038, + "step": 21190 + }, + { + "epoch": 0.6283841888325475, + "grad_norm": 0.10757564753293991, + "learning_rate": 0.0003092507117696968, + "loss": 2.6198, + "step": 21191 + }, + { + "epoch": 0.628413842184859, + "grad_norm": 0.10849058628082275, + "learning_rate": 0.00030920722014466783, + "loss": 2.6203, + "step": 21192 + }, + { + "epoch": 0.6284434955371705, + "grad_norm": 0.11150891333818436, + "learning_rate": 0.0003091637302091417, + "loss": 2.5998, + "step": 21193 + }, + { + "epoch": 0.6284731488894819, + "grad_norm": 0.12149400264024734, + "learning_rate": 0.0003091202419635035, + "loss": 2.5798, + "step": 21194 + }, + { + "epoch": 0.6285028022417934, + "grad_norm": 0.11852605640888214, + "learning_rate": 0.0003090767554081384, + "loss": 2.5816, + "step": 21195 + }, + { + "epoch": 0.6285324555941049, + "grad_norm": 0.1334637701511383, + "learning_rate": 0.0003090332705434312, + "loss": 2.6234, + "step": 21196 + }, + { + "epoch": 0.6285621089464164, + "grad_norm": 0.10578791797161102, + "learning_rate": 0.00030898978736976733, + "loss": 2.5923, + "step": 21197 + }, + { + "epoch": 0.6285917622987278, + "grad_norm": 0.10793375223875046, + "learning_rate": 0.0003089463058875316, + "loss": 2.5926, + "step": 21198 + }, + { + "epoch": 0.6286214156510394, + "grad_norm": 0.13113248348236084, + "learning_rate": 0.0003089028260971092, + "loss": 2.6125, + "step": 21199 + }, + { + "epoch": 0.6286510690033509, + "grad_norm": 0.11574248224496841, + "learning_rate": 0.00030885934799888495, + "loss": 2.6005, + "step": 21200 + }, + { + "epoch": 0.6286807223556623, + "grad_norm": 0.10908810049295425, + "learning_rate": 0.0003088158715932442, + "loss": 2.5937, + "step": 21201 + }, + { + "epoch": 0.6287103757079738, + "grad_norm": 0.1189211755990982, + "learning_rate": 0.00030877239688057134, + "loss": 2.6037, + "step": 21202 + }, + { + "epoch": 0.6287400290602853, + "grad_norm": 0.12088406831026077, + "learning_rate": 0.00030872892386125196, + "loss": 2.606, + "step": 21203 + }, + { + "epoch": 0.6287696824125968, + "grad_norm": 0.10312552750110626, + "learning_rate": 0.00030868545253567094, + "loss": 2.6029, + "step": 21204 + }, + { + "epoch": 0.6287993357649082, + "grad_norm": 0.11452730745077133, + "learning_rate": 0.00030864198290421284, + "loss": 2.5813, + "step": 21205 + }, + { + "epoch": 0.6288289891172197, + "grad_norm": 0.11770101636648178, + "learning_rate": 0.000308598514967263, + "loss": 2.6265, + "step": 21206 + }, + { + "epoch": 0.6288586424695312, + "grad_norm": 0.11868730187416077, + "learning_rate": 0.00030855504872520607, + "loss": 2.6022, + "step": 21207 + }, + { + "epoch": 0.6288882958218427, + "grad_norm": 0.10215023159980774, + "learning_rate": 0.00030851158417842707, + "loss": 2.6055, + "step": 21208 + }, + { + "epoch": 0.6289179491741541, + "grad_norm": 0.11643772572278976, + "learning_rate": 0.00030846812132731083, + "loss": 2.6053, + "step": 21209 + }, + { + "epoch": 0.6289476025264656, + "grad_norm": 0.10484989732503891, + "learning_rate": 0.00030842466017224224, + "loss": 2.6007, + "step": 21210 + }, + { + "epoch": 0.6289772558787771, + "grad_norm": 0.11022940278053284, + "learning_rate": 0.0003083812007136063, + "loss": 2.6092, + "step": 21211 + }, + { + "epoch": 0.6290069092310886, + "grad_norm": 0.10723400115966797, + "learning_rate": 0.0003083377429517876, + "loss": 2.6083, + "step": 21212 + }, + { + "epoch": 0.6290365625834, + "grad_norm": 0.13168847560882568, + "learning_rate": 0.0003082942868871711, + "loss": 2.6306, + "step": 21213 + }, + { + "epoch": 0.6290662159357115, + "grad_norm": 0.1328379362821579, + "learning_rate": 0.0003082508325201416, + "loss": 2.5644, + "step": 21214 + }, + { + "epoch": 0.629095869288023, + "grad_norm": 0.11332205682992935, + "learning_rate": 0.0003082073798510838, + "loss": 2.5987, + "step": 21215 + }, + { + "epoch": 0.6291255226403345, + "grad_norm": 0.132303386926651, + "learning_rate": 0.0003081639288803827, + "loss": 2.5901, + "step": 21216 + }, + { + "epoch": 0.6291551759926459, + "grad_norm": 0.12327048927545547, + "learning_rate": 0.0003081204796084228, + "loss": 2.5972, + "step": 21217 + }, + { + "epoch": 0.6291848293449575, + "grad_norm": 0.11332091689109802, + "learning_rate": 0.0003080770320355891, + "loss": 2.586, + "step": 21218 + }, + { + "epoch": 0.6292144826972689, + "grad_norm": 0.13692492246627808, + "learning_rate": 0.0003080335861622663, + "loss": 2.6002, + "step": 21219 + }, + { + "epoch": 0.6292441360495804, + "grad_norm": 0.11314800381660461, + "learning_rate": 0.0003079901419888389, + "loss": 2.5501, + "step": 21220 + }, + { + "epoch": 0.6292737894018919, + "grad_norm": 0.12363670021295547, + "learning_rate": 0.0003079466995156918, + "loss": 2.6061, + "step": 21221 + }, + { + "epoch": 0.6293034427542034, + "grad_norm": 0.1257789433002472, + "learning_rate": 0.0003079032587432098, + "loss": 2.5981, + "step": 21222 + }, + { + "epoch": 0.6293330961065149, + "grad_norm": 0.10443752259016037, + "learning_rate": 0.00030785981967177724, + "loss": 2.6035, + "step": 21223 + }, + { + "epoch": 0.6293627494588263, + "grad_norm": 0.12246675044298172, + "learning_rate": 0.000307816382301779, + "loss": 2.6067, + "step": 21224 + }, + { + "epoch": 0.6293924028111378, + "grad_norm": 0.10353841632604599, + "learning_rate": 0.0003077729466335997, + "loss": 2.5832, + "step": 21225 + }, + { + "epoch": 0.6294220561634493, + "grad_norm": 0.09622691571712494, + "learning_rate": 0.0003077295126676238, + "loss": 2.5948, + "step": 21226 + }, + { + "epoch": 0.6294517095157608, + "grad_norm": 0.09794025868177414, + "learning_rate": 0.0003076860804042362, + "loss": 2.6146, + "step": 21227 + }, + { + "epoch": 0.6294813628680722, + "grad_norm": 0.1113903596997261, + "learning_rate": 0.0003076426498438213, + "loss": 2.6004, + "step": 21228 + }, + { + "epoch": 0.6295110162203837, + "grad_norm": 0.11248819530010223, + "learning_rate": 0.0003075992209867638, + "loss": 2.6088, + "step": 21229 + }, + { + "epoch": 0.6295406695726952, + "grad_norm": 0.10765155404806137, + "learning_rate": 0.00030755579383344824, + "loss": 2.6257, + "step": 21230 + }, + { + "epoch": 0.6295703229250067, + "grad_norm": 0.10045865923166275, + "learning_rate": 0.0003075123683842591, + "loss": 2.6097, + "step": 21231 + }, + { + "epoch": 0.6295999762773181, + "grad_norm": 0.10538316518068314, + "learning_rate": 0.0003074689446395812, + "loss": 2.6025, + "step": 21232 + }, + { + "epoch": 0.6296296296296297, + "grad_norm": 0.11174775660037994, + "learning_rate": 0.0003074255225997986, + "loss": 2.6158, + "step": 21233 + }, + { + "epoch": 0.6296592829819411, + "grad_norm": 0.10478468239307404, + "learning_rate": 0.00030738210226529614, + "loss": 2.6116, + "step": 21234 + }, + { + "epoch": 0.6296889363342526, + "grad_norm": 0.11506524682044983, + "learning_rate": 0.00030733868363645825, + "loss": 2.6042, + "step": 21235 + }, + { + "epoch": 0.629718589686564, + "grad_norm": 0.09659885615110397, + "learning_rate": 0.0003072952667136693, + "loss": 2.5921, + "step": 21236 + }, + { + "epoch": 0.6297482430388756, + "grad_norm": 0.11243907362222672, + "learning_rate": 0.0003072518514973139, + "loss": 2.6328, + "step": 21237 + }, + { + "epoch": 0.629777896391187, + "grad_norm": 0.10640294849872589, + "learning_rate": 0.0003072084379877764, + "loss": 2.6251, + "step": 21238 + }, + { + "epoch": 0.6298075497434985, + "grad_norm": 0.10043803602457047, + "learning_rate": 0.0003071650261854414, + "loss": 2.6191, + "step": 21239 + }, + { + "epoch": 0.6298372030958099, + "grad_norm": 0.10181369632482529, + "learning_rate": 0.0003071216160906931, + "loss": 2.6081, + "step": 21240 + }, + { + "epoch": 0.6298668564481215, + "grad_norm": 0.10137581080198288, + "learning_rate": 0.0003070782077039158, + "loss": 2.6294, + "step": 21241 + }, + { + "epoch": 0.629896509800433, + "grad_norm": 0.08819151669740677, + "learning_rate": 0.00030703480102549434, + "loss": 2.5987, + "step": 21242 + }, + { + "epoch": 0.6299261631527444, + "grad_norm": 0.09631256759166718, + "learning_rate": 0.0003069913960558128, + "loss": 2.5424, + "step": 21243 + }, + { + "epoch": 0.6299558165050559, + "grad_norm": 0.09290421009063721, + "learning_rate": 0.0003069479927952555, + "loss": 2.5845, + "step": 21244 + }, + { + "epoch": 0.6299854698573674, + "grad_norm": 0.08977875858545303, + "learning_rate": 0.000306904591244207, + "loss": 2.5835, + "step": 21245 + }, + { + "epoch": 0.6300151232096789, + "grad_norm": 0.10143846273422241, + "learning_rate": 0.00030686119140305136, + "loss": 2.5957, + "step": 21246 + }, + { + "epoch": 0.6300447765619903, + "grad_norm": 0.09367190301418304, + "learning_rate": 0.0003068177932721731, + "loss": 2.5676, + "step": 21247 + }, + { + "epoch": 0.6300744299143018, + "grad_norm": 0.10413025319576263, + "learning_rate": 0.00030677439685195635, + "loss": 2.6078, + "step": 21248 + }, + { + "epoch": 0.6301040832666133, + "grad_norm": 0.10323412716388702, + "learning_rate": 0.00030673100214278565, + "loss": 2.5619, + "step": 21249 + }, + { + "epoch": 0.6301337366189248, + "grad_norm": 0.09913024306297302, + "learning_rate": 0.000306687609145045, + "loss": 2.6046, + "step": 21250 + }, + { + "epoch": 0.6301633899712362, + "grad_norm": 0.11060789972543716, + "learning_rate": 0.0003066442178591187, + "loss": 2.6048, + "step": 21251 + }, + { + "epoch": 0.6301930433235478, + "grad_norm": 0.10810375213623047, + "learning_rate": 0.0003066008282853911, + "loss": 2.6081, + "step": 21252 + }, + { + "epoch": 0.6302226966758592, + "grad_norm": 0.11395974457263947, + "learning_rate": 0.00030655744042424633, + "loss": 2.5683, + "step": 21253 + }, + { + "epoch": 0.6302523500281707, + "grad_norm": 0.10742723196744919, + "learning_rate": 0.00030651405427606853, + "loss": 2.5957, + "step": 21254 + }, + { + "epoch": 0.6302820033804821, + "grad_norm": 0.11252746731042862, + "learning_rate": 0.00030647066984124214, + "loss": 2.6297, + "step": 21255 + }, + { + "epoch": 0.6303116567327937, + "grad_norm": 0.10303491353988647, + "learning_rate": 0.0003064272871201511, + "loss": 2.5881, + "step": 21256 + }, + { + "epoch": 0.6303413100851051, + "grad_norm": 0.09142322093248367, + "learning_rate": 0.0003063839061131797, + "loss": 2.5855, + "step": 21257 + }, + { + "epoch": 0.6303709634374166, + "grad_norm": 0.11916147917509079, + "learning_rate": 0.00030634052682071205, + "loss": 2.6014, + "step": 21258 + }, + { + "epoch": 0.630400616789728, + "grad_norm": 0.10722805559635162, + "learning_rate": 0.0003062971492431323, + "loss": 2.6121, + "step": 21259 + }, + { + "epoch": 0.6304302701420396, + "grad_norm": 0.10117275267839432, + "learning_rate": 0.0003062537733808246, + "loss": 2.6136, + "step": 21260 + }, + { + "epoch": 0.630459923494351, + "grad_norm": 0.11125974357128143, + "learning_rate": 0.0003062103992341729, + "loss": 2.6, + "step": 21261 + }, + { + "epoch": 0.6304895768466625, + "grad_norm": 0.11172325909137726, + "learning_rate": 0.0003061670268035614, + "loss": 2.6378, + "step": 21262 + }, + { + "epoch": 0.630519230198974, + "grad_norm": 0.11079415678977966, + "learning_rate": 0.0003061236560893741, + "loss": 2.6106, + "step": 21263 + }, + { + "epoch": 0.6305488835512855, + "grad_norm": 0.10987265408039093, + "learning_rate": 0.0003060802870919952, + "loss": 2.5609, + "step": 21264 + }, + { + "epoch": 0.630578536903597, + "grad_norm": 0.125763937830925, + "learning_rate": 0.00030603691981180857, + "loss": 2.6025, + "step": 21265 + }, + { + "epoch": 0.6306081902559084, + "grad_norm": 0.10861296951770782, + "learning_rate": 0.0003059935542491984, + "loss": 2.5958, + "step": 21266 + }, + { + "epoch": 0.63063784360822, + "grad_norm": 0.0929446667432785, + "learning_rate": 0.00030595019040454834, + "loss": 2.6003, + "step": 21267 + }, + { + "epoch": 0.6306674969605314, + "grad_norm": 0.12276380509138107, + "learning_rate": 0.0003059068282782429, + "loss": 2.6204, + "step": 21268 + }, + { + "epoch": 0.6306971503128429, + "grad_norm": 0.12314148992300034, + "learning_rate": 0.00030586346787066574, + "loss": 2.5944, + "step": 21269 + }, + { + "epoch": 0.6307268036651543, + "grad_norm": 0.10834578424692154, + "learning_rate": 0.000305820109182201, + "loss": 2.6192, + "step": 21270 + }, + { + "epoch": 0.6307564570174659, + "grad_norm": 0.10525836050510406, + "learning_rate": 0.0003057767522132324, + "loss": 2.6274, + "step": 21271 + }, + { + "epoch": 0.6307861103697773, + "grad_norm": 0.11416877061128616, + "learning_rate": 0.00030573339696414405, + "loss": 2.6235, + "step": 21272 + }, + { + "epoch": 0.6308157637220888, + "grad_norm": 0.11047949641942978, + "learning_rate": 0.0003056900434353198, + "loss": 2.5933, + "step": 21273 + }, + { + "epoch": 0.6308454170744002, + "grad_norm": 0.12014061957597733, + "learning_rate": 0.00030564669162714354, + "loss": 2.5992, + "step": 21274 + }, + { + "epoch": 0.6308750704267118, + "grad_norm": 0.10918278247117996, + "learning_rate": 0.00030560334153999924, + "loss": 2.6202, + "step": 21275 + }, + { + "epoch": 0.6309047237790232, + "grad_norm": 0.10931004583835602, + "learning_rate": 0.0003055599931742707, + "loss": 2.6268, + "step": 21276 + }, + { + "epoch": 0.6309343771313347, + "grad_norm": 0.11095999926328659, + "learning_rate": 0.00030551664653034183, + "loss": 2.5859, + "step": 21277 + }, + { + "epoch": 0.6309640304836461, + "grad_norm": 0.0999765694141388, + "learning_rate": 0.00030547330160859645, + "loss": 2.5862, + "step": 21278 + }, + { + "epoch": 0.6309936838359577, + "grad_norm": 0.11127030104398727, + "learning_rate": 0.00030542995840941815, + "loss": 2.5927, + "step": 21279 + }, + { + "epoch": 0.6310233371882691, + "grad_norm": 0.09820105135440826, + "learning_rate": 0.00030538661693319116, + "loss": 2.5882, + "step": 21280 + }, + { + "epoch": 0.6310529905405806, + "grad_norm": 0.10501521825790405, + "learning_rate": 0.0003053432771802991, + "loss": 2.6314, + "step": 21281 + }, + { + "epoch": 0.631082643892892, + "grad_norm": 0.10734449326992035, + "learning_rate": 0.00030529993915112566, + "loss": 2.5805, + "step": 21282 + }, + { + "epoch": 0.6311122972452036, + "grad_norm": 0.09208791702985764, + "learning_rate": 0.0003052566028460547, + "loss": 2.5789, + "step": 21283 + }, + { + "epoch": 0.6311419505975151, + "grad_norm": 0.1086391732096672, + "learning_rate": 0.00030521326826547, + "loss": 2.6196, + "step": 21284 + }, + { + "epoch": 0.6311716039498265, + "grad_norm": 0.10839590430259705, + "learning_rate": 0.0003051699354097551, + "loss": 2.5934, + "step": 21285 + }, + { + "epoch": 0.631201257302138, + "grad_norm": 0.10883951187133789, + "learning_rate": 0.00030512660427929405, + "loss": 2.5892, + "step": 21286 + }, + { + "epoch": 0.6312309106544495, + "grad_norm": 0.10505066066980362, + "learning_rate": 0.00030508327487447035, + "loss": 2.6059, + "step": 21287 + }, + { + "epoch": 0.631260564006761, + "grad_norm": 0.10551788657903671, + "learning_rate": 0.0003050399471956676, + "loss": 2.588, + "step": 21288 + }, + { + "epoch": 0.6312902173590724, + "grad_norm": 0.11305975914001465, + "learning_rate": 0.00030499662124326964, + "loss": 2.5722, + "step": 21289 + }, + { + "epoch": 0.631319870711384, + "grad_norm": 0.11341526359319687, + "learning_rate": 0.00030495329701766004, + "loss": 2.6, + "step": 21290 + }, + { + "epoch": 0.6313495240636954, + "grad_norm": 0.10291577130556107, + "learning_rate": 0.00030490997451922255, + "loss": 2.5815, + "step": 21291 + }, + { + "epoch": 0.6313791774160069, + "grad_norm": 0.10915179550647736, + "learning_rate": 0.00030486665374834056, + "loss": 2.5794, + "step": 21292 + }, + { + "epoch": 0.6314088307683183, + "grad_norm": 0.10185981541872025, + "learning_rate": 0.0003048233347053979, + "loss": 2.6164, + "step": 21293 + }, + { + "epoch": 0.6314384841206299, + "grad_norm": 0.09865551441907883, + "learning_rate": 0.0003047800173907782, + "loss": 2.568, + "step": 21294 + }, + { + "epoch": 0.6314681374729413, + "grad_norm": 0.10962489992380142, + "learning_rate": 0.0003047367018048649, + "loss": 2.5824, + "step": 21295 + }, + { + "epoch": 0.6314977908252528, + "grad_norm": 0.10058678686618805, + "learning_rate": 0.0003046933879480416, + "loss": 2.5903, + "step": 21296 + }, + { + "epoch": 0.6315274441775642, + "grad_norm": 0.10577874630689621, + "learning_rate": 0.0003046500758206919, + "loss": 2.5917, + "step": 21297 + }, + { + "epoch": 0.6315570975298758, + "grad_norm": 0.11818163096904755, + "learning_rate": 0.00030460676542319945, + "loss": 2.5755, + "step": 21298 + }, + { + "epoch": 0.6315867508821872, + "grad_norm": 0.09904784709215164, + "learning_rate": 0.00030456345675594756, + "loss": 2.5789, + "step": 21299 + }, + { + "epoch": 0.6316164042344987, + "grad_norm": 0.10694067925214767, + "learning_rate": 0.0003045201498193197, + "loss": 2.6246, + "step": 21300 + }, + { + "epoch": 0.6316460575868101, + "grad_norm": 0.11901607364416122, + "learning_rate": 0.0003044768446136995, + "loss": 2.6071, + "step": 21301 + }, + { + "epoch": 0.6316757109391217, + "grad_norm": 0.1112150326371193, + "learning_rate": 0.0003044335411394704, + "loss": 2.6146, + "step": 21302 + }, + { + "epoch": 0.6317053642914331, + "grad_norm": 0.09854656457901001, + "learning_rate": 0.0003043902393970159, + "loss": 2.5866, + "step": 21303 + }, + { + "epoch": 0.6317350176437446, + "grad_norm": 0.11978248506784439, + "learning_rate": 0.0003043469393867195, + "loss": 2.5886, + "step": 21304 + }, + { + "epoch": 0.6317646709960562, + "grad_norm": 0.11864050477743149, + "learning_rate": 0.00030430364110896417, + "loss": 2.5602, + "step": 21305 + }, + { + "epoch": 0.6317943243483676, + "grad_norm": 0.11637597531080246, + "learning_rate": 0.000304260344564134, + "loss": 2.6107, + "step": 21306 + }, + { + "epoch": 0.6318239777006791, + "grad_norm": 0.1226232498884201, + "learning_rate": 0.000304217049752612, + "loss": 2.6252, + "step": 21307 + }, + { + "epoch": 0.6318536310529905, + "grad_norm": 0.11606680601835251, + "learning_rate": 0.00030417375667478173, + "loss": 2.596, + "step": 21308 + }, + { + "epoch": 0.6318832844053021, + "grad_norm": 0.11659884452819824, + "learning_rate": 0.0003041304653310264, + "loss": 2.6301, + "step": 21309 + }, + { + "epoch": 0.6319129377576135, + "grad_norm": 0.1338086873292923, + "learning_rate": 0.0003040871757217294, + "loss": 2.615, + "step": 21310 + }, + { + "epoch": 0.631942591109925, + "grad_norm": 0.10862580686807632, + "learning_rate": 0.00030404388784727404, + "loss": 2.5971, + "step": 21311 + }, + { + "epoch": 0.6319722444622364, + "grad_norm": 0.12001613527536392, + "learning_rate": 0.0003040006017080437, + "loss": 2.5799, + "step": 21312 + }, + { + "epoch": 0.632001897814548, + "grad_norm": 0.10983385890722275, + "learning_rate": 0.0003039573173044217, + "loss": 2.5946, + "step": 21313 + }, + { + "epoch": 0.6320315511668594, + "grad_norm": 0.09773247689008713, + "learning_rate": 0.00030391403463679134, + "loss": 2.6096, + "step": 21314 + }, + { + "epoch": 0.6320612045191709, + "grad_norm": 0.12830299139022827, + "learning_rate": 0.00030387075370553595, + "loss": 2.5903, + "step": 21315 + }, + { + "epoch": 0.6320908578714823, + "grad_norm": 0.11628111451864243, + "learning_rate": 0.00030382747451103854, + "loss": 2.5897, + "step": 21316 + }, + { + "epoch": 0.6321205112237939, + "grad_norm": 0.10798599570989609, + "learning_rate": 0.00030378419705368254, + "loss": 2.602, + "step": 21317 + }, + { + "epoch": 0.6321501645761053, + "grad_norm": 0.11395471543073654, + "learning_rate": 0.0003037409213338511, + "loss": 2.5789, + "step": 21318 + }, + { + "epoch": 0.6321798179284168, + "grad_norm": 0.10505622625350952, + "learning_rate": 0.00030369764735192756, + "loss": 2.5941, + "step": 21319 + }, + { + "epoch": 0.6322094712807282, + "grad_norm": 0.10160654038190842, + "learning_rate": 0.00030365437510829506, + "loss": 2.6172, + "step": 21320 + }, + { + "epoch": 0.6322391246330398, + "grad_norm": 0.12135060876607895, + "learning_rate": 0.00030361110460333675, + "loss": 2.6031, + "step": 21321 + }, + { + "epoch": 0.6322687779853512, + "grad_norm": 0.10828688740730286, + "learning_rate": 0.00030356783583743586, + "loss": 2.5967, + "step": 21322 + }, + { + "epoch": 0.6322984313376627, + "grad_norm": 0.10556177794933319, + "learning_rate": 0.00030352456881097543, + "loss": 2.6021, + "step": 21323 + }, + { + "epoch": 0.6323280846899743, + "grad_norm": 0.10666421800851822, + "learning_rate": 0.00030348130352433867, + "loss": 2.5714, + "step": 21324 + }, + { + "epoch": 0.6323577380422857, + "grad_norm": 0.10755782574415207, + "learning_rate": 0.00030343803997790885, + "loss": 2.6366, + "step": 21325 + }, + { + "epoch": 0.6323873913945972, + "grad_norm": 0.10587283223867416, + "learning_rate": 0.00030339477817206885, + "loss": 2.5742, + "step": 21326 + }, + { + "epoch": 0.6324170447469086, + "grad_norm": 0.09403908252716064, + "learning_rate": 0.0003033515181072017, + "loss": 2.5896, + "step": 21327 + }, + { + "epoch": 0.6324466980992202, + "grad_norm": 0.12098506093025208, + "learning_rate": 0.00030330825978369083, + "loss": 2.638, + "step": 21328 + }, + { + "epoch": 0.6324763514515316, + "grad_norm": 0.11694253981113434, + "learning_rate": 0.0003032650032019189, + "loss": 2.576, + "step": 21329 + }, + { + "epoch": 0.6325060048038431, + "grad_norm": 0.11496886610984802, + "learning_rate": 0.00030322174836226924, + "loss": 2.5924, + "step": 21330 + }, + { + "epoch": 0.6325356581561545, + "grad_norm": 0.09035950899124146, + "learning_rate": 0.00030317849526512457, + "loss": 2.6042, + "step": 21331 + }, + { + "epoch": 0.6325653115084661, + "grad_norm": 0.11116400361061096, + "learning_rate": 0.00030313524391086834, + "loss": 2.5688, + "step": 21332 + }, + { + "epoch": 0.6325949648607775, + "grad_norm": 0.11102651059627533, + "learning_rate": 0.0003030919942998832, + "loss": 2.6209, + "step": 21333 + }, + { + "epoch": 0.632624618213089, + "grad_norm": 0.10764135420322418, + "learning_rate": 0.0003030487464325523, + "loss": 2.6289, + "step": 21334 + }, + { + "epoch": 0.6326542715654004, + "grad_norm": 0.11055454611778259, + "learning_rate": 0.00030300550030925856, + "loss": 2.5834, + "step": 21335 + }, + { + "epoch": 0.632683924917712, + "grad_norm": 0.10254346579313278, + "learning_rate": 0.000302962255930385, + "loss": 2.5952, + "step": 21336 + }, + { + "epoch": 0.6327135782700234, + "grad_norm": 0.12119339406490326, + "learning_rate": 0.0003029190132963144, + "loss": 2.6201, + "step": 21337 + }, + { + "epoch": 0.6327432316223349, + "grad_norm": 0.09655521810054779, + "learning_rate": 0.0003028757724074298, + "loss": 2.629, + "step": 21338 + }, + { + "epoch": 0.6327728849746463, + "grad_norm": 0.11607124656438828, + "learning_rate": 0.000302832533264114, + "loss": 2.6048, + "step": 21339 + }, + { + "epoch": 0.6328025383269579, + "grad_norm": 0.0987282395362854, + "learning_rate": 0.00030278929586675, + "loss": 2.6256, + "step": 21340 + }, + { + "epoch": 0.6328321916792693, + "grad_norm": 0.10028879344463348, + "learning_rate": 0.00030274606021572065, + "loss": 2.5927, + "step": 21341 + }, + { + "epoch": 0.6328618450315808, + "grad_norm": 0.1071554645895958, + "learning_rate": 0.0003027028263114089, + "loss": 2.5985, + "step": 21342 + }, + { + "epoch": 0.6328914983838922, + "grad_norm": 0.09071043878793716, + "learning_rate": 0.00030265959415419735, + "loss": 2.6138, + "step": 21343 + }, + { + "epoch": 0.6329211517362038, + "grad_norm": 0.12184073776006699, + "learning_rate": 0.00030261636374446875, + "loss": 2.5976, + "step": 21344 + }, + { + "epoch": 0.6329508050885153, + "grad_norm": 0.11825849115848541, + "learning_rate": 0.00030257313508260644, + "loss": 2.5869, + "step": 21345 + }, + { + "epoch": 0.6329804584408267, + "grad_norm": 0.11472281068563461, + "learning_rate": 0.000302529908168993, + "loss": 2.5699, + "step": 21346 + }, + { + "epoch": 0.6330101117931383, + "grad_norm": 0.11582919210195541, + "learning_rate": 0.00030248668300401086, + "loss": 2.6181, + "step": 21347 + }, + { + "epoch": 0.6330397651454497, + "grad_norm": 0.11761980503797531, + "learning_rate": 0.00030244345958804314, + "loss": 2.6045, + "step": 21348 + }, + { + "epoch": 0.6330694184977612, + "grad_norm": 0.09683341532945633, + "learning_rate": 0.00030240023792147254, + "loss": 2.6363, + "step": 21349 + }, + { + "epoch": 0.6330990718500726, + "grad_norm": 0.1194581538438797, + "learning_rate": 0.0003023570180046816, + "loss": 2.5783, + "step": 21350 + }, + { + "epoch": 0.6331287252023842, + "grad_norm": 0.10466580837965012, + "learning_rate": 0.00030231379983805334, + "loss": 2.5796, + "step": 21351 + }, + { + "epoch": 0.6331583785546956, + "grad_norm": 0.10332873463630676, + "learning_rate": 0.0003022705834219703, + "loss": 2.6226, + "step": 21352 + }, + { + "epoch": 0.6331880319070071, + "grad_norm": 0.10525781661272049, + "learning_rate": 0.00030222736875681525, + "loss": 2.6006, + "step": 21353 + }, + { + "epoch": 0.6332176852593185, + "grad_norm": 0.10904806107282639, + "learning_rate": 0.00030218415584297063, + "loss": 2.5479, + "step": 21354 + }, + { + "epoch": 0.6332473386116301, + "grad_norm": 0.09967018663883209, + "learning_rate": 0.0003021409446808194, + "loss": 2.6265, + "step": 21355 + }, + { + "epoch": 0.6332769919639415, + "grad_norm": 0.11277929693460464, + "learning_rate": 0.000302097735270744, + "loss": 2.6335, + "step": 21356 + }, + { + "epoch": 0.633306645316253, + "grad_norm": 0.09253208339214325, + "learning_rate": 0.000302054527613127, + "loss": 2.5935, + "step": 21357 + }, + { + "epoch": 0.6333362986685644, + "grad_norm": 0.10287287831306458, + "learning_rate": 0.0003020113217083513, + "loss": 2.5711, + "step": 21358 + }, + { + "epoch": 0.633365952020876, + "grad_norm": 0.0870300903916359, + "learning_rate": 0.00030196811755679926, + "loss": 2.5947, + "step": 21359 + }, + { + "epoch": 0.6333956053731874, + "grad_norm": 0.11220881342887878, + "learning_rate": 0.00030192491515885356, + "loss": 2.6249, + "step": 21360 + }, + { + "epoch": 0.6334252587254989, + "grad_norm": 0.09657296538352966, + "learning_rate": 0.0003018817145148968, + "loss": 2.6015, + "step": 21361 + }, + { + "epoch": 0.6334549120778104, + "grad_norm": 0.10380705446004868, + "learning_rate": 0.00030183851562531135, + "loss": 2.5823, + "step": 21362 + }, + { + "epoch": 0.6334845654301219, + "grad_norm": 0.09573452174663544, + "learning_rate": 0.00030179531849048, + "loss": 2.6177, + "step": 21363 + }, + { + "epoch": 0.6335142187824333, + "grad_norm": 0.1133752167224884, + "learning_rate": 0.00030175212311078504, + "loss": 2.6285, + "step": 21364 + }, + { + "epoch": 0.6335438721347448, + "grad_norm": 0.12291416525840759, + "learning_rate": 0.0003017089294866091, + "loss": 2.5709, + "step": 21365 + }, + { + "epoch": 0.6335735254870564, + "grad_norm": 0.10215574502944946, + "learning_rate": 0.00030166573761833453, + "loss": 2.6068, + "step": 21366 + }, + { + "epoch": 0.6336031788393678, + "grad_norm": 0.1093880757689476, + "learning_rate": 0.000301622547506344, + "loss": 2.6105, + "step": 21367 + }, + { + "epoch": 0.6336328321916793, + "grad_norm": 0.11713410168886185, + "learning_rate": 0.00030157935915101975, + "loss": 2.6135, + "step": 21368 + }, + { + "epoch": 0.6336624855439907, + "grad_norm": 0.10362762212753296, + "learning_rate": 0.0003015361725527444, + "loss": 2.5704, + "step": 21369 + }, + { + "epoch": 0.6336921388963023, + "grad_norm": 0.10135796666145325, + "learning_rate": 0.0003014929877119002, + "loss": 2.6008, + "step": 21370 + }, + { + "epoch": 0.6337217922486137, + "grad_norm": 0.11276498436927795, + "learning_rate": 0.00030144980462886974, + "loss": 2.6211, + "step": 21371 + }, + { + "epoch": 0.6337514456009252, + "grad_norm": 0.09590252488851547, + "learning_rate": 0.0003014066233040354, + "loss": 2.5951, + "step": 21372 + }, + { + "epoch": 0.6337810989532366, + "grad_norm": 0.09985457360744476, + "learning_rate": 0.00030136344373777945, + "loss": 2.6076, + "step": 21373 + }, + { + "epoch": 0.6338107523055482, + "grad_norm": 0.09472500532865524, + "learning_rate": 0.00030132026593048444, + "loss": 2.6249, + "step": 21374 + }, + { + "epoch": 0.6338404056578596, + "grad_norm": 0.09850718080997467, + "learning_rate": 0.00030127708988253243, + "loss": 2.6075, + "step": 21375 + }, + { + "epoch": 0.6338700590101711, + "grad_norm": 0.10760962963104248, + "learning_rate": 0.0003012339155943059, + "loss": 2.5761, + "step": 21376 + }, + { + "epoch": 0.6338997123624825, + "grad_norm": 0.08801021426916122, + "learning_rate": 0.0003011907430661872, + "loss": 2.5976, + "step": 21377 + }, + { + "epoch": 0.6339293657147941, + "grad_norm": 0.10775182396173477, + "learning_rate": 0.0003011475722985586, + "loss": 2.6346, + "step": 21378 + }, + { + "epoch": 0.6339590190671055, + "grad_norm": 0.10192765295505524, + "learning_rate": 0.00030110440329180236, + "loss": 2.5885, + "step": 21379 + }, + { + "epoch": 0.633988672419417, + "grad_norm": 0.09474053978919983, + "learning_rate": 0.00030106123604630087, + "loss": 2.5787, + "step": 21380 + }, + { + "epoch": 0.6340183257717285, + "grad_norm": 0.11223964393138885, + "learning_rate": 0.0003010180705624362, + "loss": 2.615, + "step": 21381 + }, + { + "epoch": 0.63404797912404, + "grad_norm": 0.12311910092830658, + "learning_rate": 0.0003009749068405907, + "loss": 2.5947, + "step": 21382 + }, + { + "epoch": 0.6340776324763514, + "grad_norm": 0.10426148772239685, + "learning_rate": 0.0003009317448811463, + "loss": 2.6167, + "step": 21383 + }, + { + "epoch": 0.6341072858286629, + "grad_norm": 0.11254376918077469, + "learning_rate": 0.00030088858468448575, + "loss": 2.585, + "step": 21384 + }, + { + "epoch": 0.6341369391809744, + "grad_norm": 0.11329320073127747, + "learning_rate": 0.00030084542625099094, + "loss": 2.6067, + "step": 21385 + }, + { + "epoch": 0.6341665925332859, + "grad_norm": 0.11145190894603729, + "learning_rate": 0.000300802269581044, + "loss": 2.6261, + "step": 21386 + }, + { + "epoch": 0.6341962458855974, + "grad_norm": 0.11205977946519852, + "learning_rate": 0.0003007591146750272, + "loss": 2.6128, + "step": 21387 + }, + { + "epoch": 0.6342258992379088, + "grad_norm": 0.10733068734407425, + "learning_rate": 0.0003007159615333226, + "loss": 2.6294, + "step": 21388 + }, + { + "epoch": 0.6342555525902204, + "grad_norm": 0.10232194513082504, + "learning_rate": 0.0003006728101563124, + "loss": 2.6073, + "step": 21389 + }, + { + "epoch": 0.6342852059425318, + "grad_norm": 0.1028345376253128, + "learning_rate": 0.0003006296605443787, + "loss": 2.5844, + "step": 21390 + }, + { + "epoch": 0.6343148592948433, + "grad_norm": 0.10316794365644455, + "learning_rate": 0.0003005865126979036, + "loss": 2.6241, + "step": 21391 + }, + { + "epoch": 0.6343445126471547, + "grad_norm": 0.10205036401748657, + "learning_rate": 0.0003005433666172691, + "loss": 2.6253, + "step": 21392 + }, + { + "epoch": 0.6343741659994663, + "grad_norm": 0.10557355731725693, + "learning_rate": 0.0003005002223028573, + "loss": 2.5844, + "step": 21393 + }, + { + "epoch": 0.6344038193517777, + "grad_norm": 0.10546640306711197, + "learning_rate": 0.00030045707975505034, + "loss": 2.5744, + "step": 21394 + }, + { + "epoch": 0.6344334727040892, + "grad_norm": 0.10000888258218765, + "learning_rate": 0.00030041393897423015, + "loss": 2.6586, + "step": 21395 + }, + { + "epoch": 0.6344631260564007, + "grad_norm": 0.10286914557218552, + "learning_rate": 0.00030037079996077866, + "loss": 2.6148, + "step": 21396 + }, + { + "epoch": 0.6344927794087122, + "grad_norm": 0.1012711450457573, + "learning_rate": 0.00030032766271507815, + "loss": 2.6108, + "step": 21397 + }, + { + "epoch": 0.6345224327610236, + "grad_norm": 0.10669467598199844, + "learning_rate": 0.00030028452723751043, + "loss": 2.6195, + "step": 21398 + }, + { + "epoch": 0.6345520861133351, + "grad_norm": 0.09656219184398651, + "learning_rate": 0.0003002413935284575, + "loss": 2.5893, + "step": 21399 + }, + { + "epoch": 0.6345817394656466, + "grad_norm": 0.0983547493815422, + "learning_rate": 0.0003001982615883013, + "loss": 2.6264, + "step": 21400 + }, + { + "epoch": 0.6346113928179581, + "grad_norm": 0.09912202507257462, + "learning_rate": 0.000300155131417424, + "loss": 2.594, + "step": 21401 + }, + { + "epoch": 0.6346410461702695, + "grad_norm": 0.09341409057378769, + "learning_rate": 0.00030011200301620713, + "loss": 2.5777, + "step": 21402 + }, + { + "epoch": 0.634670699522581, + "grad_norm": 0.10747645795345306, + "learning_rate": 0.00030006887638503276, + "loss": 2.5663, + "step": 21403 + }, + { + "epoch": 0.6347003528748925, + "grad_norm": 0.10413121432065964, + "learning_rate": 0.00030002575152428284, + "loss": 2.6215, + "step": 21404 + }, + { + "epoch": 0.634730006227204, + "grad_norm": 0.11794167757034302, + "learning_rate": 0.0002999826284343392, + "loss": 2.63, + "step": 21405 + }, + { + "epoch": 0.6347596595795154, + "grad_norm": 0.11026021838188171, + "learning_rate": 0.0002999395071155837, + "loss": 2.5858, + "step": 21406 + }, + { + "epoch": 0.6347893129318269, + "grad_norm": 0.11340732127428055, + "learning_rate": 0.0002998963875683983, + "loss": 2.5815, + "step": 21407 + }, + { + "epoch": 0.6348189662841385, + "grad_norm": 0.10728152841329575, + "learning_rate": 0.00029985326979316474, + "loss": 2.6003, + "step": 21408 + }, + { + "epoch": 0.6348486196364499, + "grad_norm": 0.1187722384929657, + "learning_rate": 0.00029981015379026456, + "loss": 2.5793, + "step": 21409 + }, + { + "epoch": 0.6348782729887614, + "grad_norm": 0.11022797971963882, + "learning_rate": 0.00029976703956008, + "loss": 2.5902, + "step": 21410 + }, + { + "epoch": 0.6349079263410728, + "grad_norm": 0.10237380862236023, + "learning_rate": 0.00029972392710299274, + "loss": 2.5852, + "step": 21411 + }, + { + "epoch": 0.6349375796933844, + "grad_norm": 0.1191418394446373, + "learning_rate": 0.00029968081641938455, + "loss": 2.599, + "step": 21412 + }, + { + "epoch": 0.6349672330456958, + "grad_norm": 0.10566443204879761, + "learning_rate": 0.00029963770750963704, + "loss": 2.5953, + "step": 21413 + }, + { + "epoch": 0.6349968863980073, + "grad_norm": 0.10931594669818878, + "learning_rate": 0.000299594600374132, + "loss": 2.6187, + "step": 21414 + }, + { + "epoch": 0.6350265397503188, + "grad_norm": 0.09690174460411072, + "learning_rate": 0.00029955149501325115, + "loss": 2.6083, + "step": 21415 + }, + { + "epoch": 0.6350561931026303, + "grad_norm": 0.09774896502494812, + "learning_rate": 0.00029950839142737617, + "loss": 2.5948, + "step": 21416 + }, + { + "epoch": 0.6350858464549417, + "grad_norm": 0.10172770172357559, + "learning_rate": 0.0002994652896168889, + "loss": 2.5791, + "step": 21417 + }, + { + "epoch": 0.6351154998072532, + "grad_norm": 0.09630927443504333, + "learning_rate": 0.000299422189582171, + "loss": 2.6267, + "step": 21418 + }, + { + "epoch": 0.6351451531595647, + "grad_norm": 0.09612102061510086, + "learning_rate": 0.00029937909132360385, + "loss": 2.6113, + "step": 21419 + }, + { + "epoch": 0.6351748065118762, + "grad_norm": 0.10973964631557465, + "learning_rate": 0.0002993359948415694, + "loss": 2.6182, + "step": 21420 + }, + { + "epoch": 0.6352044598641876, + "grad_norm": 0.09442313760519028, + "learning_rate": 0.00029929290013644904, + "loss": 2.585, + "step": 21421 + }, + { + "epoch": 0.6352341132164991, + "grad_norm": 0.08788346499204636, + "learning_rate": 0.0002992498072086245, + "loss": 2.614, + "step": 21422 + }, + { + "epoch": 0.6352637665688106, + "grad_norm": 0.09740213304758072, + "learning_rate": 0.0002992067160584775, + "loss": 2.5947, + "step": 21423 + }, + { + "epoch": 0.6352934199211221, + "grad_norm": 0.09450922906398773, + "learning_rate": 0.00029916362668638944, + "loss": 2.6189, + "step": 21424 + }, + { + "epoch": 0.6353230732734335, + "grad_norm": 0.10569828003644943, + "learning_rate": 0.0002991205390927419, + "loss": 2.6087, + "step": 21425 + }, + { + "epoch": 0.635352726625745, + "grad_norm": 0.11135043948888779, + "learning_rate": 0.00029907745327791647, + "loss": 2.5813, + "step": 21426 + }, + { + "epoch": 0.6353823799780565, + "grad_norm": 0.09258726239204407, + "learning_rate": 0.0002990343692422948, + "loss": 2.6219, + "step": 21427 + }, + { + "epoch": 0.635412033330368, + "grad_norm": 0.10845506191253662, + "learning_rate": 0.0002989912869862581, + "loss": 2.6088, + "step": 21428 + }, + { + "epoch": 0.6354416866826795, + "grad_norm": 0.10517505556344986, + "learning_rate": 0.0002989482065101883, + "loss": 2.5931, + "step": 21429 + }, + { + "epoch": 0.635471340034991, + "grad_norm": 0.0977267399430275, + "learning_rate": 0.0002989051278144665, + "loss": 2.6235, + "step": 21430 + }, + { + "epoch": 0.6355009933873025, + "grad_norm": 0.10727468132972717, + "learning_rate": 0.00029886205089947425, + "loss": 2.5973, + "step": 21431 + }, + { + "epoch": 0.6355306467396139, + "grad_norm": 0.08545658737421036, + "learning_rate": 0.0002988189757655931, + "loss": 2.6176, + "step": 21432 + }, + { + "epoch": 0.6355603000919254, + "grad_norm": 0.09480585902929306, + "learning_rate": 0.00029877590241320453, + "loss": 2.6016, + "step": 21433 + }, + { + "epoch": 0.6355899534442369, + "grad_norm": 0.10468360781669617, + "learning_rate": 0.0002987328308426898, + "loss": 2.6038, + "step": 21434 + }, + { + "epoch": 0.6356196067965484, + "grad_norm": 0.0968359038233757, + "learning_rate": 0.00029868976105443035, + "loss": 2.6079, + "step": 21435 + }, + { + "epoch": 0.6356492601488598, + "grad_norm": 0.10706113278865814, + "learning_rate": 0.00029864669304880765, + "loss": 2.6154, + "step": 21436 + }, + { + "epoch": 0.6356789135011713, + "grad_norm": 0.10861489176750183, + "learning_rate": 0.0002986036268262031, + "loss": 2.622, + "step": 21437 + }, + { + "epoch": 0.6357085668534828, + "grad_norm": 0.12978589534759521, + "learning_rate": 0.00029856056238699804, + "loss": 2.6188, + "step": 21438 + }, + { + "epoch": 0.6357382202057943, + "grad_norm": 0.12009123712778091, + "learning_rate": 0.0002985174997315738, + "loss": 2.5718, + "step": 21439 + }, + { + "epoch": 0.6357678735581057, + "grad_norm": 0.0923011526465416, + "learning_rate": 0.00029847443886031166, + "loss": 2.5557, + "step": 21440 + }, + { + "epoch": 0.6357975269104172, + "grad_norm": 0.1058521568775177, + "learning_rate": 0.00029843137977359293, + "loss": 2.6133, + "step": 21441 + }, + { + "epoch": 0.6358271802627287, + "grad_norm": 0.10672993212938309, + "learning_rate": 0.00029838832247179885, + "loss": 2.5811, + "step": 21442 + }, + { + "epoch": 0.6358568336150402, + "grad_norm": 0.10389801859855652, + "learning_rate": 0.0002983452669553108, + "loss": 2.5693, + "step": 21443 + }, + { + "epoch": 0.6358864869673516, + "grad_norm": 0.10639487206935883, + "learning_rate": 0.0002983022132245101, + "loss": 2.6035, + "step": 21444 + }, + { + "epoch": 0.6359161403196631, + "grad_norm": 0.10861905664205551, + "learning_rate": 0.00029825916127977794, + "loss": 2.572, + "step": 21445 + }, + { + "epoch": 0.6359457936719746, + "grad_norm": 0.12350618094205856, + "learning_rate": 0.00029821611112149554, + "loss": 2.595, + "step": 21446 + }, + { + "epoch": 0.6359754470242861, + "grad_norm": 0.09694884717464447, + "learning_rate": 0.0002981730627500441, + "loss": 2.614, + "step": 21447 + }, + { + "epoch": 0.6360051003765975, + "grad_norm": 0.11695337295532227, + "learning_rate": 0.0002981300161658046, + "loss": 2.5911, + "step": 21448 + }, + { + "epoch": 0.636034753728909, + "grad_norm": 0.11156270653009415, + "learning_rate": 0.00029808697136915864, + "loss": 2.6028, + "step": 21449 + }, + { + "epoch": 0.6360644070812206, + "grad_norm": 0.12496618181467056, + "learning_rate": 0.0002980439283604873, + "loss": 2.606, + "step": 21450 + }, + { + "epoch": 0.636094060433532, + "grad_norm": 0.11704675108194351, + "learning_rate": 0.0002980008871401715, + "loss": 2.6163, + "step": 21451 + }, + { + "epoch": 0.6361237137858435, + "grad_norm": 0.09695173799991608, + "learning_rate": 0.00029795784770859256, + "loss": 2.6244, + "step": 21452 + }, + { + "epoch": 0.636153367138155, + "grad_norm": 0.13268399238586426, + "learning_rate": 0.0002979148100661316, + "loss": 2.6094, + "step": 21453 + }, + { + "epoch": 0.6361830204904665, + "grad_norm": 0.10468127578496933, + "learning_rate": 0.0002978717742131697, + "loss": 2.5904, + "step": 21454 + }, + { + "epoch": 0.6362126738427779, + "grad_norm": 0.09272780269384384, + "learning_rate": 0.00029782874015008785, + "loss": 2.5999, + "step": 21455 + }, + { + "epoch": 0.6362423271950894, + "grad_norm": 0.12745842337608337, + "learning_rate": 0.00029778570787726734, + "loss": 2.6232, + "step": 21456 + }, + { + "epoch": 0.6362719805474009, + "grad_norm": 0.10566312074661255, + "learning_rate": 0.00029774267739508895, + "loss": 2.5976, + "step": 21457 + }, + { + "epoch": 0.6363016338997124, + "grad_norm": 0.10067164152860641, + "learning_rate": 0.00029769964870393395, + "loss": 2.5955, + "step": 21458 + }, + { + "epoch": 0.6363312872520238, + "grad_norm": 0.11655815690755844, + "learning_rate": 0.00029765662180418327, + "loss": 2.6168, + "step": 21459 + }, + { + "epoch": 0.6363609406043353, + "grad_norm": 0.09316381067037582, + "learning_rate": 0.0002976135966962178, + "loss": 2.6134, + "step": 21460 + }, + { + "epoch": 0.6363905939566468, + "grad_norm": 0.10835113376379013, + "learning_rate": 0.0002975705733804188, + "loss": 2.6042, + "step": 21461 + }, + { + "epoch": 0.6364202473089583, + "grad_norm": 0.09934023022651672, + "learning_rate": 0.00029752755185716713, + "loss": 2.5804, + "step": 21462 + }, + { + "epoch": 0.6364499006612697, + "grad_norm": 0.1074681431055069, + "learning_rate": 0.00029748453212684366, + "loss": 2.6424, + "step": 21463 + }, + { + "epoch": 0.6364795540135813, + "grad_norm": 0.10784405469894409, + "learning_rate": 0.00029744151418982956, + "loss": 2.5919, + "step": 21464 + }, + { + "epoch": 0.6365092073658927, + "grad_norm": 0.10195277631282806, + "learning_rate": 0.00029739849804650555, + "loss": 2.6175, + "step": 21465 + }, + { + "epoch": 0.6365388607182042, + "grad_norm": 0.1037539467215538, + "learning_rate": 0.00029735548369725257, + "loss": 2.6065, + "step": 21466 + }, + { + "epoch": 0.6365685140705156, + "grad_norm": 0.09309058636426926, + "learning_rate": 0.00029731247114245176, + "loss": 2.5762, + "step": 21467 + }, + { + "epoch": 0.6365981674228272, + "grad_norm": 0.10128969699144363, + "learning_rate": 0.0002972694603824836, + "loss": 2.5909, + "step": 21468 + }, + { + "epoch": 0.6366278207751386, + "grad_norm": 0.10400018841028214, + "learning_rate": 0.0002972264514177292, + "loss": 2.5798, + "step": 21469 + }, + { + "epoch": 0.6366574741274501, + "grad_norm": 0.09714070707559586, + "learning_rate": 0.0002971834442485694, + "loss": 2.6282, + "step": 21470 + }, + { + "epoch": 0.6366871274797616, + "grad_norm": 0.1140994057059288, + "learning_rate": 0.000297140438875385, + "loss": 2.6093, + "step": 21471 + }, + { + "epoch": 0.6367167808320731, + "grad_norm": 0.1246233806014061, + "learning_rate": 0.0002970974352985569, + "loss": 2.5911, + "step": 21472 + }, + { + "epoch": 0.6367464341843846, + "grad_norm": 0.10390597581863403, + "learning_rate": 0.0002970544335184657, + "loss": 2.5809, + "step": 21473 + }, + { + "epoch": 0.636776087536696, + "grad_norm": 0.10120280086994171, + "learning_rate": 0.00029701143353549236, + "loss": 2.5893, + "step": 21474 + }, + { + "epoch": 0.6368057408890075, + "grad_norm": 0.13105767965316772, + "learning_rate": 0.0002969684353500177, + "loss": 2.5659, + "step": 21475 + }, + { + "epoch": 0.636835394241319, + "grad_norm": 0.13065454363822937, + "learning_rate": 0.00029692543896242237, + "loss": 2.6124, + "step": 21476 + }, + { + "epoch": 0.6368650475936305, + "grad_norm": 0.09136524796485901, + "learning_rate": 0.0002968824443730872, + "loss": 2.6067, + "step": 21477 + }, + { + "epoch": 0.6368947009459419, + "grad_norm": 0.12048756331205368, + "learning_rate": 0.00029683945158239276, + "loss": 2.6033, + "step": 21478 + }, + { + "epoch": 0.6369243542982534, + "grad_norm": 0.1295805126428604, + "learning_rate": 0.00029679646059071986, + "loss": 2.6204, + "step": 21479 + }, + { + "epoch": 0.6369540076505649, + "grad_norm": 0.12083890289068222, + "learning_rate": 0.0002967534713984492, + "loss": 2.5796, + "step": 21480 + }, + { + "epoch": 0.6369836610028764, + "grad_norm": 0.11919362097978592, + "learning_rate": 0.00029671048400596145, + "loss": 2.5964, + "step": 21481 + }, + { + "epoch": 0.6370133143551878, + "grad_norm": 0.13834881782531738, + "learning_rate": 0.0002966674984136372, + "loss": 2.6022, + "step": 21482 + }, + { + "epoch": 0.6370429677074994, + "grad_norm": 0.1386321634054184, + "learning_rate": 0.0002966245146218572, + "loss": 2.5996, + "step": 21483 + }, + { + "epoch": 0.6370726210598108, + "grad_norm": 0.12387285381555557, + "learning_rate": 0.0002965815326310021, + "loss": 2.6058, + "step": 21484 + }, + { + "epoch": 0.6371022744121223, + "grad_norm": 0.11186424642801285, + "learning_rate": 0.0002965385524414524, + "loss": 2.5907, + "step": 21485 + }, + { + "epoch": 0.6371319277644337, + "grad_norm": 0.11415860056877136, + "learning_rate": 0.0002964955740535885, + "loss": 2.6229, + "step": 21486 + }, + { + "epoch": 0.6371615811167453, + "grad_norm": 0.11681363731622696, + "learning_rate": 0.0002964525974677914, + "loss": 2.611, + "step": 21487 + }, + { + "epoch": 0.6371912344690567, + "grad_norm": 0.10092616081237793, + "learning_rate": 0.00029640962268444163, + "loss": 2.5813, + "step": 21488 + }, + { + "epoch": 0.6372208878213682, + "grad_norm": 0.12072810530662537, + "learning_rate": 0.00029636664970391946, + "loss": 2.6383, + "step": 21489 + }, + { + "epoch": 0.6372505411736796, + "grad_norm": 0.11167749762535095, + "learning_rate": 0.00029632367852660565, + "loss": 2.6347, + "step": 21490 + }, + { + "epoch": 0.6372801945259912, + "grad_norm": 0.09780009835958481, + "learning_rate": 0.0002962807091528805, + "loss": 2.6141, + "step": 21491 + }, + { + "epoch": 0.6373098478783027, + "grad_norm": 0.12967723608016968, + "learning_rate": 0.00029623774158312475, + "loss": 2.5983, + "step": 21492 + }, + { + "epoch": 0.6373395012306141, + "grad_norm": 0.09415643662214279, + "learning_rate": 0.0002961947758177187, + "loss": 2.6261, + "step": 21493 + }, + { + "epoch": 0.6373691545829256, + "grad_norm": 0.10743638128042221, + "learning_rate": 0.000296151811857043, + "loss": 2.6217, + "step": 21494 + }, + { + "epoch": 0.6373988079352371, + "grad_norm": 0.11542440205812454, + "learning_rate": 0.00029610884970147797, + "loss": 2.6057, + "step": 21495 + }, + { + "epoch": 0.6374284612875486, + "grad_norm": 0.10062111914157867, + "learning_rate": 0.00029606588935140397, + "loss": 2.586, + "step": 21496 + }, + { + "epoch": 0.63745811463986, + "grad_norm": 0.10083172470331192, + "learning_rate": 0.00029602293080720156, + "loss": 2.5823, + "step": 21497 + }, + { + "epoch": 0.6374877679921716, + "grad_norm": 0.10916724801063538, + "learning_rate": 0.00029597997406925113, + "loss": 2.58, + "step": 21498 + }, + { + "epoch": 0.637517421344483, + "grad_norm": 0.09294088929891586, + "learning_rate": 0.0002959370191379329, + "loss": 2.6152, + "step": 21499 + }, + { + "epoch": 0.6375470746967945, + "grad_norm": 0.10240332782268524, + "learning_rate": 0.00029589406601362756, + "loss": 2.6045, + "step": 21500 + }, + { + "epoch": 0.6375767280491059, + "grad_norm": 0.11567326635122299, + "learning_rate": 0.0002958511146967153, + "loss": 2.5965, + "step": 21501 + }, + { + "epoch": 0.6376063814014175, + "grad_norm": 0.09774640947580338, + "learning_rate": 0.0002958081651875764, + "loss": 2.6151, + "step": 21502 + }, + { + "epoch": 0.6376360347537289, + "grad_norm": 0.0996258482336998, + "learning_rate": 0.0002957652174865913, + "loss": 2.6141, + "step": 21503 + }, + { + "epoch": 0.6376656881060404, + "grad_norm": 0.1062513068318367, + "learning_rate": 0.00029572227159414024, + "loss": 2.5687, + "step": 21504 + }, + { + "epoch": 0.6376953414583518, + "grad_norm": 0.10608342289924622, + "learning_rate": 0.0002956793275106036, + "loss": 2.608, + "step": 21505 + }, + { + "epoch": 0.6377249948106634, + "grad_norm": 0.11170478910207748, + "learning_rate": 0.0002956363852363615, + "loss": 2.6064, + "step": 21506 + }, + { + "epoch": 0.6377546481629748, + "grad_norm": 0.10415136069059372, + "learning_rate": 0.0002955934447717943, + "loss": 2.6221, + "step": 21507 + }, + { + "epoch": 0.6377843015152863, + "grad_norm": 0.1142331063747406, + "learning_rate": 0.00029555050611728225, + "loss": 2.5923, + "step": 21508 + }, + { + "epoch": 0.6378139548675977, + "grad_norm": 0.12741339206695557, + "learning_rate": 0.00029550756927320554, + "loss": 2.5876, + "step": 21509 + }, + { + "epoch": 0.6378436082199093, + "grad_norm": 0.10619033873081207, + "learning_rate": 0.0002954646342399444, + "loss": 2.5885, + "step": 21510 + }, + { + "epoch": 0.6378732615722207, + "grad_norm": 0.11773961782455444, + "learning_rate": 0.00029542170101787914, + "loss": 2.6054, + "step": 21511 + }, + { + "epoch": 0.6379029149245322, + "grad_norm": 0.10892379283905029, + "learning_rate": 0.00029537876960738954, + "loss": 2.5917, + "step": 21512 + }, + { + "epoch": 0.6379325682768437, + "grad_norm": 0.10859421640634537, + "learning_rate": 0.0002953358400088563, + "loss": 2.5853, + "step": 21513 + }, + { + "epoch": 0.6379622216291552, + "grad_norm": 0.11757587641477585, + "learning_rate": 0.0002952929122226592, + "loss": 2.5896, + "step": 21514 + }, + { + "epoch": 0.6379918749814667, + "grad_norm": 0.10841882973909378, + "learning_rate": 0.00029524998624917866, + "loss": 2.5947, + "step": 21515 + }, + { + "epoch": 0.6380215283337781, + "grad_norm": 0.11666921526193619, + "learning_rate": 0.0002952070620887946, + "loss": 2.6322, + "step": 21516 + }, + { + "epoch": 0.6380511816860897, + "grad_norm": 0.13188932836055756, + "learning_rate": 0.00029516413974188706, + "loss": 2.5992, + "step": 21517 + }, + { + "epoch": 0.6380808350384011, + "grad_norm": 0.10190390795469284, + "learning_rate": 0.00029512121920883627, + "loss": 2.5822, + "step": 21518 + }, + { + "epoch": 0.6381104883907126, + "grad_norm": 0.1203138679265976, + "learning_rate": 0.0002950783004900223, + "loss": 2.6371, + "step": 21519 + }, + { + "epoch": 0.638140141743024, + "grad_norm": 0.10766691714525223, + "learning_rate": 0.000295035383585825, + "loss": 2.6065, + "step": 21520 + }, + { + "epoch": 0.6381697950953356, + "grad_norm": 0.10591478645801544, + "learning_rate": 0.00029499246849662474, + "loss": 2.5967, + "step": 21521 + }, + { + "epoch": 0.638199448447647, + "grad_norm": 0.11441361159086227, + "learning_rate": 0.0002949495552228014, + "loss": 2.6009, + "step": 21522 + }, + { + "epoch": 0.6382291017999585, + "grad_norm": 0.1019904688000679, + "learning_rate": 0.0002949066437647349, + "loss": 2.6146, + "step": 21523 + }, + { + "epoch": 0.6382587551522699, + "grad_norm": 0.10640569776296616, + "learning_rate": 0.0002948637341228051, + "loss": 2.6218, + "step": 21524 + }, + { + "epoch": 0.6382884085045815, + "grad_norm": 0.10982646048069, + "learning_rate": 0.00029482082629739205, + "loss": 2.6017, + "step": 21525 + }, + { + "epoch": 0.6383180618568929, + "grad_norm": 0.11171699315309525, + "learning_rate": 0.0002947779202888761, + "loss": 2.5909, + "step": 21526 + }, + { + "epoch": 0.6383477152092044, + "grad_norm": 0.09967344254255295, + "learning_rate": 0.0002947350160976368, + "loss": 2.5906, + "step": 21527 + }, + { + "epoch": 0.6383773685615158, + "grad_norm": 0.11465324461460114, + "learning_rate": 0.0002946921137240542, + "loss": 2.5896, + "step": 21528 + }, + { + "epoch": 0.6384070219138274, + "grad_norm": 0.11179099231958389, + "learning_rate": 0.00029464921316850815, + "loss": 2.6145, + "step": 21529 + }, + { + "epoch": 0.6384366752661388, + "grad_norm": 0.12443573027849197, + "learning_rate": 0.00029460631443137855, + "loss": 2.5778, + "step": 21530 + }, + { + "epoch": 0.6384663286184503, + "grad_norm": 0.1323346048593521, + "learning_rate": 0.00029456341751304526, + "loss": 2.6356, + "step": 21531 + }, + { + "epoch": 0.6384959819707619, + "grad_norm": 0.11426889151334763, + "learning_rate": 0.0002945205224138883, + "loss": 2.616, + "step": 21532 + }, + { + "epoch": 0.6385256353230733, + "grad_norm": 0.11236947029829025, + "learning_rate": 0.0002944776291342873, + "loss": 2.6058, + "step": 21533 + }, + { + "epoch": 0.6385552886753848, + "grad_norm": 0.12033069133758545, + "learning_rate": 0.0002944347376746221, + "loss": 2.6121, + "step": 21534 + }, + { + "epoch": 0.6385849420276962, + "grad_norm": 0.10332702845335007, + "learning_rate": 0.0002943918480352726, + "loss": 2.6036, + "step": 21535 + }, + { + "epoch": 0.6386145953800078, + "grad_norm": 0.10454359650611877, + "learning_rate": 0.00029434896021661867, + "loss": 2.6112, + "step": 21536 + }, + { + "epoch": 0.6386442487323192, + "grad_norm": 0.10825624316930771, + "learning_rate": 0.00029430607421903997, + "loss": 2.6172, + "step": 21537 + }, + { + "epoch": 0.6386739020846307, + "grad_norm": 0.0960453674197197, + "learning_rate": 0.00029426319004291615, + "loss": 2.5794, + "step": 21538 + }, + { + "epoch": 0.6387035554369421, + "grad_norm": 0.11381036788225174, + "learning_rate": 0.00029422030768862717, + "loss": 2.5853, + "step": 21539 + }, + { + "epoch": 0.6387332087892537, + "grad_norm": 0.09870703518390656, + "learning_rate": 0.00029417742715655276, + "loss": 2.6287, + "step": 21540 + }, + { + "epoch": 0.6387628621415651, + "grad_norm": 0.11278371512889862, + "learning_rate": 0.0002941345484470726, + "loss": 2.6146, + "step": 21541 + }, + { + "epoch": 0.6387925154938766, + "grad_norm": 0.11299946159124374, + "learning_rate": 0.00029409167156056623, + "loss": 2.6453, + "step": 21542 + }, + { + "epoch": 0.638822168846188, + "grad_norm": 0.1037638708949089, + "learning_rate": 0.0002940487964974136, + "loss": 2.5933, + "step": 21543 + }, + { + "epoch": 0.6388518221984996, + "grad_norm": 0.12375636398792267, + "learning_rate": 0.00029400592325799416, + "loss": 2.5683, + "step": 21544 + }, + { + "epoch": 0.638881475550811, + "grad_norm": 0.10828317701816559, + "learning_rate": 0.0002939630518426876, + "loss": 2.6127, + "step": 21545 + }, + { + "epoch": 0.6389111289031225, + "grad_norm": 0.09837977588176727, + "learning_rate": 0.0002939201822518735, + "loss": 2.6129, + "step": 21546 + }, + { + "epoch": 0.6389407822554339, + "grad_norm": 0.12097512930631638, + "learning_rate": 0.00029387731448593166, + "loss": 2.6057, + "step": 21547 + }, + { + "epoch": 0.6389704356077455, + "grad_norm": 0.09390046447515488, + "learning_rate": 0.00029383444854524156, + "loss": 2.5703, + "step": 21548 + }, + { + "epoch": 0.6390000889600569, + "grad_norm": 0.11529307812452316, + "learning_rate": 0.0002937915844301829, + "loss": 2.58, + "step": 21549 + }, + { + "epoch": 0.6390297423123684, + "grad_norm": 0.11641470342874527, + "learning_rate": 0.000293748722141135, + "loss": 2.6451, + "step": 21550 + }, + { + "epoch": 0.6390593956646798, + "grad_norm": 0.10844636708498001, + "learning_rate": 0.00029370586167847744, + "loss": 2.5817, + "step": 21551 + }, + { + "epoch": 0.6390890490169914, + "grad_norm": 0.10032488405704498, + "learning_rate": 0.0002936630030425901, + "loss": 2.6264, + "step": 21552 + }, + { + "epoch": 0.6391187023693029, + "grad_norm": 0.1081477701663971, + "learning_rate": 0.00029362014623385236, + "loss": 2.6196, + "step": 21553 + }, + { + "epoch": 0.6391483557216143, + "grad_norm": 0.10209762305021286, + "learning_rate": 0.00029357729125264353, + "loss": 2.6054, + "step": 21554 + }, + { + "epoch": 0.6391780090739259, + "grad_norm": 0.09460337460041046, + "learning_rate": 0.00029353443809934323, + "loss": 2.6098, + "step": 21555 + }, + { + "epoch": 0.6392076624262373, + "grad_norm": 0.092137910425663, + "learning_rate": 0.00029349158677433095, + "loss": 2.5844, + "step": 21556 + }, + { + "epoch": 0.6392373157785488, + "grad_norm": 0.09814447164535522, + "learning_rate": 0.00029344873727798603, + "loss": 2.6072, + "step": 21557 + }, + { + "epoch": 0.6392669691308602, + "grad_norm": 0.08750364184379578, + "learning_rate": 0.00029340588961068807, + "loss": 2.6118, + "step": 21558 + }, + { + "epoch": 0.6392966224831718, + "grad_norm": 0.0891343280673027, + "learning_rate": 0.0002933630437728164, + "loss": 2.5931, + "step": 21559 + }, + { + "epoch": 0.6393262758354832, + "grad_norm": 0.08456846326589584, + "learning_rate": 0.00029332019976475055, + "loss": 2.5727, + "step": 21560 + }, + { + "epoch": 0.6393559291877947, + "grad_norm": 0.09694851189851761, + "learning_rate": 0.0002932773575868697, + "loss": 2.6075, + "step": 21561 + }, + { + "epoch": 0.6393855825401061, + "grad_norm": 0.09070942550897598, + "learning_rate": 0.0002932345172395534, + "loss": 2.5826, + "step": 21562 + }, + { + "epoch": 0.6394152358924177, + "grad_norm": 0.08642160147428513, + "learning_rate": 0.00029319167872318084, + "loss": 2.5922, + "step": 21563 + }, + { + "epoch": 0.6394448892447291, + "grad_norm": 0.09491951018571854, + "learning_rate": 0.0002931488420381314, + "loss": 2.5753, + "step": 21564 + }, + { + "epoch": 0.6394745425970406, + "grad_norm": 0.09634276479482651, + "learning_rate": 0.00029310600718478457, + "loss": 2.6073, + "step": 21565 + }, + { + "epoch": 0.639504195949352, + "grad_norm": 0.08935066312551498, + "learning_rate": 0.0002930631741635196, + "loss": 2.5818, + "step": 21566 + }, + { + "epoch": 0.6395338493016636, + "grad_norm": 0.08844199031591415, + "learning_rate": 0.0002930203429747157, + "loss": 2.604, + "step": 21567 + }, + { + "epoch": 0.639563502653975, + "grad_norm": 0.08913496881723404, + "learning_rate": 0.00029297751361875217, + "loss": 2.5922, + "step": 21568 + }, + { + "epoch": 0.6395931560062865, + "grad_norm": 0.0974404588341713, + "learning_rate": 0.0002929346860960083, + "loss": 2.5777, + "step": 21569 + }, + { + "epoch": 0.6396228093585979, + "grad_norm": 0.09980494529008865, + "learning_rate": 0.0002928918604068634, + "loss": 2.6087, + "step": 21570 + }, + { + "epoch": 0.6396524627109095, + "grad_norm": 0.098385751247406, + "learning_rate": 0.00029284903655169665, + "loss": 2.5927, + "step": 21571 + }, + { + "epoch": 0.6396821160632209, + "grad_norm": 0.08895406872034073, + "learning_rate": 0.0002928062145308872, + "loss": 2.6048, + "step": 21572 + }, + { + "epoch": 0.6397117694155324, + "grad_norm": 0.10359552502632141, + "learning_rate": 0.0002927633943448142, + "loss": 2.5841, + "step": 21573 + }, + { + "epoch": 0.639741422767844, + "grad_norm": 0.10757359862327576, + "learning_rate": 0.00029272057599385694, + "loss": 2.5568, + "step": 21574 + }, + { + "epoch": 0.6397710761201554, + "grad_norm": 0.09166648238897324, + "learning_rate": 0.0002926777594783946, + "loss": 2.5972, + "step": 21575 + }, + { + "epoch": 0.6398007294724669, + "grad_norm": 0.09478229284286499, + "learning_rate": 0.00029263494479880625, + "loss": 2.5945, + "step": 21576 + }, + { + "epoch": 0.6398303828247783, + "grad_norm": 0.12768130004405975, + "learning_rate": 0.00029259213195547096, + "loss": 2.5812, + "step": 21577 + }, + { + "epoch": 0.6398600361770899, + "grad_norm": 0.11919175088405609, + "learning_rate": 0.00029254932094876807, + "loss": 2.5803, + "step": 21578 + }, + { + "epoch": 0.6398896895294013, + "grad_norm": 0.115818090736866, + "learning_rate": 0.0002925065117790765, + "loss": 2.584, + "step": 21579 + }, + { + "epoch": 0.6399193428817128, + "grad_norm": 0.10387950390577316, + "learning_rate": 0.00029246370444677546, + "loss": 2.589, + "step": 21580 + }, + { + "epoch": 0.6399489962340242, + "grad_norm": 0.13294503092765808, + "learning_rate": 0.000292420898952244, + "loss": 2.6118, + "step": 21581 + }, + { + "epoch": 0.6399786495863358, + "grad_norm": 0.10983418673276901, + "learning_rate": 0.00029237809529586103, + "loss": 2.6071, + "step": 21582 + }, + { + "epoch": 0.6400083029386472, + "grad_norm": 0.10981635749340057, + "learning_rate": 0.0002923352934780056, + "loss": 2.6058, + "step": 21583 + }, + { + "epoch": 0.6400379562909587, + "grad_norm": 0.10725318640470505, + "learning_rate": 0.0002922924934990568, + "loss": 2.6125, + "step": 21584 + }, + { + "epoch": 0.6400676096432701, + "grad_norm": 0.11079658567905426, + "learning_rate": 0.0002922496953593937, + "loss": 2.6187, + "step": 21585 + }, + { + "epoch": 0.6400972629955817, + "grad_norm": 0.09878289699554443, + "learning_rate": 0.0002922068990593951, + "loss": 2.6129, + "step": 21586 + }, + { + "epoch": 0.6401269163478931, + "grad_norm": 0.10786880552768707, + "learning_rate": 0.0002921641045994403, + "loss": 2.6064, + "step": 21587 + }, + { + "epoch": 0.6401565697002046, + "grad_norm": 0.0973551794886589, + "learning_rate": 0.0002921213119799079, + "loss": 2.6065, + "step": 21588 + }, + { + "epoch": 0.640186223052516, + "grad_norm": 0.10383190214633942, + "learning_rate": 0.0002920785212011769, + "loss": 2.6077, + "step": 21589 + }, + { + "epoch": 0.6402158764048276, + "grad_norm": 0.10414838790893555, + "learning_rate": 0.0002920357322636261, + "loss": 2.571, + "step": 21590 + }, + { + "epoch": 0.640245529757139, + "grad_norm": 0.09311489760875702, + "learning_rate": 0.0002919929451676349, + "loss": 2.6102, + "step": 21591 + }, + { + "epoch": 0.6402751831094505, + "grad_norm": 0.09694409370422363, + "learning_rate": 0.00029195015991358175, + "loss": 2.6156, + "step": 21592 + }, + { + "epoch": 0.640304836461762, + "grad_norm": 0.09404141455888748, + "learning_rate": 0.00029190737650184565, + "loss": 2.631, + "step": 21593 + }, + { + "epoch": 0.6403344898140735, + "grad_norm": 0.08977595716714859, + "learning_rate": 0.00029186459493280546, + "loss": 2.6159, + "step": 21594 + }, + { + "epoch": 0.640364143166385, + "grad_norm": 0.09614089131355286, + "learning_rate": 0.0002918218152068402, + "loss": 2.5937, + "step": 21595 + }, + { + "epoch": 0.6403937965186964, + "grad_norm": 0.09134788066148758, + "learning_rate": 0.0002917790373243283, + "loss": 2.588, + "step": 21596 + }, + { + "epoch": 0.640423449871008, + "grad_norm": 0.09593018144369125, + "learning_rate": 0.0002917362612856488, + "loss": 2.6119, + "step": 21597 + }, + { + "epoch": 0.6404531032233194, + "grad_norm": 0.10106837004423141, + "learning_rate": 0.0002916934870911805, + "loss": 2.5969, + "step": 21598 + }, + { + "epoch": 0.6404827565756309, + "grad_norm": 0.08785106986761093, + "learning_rate": 0.000291650714741302, + "loss": 2.5674, + "step": 21599 + }, + { + "epoch": 0.6405124099279423, + "grad_norm": 0.10034050792455673, + "learning_rate": 0.00029160794423639224, + "loss": 2.581, + "step": 21600 + }, + { + "epoch": 0.6405420632802539, + "grad_norm": 0.0970664769411087, + "learning_rate": 0.0002915651755768299, + "loss": 2.6075, + "step": 21601 + }, + { + "epoch": 0.6405717166325653, + "grad_norm": 0.10089979320764542, + "learning_rate": 0.00029152240876299373, + "loss": 2.5524, + "step": 21602 + }, + { + "epoch": 0.6406013699848768, + "grad_norm": 0.1027585044503212, + "learning_rate": 0.00029147964379526225, + "loss": 2.6071, + "step": 21603 + }, + { + "epoch": 0.6406310233371882, + "grad_norm": 0.09421837329864502, + "learning_rate": 0.00029143688067401446, + "loss": 2.6144, + "step": 21604 + }, + { + "epoch": 0.6406606766894998, + "grad_norm": 0.10192159563302994, + "learning_rate": 0.0002913941193996291, + "loss": 2.5754, + "step": 21605 + }, + { + "epoch": 0.6406903300418112, + "grad_norm": 0.11151101440191269, + "learning_rate": 0.0002913513599724844, + "loss": 2.641, + "step": 21606 + }, + { + "epoch": 0.6407199833941227, + "grad_norm": 0.11079593747854233, + "learning_rate": 0.0002913086023929593, + "loss": 2.6209, + "step": 21607 + }, + { + "epoch": 0.6407496367464341, + "grad_norm": 0.12249982357025146, + "learning_rate": 0.0002912658466614323, + "loss": 2.5896, + "step": 21608 + }, + { + "epoch": 0.6407792900987457, + "grad_norm": 0.09553223103284836, + "learning_rate": 0.0002912230927782821, + "loss": 2.5585, + "step": 21609 + }, + { + "epoch": 0.6408089434510571, + "grad_norm": 0.11148754507303238, + "learning_rate": 0.0002911803407438871, + "loss": 2.5941, + "step": 21610 + }, + { + "epoch": 0.6408385968033686, + "grad_norm": 0.12805718183517456, + "learning_rate": 0.00029113759055862616, + "loss": 2.6288, + "step": 21611 + }, + { + "epoch": 0.64086825015568, + "grad_norm": 0.10117946565151215, + "learning_rate": 0.00029109484222287764, + "loss": 2.554, + "step": 21612 + }, + { + "epoch": 0.6408979035079916, + "grad_norm": 0.12268004566431046, + "learning_rate": 0.0002910520957370202, + "loss": 2.6201, + "step": 21613 + }, + { + "epoch": 0.640927556860303, + "grad_norm": 0.11412245035171509, + "learning_rate": 0.00029100935110143233, + "loss": 2.5937, + "step": 21614 + }, + { + "epoch": 0.6409572102126145, + "grad_norm": 0.11099039763212204, + "learning_rate": 0.0002909666083164925, + "loss": 2.5779, + "step": 21615 + }, + { + "epoch": 0.6409868635649261, + "grad_norm": 0.10238245129585266, + "learning_rate": 0.0002909238673825792, + "loss": 2.6251, + "step": 21616 + }, + { + "epoch": 0.6410165169172375, + "grad_norm": 0.10955408960580826, + "learning_rate": 0.000290881128300071, + "loss": 2.585, + "step": 21617 + }, + { + "epoch": 0.641046170269549, + "grad_norm": 0.10297174006700516, + "learning_rate": 0.0002908383910693463, + "loss": 2.6141, + "step": 21618 + }, + { + "epoch": 0.6410758236218604, + "grad_norm": 0.11511728912591934, + "learning_rate": 0.00029079565569078346, + "loss": 2.6007, + "step": 21619 + }, + { + "epoch": 0.641105476974172, + "grad_norm": 0.09960290789604187, + "learning_rate": 0.00029075292216476113, + "loss": 2.5913, + "step": 21620 + }, + { + "epoch": 0.6411351303264834, + "grad_norm": 0.10150982439517975, + "learning_rate": 0.00029071019049165746, + "loss": 2.5782, + "step": 21621 + }, + { + "epoch": 0.6411647836787949, + "grad_norm": 0.09675665199756622, + "learning_rate": 0.0002906674606718511, + "loss": 2.5769, + "step": 21622 + }, + { + "epoch": 0.6411944370311063, + "grad_norm": 0.10835231095552444, + "learning_rate": 0.0002906247327057202, + "loss": 2.5877, + "step": 21623 + }, + { + "epoch": 0.6412240903834179, + "grad_norm": 0.10500773042440414, + "learning_rate": 0.00029058200659364333, + "loss": 2.6364, + "step": 21624 + }, + { + "epoch": 0.6412537437357293, + "grad_norm": 0.10731352865695953, + "learning_rate": 0.0002905392823359987, + "loss": 2.5833, + "step": 21625 + }, + { + "epoch": 0.6412833970880408, + "grad_norm": 0.12040591984987259, + "learning_rate": 0.0002904965599331646, + "loss": 2.6168, + "step": 21626 + }, + { + "epoch": 0.6413130504403523, + "grad_norm": 0.09093482792377472, + "learning_rate": 0.0002904538393855195, + "loss": 2.6176, + "step": 21627 + }, + { + "epoch": 0.6413427037926638, + "grad_norm": 0.1168464794754982, + "learning_rate": 0.00029041112069344165, + "loss": 2.5957, + "step": 21628 + }, + { + "epoch": 0.6413723571449752, + "grad_norm": 0.101498544216156, + "learning_rate": 0.00029036840385730924, + "loss": 2.5878, + "step": 21629 + }, + { + "epoch": 0.6414020104972867, + "grad_norm": 0.10703451186418533, + "learning_rate": 0.00029032568887750064, + "loss": 2.5855, + "step": 21630 + }, + { + "epoch": 0.6414316638495982, + "grad_norm": 0.11696980893611908, + "learning_rate": 0.000290282975754394, + "loss": 2.612, + "step": 21631 + }, + { + "epoch": 0.6414613172019097, + "grad_norm": 0.11008197069168091, + "learning_rate": 0.00029024026448836766, + "loss": 2.6024, + "step": 21632 + }, + { + "epoch": 0.6414909705542211, + "grad_norm": 0.10278960317373276, + "learning_rate": 0.00029019755507979974, + "loss": 2.5736, + "step": 21633 + }, + { + "epoch": 0.6415206239065326, + "grad_norm": 0.12447767704725266, + "learning_rate": 0.0002901548475290685, + "loss": 2.5914, + "step": 21634 + }, + { + "epoch": 0.6415502772588441, + "grad_norm": 0.10698909312486649, + "learning_rate": 0.0002901121418365521, + "loss": 2.5691, + "step": 21635 + }, + { + "epoch": 0.6415799306111556, + "grad_norm": 0.12845578789710999, + "learning_rate": 0.00029006943800262875, + "loss": 2.5999, + "step": 21636 + }, + { + "epoch": 0.6416095839634671, + "grad_norm": 0.12223771214485168, + "learning_rate": 0.00029002673602767656, + "loss": 2.6202, + "step": 21637 + }, + { + "epoch": 0.6416392373157785, + "grad_norm": 0.11896409094333649, + "learning_rate": 0.00028998403591207363, + "loss": 2.593, + "step": 21638 + }, + { + "epoch": 0.6416688906680901, + "grad_norm": 0.10560344159603119, + "learning_rate": 0.0002899413376561983, + "loss": 2.5869, + "step": 21639 + }, + { + "epoch": 0.6416985440204015, + "grad_norm": 0.09439299255609512, + "learning_rate": 0.0002898986412604283, + "loss": 2.5911, + "step": 21640 + }, + { + "epoch": 0.641728197372713, + "grad_norm": 0.11652366071939468, + "learning_rate": 0.0002898559467251419, + "loss": 2.626, + "step": 21641 + }, + { + "epoch": 0.6417578507250244, + "grad_norm": 0.09909101575613022, + "learning_rate": 0.00028981325405071697, + "loss": 2.5932, + "step": 21642 + }, + { + "epoch": 0.641787504077336, + "grad_norm": 0.11627589911222458, + "learning_rate": 0.00028977056323753193, + "loss": 2.6045, + "step": 21643 + }, + { + "epoch": 0.6418171574296474, + "grad_norm": 0.11787854880094528, + "learning_rate": 0.00028972787428596463, + "loss": 2.6269, + "step": 21644 + }, + { + "epoch": 0.6418468107819589, + "grad_norm": 0.11018336564302444, + "learning_rate": 0.00028968518719639313, + "loss": 2.6083, + "step": 21645 + }, + { + "epoch": 0.6418764641342704, + "grad_norm": 0.08833857625722885, + "learning_rate": 0.0002896425019691954, + "loss": 2.5767, + "step": 21646 + }, + { + "epoch": 0.6419061174865819, + "grad_norm": 0.10837599635124207, + "learning_rate": 0.0002895998186047494, + "loss": 2.6004, + "step": 21647 + }, + { + "epoch": 0.6419357708388933, + "grad_norm": 0.10117579251527786, + "learning_rate": 0.0002895571371034331, + "loss": 2.6181, + "step": 21648 + }, + { + "epoch": 0.6419654241912048, + "grad_norm": 0.09735157340765, + "learning_rate": 0.0002895144574656245, + "loss": 2.5932, + "step": 21649 + }, + { + "epoch": 0.6419950775435163, + "grad_norm": 0.09259669482707977, + "learning_rate": 0.0002894717796917017, + "loss": 2.5709, + "step": 21650 + }, + { + "epoch": 0.6420247308958278, + "grad_norm": 0.10226704180240631, + "learning_rate": 0.0002894291037820422, + "loss": 2.5834, + "step": 21651 + }, + { + "epoch": 0.6420543842481392, + "grad_norm": 0.08699026703834534, + "learning_rate": 0.0002893864297370242, + "loss": 2.5803, + "step": 21652 + }, + { + "epoch": 0.6420840376004507, + "grad_norm": 0.10669246315956116, + "learning_rate": 0.0002893437575570255, + "loss": 2.5901, + "step": 21653 + }, + { + "epoch": 0.6421136909527622, + "grad_norm": 0.09458012133836746, + "learning_rate": 0.0002893010872424238, + "loss": 2.5897, + "step": 21654 + }, + { + "epoch": 0.6421433443050737, + "grad_norm": 0.09572649002075195, + "learning_rate": 0.00028925841879359727, + "loss": 2.5844, + "step": 21655 + }, + { + "epoch": 0.6421729976573851, + "grad_norm": 0.10659069567918777, + "learning_rate": 0.0002892157522109236, + "loss": 2.6096, + "step": 21656 + }, + { + "epoch": 0.6422026510096966, + "grad_norm": 0.1327621042728424, + "learning_rate": 0.00028917308749478067, + "loss": 2.6347, + "step": 21657 + }, + { + "epoch": 0.6422323043620082, + "grad_norm": 0.10577812790870667, + "learning_rate": 0.0002891304246455463, + "loss": 2.6195, + "step": 21658 + }, + { + "epoch": 0.6422619577143196, + "grad_norm": 0.10615894198417664, + "learning_rate": 0.0002890877636635981, + "loss": 2.6036, + "step": 21659 + }, + { + "epoch": 0.6422916110666311, + "grad_norm": 0.11356477439403534, + "learning_rate": 0.00028904510454931423, + "loss": 2.6273, + "step": 21660 + }, + { + "epoch": 0.6423212644189426, + "grad_norm": 0.10904466360807419, + "learning_rate": 0.0002890024473030719, + "loss": 2.5863, + "step": 21661 + }, + { + "epoch": 0.6423509177712541, + "grad_norm": 0.10980620980262756, + "learning_rate": 0.00028895979192524926, + "loss": 2.628, + "step": 21662 + }, + { + "epoch": 0.6423805711235655, + "grad_norm": 0.11430283635854721, + "learning_rate": 0.00028891713841622377, + "loss": 2.5763, + "step": 21663 + }, + { + "epoch": 0.642410224475877, + "grad_norm": 0.09648822247982025, + "learning_rate": 0.0002888744867763732, + "loss": 2.5855, + "step": 21664 + }, + { + "epoch": 0.6424398778281885, + "grad_norm": 0.09705083817243576, + "learning_rate": 0.0002888318370060754, + "loss": 2.5964, + "step": 21665 + }, + { + "epoch": 0.6424695311805, + "grad_norm": 0.11722133308649063, + "learning_rate": 0.0002887891891057079, + "loss": 2.6336, + "step": 21666 + }, + { + "epoch": 0.6424991845328114, + "grad_norm": 0.11219053715467453, + "learning_rate": 0.0002887465430756482, + "loss": 2.5905, + "step": 21667 + }, + { + "epoch": 0.6425288378851229, + "grad_norm": 0.10087894648313522, + "learning_rate": 0.0002887038989162742, + "loss": 2.5699, + "step": 21668 + }, + { + "epoch": 0.6425584912374344, + "grad_norm": 0.11217057704925537, + "learning_rate": 0.0002886612566279636, + "loss": 2.5972, + "step": 21669 + }, + { + "epoch": 0.6425881445897459, + "grad_norm": 0.09705746173858643, + "learning_rate": 0.0002886186162110938, + "loss": 2.6017, + "step": 21670 + }, + { + "epoch": 0.6426177979420573, + "grad_norm": 0.10083301365375519, + "learning_rate": 0.0002885759776660426, + "loss": 2.6216, + "step": 21671 + }, + { + "epoch": 0.6426474512943688, + "grad_norm": 0.10190615057945251, + "learning_rate": 0.0002885333409931873, + "loss": 2.5859, + "step": 21672 + }, + { + "epoch": 0.6426771046466803, + "grad_norm": 0.10200007259845734, + "learning_rate": 0.00028849070619290545, + "loss": 2.6033, + "step": 21673 + }, + { + "epoch": 0.6427067579989918, + "grad_norm": 0.09886854141950607, + "learning_rate": 0.0002884480732655748, + "loss": 2.6177, + "step": 21674 + }, + { + "epoch": 0.6427364113513032, + "grad_norm": 0.11004962027072906, + "learning_rate": 0.00028840544221157274, + "loss": 2.5856, + "step": 21675 + }, + { + "epoch": 0.6427660647036147, + "grad_norm": 0.1040918156504631, + "learning_rate": 0.00028836281303127676, + "loss": 2.5704, + "step": 21676 + }, + { + "epoch": 0.6427957180559262, + "grad_norm": 0.10123570263385773, + "learning_rate": 0.00028832018572506446, + "loss": 2.6263, + "step": 21677 + }, + { + "epoch": 0.6428253714082377, + "grad_norm": 0.11359475553035736, + "learning_rate": 0.0002882775602933132, + "loss": 2.6197, + "step": 21678 + }, + { + "epoch": 0.6428550247605492, + "grad_norm": 0.1351536512374878, + "learning_rate": 0.0002882349367364005, + "loss": 2.6106, + "step": 21679 + }, + { + "epoch": 0.6428846781128607, + "grad_norm": 0.10635382682085037, + "learning_rate": 0.00028819231505470357, + "loss": 2.6113, + "step": 21680 + }, + { + "epoch": 0.6429143314651722, + "grad_norm": 0.11367517709732056, + "learning_rate": 0.00028814969524860047, + "loss": 2.5887, + "step": 21681 + }, + { + "epoch": 0.6429439848174836, + "grad_norm": 0.1075383648276329, + "learning_rate": 0.000288107077318468, + "loss": 2.5827, + "step": 21682 + }, + { + "epoch": 0.6429736381697951, + "grad_norm": 0.09469058364629745, + "learning_rate": 0.00028806446126468366, + "loss": 2.6421, + "step": 21683 + }, + { + "epoch": 0.6430032915221066, + "grad_norm": 0.105474554002285, + "learning_rate": 0.00028802184708762505, + "loss": 2.5855, + "step": 21684 + }, + { + "epoch": 0.6430329448744181, + "grad_norm": 0.10704318434000015, + "learning_rate": 0.0002879792347876692, + "loss": 2.621, + "step": 21685 + }, + { + "epoch": 0.6430625982267295, + "grad_norm": 0.09903233498334885, + "learning_rate": 0.0002879366243651937, + "loss": 2.5855, + "step": 21686 + }, + { + "epoch": 0.643092251579041, + "grad_norm": 0.09507939964532852, + "learning_rate": 0.0002878940158205757, + "loss": 2.5959, + "step": 21687 + }, + { + "epoch": 0.6431219049313525, + "grad_norm": 0.0920889675617218, + "learning_rate": 0.0002878514091541927, + "loss": 2.5903, + "step": 21688 + }, + { + "epoch": 0.643151558283664, + "grad_norm": 0.10623112320899963, + "learning_rate": 0.00028780880436642185, + "loss": 2.6026, + "step": 21689 + }, + { + "epoch": 0.6431812116359754, + "grad_norm": 0.09763939678668976, + "learning_rate": 0.00028776620145764056, + "loss": 2.5982, + "step": 21690 + }, + { + "epoch": 0.643210864988287, + "grad_norm": 0.10716892033815384, + "learning_rate": 0.0002877236004282259, + "loss": 2.5964, + "step": 21691 + }, + { + "epoch": 0.6432405183405984, + "grad_norm": 0.1018752008676529, + "learning_rate": 0.00028768100127855514, + "loss": 2.5942, + "step": 21692 + }, + { + "epoch": 0.6432701716929099, + "grad_norm": 0.09302591532468796, + "learning_rate": 0.0002876384040090056, + "loss": 2.6095, + "step": 21693 + }, + { + "epoch": 0.6432998250452213, + "grad_norm": 0.10222679376602173, + "learning_rate": 0.0002875958086199545, + "loss": 2.599, + "step": 21694 + }, + { + "epoch": 0.6433294783975329, + "grad_norm": 0.10801803320646286, + "learning_rate": 0.0002875532151117789, + "loss": 2.6, + "step": 21695 + }, + { + "epoch": 0.6433591317498443, + "grad_norm": 0.11381063610315323, + "learning_rate": 0.000287510623484856, + "loss": 2.6056, + "step": 21696 + }, + { + "epoch": 0.6433887851021558, + "grad_norm": 0.13030210137367249, + "learning_rate": 0.0002874680337395631, + "loss": 2.616, + "step": 21697 + }, + { + "epoch": 0.6434184384544672, + "grad_norm": 0.10335426777601242, + "learning_rate": 0.0002874254458762772, + "loss": 2.5949, + "step": 21698 + }, + { + "epoch": 0.6434480918067788, + "grad_norm": 0.1041821613907814, + "learning_rate": 0.0002873828598953755, + "loss": 2.5756, + "step": 21699 + }, + { + "epoch": 0.6434777451590903, + "grad_norm": 0.10406512022018433, + "learning_rate": 0.0002873402757972351, + "loss": 2.5967, + "step": 21700 + }, + { + "epoch": 0.6435073985114017, + "grad_norm": 0.1043548658490181, + "learning_rate": 0.000287297693582233, + "loss": 2.5863, + "step": 21701 + }, + { + "epoch": 0.6435370518637132, + "grad_norm": 0.10684970766305923, + "learning_rate": 0.00028725511325074633, + "loss": 2.5772, + "step": 21702 + }, + { + "epoch": 0.6435667052160247, + "grad_norm": 0.09803333133459091, + "learning_rate": 0.00028721253480315214, + "loss": 2.5957, + "step": 21703 + }, + { + "epoch": 0.6435963585683362, + "grad_norm": 0.10236649960279465, + "learning_rate": 0.0002871699582398275, + "loss": 2.5756, + "step": 21704 + }, + { + "epoch": 0.6436260119206476, + "grad_norm": 0.11319614946842194, + "learning_rate": 0.00028712738356114967, + "loss": 2.5839, + "step": 21705 + }, + { + "epoch": 0.6436556652729591, + "grad_norm": 0.09814745932817459, + "learning_rate": 0.0002870848107674949, + "loss": 2.5895, + "step": 21706 + }, + { + "epoch": 0.6436853186252706, + "grad_norm": 0.1135329157114029, + "learning_rate": 0.0002870422398592409, + "loss": 2.6498, + "step": 21707 + }, + { + "epoch": 0.6437149719775821, + "grad_norm": 0.12464484572410583, + "learning_rate": 0.00028699967083676447, + "loss": 2.6058, + "step": 21708 + }, + { + "epoch": 0.6437446253298935, + "grad_norm": 0.12868505716323853, + "learning_rate": 0.0002869571037004425, + "loss": 2.5957, + "step": 21709 + }, + { + "epoch": 0.643774278682205, + "grad_norm": 0.10703346878290176, + "learning_rate": 0.0002869145384506519, + "loss": 2.6327, + "step": 21710 + }, + { + "epoch": 0.6438039320345165, + "grad_norm": 0.12455935776233673, + "learning_rate": 0.00028687197508776965, + "loss": 2.6153, + "step": 21711 + }, + { + "epoch": 0.643833585386828, + "grad_norm": 0.11358088254928589, + "learning_rate": 0.00028682941361217267, + "loss": 2.6285, + "step": 21712 + }, + { + "epoch": 0.6438632387391394, + "grad_norm": 0.09499835222959518, + "learning_rate": 0.0002867868540242378, + "loss": 2.6227, + "step": 21713 + }, + { + "epoch": 0.643892892091451, + "grad_norm": 0.10013218969106674, + "learning_rate": 0.000286744296324342, + "loss": 2.6299, + "step": 21714 + }, + { + "epoch": 0.6439225454437624, + "grad_norm": 0.09832454472780228, + "learning_rate": 0.0002867017405128622, + "loss": 2.5954, + "step": 21715 + }, + { + "epoch": 0.6439521987960739, + "grad_norm": 0.10285354405641556, + "learning_rate": 0.0002866591865901749, + "loss": 2.6182, + "step": 21716 + }, + { + "epoch": 0.6439818521483853, + "grad_norm": 0.120327427983284, + "learning_rate": 0.0002866166345566571, + "loss": 2.6245, + "step": 21717 + }, + { + "epoch": 0.6440115055006969, + "grad_norm": 0.10930740833282471, + "learning_rate": 0.00028657408441268566, + "loss": 2.6181, + "step": 21718 + }, + { + "epoch": 0.6440411588530083, + "grad_norm": 0.10646657645702362, + "learning_rate": 0.00028653153615863715, + "loss": 2.5885, + "step": 21719 + }, + { + "epoch": 0.6440708122053198, + "grad_norm": 0.10129442811012268, + "learning_rate": 0.0002864889897948887, + "loss": 2.609, + "step": 21720 + }, + { + "epoch": 0.6441004655576313, + "grad_norm": 0.09554869681596756, + "learning_rate": 0.0002864464453218169, + "loss": 2.5653, + "step": 21721 + }, + { + "epoch": 0.6441301189099428, + "grad_norm": 0.11369015276432037, + "learning_rate": 0.00028640390273979857, + "loss": 2.6117, + "step": 21722 + }, + { + "epoch": 0.6441597722622543, + "grad_norm": 0.1002071276307106, + "learning_rate": 0.0002863613620492102, + "loss": 2.5859, + "step": 21723 + }, + { + "epoch": 0.6441894256145657, + "grad_norm": 0.11260433495044708, + "learning_rate": 0.0002863188232504287, + "loss": 2.5997, + "step": 21724 + }, + { + "epoch": 0.6442190789668772, + "grad_norm": 0.10685725510120392, + "learning_rate": 0.0002862762863438307, + "loss": 2.6075, + "step": 21725 + }, + { + "epoch": 0.6442487323191887, + "grad_norm": 0.10262204706668854, + "learning_rate": 0.00028623375132979313, + "loss": 2.5916, + "step": 21726 + }, + { + "epoch": 0.6442783856715002, + "grad_norm": 0.10068556666374207, + "learning_rate": 0.0002861912182086921, + "loss": 2.6237, + "step": 21727 + }, + { + "epoch": 0.6443080390238116, + "grad_norm": 0.09727798402309418, + "learning_rate": 0.0002861486869809046, + "loss": 2.5775, + "step": 21728 + }, + { + "epoch": 0.6443376923761232, + "grad_norm": 0.10189270228147507, + "learning_rate": 0.00028610615764680707, + "loss": 2.6227, + "step": 21729 + }, + { + "epoch": 0.6443673457284346, + "grad_norm": 0.0981331318616867, + "learning_rate": 0.0002860636302067763, + "loss": 2.5681, + "step": 21730 + }, + { + "epoch": 0.6443969990807461, + "grad_norm": 0.10352108627557755, + "learning_rate": 0.0002860211046611888, + "loss": 2.593, + "step": 21731 + }, + { + "epoch": 0.6444266524330575, + "grad_norm": 0.10313241928815842, + "learning_rate": 0.000285978581010421, + "loss": 2.6043, + "step": 21732 + }, + { + "epoch": 0.644456305785369, + "grad_norm": 0.10332652926445007, + "learning_rate": 0.00028593605925484976, + "loss": 2.6205, + "step": 21733 + }, + { + "epoch": 0.6444859591376805, + "grad_norm": 0.10433496534824371, + "learning_rate": 0.0002858935393948514, + "loss": 2.6055, + "step": 21734 + }, + { + "epoch": 0.644515612489992, + "grad_norm": 0.10947174578905106, + "learning_rate": 0.00028585102143080255, + "loss": 2.6033, + "step": 21735 + }, + { + "epoch": 0.6445452658423034, + "grad_norm": 0.10226946324110031, + "learning_rate": 0.0002858085053630799, + "loss": 2.6073, + "step": 21736 + }, + { + "epoch": 0.644574919194615, + "grad_norm": 0.10705472528934479, + "learning_rate": 0.00028576599119205947, + "loss": 2.5871, + "step": 21737 + }, + { + "epoch": 0.6446045725469264, + "grad_norm": 0.10062173753976822, + "learning_rate": 0.000285723478918118, + "loss": 2.5791, + "step": 21738 + }, + { + "epoch": 0.6446342258992379, + "grad_norm": 0.10485291481018066, + "learning_rate": 0.00028568096854163184, + "loss": 2.611, + "step": 21739 + }, + { + "epoch": 0.6446638792515494, + "grad_norm": 0.10350754112005234, + "learning_rate": 0.0002856384600629775, + "loss": 2.6191, + "step": 21740 + }, + { + "epoch": 0.6446935326038609, + "grad_norm": 0.10496684163808823, + "learning_rate": 0.0002855959534825314, + "loss": 2.5974, + "step": 21741 + }, + { + "epoch": 0.6447231859561724, + "grad_norm": 0.09880614280700684, + "learning_rate": 0.00028555344880067, + "loss": 2.6646, + "step": 21742 + }, + { + "epoch": 0.6447528393084838, + "grad_norm": 0.10372743010520935, + "learning_rate": 0.0002855109460177695, + "loss": 2.5923, + "step": 21743 + }, + { + "epoch": 0.6447824926607953, + "grad_norm": 0.09350363165140152, + "learning_rate": 0.00028546844513420644, + "loss": 2.5406, + "step": 21744 + }, + { + "epoch": 0.6448121460131068, + "grad_norm": 0.0991923063993454, + "learning_rate": 0.000285425946150357, + "loss": 2.5887, + "step": 21745 + }, + { + "epoch": 0.6448417993654183, + "grad_norm": 0.09948797523975372, + "learning_rate": 0.00028538344906659777, + "loss": 2.6034, + "step": 21746 + }, + { + "epoch": 0.6448714527177297, + "grad_norm": 0.10220704972743988, + "learning_rate": 0.00028534095388330506, + "loss": 2.6007, + "step": 21747 + }, + { + "epoch": 0.6449011060700413, + "grad_norm": 0.10085061192512512, + "learning_rate": 0.00028529846060085495, + "loss": 2.5982, + "step": 21748 + }, + { + "epoch": 0.6449307594223527, + "grad_norm": 0.10635431110858917, + "learning_rate": 0.0002852559692196238, + "loss": 2.5812, + "step": 21749 + }, + { + "epoch": 0.6449604127746642, + "grad_norm": 0.10654914379119873, + "learning_rate": 0.0002852134797399879, + "loss": 2.5684, + "step": 21750 + }, + { + "epoch": 0.6449900661269756, + "grad_norm": 0.11355885863304138, + "learning_rate": 0.00028517099216232355, + "loss": 2.5939, + "step": 21751 + }, + { + "epoch": 0.6450197194792872, + "grad_norm": 0.10268333554267883, + "learning_rate": 0.00028512850648700685, + "loss": 2.6229, + "step": 21752 + }, + { + "epoch": 0.6450493728315986, + "grad_norm": 0.11013083904981613, + "learning_rate": 0.0002850860227144142, + "loss": 2.5913, + "step": 21753 + }, + { + "epoch": 0.6450790261839101, + "grad_norm": 0.11052774637937546, + "learning_rate": 0.0002850435408449216, + "loss": 2.6102, + "step": 21754 + }, + { + "epoch": 0.6451086795362215, + "grad_norm": 0.09552481770515442, + "learning_rate": 0.00028500106087890544, + "loss": 2.6293, + "step": 21755 + }, + { + "epoch": 0.6451383328885331, + "grad_norm": 0.12312547862529755, + "learning_rate": 0.00028495858281674173, + "loss": 2.5953, + "step": 21756 + }, + { + "epoch": 0.6451679862408445, + "grad_norm": 0.1380128562450409, + "learning_rate": 0.00028491610665880675, + "loss": 2.6106, + "step": 21757 + }, + { + "epoch": 0.645197639593156, + "grad_norm": 0.10435571521520615, + "learning_rate": 0.0002848736324054765, + "loss": 2.6173, + "step": 21758 + }, + { + "epoch": 0.6452272929454674, + "grad_norm": 0.09248907119035721, + "learning_rate": 0.00028483116005712715, + "loss": 2.5974, + "step": 21759 + }, + { + "epoch": 0.645256946297779, + "grad_norm": 0.11191897094249725, + "learning_rate": 0.00028478868961413485, + "loss": 2.6163, + "step": 21760 + }, + { + "epoch": 0.6452865996500905, + "grad_norm": 0.09736433625221252, + "learning_rate": 0.00028474622107687567, + "loss": 2.6418, + "step": 21761 + }, + { + "epoch": 0.6453162530024019, + "grad_norm": 0.10588043928146362, + "learning_rate": 0.0002847037544457256, + "loss": 2.619, + "step": 21762 + }, + { + "epoch": 0.6453459063547134, + "grad_norm": 0.11181993037462234, + "learning_rate": 0.00028466128972106075, + "loss": 2.5943, + "step": 21763 + }, + { + "epoch": 0.6453755597070249, + "grad_norm": 0.09788785874843597, + "learning_rate": 0.0002846188269032571, + "loss": 2.5953, + "step": 21764 + }, + { + "epoch": 0.6454052130593364, + "grad_norm": 0.10795494168996811, + "learning_rate": 0.00028457636599269083, + "loss": 2.6106, + "step": 21765 + }, + { + "epoch": 0.6454348664116478, + "grad_norm": 0.10678835213184357, + "learning_rate": 0.00028453390698973766, + "loss": 2.6056, + "step": 21766 + }, + { + "epoch": 0.6454645197639594, + "grad_norm": 0.09904983639717102, + "learning_rate": 0.0002844914498947739, + "loss": 2.6136, + "step": 21767 + }, + { + "epoch": 0.6454941731162708, + "grad_norm": 0.10093239694833755, + "learning_rate": 0.00028444899470817523, + "loss": 2.5802, + "step": 21768 + }, + { + "epoch": 0.6455238264685823, + "grad_norm": 0.09955034404993057, + "learning_rate": 0.00028440654143031775, + "loss": 2.5899, + "step": 21769 + }, + { + "epoch": 0.6455534798208937, + "grad_norm": 0.09384530782699585, + "learning_rate": 0.0002843640900615775, + "loss": 2.6123, + "step": 21770 + }, + { + "epoch": 0.6455831331732053, + "grad_norm": 0.09672108292579651, + "learning_rate": 0.0002843216406023299, + "loss": 2.6083, + "step": 21771 + }, + { + "epoch": 0.6456127865255167, + "grad_norm": 0.10586507618427277, + "learning_rate": 0.00028427919305295136, + "loss": 2.5783, + "step": 21772 + }, + { + "epoch": 0.6456424398778282, + "grad_norm": 0.09653519839048386, + "learning_rate": 0.0002842367474138176, + "loss": 2.6003, + "step": 21773 + }, + { + "epoch": 0.6456720932301396, + "grad_norm": 0.09621778875589371, + "learning_rate": 0.00028419430368530453, + "loss": 2.6419, + "step": 21774 + }, + { + "epoch": 0.6457017465824512, + "grad_norm": 0.10553926974534988, + "learning_rate": 0.00028415186186778787, + "loss": 2.5994, + "step": 21775 + }, + { + "epoch": 0.6457313999347626, + "grad_norm": 0.11703573167324066, + "learning_rate": 0.0002841094219616436, + "loss": 2.5882, + "step": 21776 + }, + { + "epoch": 0.6457610532870741, + "grad_norm": 0.11920279264450073, + "learning_rate": 0.0002840669839672474, + "loss": 2.5892, + "step": 21777 + }, + { + "epoch": 0.6457907066393855, + "grad_norm": 0.0954967588186264, + "learning_rate": 0.0002840245478849751, + "loss": 2.5857, + "step": 21778 + }, + { + "epoch": 0.6458203599916971, + "grad_norm": 0.14029453694820404, + "learning_rate": 0.00028398211371520257, + "loss": 2.6065, + "step": 21779 + }, + { + "epoch": 0.6458500133440085, + "grad_norm": 0.09159006923437119, + "learning_rate": 0.0002839396814583054, + "loss": 2.5612, + "step": 21780 + }, + { + "epoch": 0.64587966669632, + "grad_norm": 0.10110441595315933, + "learning_rate": 0.0002838972511146597, + "loss": 2.6338, + "step": 21781 + }, + { + "epoch": 0.6459093200486316, + "grad_norm": 0.10758768022060394, + "learning_rate": 0.0002838548226846407, + "loss": 2.6138, + "step": 21782 + }, + { + "epoch": 0.645938973400943, + "grad_norm": 0.10038284212350845, + "learning_rate": 0.0002838123961686244, + "loss": 2.5644, + "step": 21783 + }, + { + "epoch": 0.6459686267532545, + "grad_norm": 0.10001664608716965, + "learning_rate": 0.0002837699715669863, + "loss": 2.5887, + "step": 21784 + }, + { + "epoch": 0.6459982801055659, + "grad_norm": 0.09831396490335464, + "learning_rate": 0.00028372754888010223, + "loss": 2.5895, + "step": 21785 + }, + { + "epoch": 0.6460279334578775, + "grad_norm": 0.10489919781684875, + "learning_rate": 0.000283685128108348, + "loss": 2.6008, + "step": 21786 + }, + { + "epoch": 0.6460575868101889, + "grad_norm": 0.10154688358306885, + "learning_rate": 0.000283642709252099, + "loss": 2.6043, + "step": 21787 + }, + { + "epoch": 0.6460872401625004, + "grad_norm": 0.10019668936729431, + "learning_rate": 0.00028360029231173104, + "loss": 2.6129, + "step": 21788 + }, + { + "epoch": 0.6461168935148118, + "grad_norm": 0.1013450101017952, + "learning_rate": 0.0002835578772876195, + "loss": 2.616, + "step": 21789 + }, + { + "epoch": 0.6461465468671234, + "grad_norm": 0.10546590387821198, + "learning_rate": 0.0002835154641801402, + "loss": 2.597, + "step": 21790 + }, + { + "epoch": 0.6461762002194348, + "grad_norm": 0.09960733354091644, + "learning_rate": 0.0002834730529896688, + "loss": 2.6283, + "step": 21791 + }, + { + "epoch": 0.6462058535717463, + "grad_norm": 0.09644275158643723, + "learning_rate": 0.0002834306437165805, + "loss": 2.5986, + "step": 21792 + }, + { + "epoch": 0.6462355069240577, + "grad_norm": 0.09921473264694214, + "learning_rate": 0.00028338823636125097, + "loss": 2.6267, + "step": 21793 + }, + { + "epoch": 0.6462651602763693, + "grad_norm": 0.09338457137346268, + "learning_rate": 0.00028334583092405586, + "loss": 2.5867, + "step": 21794 + }, + { + "epoch": 0.6462948136286807, + "grad_norm": 0.09216413646936417, + "learning_rate": 0.00028330342740537064, + "loss": 2.6035, + "step": 21795 + }, + { + "epoch": 0.6463244669809922, + "grad_norm": 0.09765036404132843, + "learning_rate": 0.00028326102580557066, + "loss": 2.6286, + "step": 21796 + }, + { + "epoch": 0.6463541203333036, + "grad_norm": 0.09317848831415176, + "learning_rate": 0.00028321862612503134, + "loss": 2.6228, + "step": 21797 + }, + { + "epoch": 0.6463837736856152, + "grad_norm": 0.0947369933128357, + "learning_rate": 0.0002831762283641285, + "loss": 2.6181, + "step": 21798 + }, + { + "epoch": 0.6464134270379266, + "grad_norm": 0.10400056093931198, + "learning_rate": 0.0002831338325232374, + "loss": 2.6071, + "step": 21799 + }, + { + "epoch": 0.6464430803902381, + "grad_norm": 0.09899081289768219, + "learning_rate": 0.00028309143860273336, + "loss": 2.5886, + "step": 21800 + }, + { + "epoch": 0.6464727337425495, + "grad_norm": 0.10344746708869934, + "learning_rate": 0.0002830490466029919, + "loss": 2.591, + "step": 21801 + }, + { + "epoch": 0.6465023870948611, + "grad_norm": 0.09637308120727539, + "learning_rate": 0.0002830066565243886, + "loss": 2.5818, + "step": 21802 + }, + { + "epoch": 0.6465320404471726, + "grad_norm": 0.09587262570858002, + "learning_rate": 0.0002829642683672984, + "loss": 2.6368, + "step": 21803 + }, + { + "epoch": 0.646561693799484, + "grad_norm": 0.10924255102872849, + "learning_rate": 0.00028292188213209686, + "loss": 2.6008, + "step": 21804 + }, + { + "epoch": 0.6465913471517956, + "grad_norm": 0.10813558846712112, + "learning_rate": 0.0002828794978191593, + "loss": 2.5837, + "step": 21805 + }, + { + "epoch": 0.646621000504107, + "grad_norm": 0.10290089249610901, + "learning_rate": 0.00028283711542886105, + "loss": 2.6269, + "step": 21806 + }, + { + "epoch": 0.6466506538564185, + "grad_norm": 0.12064394354820251, + "learning_rate": 0.00028279473496157744, + "loss": 2.6262, + "step": 21807 + }, + { + "epoch": 0.6466803072087299, + "grad_norm": 0.13847941160202026, + "learning_rate": 0.0002827523564176837, + "loss": 2.6176, + "step": 21808 + }, + { + "epoch": 0.6467099605610415, + "grad_norm": 0.11968721449375153, + "learning_rate": 0.0002827099797975551, + "loss": 2.5886, + "step": 21809 + }, + { + "epoch": 0.6467396139133529, + "grad_norm": 0.10248222202062607, + "learning_rate": 0.00028266760510156684, + "loss": 2.6184, + "step": 21810 + }, + { + "epoch": 0.6467692672656644, + "grad_norm": 0.1181664913892746, + "learning_rate": 0.00028262523233009434, + "loss": 2.6123, + "step": 21811 + }, + { + "epoch": 0.6467989206179758, + "grad_norm": 0.12186585366725922, + "learning_rate": 0.00028258286148351297, + "loss": 2.6064, + "step": 21812 + }, + { + "epoch": 0.6468285739702874, + "grad_norm": 0.1257348209619522, + "learning_rate": 0.0002825404925621975, + "loss": 2.6408, + "step": 21813 + }, + { + "epoch": 0.6468582273225988, + "grad_norm": 0.108983613550663, + "learning_rate": 0.0002824981255665232, + "loss": 2.6139, + "step": 21814 + }, + { + "epoch": 0.6468878806749103, + "grad_norm": 0.12149421125650406, + "learning_rate": 0.00028245576049686544, + "loss": 2.597, + "step": 21815 + }, + { + "epoch": 0.6469175340272217, + "grad_norm": 0.11786899715662003, + "learning_rate": 0.0002824133973535992, + "loss": 2.6111, + "step": 21816 + }, + { + "epoch": 0.6469471873795333, + "grad_norm": 0.11102428287267685, + "learning_rate": 0.00028237103613709967, + "loss": 2.6144, + "step": 21817 + }, + { + "epoch": 0.6469768407318447, + "grad_norm": 0.11356531083583832, + "learning_rate": 0.00028232867684774206, + "loss": 2.6195, + "step": 21818 + }, + { + "epoch": 0.6470064940841562, + "grad_norm": 0.0984138697385788, + "learning_rate": 0.0002822863194859013, + "loss": 2.6047, + "step": 21819 + }, + { + "epoch": 0.6470361474364676, + "grad_norm": 0.10948493331670761, + "learning_rate": 0.0002822439640519525, + "loss": 2.6257, + "step": 21820 + }, + { + "epoch": 0.6470658007887792, + "grad_norm": 0.10510957986116409, + "learning_rate": 0.0002822016105462709, + "loss": 2.6136, + "step": 21821 + }, + { + "epoch": 0.6470954541410906, + "grad_norm": 0.10501851886510849, + "learning_rate": 0.00028215925896923135, + "loss": 2.6291, + "step": 21822 + }, + { + "epoch": 0.6471251074934021, + "grad_norm": 0.09799160808324814, + "learning_rate": 0.000282116909321209, + "loss": 2.5722, + "step": 21823 + }, + { + "epoch": 0.6471547608457137, + "grad_norm": 0.10235584527254105, + "learning_rate": 0.00028207456160257873, + "loss": 2.5943, + "step": 21824 + }, + { + "epoch": 0.6471844141980251, + "grad_norm": 0.09671424329280853, + "learning_rate": 0.00028203221581371573, + "loss": 2.6044, + "step": 21825 + }, + { + "epoch": 0.6472140675503366, + "grad_norm": 0.10704326629638672, + "learning_rate": 0.0002819898719549948, + "loss": 2.5922, + "step": 21826 + }, + { + "epoch": 0.647243720902648, + "grad_norm": 0.09947574883699417, + "learning_rate": 0.00028194753002679096, + "loss": 2.6032, + "step": 21827 + }, + { + "epoch": 0.6472733742549596, + "grad_norm": 0.10030000656843185, + "learning_rate": 0.00028190519002947923, + "loss": 2.6369, + "step": 21828 + }, + { + "epoch": 0.647303027607271, + "grad_norm": 0.10840592533349991, + "learning_rate": 0.00028186285196343444, + "loss": 2.6097, + "step": 21829 + }, + { + "epoch": 0.6473326809595825, + "grad_norm": 0.09880552440881729, + "learning_rate": 0.00028182051582903157, + "loss": 2.6192, + "step": 21830 + }, + { + "epoch": 0.6473623343118939, + "grad_norm": 0.10170909017324448, + "learning_rate": 0.00028177818162664547, + "loss": 2.6149, + "step": 21831 + }, + { + "epoch": 0.6473919876642055, + "grad_norm": 0.1114564761519432, + "learning_rate": 0.00028173584935665096, + "loss": 2.5842, + "step": 21832 + }, + { + "epoch": 0.6474216410165169, + "grad_norm": 0.09191770851612091, + "learning_rate": 0.0002816935190194231, + "loss": 2.5875, + "step": 21833 + }, + { + "epoch": 0.6474512943688284, + "grad_norm": 0.10608425736427307, + "learning_rate": 0.0002816511906153365, + "loss": 2.553, + "step": 21834 + }, + { + "epoch": 0.6474809477211398, + "grad_norm": 0.10142860561609268, + "learning_rate": 0.00028160886414476614, + "loss": 2.5836, + "step": 21835 + }, + { + "epoch": 0.6475106010734514, + "grad_norm": 0.1001218780875206, + "learning_rate": 0.00028156653960808675, + "loss": 2.614, + "step": 21836 + }, + { + "epoch": 0.6475402544257628, + "grad_norm": 0.09916657954454422, + "learning_rate": 0.0002815242170056731, + "loss": 2.6099, + "step": 21837 + }, + { + "epoch": 0.6475699077780743, + "grad_norm": 0.10048488527536392, + "learning_rate": 0.0002814818963379001, + "loss": 2.6201, + "step": 21838 + }, + { + "epoch": 0.6475995611303857, + "grad_norm": 0.10352618992328644, + "learning_rate": 0.0002814395776051424, + "loss": 2.5723, + "step": 21839 + }, + { + "epoch": 0.6476292144826973, + "grad_norm": 0.10347362607717514, + "learning_rate": 0.0002813972608077747, + "loss": 2.5701, + "step": 21840 + }, + { + "epoch": 0.6476588678350087, + "grad_norm": 0.10477834939956665, + "learning_rate": 0.00028135494594617186, + "loss": 2.6247, + "step": 21841 + }, + { + "epoch": 0.6476885211873202, + "grad_norm": 0.10947606712579727, + "learning_rate": 0.0002813126330207084, + "loss": 2.5707, + "step": 21842 + }, + { + "epoch": 0.6477181745396317, + "grad_norm": 0.09853210300207138, + "learning_rate": 0.0002812703220317592, + "loss": 2.6103, + "step": 21843 + }, + { + "epoch": 0.6477478278919432, + "grad_norm": 0.10324884951114655, + "learning_rate": 0.0002812280129796988, + "loss": 2.6207, + "step": 21844 + }, + { + "epoch": 0.6477774812442547, + "grad_norm": 0.09025485068559647, + "learning_rate": 0.0002811857058649019, + "loss": 2.557, + "step": 21845 + }, + { + "epoch": 0.6478071345965661, + "grad_norm": 0.10431510210037231, + "learning_rate": 0.00028114340068774316, + "loss": 2.5909, + "step": 21846 + }, + { + "epoch": 0.6478367879488777, + "grad_norm": 0.09941544383764267, + "learning_rate": 0.0002811010974485973, + "loss": 2.5711, + "step": 21847 + }, + { + "epoch": 0.6478664413011891, + "grad_norm": 0.09373468905687332, + "learning_rate": 0.00028105879614783846, + "loss": 2.6045, + "step": 21848 + }, + { + "epoch": 0.6478960946535006, + "grad_norm": 0.11437901109457016, + "learning_rate": 0.0002810164967858417, + "loss": 2.5789, + "step": 21849 + }, + { + "epoch": 0.647925748005812, + "grad_norm": 0.09582936763763428, + "learning_rate": 0.0002809741993629815, + "loss": 2.6266, + "step": 21850 + }, + { + "epoch": 0.6479554013581236, + "grad_norm": 0.10533928126096725, + "learning_rate": 0.0002809319038796324, + "loss": 2.5841, + "step": 21851 + }, + { + "epoch": 0.647985054710435, + "grad_norm": 0.09954671561717987, + "learning_rate": 0.0002808896103361688, + "loss": 2.6183, + "step": 21852 + }, + { + "epoch": 0.6480147080627465, + "grad_norm": 0.09830434620380402, + "learning_rate": 0.0002808473187329654, + "loss": 2.5974, + "step": 21853 + }, + { + "epoch": 0.648044361415058, + "grad_norm": 0.11131601780653, + "learning_rate": 0.0002808050290703965, + "loss": 2.5955, + "step": 21854 + }, + { + "epoch": 0.6480740147673695, + "grad_norm": 0.10617861151695251, + "learning_rate": 0.0002807627413488368, + "loss": 2.6074, + "step": 21855 + }, + { + "epoch": 0.6481036681196809, + "grad_norm": 0.08926481753587723, + "learning_rate": 0.0002807204555686606, + "loss": 2.5755, + "step": 21856 + }, + { + "epoch": 0.6481333214719924, + "grad_norm": 0.10943276435136795, + "learning_rate": 0.00028067817173024263, + "loss": 2.5713, + "step": 21857 + }, + { + "epoch": 0.6481629748243039, + "grad_norm": 0.10685937106609344, + "learning_rate": 0.0002806358898339569, + "loss": 2.5647, + "step": 21858 + }, + { + "epoch": 0.6481926281766154, + "grad_norm": 0.08577004075050354, + "learning_rate": 0.00028059360988017803, + "loss": 2.5968, + "step": 21859 + }, + { + "epoch": 0.6482222815289268, + "grad_norm": 0.11360291391611099, + "learning_rate": 0.00028055133186928035, + "loss": 2.5876, + "step": 21860 + }, + { + "epoch": 0.6482519348812383, + "grad_norm": 0.10040341317653656, + "learning_rate": 0.0002805090558016381, + "loss": 2.6039, + "step": 21861 + }, + { + "epoch": 0.6482815882335498, + "grad_norm": 0.10440374910831451, + "learning_rate": 0.00028046678167762605, + "loss": 2.5866, + "step": 21862 + }, + { + "epoch": 0.6483112415858613, + "grad_norm": 0.1110834926366806, + "learning_rate": 0.00028042450949761834, + "loss": 2.6062, + "step": 21863 + }, + { + "epoch": 0.6483408949381727, + "grad_norm": 0.10414700955152512, + "learning_rate": 0.00028038223926198937, + "loss": 2.5898, + "step": 21864 + }, + { + "epoch": 0.6483705482904842, + "grad_norm": 0.09582381695508957, + "learning_rate": 0.00028033997097111325, + "loss": 2.6112, + "step": 21865 + }, + { + "epoch": 0.6484002016427958, + "grad_norm": 0.10220769047737122, + "learning_rate": 0.0002802977046253644, + "loss": 2.5957, + "step": 21866 + }, + { + "epoch": 0.6484298549951072, + "grad_norm": 0.10154739767313004, + "learning_rate": 0.00028025544022511735, + "loss": 2.5837, + "step": 21867 + }, + { + "epoch": 0.6484595083474187, + "grad_norm": 0.11315088719129562, + "learning_rate": 0.00028021317777074585, + "loss": 2.5845, + "step": 21868 + }, + { + "epoch": 0.6484891616997301, + "grad_norm": 0.110236756503582, + "learning_rate": 0.0002801709172626244, + "loss": 2.607, + "step": 21869 + }, + { + "epoch": 0.6485188150520417, + "grad_norm": 0.10963412374258041, + "learning_rate": 0.0002801286587011272, + "loss": 2.6178, + "step": 21870 + }, + { + "epoch": 0.6485484684043531, + "grad_norm": 0.10391244292259216, + "learning_rate": 0.0002800864020866285, + "loss": 2.5831, + "step": 21871 + }, + { + "epoch": 0.6485781217566646, + "grad_norm": 0.1032610610127449, + "learning_rate": 0.0002800441474195024, + "loss": 2.6039, + "step": 21872 + }, + { + "epoch": 0.648607775108976, + "grad_norm": 0.10703300684690475, + "learning_rate": 0.00028000189470012317, + "loss": 2.5783, + "step": 21873 + }, + { + "epoch": 0.6486374284612876, + "grad_norm": 0.10688591748476028, + "learning_rate": 0.00027995964392886475, + "loss": 2.5905, + "step": 21874 + }, + { + "epoch": 0.648667081813599, + "grad_norm": 0.09407629072666168, + "learning_rate": 0.0002799173951061016, + "loss": 2.5623, + "step": 21875 + }, + { + "epoch": 0.6486967351659105, + "grad_norm": 0.11166112124919891, + "learning_rate": 0.0002798751482322077, + "loss": 2.5751, + "step": 21876 + }, + { + "epoch": 0.648726388518222, + "grad_norm": 0.09804403781890869, + "learning_rate": 0.00027983290330755716, + "loss": 2.6018, + "step": 21877 + }, + { + "epoch": 0.6487560418705335, + "grad_norm": 0.09214220941066742, + "learning_rate": 0.00027979066033252426, + "loss": 2.6117, + "step": 21878 + }, + { + "epoch": 0.6487856952228449, + "grad_norm": 0.10169713199138641, + "learning_rate": 0.00027974841930748263, + "loss": 2.6086, + "step": 21879 + }, + { + "epoch": 0.6488153485751564, + "grad_norm": 0.09265056997537613, + "learning_rate": 0.0002797061802328066, + "loss": 2.6115, + "step": 21880 + }, + { + "epoch": 0.6488450019274679, + "grad_norm": 0.10110712796449661, + "learning_rate": 0.0002796639431088701, + "loss": 2.5947, + "step": 21881 + }, + { + "epoch": 0.6488746552797794, + "grad_norm": 0.10117928683757782, + "learning_rate": 0.00027962170793604727, + "loss": 2.6035, + "step": 21882 + }, + { + "epoch": 0.6489043086320908, + "grad_norm": 0.08858443796634674, + "learning_rate": 0.000279579474714712, + "loss": 2.5751, + "step": 21883 + }, + { + "epoch": 0.6489339619844023, + "grad_norm": 0.11299563199281693, + "learning_rate": 0.00027953724344523836, + "loss": 2.5907, + "step": 21884 + }, + { + "epoch": 0.6489636153367138, + "grad_norm": 0.11224985122680664, + "learning_rate": 0.0002794950141280002, + "loss": 2.6201, + "step": 21885 + }, + { + "epoch": 0.6489932686890253, + "grad_norm": 0.10532223433256149, + "learning_rate": 0.0002794527867633716, + "loss": 2.591, + "step": 21886 + }, + { + "epoch": 0.6490229220413368, + "grad_norm": 0.11774695664644241, + "learning_rate": 0.0002794105613517262, + "loss": 2.6266, + "step": 21887 + }, + { + "epoch": 0.6490525753936482, + "grad_norm": 0.10953547060489655, + "learning_rate": 0.00027936833789343854, + "loss": 2.586, + "step": 21888 + }, + { + "epoch": 0.6490822287459598, + "grad_norm": 0.11035161465406418, + "learning_rate": 0.0002793261163888819, + "loss": 2.5897, + "step": 21889 + }, + { + "epoch": 0.6491118820982712, + "grad_norm": 0.10332232713699341, + "learning_rate": 0.00027928389683843047, + "loss": 2.606, + "step": 21890 + }, + { + "epoch": 0.6491415354505827, + "grad_norm": 0.109404057264328, + "learning_rate": 0.0002792416792424579, + "loss": 2.5899, + "step": 21891 + }, + { + "epoch": 0.6491711888028942, + "grad_norm": 0.10900511592626572, + "learning_rate": 0.0002791994636013382, + "loss": 2.5989, + "step": 21892 + }, + { + "epoch": 0.6492008421552057, + "grad_norm": 0.11082078516483307, + "learning_rate": 0.0002791572499154452, + "loss": 2.6246, + "step": 21893 + }, + { + "epoch": 0.6492304955075171, + "grad_norm": 0.09884566813707352, + "learning_rate": 0.0002791150381851526, + "loss": 2.616, + "step": 21894 + }, + { + "epoch": 0.6492601488598286, + "grad_norm": 0.10429937392473221, + "learning_rate": 0.00027907282841083423, + "loss": 2.5867, + "step": 21895 + }, + { + "epoch": 0.64928980221214, + "grad_norm": 0.10291175544261932, + "learning_rate": 0.000279030620592864, + "loss": 2.6126, + "step": 21896 + }, + { + "epoch": 0.6493194555644516, + "grad_norm": 0.09993211925029755, + "learning_rate": 0.0002789884147316154, + "loss": 2.6223, + "step": 21897 + }, + { + "epoch": 0.649349108916763, + "grad_norm": 0.11028727889060974, + "learning_rate": 0.0002789462108274624, + "loss": 2.614, + "step": 21898 + }, + { + "epoch": 0.6493787622690745, + "grad_norm": 0.09651990979909897, + "learning_rate": 0.0002789040088807787, + "loss": 2.5931, + "step": 21899 + }, + { + "epoch": 0.649408415621386, + "grad_norm": 0.10454754531383514, + "learning_rate": 0.0002788618088919379, + "loss": 2.5915, + "step": 21900 + }, + { + "epoch": 0.6494380689736975, + "grad_norm": 0.11177030205726624, + "learning_rate": 0.00027881961086131376, + "loss": 2.612, + "step": 21901 + }, + { + "epoch": 0.6494677223260089, + "grad_norm": 0.10037756711244583, + "learning_rate": 0.00027877741478927987, + "loss": 2.5895, + "step": 21902 + }, + { + "epoch": 0.6494973756783204, + "grad_norm": 0.09725731611251831, + "learning_rate": 0.00027873522067621, + "loss": 2.5745, + "step": 21903 + }, + { + "epoch": 0.6495270290306319, + "grad_norm": 0.10196022689342499, + "learning_rate": 0.00027869302852247767, + "loss": 2.6094, + "step": 21904 + }, + { + "epoch": 0.6495566823829434, + "grad_norm": 0.09718642383813858, + "learning_rate": 0.0002786508383284566, + "loss": 2.5777, + "step": 21905 + }, + { + "epoch": 0.6495863357352548, + "grad_norm": 0.09142555296421051, + "learning_rate": 0.0002786086500945204, + "loss": 2.6063, + "step": 21906 + }, + { + "epoch": 0.6496159890875663, + "grad_norm": 0.11314166337251663, + "learning_rate": 0.00027856646382104256, + "loss": 2.606, + "step": 21907 + }, + { + "epoch": 0.6496456424398779, + "grad_norm": 0.12307481467723846, + "learning_rate": 0.0002785242795083967, + "loss": 2.6284, + "step": 21908 + }, + { + "epoch": 0.6496752957921893, + "grad_norm": 0.10140348970890045, + "learning_rate": 0.0002784820971569564, + "loss": 2.5917, + "step": 21909 + }, + { + "epoch": 0.6497049491445008, + "grad_norm": 0.10567609965801239, + "learning_rate": 0.0002784399167670951, + "loss": 2.5925, + "step": 21910 + }, + { + "epoch": 0.6497346024968123, + "grad_norm": 0.12696795165538788, + "learning_rate": 0.00027839773833918634, + "loss": 2.5738, + "step": 21911 + }, + { + "epoch": 0.6497642558491238, + "grad_norm": 0.11226661503314972, + "learning_rate": 0.0002783555618736039, + "loss": 2.619, + "step": 21912 + }, + { + "epoch": 0.6497939092014352, + "grad_norm": 0.10650788992643356, + "learning_rate": 0.00027831338737072055, + "loss": 2.6058, + "step": 21913 + }, + { + "epoch": 0.6498235625537467, + "grad_norm": 0.11536990106105804, + "learning_rate": 0.00027827121483091046, + "loss": 2.5719, + "step": 21914 + }, + { + "epoch": 0.6498532159060582, + "grad_norm": 0.11984628438949585, + "learning_rate": 0.00027822904425454676, + "loss": 2.6377, + "step": 21915 + }, + { + "epoch": 0.6498828692583697, + "grad_norm": 0.1113087460398674, + "learning_rate": 0.00027818687564200295, + "loss": 2.6386, + "step": 21916 + }, + { + "epoch": 0.6499125226106811, + "grad_norm": 0.10877716541290283, + "learning_rate": 0.0002781447089936525, + "loss": 2.6336, + "step": 21917 + }, + { + "epoch": 0.6499421759629926, + "grad_norm": 0.11677084118127823, + "learning_rate": 0.0002781025443098687, + "loss": 2.6038, + "step": 21918 + }, + { + "epoch": 0.6499718293153041, + "grad_norm": 0.11859283596277237, + "learning_rate": 0.000278060381591025, + "loss": 2.6324, + "step": 21919 + }, + { + "epoch": 0.6500014826676156, + "grad_norm": 0.09868799149990082, + "learning_rate": 0.0002780182208374946, + "loss": 2.62, + "step": 21920 + }, + { + "epoch": 0.650031136019927, + "grad_norm": 0.10781525075435638, + "learning_rate": 0.00027797606204965104, + "loss": 2.6094, + "step": 21921 + }, + { + "epoch": 0.6500607893722385, + "grad_norm": 0.09560985118150711, + "learning_rate": 0.00027793390522786756, + "loss": 2.562, + "step": 21922 + }, + { + "epoch": 0.65009044272455, + "grad_norm": 0.10272344946861267, + "learning_rate": 0.0002778917503725177, + "loss": 2.6094, + "step": 21923 + }, + { + "epoch": 0.6501200960768615, + "grad_norm": 0.0890025943517685, + "learning_rate": 0.0002778495974839742, + "loss": 2.5973, + "step": 21924 + }, + { + "epoch": 0.6501497494291729, + "grad_norm": 0.09983271360397339, + "learning_rate": 0.0002778074465626107, + "loss": 2.6116, + "step": 21925 + }, + { + "epoch": 0.6501794027814845, + "grad_norm": 0.09676724672317505, + "learning_rate": 0.0002777652976088002, + "loss": 2.6046, + "step": 21926 + }, + { + "epoch": 0.6502090561337959, + "grad_norm": 0.09916723519563675, + "learning_rate": 0.00027772315062291644, + "loss": 2.5876, + "step": 21927 + }, + { + "epoch": 0.6502387094861074, + "grad_norm": 0.1197381317615509, + "learning_rate": 0.00027768100560533217, + "loss": 2.6258, + "step": 21928 + }, + { + "epoch": 0.6502683628384189, + "grad_norm": 0.10889722406864166, + "learning_rate": 0.00027763886255642085, + "loss": 2.5829, + "step": 21929 + }, + { + "epoch": 0.6502980161907304, + "grad_norm": 0.09432931989431381, + "learning_rate": 0.00027759672147655554, + "loss": 2.5804, + "step": 21930 + }, + { + "epoch": 0.6503276695430419, + "grad_norm": 0.1148943081498146, + "learning_rate": 0.00027755458236610944, + "loss": 2.6007, + "step": 21931 + }, + { + "epoch": 0.6503573228953533, + "grad_norm": 0.09442976862192154, + "learning_rate": 0.00027751244522545573, + "loss": 2.5938, + "step": 21932 + }, + { + "epoch": 0.6503869762476648, + "grad_norm": 0.10272502154111862, + "learning_rate": 0.00027747031005496763, + "loss": 2.5792, + "step": 21933 + }, + { + "epoch": 0.6504166295999763, + "grad_norm": 0.09566013514995575, + "learning_rate": 0.00027742817685501796, + "loss": 2.5961, + "step": 21934 + }, + { + "epoch": 0.6504462829522878, + "grad_norm": 0.10553891956806183, + "learning_rate": 0.00027738604562598, + "loss": 2.6119, + "step": 21935 + }, + { + "epoch": 0.6504759363045992, + "grad_norm": 0.10060513764619827, + "learning_rate": 0.00027734391636822684, + "loss": 2.5766, + "step": 21936 + }, + { + "epoch": 0.6505055896569107, + "grad_norm": 0.09531545639038086, + "learning_rate": 0.00027730178908213154, + "loss": 2.5714, + "step": 21937 + }, + { + "epoch": 0.6505352430092222, + "grad_norm": 0.10454728454351425, + "learning_rate": 0.0002772596637680671, + "loss": 2.5874, + "step": 21938 + }, + { + "epoch": 0.6505648963615337, + "grad_norm": 0.1252889335155487, + "learning_rate": 0.00027721754042640633, + "loss": 2.602, + "step": 21939 + }, + { + "epoch": 0.6505945497138451, + "grad_norm": 0.1092253252863884, + "learning_rate": 0.0002771754190575227, + "loss": 2.5943, + "step": 21940 + }, + { + "epoch": 0.6506242030661566, + "grad_norm": 0.10490091890096664, + "learning_rate": 0.000277133299661789, + "loss": 2.5641, + "step": 21941 + }, + { + "epoch": 0.6506538564184681, + "grad_norm": 0.14782479405403137, + "learning_rate": 0.00027709118223957817, + "loss": 2.5912, + "step": 21942 + }, + { + "epoch": 0.6506835097707796, + "grad_norm": 0.12232114374637604, + "learning_rate": 0.0002770490667912634, + "loss": 2.5707, + "step": 21943 + }, + { + "epoch": 0.650713163123091, + "grad_norm": 0.10934565216302872, + "learning_rate": 0.00027700695331721716, + "loss": 2.6173, + "step": 21944 + }, + { + "epoch": 0.6507428164754026, + "grad_norm": 0.13217751681804657, + "learning_rate": 0.00027696484181781266, + "loss": 2.5621, + "step": 21945 + }, + { + "epoch": 0.650772469827714, + "grad_norm": 0.12627117335796356, + "learning_rate": 0.0002769227322934228, + "loss": 2.5889, + "step": 21946 + }, + { + "epoch": 0.6508021231800255, + "grad_norm": 0.10332922637462616, + "learning_rate": 0.00027688062474442044, + "loss": 2.6188, + "step": 21947 + }, + { + "epoch": 0.6508317765323369, + "grad_norm": 0.1200978234410286, + "learning_rate": 0.00027683851917117844, + "loss": 2.6054, + "step": 21948 + }, + { + "epoch": 0.6508614298846485, + "grad_norm": 0.10728005319833755, + "learning_rate": 0.0002767964155740695, + "loss": 2.5621, + "step": 21949 + }, + { + "epoch": 0.65089108323696, + "grad_norm": 0.0979684516787529, + "learning_rate": 0.00027675431395346683, + "loss": 2.6113, + "step": 21950 + }, + { + "epoch": 0.6509207365892714, + "grad_norm": 0.11002855002880096, + "learning_rate": 0.00027671221430974294, + "loss": 2.5903, + "step": 21951 + }, + { + "epoch": 0.6509503899415829, + "grad_norm": 0.1068970188498497, + "learning_rate": 0.00027667011664327053, + "loss": 2.5729, + "step": 21952 + }, + { + "epoch": 0.6509800432938944, + "grad_norm": 0.10712031275033951, + "learning_rate": 0.0002766280209544227, + "loss": 2.5787, + "step": 21953 + }, + { + "epoch": 0.6510096966462059, + "grad_norm": 0.10739073157310486, + "learning_rate": 0.0002765859272435724, + "loss": 2.5964, + "step": 21954 + }, + { + "epoch": 0.6510393499985173, + "grad_norm": 0.10641667991876602, + "learning_rate": 0.0002765438355110918, + "loss": 2.6, + "step": 21955 + }, + { + "epoch": 0.6510690033508288, + "grad_norm": 0.10303790867328644, + "learning_rate": 0.00027650174575735394, + "loss": 2.6173, + "step": 21956 + }, + { + "epoch": 0.6510986567031403, + "grad_norm": 0.10797443240880966, + "learning_rate": 0.0002764596579827314, + "loss": 2.6258, + "step": 21957 + }, + { + "epoch": 0.6511283100554518, + "grad_norm": 0.1071104034781456, + "learning_rate": 0.000276417572187597, + "loss": 2.6071, + "step": 21958 + }, + { + "epoch": 0.6511579634077632, + "grad_norm": 0.11227317154407501, + "learning_rate": 0.0002763754883723234, + "loss": 2.5985, + "step": 21959 + }, + { + "epoch": 0.6511876167600748, + "grad_norm": 0.10931815952062607, + "learning_rate": 0.00027633340653728325, + "loss": 2.5848, + "step": 21960 + }, + { + "epoch": 0.6512172701123862, + "grad_norm": 0.09576407819986343, + "learning_rate": 0.0002762913266828492, + "loss": 2.6271, + "step": 21961 + }, + { + "epoch": 0.6512469234646977, + "grad_norm": 0.10514792799949646, + "learning_rate": 0.0002762492488093937, + "loss": 2.5707, + "step": 21962 + }, + { + "epoch": 0.6512765768170091, + "grad_norm": 0.10075870156288147, + "learning_rate": 0.00027620717291728964, + "loss": 2.5918, + "step": 21963 + }, + { + "epoch": 0.6513062301693207, + "grad_norm": 0.098763607442379, + "learning_rate": 0.0002761650990069094, + "loss": 2.6333, + "step": 21964 + }, + { + "epoch": 0.6513358835216321, + "grad_norm": 0.09767083078622818, + "learning_rate": 0.00027612302707862566, + "loss": 2.5966, + "step": 21965 + }, + { + "epoch": 0.6513655368739436, + "grad_norm": 0.10675916075706482, + "learning_rate": 0.000276080957132811, + "loss": 2.5878, + "step": 21966 + }, + { + "epoch": 0.651395190226255, + "grad_norm": 0.09727312624454498, + "learning_rate": 0.0002760388891698379, + "loss": 2.6163, + "step": 21967 + }, + { + "epoch": 0.6514248435785666, + "grad_norm": 0.10755913704633713, + "learning_rate": 0.0002759968231900788, + "loss": 2.6102, + "step": 21968 + }, + { + "epoch": 0.6514544969308781, + "grad_norm": 0.10892988741397858, + "learning_rate": 0.00027595475919390633, + "loss": 2.5928, + "step": 21969 + }, + { + "epoch": 0.6514841502831895, + "grad_norm": 0.09882110357284546, + "learning_rate": 0.00027591269718169287, + "loss": 2.6127, + "step": 21970 + }, + { + "epoch": 0.651513803635501, + "grad_norm": 0.09748736768960953, + "learning_rate": 0.000275870637153811, + "loss": 2.5941, + "step": 21971 + }, + { + "epoch": 0.6515434569878125, + "grad_norm": 0.10520412772893906, + "learning_rate": 0.0002758285791106331, + "loss": 2.589, + "step": 21972 + }, + { + "epoch": 0.651573110340124, + "grad_norm": 0.09538232535123825, + "learning_rate": 0.0002757865230525316, + "loss": 2.5882, + "step": 21973 + }, + { + "epoch": 0.6516027636924354, + "grad_norm": 0.10556301474571228, + "learning_rate": 0.00027574446897987893, + "loss": 2.6265, + "step": 21974 + }, + { + "epoch": 0.651632417044747, + "grad_norm": 0.10175566375255585, + "learning_rate": 0.00027570241689304744, + "loss": 2.6017, + "step": 21975 + }, + { + "epoch": 0.6516620703970584, + "grad_norm": 0.09592302143573761, + "learning_rate": 0.0002756603667924096, + "loss": 2.617, + "step": 21976 + }, + { + "epoch": 0.6516917237493699, + "grad_norm": 0.10161551833152771, + "learning_rate": 0.0002756183186783377, + "loss": 2.6006, + "step": 21977 + }, + { + "epoch": 0.6517213771016813, + "grad_norm": 0.10939128696918488, + "learning_rate": 0.0002755762725512041, + "loss": 2.626, + "step": 21978 + }, + { + "epoch": 0.6517510304539929, + "grad_norm": 0.0890558585524559, + "learning_rate": 0.0002755342284113811, + "loss": 2.5919, + "step": 21979 + }, + { + "epoch": 0.6517806838063043, + "grad_norm": 0.10971397161483765, + "learning_rate": 0.0002754921862592411, + "loss": 2.6058, + "step": 21980 + }, + { + "epoch": 0.6518103371586158, + "grad_norm": 0.10191522538661957, + "learning_rate": 0.00027545014609515626, + "loss": 2.5885, + "step": 21981 + }, + { + "epoch": 0.6518399905109272, + "grad_norm": 0.09287208318710327, + "learning_rate": 0.000275408107919499, + "loss": 2.6159, + "step": 21982 + }, + { + "epoch": 0.6518696438632388, + "grad_norm": 0.11855081468820572, + "learning_rate": 0.00027536607173264145, + "loss": 2.5739, + "step": 21983 + }, + { + "epoch": 0.6518992972155502, + "grad_norm": 0.10021331161260605, + "learning_rate": 0.0002753240375349559, + "loss": 2.5825, + "step": 21984 + }, + { + "epoch": 0.6519289505678617, + "grad_norm": 0.11807014048099518, + "learning_rate": 0.0002752820053268146, + "loss": 2.6077, + "step": 21985 + }, + { + "epoch": 0.6519586039201731, + "grad_norm": 0.1102963462471962, + "learning_rate": 0.0002752399751085896, + "loss": 2.5763, + "step": 21986 + }, + { + "epoch": 0.6519882572724847, + "grad_norm": 0.09545262902975082, + "learning_rate": 0.00027519794688065323, + "loss": 2.5737, + "step": 21987 + }, + { + "epoch": 0.6520179106247961, + "grad_norm": 0.10436317324638367, + "learning_rate": 0.0002751559206433779, + "loss": 2.5857, + "step": 21988 + }, + { + "epoch": 0.6520475639771076, + "grad_norm": 0.11107156425714493, + "learning_rate": 0.00027511389639713524, + "loss": 2.6086, + "step": 21989 + }, + { + "epoch": 0.6520772173294191, + "grad_norm": 0.09438049048185349, + "learning_rate": 0.00027507187414229766, + "loss": 2.5807, + "step": 21990 + }, + { + "epoch": 0.6521068706817306, + "grad_norm": 0.11750459671020508, + "learning_rate": 0.00027502985387923705, + "loss": 2.6111, + "step": 21991 + }, + { + "epoch": 0.6521365240340421, + "grad_norm": 0.09859556704759598, + "learning_rate": 0.00027498783560832583, + "loss": 2.6154, + "step": 21992 + }, + { + "epoch": 0.6521661773863535, + "grad_norm": 0.11720595508813858, + "learning_rate": 0.000274945819329936, + "loss": 2.5698, + "step": 21993 + }, + { + "epoch": 0.652195830738665, + "grad_norm": 0.0950484424829483, + "learning_rate": 0.0002749038050444396, + "loss": 2.6108, + "step": 21994 + }, + { + "epoch": 0.6522254840909765, + "grad_norm": 0.0994667187333107, + "learning_rate": 0.0002748617927522086, + "loss": 2.5615, + "step": 21995 + }, + { + "epoch": 0.652255137443288, + "grad_norm": 0.09963881969451904, + "learning_rate": 0.00027481978245361507, + "loss": 2.5569, + "step": 21996 + }, + { + "epoch": 0.6522847907955994, + "grad_norm": 0.08990861475467682, + "learning_rate": 0.00027477777414903104, + "loss": 2.6008, + "step": 21997 + }, + { + "epoch": 0.652314444147911, + "grad_norm": 0.09934787452220917, + "learning_rate": 0.00027473576783882845, + "loss": 2.5915, + "step": 21998 + }, + { + "epoch": 0.6523440975002224, + "grad_norm": 0.09540470689535141, + "learning_rate": 0.00027469376352337957, + "loss": 2.5909, + "step": 21999 + }, + { + "epoch": 0.6523737508525339, + "grad_norm": 0.09108973294496536, + "learning_rate": 0.00027465176120305577, + "loss": 2.5868, + "step": 22000 + }, + { + "epoch": 0.6524034042048453, + "grad_norm": 0.1018790677189827, + "learning_rate": 0.00027460976087822944, + "loss": 2.5765, + "step": 22001 + }, + { + "epoch": 0.6524330575571569, + "grad_norm": 0.09412309527397156, + "learning_rate": 0.0002745677625492723, + "loss": 2.6048, + "step": 22002 + }, + { + "epoch": 0.6524627109094683, + "grad_norm": 0.10155782103538513, + "learning_rate": 0.0002745257662165562, + "loss": 2.6127, + "step": 22003 + }, + { + "epoch": 0.6524923642617798, + "grad_norm": 0.09955345839262009, + "learning_rate": 0.00027448377188045317, + "loss": 2.592, + "step": 22004 + }, + { + "epoch": 0.6525220176140912, + "grad_norm": 0.10189814120531082, + "learning_rate": 0.00027444177954133507, + "loss": 2.5958, + "step": 22005 + }, + { + "epoch": 0.6525516709664028, + "grad_norm": 0.09559714794158936, + "learning_rate": 0.0002743997891995738, + "loss": 2.6038, + "step": 22006 + }, + { + "epoch": 0.6525813243187142, + "grad_norm": 0.10267380625009537, + "learning_rate": 0.00027435780085554115, + "loss": 2.5757, + "step": 22007 + }, + { + "epoch": 0.6526109776710257, + "grad_norm": 0.10678134113550186, + "learning_rate": 0.00027431581450960887, + "loss": 2.5863, + "step": 22008 + }, + { + "epoch": 0.6526406310233371, + "grad_norm": 0.11895092576742172, + "learning_rate": 0.00027427383016214894, + "loss": 2.5797, + "step": 22009 + }, + { + "epoch": 0.6526702843756487, + "grad_norm": 0.10807029902935028, + "learning_rate": 0.0002742318478135328, + "loss": 2.6359, + "step": 22010 + }, + { + "epoch": 0.6526999377279602, + "grad_norm": 0.10605639219284058, + "learning_rate": 0.00027418986746413247, + "loss": 2.6317, + "step": 22011 + }, + { + "epoch": 0.6527295910802716, + "grad_norm": 0.10118740797042847, + "learning_rate": 0.00027414788911431963, + "loss": 2.6188, + "step": 22012 + }, + { + "epoch": 0.6527592444325832, + "grad_norm": 0.11859264224767685, + "learning_rate": 0.00027410591276446597, + "loss": 2.6064, + "step": 22013 + }, + { + "epoch": 0.6527888977848946, + "grad_norm": 0.11454518884420395, + "learning_rate": 0.00027406393841494315, + "loss": 2.6277, + "step": 22014 + }, + { + "epoch": 0.6528185511372061, + "grad_norm": 0.10965904593467712, + "learning_rate": 0.00027402196606612304, + "loss": 2.5887, + "step": 22015 + }, + { + "epoch": 0.6528482044895175, + "grad_norm": 0.11518357694149017, + "learning_rate": 0.00027397999571837693, + "loss": 2.6197, + "step": 22016 + }, + { + "epoch": 0.6528778578418291, + "grad_norm": 0.1075870543718338, + "learning_rate": 0.0002739380273720771, + "loss": 2.5851, + "step": 22017 + }, + { + "epoch": 0.6529075111941405, + "grad_norm": 0.1200215294957161, + "learning_rate": 0.0002738960610275947, + "loss": 2.5748, + "step": 22018 + }, + { + "epoch": 0.652937164546452, + "grad_norm": 0.13276997208595276, + "learning_rate": 0.0002738540966853015, + "loss": 2.5911, + "step": 22019 + }, + { + "epoch": 0.6529668178987634, + "grad_norm": 0.10084927082061768, + "learning_rate": 0.0002738121343455693, + "loss": 2.6035, + "step": 22020 + }, + { + "epoch": 0.652996471251075, + "grad_norm": 0.11588902026414871, + "learning_rate": 0.0002737701740087693, + "loss": 2.5845, + "step": 22021 + }, + { + "epoch": 0.6530261246033864, + "grad_norm": 0.1154862716794014, + "learning_rate": 0.0002737282156752732, + "loss": 2.633, + "step": 22022 + }, + { + "epoch": 0.6530557779556979, + "grad_norm": 0.12163212150335312, + "learning_rate": 0.00027368625934545265, + "loss": 2.63, + "step": 22023 + }, + { + "epoch": 0.6530854313080093, + "grad_norm": 0.10749166458845139, + "learning_rate": 0.0002736443050196791, + "loss": 2.6044, + "step": 22024 + }, + { + "epoch": 0.6531150846603209, + "grad_norm": 0.12760457396507263, + "learning_rate": 0.0002736023526983241, + "loss": 2.5995, + "step": 22025 + }, + { + "epoch": 0.6531447380126323, + "grad_norm": 0.10041314363479614, + "learning_rate": 0.00027356040238175905, + "loss": 2.579, + "step": 22026 + }, + { + "epoch": 0.6531743913649438, + "grad_norm": 0.11578913033008575, + "learning_rate": 0.0002735184540703556, + "loss": 2.5909, + "step": 22027 + }, + { + "epoch": 0.6532040447172552, + "grad_norm": 0.10878317058086395, + "learning_rate": 0.00027347650776448506, + "loss": 2.6033, + "step": 22028 + }, + { + "epoch": 0.6532336980695668, + "grad_norm": 0.11195103824138641, + "learning_rate": 0.0002734345634645187, + "loss": 2.6233, + "step": 22029 + }, + { + "epoch": 0.6532633514218782, + "grad_norm": 0.10801765322685242, + "learning_rate": 0.00027339262117082866, + "loss": 2.5876, + "step": 22030 + }, + { + "epoch": 0.6532930047741897, + "grad_norm": 0.10000947117805481, + "learning_rate": 0.00027335068088378555, + "loss": 2.6103, + "step": 22031 + }, + { + "epoch": 0.6533226581265013, + "grad_norm": 0.11607170104980469, + "learning_rate": 0.00027330874260376114, + "loss": 2.6247, + "step": 22032 + }, + { + "epoch": 0.6533523114788127, + "grad_norm": 0.1059996709227562, + "learning_rate": 0.00027326680633112674, + "loss": 2.5808, + "step": 22033 + }, + { + "epoch": 0.6533819648311242, + "grad_norm": 0.10186668485403061, + "learning_rate": 0.00027322487206625365, + "loss": 2.6435, + "step": 22034 + }, + { + "epoch": 0.6534116181834356, + "grad_norm": 0.12884241342544556, + "learning_rate": 0.00027318293980951327, + "loss": 2.6252, + "step": 22035 + }, + { + "epoch": 0.6534412715357472, + "grad_norm": 0.12516453862190247, + "learning_rate": 0.00027314100956127687, + "loss": 2.5999, + "step": 22036 + }, + { + "epoch": 0.6534709248880586, + "grad_norm": 0.10682416707277298, + "learning_rate": 0.00027309908132191574, + "loss": 2.5857, + "step": 22037 + }, + { + "epoch": 0.6535005782403701, + "grad_norm": 0.123136006295681, + "learning_rate": 0.00027305715509180116, + "loss": 2.6157, + "step": 22038 + }, + { + "epoch": 0.6535302315926815, + "grad_norm": 0.10751829296350479, + "learning_rate": 0.00027301523087130454, + "loss": 2.5809, + "step": 22039 + }, + { + "epoch": 0.6535598849449931, + "grad_norm": 0.10932411253452301, + "learning_rate": 0.00027297330866079693, + "loss": 2.5983, + "step": 22040 + }, + { + "epoch": 0.6535895382973045, + "grad_norm": 0.11702648550271988, + "learning_rate": 0.0002729313884606497, + "loss": 2.6018, + "step": 22041 + }, + { + "epoch": 0.653619191649616, + "grad_norm": 0.11992903798818588, + "learning_rate": 0.00027288947027123396, + "loss": 2.5813, + "step": 22042 + }, + { + "epoch": 0.6536488450019274, + "grad_norm": 0.09376614540815353, + "learning_rate": 0.00027284755409292096, + "loss": 2.5821, + "step": 22043 + }, + { + "epoch": 0.653678498354239, + "grad_norm": 0.11796408891677856, + "learning_rate": 0.00027280563992608184, + "loss": 2.6056, + "step": 22044 + }, + { + "epoch": 0.6537081517065504, + "grad_norm": 0.11477886885404587, + "learning_rate": 0.0002727637277710878, + "loss": 2.6117, + "step": 22045 + }, + { + "epoch": 0.6537378050588619, + "grad_norm": 0.10451444983482361, + "learning_rate": 0.00027272181762830997, + "loss": 2.5692, + "step": 22046 + }, + { + "epoch": 0.6537674584111733, + "grad_norm": 0.10851464420557022, + "learning_rate": 0.0002726799094981194, + "loss": 2.6194, + "step": 22047 + }, + { + "epoch": 0.6537971117634849, + "grad_norm": 0.11083017289638519, + "learning_rate": 0.0002726380033808872, + "loss": 2.6274, + "step": 22048 + }, + { + "epoch": 0.6538267651157963, + "grad_norm": 0.0990367904305458, + "learning_rate": 0.0002725960992769846, + "loss": 2.6316, + "step": 22049 + }, + { + "epoch": 0.6538564184681078, + "grad_norm": 0.10913848131895065, + "learning_rate": 0.0002725541971867826, + "loss": 2.591, + "step": 22050 + }, + { + "epoch": 0.6538860718204192, + "grad_norm": 0.09887660294771194, + "learning_rate": 0.0002725122971106522, + "loss": 2.5801, + "step": 22051 + }, + { + "epoch": 0.6539157251727308, + "grad_norm": 0.11216046661138535, + "learning_rate": 0.0002724703990489644, + "loss": 2.5944, + "step": 22052 + }, + { + "epoch": 0.6539453785250423, + "grad_norm": 0.0913851335644722, + "learning_rate": 0.0002724285030020903, + "loss": 2.5945, + "step": 22053 + }, + { + "epoch": 0.6539750318773537, + "grad_norm": 0.1161380186676979, + "learning_rate": 0.00027238660897040113, + "loss": 2.5611, + "step": 22054 + }, + { + "epoch": 0.6540046852296653, + "grad_norm": 0.09760963171720505, + "learning_rate": 0.0002723447169542671, + "loss": 2.5982, + "step": 22055 + }, + { + "epoch": 0.6540343385819767, + "grad_norm": 0.10854349285364151, + "learning_rate": 0.00027230282695405995, + "loss": 2.5905, + "step": 22056 + }, + { + "epoch": 0.6540639919342882, + "grad_norm": 0.11354134231805801, + "learning_rate": 0.00027226093897015036, + "loss": 2.613, + "step": 22057 + }, + { + "epoch": 0.6540936452865996, + "grad_norm": 0.10048316419124603, + "learning_rate": 0.00027221905300290917, + "loss": 2.5888, + "step": 22058 + }, + { + "epoch": 0.6541232986389112, + "grad_norm": 0.09109088778495789, + "learning_rate": 0.00027217716905270745, + "loss": 2.5888, + "step": 22059 + }, + { + "epoch": 0.6541529519912226, + "grad_norm": 0.11143268644809723, + "learning_rate": 0.00027213528711991596, + "loss": 2.6099, + "step": 22060 + }, + { + "epoch": 0.6541826053435341, + "grad_norm": 0.0939759686589241, + "learning_rate": 0.0002720934072049056, + "loss": 2.5939, + "step": 22061 + }, + { + "epoch": 0.6542122586958455, + "grad_norm": 0.09730090945959091, + "learning_rate": 0.0002720515293080473, + "loss": 2.5642, + "step": 22062 + }, + { + "epoch": 0.6542419120481571, + "grad_norm": 0.08937744051218033, + "learning_rate": 0.0002720096534297118, + "loss": 2.603, + "step": 22063 + }, + { + "epoch": 0.6542715654004685, + "grad_norm": 0.09685049206018448, + "learning_rate": 0.0002719677795702701, + "loss": 2.6307, + "step": 22064 + }, + { + "epoch": 0.65430121875278, + "grad_norm": 0.10036862641572952, + "learning_rate": 0.00027192590773009276, + "loss": 2.5872, + "step": 22065 + }, + { + "epoch": 0.6543308721050914, + "grad_norm": 0.09281132370233536, + "learning_rate": 0.00027188403790955057, + "loss": 2.5866, + "step": 22066 + }, + { + "epoch": 0.654360525457403, + "grad_norm": 0.0988016128540039, + "learning_rate": 0.0002718421701090144, + "loss": 2.5905, + "step": 22067 + }, + { + "epoch": 0.6543901788097144, + "grad_norm": 0.09719248116016388, + "learning_rate": 0.0002718003043288548, + "loss": 2.5959, + "step": 22068 + }, + { + "epoch": 0.6544198321620259, + "grad_norm": 0.1041623130440712, + "learning_rate": 0.0002717584405694429, + "loss": 2.6063, + "step": 22069 + }, + { + "epoch": 0.6544494855143373, + "grad_norm": 0.10205624997615814, + "learning_rate": 0.0002717165788311491, + "loss": 2.5839, + "step": 22070 + }, + { + "epoch": 0.6544791388666489, + "grad_norm": 0.09961047768592834, + "learning_rate": 0.00027167471911434426, + "loss": 2.6015, + "step": 22071 + }, + { + "epoch": 0.6545087922189603, + "grad_norm": 0.11036691069602966, + "learning_rate": 0.0002716328614193989, + "loss": 2.595, + "step": 22072 + }, + { + "epoch": 0.6545384455712718, + "grad_norm": 0.1060258001089096, + "learning_rate": 0.00027159100574668385, + "loss": 2.5967, + "step": 22073 + }, + { + "epoch": 0.6545680989235834, + "grad_norm": 0.10392207652330399, + "learning_rate": 0.00027154915209656955, + "loss": 2.5777, + "step": 22074 + }, + { + "epoch": 0.6545977522758948, + "grad_norm": 0.0945049449801445, + "learning_rate": 0.00027150730046942694, + "loss": 2.5923, + "step": 22075 + }, + { + "epoch": 0.6546274056282063, + "grad_norm": 0.09619325399398804, + "learning_rate": 0.0002714654508656262, + "loss": 2.5813, + "step": 22076 + }, + { + "epoch": 0.6546570589805177, + "grad_norm": 0.10190770775079727, + "learning_rate": 0.0002714236032855382, + "loss": 2.5779, + "step": 22077 + }, + { + "epoch": 0.6546867123328293, + "grad_norm": 0.11018124967813492, + "learning_rate": 0.0002713817577295333, + "loss": 2.6371, + "step": 22078 + }, + { + "epoch": 0.6547163656851407, + "grad_norm": 0.08972006291151047, + "learning_rate": 0.00027133991419798234, + "loss": 2.5768, + "step": 22079 + }, + { + "epoch": 0.6547460190374522, + "grad_norm": 0.10789070278406143, + "learning_rate": 0.0002712980726912556, + "loss": 2.5816, + "step": 22080 + }, + { + "epoch": 0.6547756723897636, + "grad_norm": 0.10911737382411957, + "learning_rate": 0.0002712562332097235, + "loss": 2.6137, + "step": 22081 + }, + { + "epoch": 0.6548053257420752, + "grad_norm": 0.1112731397151947, + "learning_rate": 0.00027121439575375684, + "loss": 2.6202, + "step": 22082 + }, + { + "epoch": 0.6548349790943866, + "grad_norm": 0.09879301488399506, + "learning_rate": 0.0002711725603237261, + "loss": 2.5842, + "step": 22083 + }, + { + "epoch": 0.6548646324466981, + "grad_norm": 0.1109067052602768, + "learning_rate": 0.00027113072692000153, + "loss": 2.572, + "step": 22084 + }, + { + "epoch": 0.6548942857990095, + "grad_norm": 0.10675542801618576, + "learning_rate": 0.0002710888955429538, + "loss": 2.5843, + "step": 22085 + }, + { + "epoch": 0.6549239391513211, + "grad_norm": 0.11238880455493927, + "learning_rate": 0.0002710470661929531, + "loss": 2.5507, + "step": 22086 + }, + { + "epoch": 0.6549535925036325, + "grad_norm": 0.11621986329555511, + "learning_rate": 0.0002710052388703699, + "loss": 2.6134, + "step": 22087 + }, + { + "epoch": 0.654983245855944, + "grad_norm": 0.10255452990531921, + "learning_rate": 0.00027096341357557465, + "loss": 2.5916, + "step": 22088 + }, + { + "epoch": 0.6550128992082555, + "grad_norm": 0.0992862656712532, + "learning_rate": 0.0002709215903089376, + "loss": 2.6207, + "step": 22089 + }, + { + "epoch": 0.655042552560567, + "grad_norm": 0.1306229531764984, + "learning_rate": 0.00027087976907082923, + "loss": 2.6398, + "step": 22090 + }, + { + "epoch": 0.6550722059128784, + "grad_norm": 0.12177838385105133, + "learning_rate": 0.0002708379498616199, + "loss": 2.5854, + "step": 22091 + }, + { + "epoch": 0.6551018592651899, + "grad_norm": 0.10875148326158524, + "learning_rate": 0.0002707961326816798, + "loss": 2.6016, + "step": 22092 + }, + { + "epoch": 0.6551315126175014, + "grad_norm": 0.11147094517946243, + "learning_rate": 0.00027075431753137923, + "loss": 2.6003, + "step": 22093 + }, + { + "epoch": 0.6551611659698129, + "grad_norm": 0.11097690463066101, + "learning_rate": 0.0002707125044110884, + "loss": 2.5971, + "step": 22094 + }, + { + "epoch": 0.6551908193221244, + "grad_norm": 0.10583702474832535, + "learning_rate": 0.0002706706933211779, + "loss": 2.6352, + "step": 22095 + }, + { + "epoch": 0.6552204726744358, + "grad_norm": 0.10758783668279648, + "learning_rate": 0.00027062888426201796, + "loss": 2.5922, + "step": 22096 + }, + { + "epoch": 0.6552501260267474, + "grad_norm": 0.10362184047698975, + "learning_rate": 0.00027058707723397847, + "loss": 2.599, + "step": 22097 + }, + { + "epoch": 0.6552797793790588, + "grad_norm": 0.10049945116043091, + "learning_rate": 0.0002705452722374298, + "loss": 2.6154, + "step": 22098 + }, + { + "epoch": 0.6553094327313703, + "grad_norm": 0.09433721750974655, + "learning_rate": 0.000270503469272742, + "loss": 2.6014, + "step": 22099 + }, + { + "epoch": 0.6553390860836817, + "grad_norm": 0.09674103558063507, + "learning_rate": 0.0002704616683402854, + "loss": 2.6045, + "step": 22100 + }, + { + "epoch": 0.6553687394359933, + "grad_norm": 0.09279389679431915, + "learning_rate": 0.0002704198694404302, + "loss": 2.5969, + "step": 22101 + }, + { + "epoch": 0.6553983927883047, + "grad_norm": 0.09594947844743729, + "learning_rate": 0.0002703780725735464, + "loss": 2.5986, + "step": 22102 + }, + { + "epoch": 0.6554280461406162, + "grad_norm": 0.1003556102514267, + "learning_rate": 0.00027033627774000426, + "loss": 2.5467, + "step": 22103 + }, + { + "epoch": 0.6554576994929276, + "grad_norm": 0.0995720773935318, + "learning_rate": 0.0002702944849401737, + "loss": 2.5991, + "step": 22104 + }, + { + "epoch": 0.6554873528452392, + "grad_norm": 0.0942750871181488, + "learning_rate": 0.000270252694174425, + "loss": 2.645, + "step": 22105 + }, + { + "epoch": 0.6555170061975506, + "grad_norm": 0.09829361736774445, + "learning_rate": 0.0002702109054431281, + "loss": 2.5855, + "step": 22106 + }, + { + "epoch": 0.6555466595498621, + "grad_norm": 0.1022769883275032, + "learning_rate": 0.0002701691187466531, + "loss": 2.5914, + "step": 22107 + }, + { + "epoch": 0.6555763129021736, + "grad_norm": 0.10446440428495407, + "learning_rate": 0.00027012733408536993, + "loss": 2.6283, + "step": 22108 + }, + { + "epoch": 0.6556059662544851, + "grad_norm": 0.1115044429898262, + "learning_rate": 0.00027008555145964877, + "loss": 2.6068, + "step": 22109 + }, + { + "epoch": 0.6556356196067965, + "grad_norm": 0.10706491768360138, + "learning_rate": 0.0002700437708698594, + "loss": 2.5902, + "step": 22110 + }, + { + "epoch": 0.655665272959108, + "grad_norm": 0.09647787362337112, + "learning_rate": 0.000270001992316372, + "loss": 2.631, + "step": 22111 + }, + { + "epoch": 0.6556949263114195, + "grad_norm": 0.1128201112151146, + "learning_rate": 0.0002699602157995564, + "loss": 2.5644, + "step": 22112 + }, + { + "epoch": 0.655724579663731, + "grad_norm": 0.11383502185344696, + "learning_rate": 0.0002699184413197826, + "loss": 2.5779, + "step": 22113 + }, + { + "epoch": 0.6557542330160424, + "grad_norm": 0.09557279199361801, + "learning_rate": 0.00026987666887742047, + "loss": 2.5577, + "step": 22114 + }, + { + "epoch": 0.6557838863683539, + "grad_norm": 0.09721441566944122, + "learning_rate": 0.00026983489847283996, + "loss": 2.5663, + "step": 22115 + }, + { + "epoch": 0.6558135397206655, + "grad_norm": 0.10434423387050629, + "learning_rate": 0.0002697931301064109, + "loss": 2.5949, + "step": 22116 + }, + { + "epoch": 0.6558431930729769, + "grad_norm": 0.1067771315574646, + "learning_rate": 0.0002697513637785032, + "loss": 2.579, + "step": 22117 + }, + { + "epoch": 0.6558728464252884, + "grad_norm": 0.11534196138381958, + "learning_rate": 0.00026970959948948673, + "loss": 2.6067, + "step": 22118 + }, + { + "epoch": 0.6559024997775998, + "grad_norm": 0.0943652093410492, + "learning_rate": 0.00026966783723973145, + "loss": 2.6095, + "step": 22119 + }, + { + "epoch": 0.6559321531299114, + "grad_norm": 0.11612441390752792, + "learning_rate": 0.00026962607702960657, + "loss": 2.6121, + "step": 22120 + }, + { + "epoch": 0.6559618064822228, + "grad_norm": 0.11203707009553909, + "learning_rate": 0.00026958431885948256, + "loss": 2.5911, + "step": 22121 + }, + { + "epoch": 0.6559914598345343, + "grad_norm": 0.10969680547714233, + "learning_rate": 0.000269542562729729, + "loss": 2.5876, + "step": 22122 + }, + { + "epoch": 0.6560211131868458, + "grad_norm": 0.1029895767569542, + "learning_rate": 0.0002695008086407155, + "loss": 2.5806, + "step": 22123 + }, + { + "epoch": 0.6560507665391573, + "grad_norm": 0.10852902382612228, + "learning_rate": 0.000269459056592812, + "loss": 2.6183, + "step": 22124 + }, + { + "epoch": 0.6560804198914687, + "grad_norm": 0.0878155529499054, + "learning_rate": 0.0002694173065863881, + "loss": 2.5916, + "step": 22125 + }, + { + "epoch": 0.6561100732437802, + "grad_norm": 0.11127188801765442, + "learning_rate": 0.0002693755586218135, + "loss": 2.6049, + "step": 22126 + }, + { + "epoch": 0.6561397265960917, + "grad_norm": 0.10182755440473557, + "learning_rate": 0.00026933381269945793, + "loss": 2.5853, + "step": 22127 + }, + { + "epoch": 0.6561693799484032, + "grad_norm": 0.10428395122289658, + "learning_rate": 0.000269292068819691, + "loss": 2.5746, + "step": 22128 + }, + { + "epoch": 0.6561990333007146, + "grad_norm": 0.09810175001621246, + "learning_rate": 0.00026925032698288257, + "loss": 2.5799, + "step": 22129 + }, + { + "epoch": 0.6562286866530261, + "grad_norm": 0.10662726312875748, + "learning_rate": 0.0002692085871894021, + "loss": 2.538, + "step": 22130 + }, + { + "epoch": 0.6562583400053376, + "grad_norm": 0.09677284955978394, + "learning_rate": 0.0002691668494396191, + "loss": 2.6045, + "step": 22131 + }, + { + "epoch": 0.6562879933576491, + "grad_norm": 0.09426628798246384, + "learning_rate": 0.00026912511373390326, + "loss": 2.6332, + "step": 22132 + }, + { + "epoch": 0.6563176467099605, + "grad_norm": 0.11471781879663467, + "learning_rate": 0.00026908338007262397, + "loss": 2.6284, + "step": 22133 + }, + { + "epoch": 0.656347300062272, + "grad_norm": 0.10763439536094666, + "learning_rate": 0.0002690416484561512, + "loss": 2.5793, + "step": 22134 + }, + { + "epoch": 0.6563769534145835, + "grad_norm": 0.1007058173418045, + "learning_rate": 0.0002689999188848542, + "loss": 2.6043, + "step": 22135 + }, + { + "epoch": 0.656406606766895, + "grad_norm": 0.11036007106304169, + "learning_rate": 0.00026895819135910263, + "loss": 2.6312, + "step": 22136 + }, + { + "epoch": 0.6564362601192065, + "grad_norm": 0.11381851881742477, + "learning_rate": 0.000268916465879266, + "loss": 2.5697, + "step": 22137 + }, + { + "epoch": 0.656465913471518, + "grad_norm": 0.11355333030223846, + "learning_rate": 0.00026887474244571363, + "loss": 2.5751, + "step": 22138 + }, + { + "epoch": 0.6564955668238295, + "grad_norm": 0.1115061342716217, + "learning_rate": 0.0002688330210588151, + "loss": 2.6092, + "step": 22139 + }, + { + "epoch": 0.6565252201761409, + "grad_norm": 0.09215257316827774, + "learning_rate": 0.0002687913017189401, + "loss": 2.5862, + "step": 22140 + }, + { + "epoch": 0.6565548735284524, + "grad_norm": 0.11526152491569519, + "learning_rate": 0.0002687495844264575, + "loss": 2.5616, + "step": 22141 + }, + { + "epoch": 0.6565845268807639, + "grad_norm": 0.08641430735588074, + "learning_rate": 0.00026870786918173714, + "loss": 2.608, + "step": 22142 + }, + { + "epoch": 0.6566141802330754, + "grad_norm": 0.10864599794149399, + "learning_rate": 0.0002686661559851482, + "loss": 2.6097, + "step": 22143 + }, + { + "epoch": 0.6566438335853868, + "grad_norm": 0.09746697545051575, + "learning_rate": 0.0002686244448370603, + "loss": 2.5845, + "step": 22144 + }, + { + "epoch": 0.6566734869376983, + "grad_norm": 0.09476712346076965, + "learning_rate": 0.0002685827357378425, + "loss": 2.5968, + "step": 22145 + }, + { + "epoch": 0.6567031402900098, + "grad_norm": 0.0938347801566124, + "learning_rate": 0.0002685410286878642, + "loss": 2.5924, + "step": 22146 + }, + { + "epoch": 0.6567327936423213, + "grad_norm": 0.09501393139362335, + "learning_rate": 0.00026849932368749494, + "loss": 2.5842, + "step": 22147 + }, + { + "epoch": 0.6567624469946327, + "grad_norm": 0.09508440643548965, + "learning_rate": 0.000268457620737104, + "loss": 2.6322, + "step": 22148 + }, + { + "epoch": 0.6567921003469442, + "grad_norm": 0.09801356494426727, + "learning_rate": 0.0002684159198370605, + "loss": 2.5632, + "step": 22149 + }, + { + "epoch": 0.6568217536992557, + "grad_norm": 0.10301590710878372, + "learning_rate": 0.0002683742209877338, + "loss": 2.6051, + "step": 22150 + }, + { + "epoch": 0.6568514070515672, + "grad_norm": 0.09427520632743835, + "learning_rate": 0.0002683325241894934, + "loss": 2.6013, + "step": 22151 + }, + { + "epoch": 0.6568810604038786, + "grad_norm": 0.11750732362270355, + "learning_rate": 0.000268290829442708, + "loss": 2.5757, + "step": 22152 + }, + { + "epoch": 0.6569107137561901, + "grad_norm": 0.13409188389778137, + "learning_rate": 0.00026824913674774705, + "loss": 2.5763, + "step": 22153 + }, + { + "epoch": 0.6569403671085016, + "grad_norm": 0.11735901981592178, + "learning_rate": 0.00026820744610497985, + "loss": 2.6244, + "step": 22154 + }, + { + "epoch": 0.6569700204608131, + "grad_norm": 0.1178555116057396, + "learning_rate": 0.0002681657575147754, + "loss": 2.5952, + "step": 22155 + }, + { + "epoch": 0.6569996738131245, + "grad_norm": 0.10041829943656921, + "learning_rate": 0.000268124070977503, + "loss": 2.6074, + "step": 22156 + }, + { + "epoch": 0.657029327165436, + "grad_norm": 0.11314065754413605, + "learning_rate": 0.0002680823864935318, + "loss": 2.6018, + "step": 22157 + }, + { + "epoch": 0.6570589805177476, + "grad_norm": 0.10539969056844711, + "learning_rate": 0.0002680407040632308, + "loss": 2.6206, + "step": 22158 + }, + { + "epoch": 0.657088633870059, + "grad_norm": 0.09443454444408417, + "learning_rate": 0.00026799902368696905, + "loss": 2.6212, + "step": 22159 + }, + { + "epoch": 0.6571182872223705, + "grad_norm": 0.10796497017145157, + "learning_rate": 0.00026795734536511594, + "loss": 2.5745, + "step": 22160 + }, + { + "epoch": 0.657147940574682, + "grad_norm": 0.09835054725408554, + "learning_rate": 0.00026791566909804056, + "loss": 2.5851, + "step": 22161 + }, + { + "epoch": 0.6571775939269935, + "grad_norm": 0.1107916384935379, + "learning_rate": 0.00026787399488611155, + "loss": 2.5947, + "step": 22162 + }, + { + "epoch": 0.6572072472793049, + "grad_norm": 0.10661685466766357, + "learning_rate": 0.00026783232272969813, + "loss": 2.6067, + "step": 22163 + }, + { + "epoch": 0.6572369006316164, + "grad_norm": 0.09342627227306366, + "learning_rate": 0.00026779065262916947, + "loss": 2.6009, + "step": 22164 + }, + { + "epoch": 0.6572665539839279, + "grad_norm": 0.10625098645687103, + "learning_rate": 0.0002677489845848944, + "loss": 2.6268, + "step": 22165 + }, + { + "epoch": 0.6572962073362394, + "grad_norm": 0.1001787856221199, + "learning_rate": 0.00026770731859724185, + "loss": 2.5995, + "step": 22166 + }, + { + "epoch": 0.6573258606885508, + "grad_norm": 0.09365074336528778, + "learning_rate": 0.0002676656546665809, + "loss": 2.5806, + "step": 22167 + }, + { + "epoch": 0.6573555140408623, + "grad_norm": 0.1123194769024849, + "learning_rate": 0.0002676239927932805, + "loss": 2.5996, + "step": 22168 + }, + { + "epoch": 0.6573851673931738, + "grad_norm": 0.09110651910305023, + "learning_rate": 0.0002675823329777095, + "loss": 2.5831, + "step": 22169 + }, + { + "epoch": 0.6574148207454853, + "grad_norm": 0.11387710273265839, + "learning_rate": 0.00026754067522023685, + "loss": 2.6411, + "step": 22170 + }, + { + "epoch": 0.6574444740977967, + "grad_norm": 0.10055898129940033, + "learning_rate": 0.0002674990195212314, + "loss": 2.6179, + "step": 22171 + }, + { + "epoch": 0.6574741274501082, + "grad_norm": 0.10603438317775726, + "learning_rate": 0.00026745736588106207, + "loss": 2.6042, + "step": 22172 + }, + { + "epoch": 0.6575037808024197, + "grad_norm": 0.12193764746189117, + "learning_rate": 0.0002674157143000977, + "loss": 2.5968, + "step": 22173 + }, + { + "epoch": 0.6575334341547312, + "grad_norm": 0.11531112343072891, + "learning_rate": 0.000267374064778707, + "loss": 2.6069, + "step": 22174 + }, + { + "epoch": 0.6575630875070426, + "grad_norm": 0.10673075169324875, + "learning_rate": 0.000267332417317259, + "loss": 2.6029, + "step": 22175 + }, + { + "epoch": 0.6575927408593542, + "grad_norm": 0.11105421185493469, + "learning_rate": 0.0002672907719161223, + "loss": 2.5995, + "step": 22176 + }, + { + "epoch": 0.6576223942116657, + "grad_norm": 0.1104259043931961, + "learning_rate": 0.0002672491285756658, + "loss": 2.5849, + "step": 22177 + }, + { + "epoch": 0.6576520475639771, + "grad_norm": 0.11665171384811401, + "learning_rate": 0.0002672074872962582, + "loss": 2.5872, + "step": 22178 + }, + { + "epoch": 0.6576817009162886, + "grad_norm": 0.09983772784471512, + "learning_rate": 0.0002671658480782683, + "loss": 2.626, + "step": 22179 + }, + { + "epoch": 0.6577113542686001, + "grad_norm": 0.10627587139606476, + "learning_rate": 0.00026712421092206474, + "loss": 2.6031, + "step": 22180 + }, + { + "epoch": 0.6577410076209116, + "grad_norm": 0.10726243257522583, + "learning_rate": 0.0002670825758280163, + "loss": 2.6353, + "step": 22181 + }, + { + "epoch": 0.657770660973223, + "grad_norm": 0.11984874308109283, + "learning_rate": 0.0002670409427964916, + "loss": 2.613, + "step": 22182 + }, + { + "epoch": 0.6578003143255345, + "grad_norm": 0.09464353322982788, + "learning_rate": 0.0002669993118278593, + "loss": 2.5787, + "step": 22183 + }, + { + "epoch": 0.657829967677846, + "grad_norm": 0.09314566105604172, + "learning_rate": 0.0002669576829224881, + "loss": 2.6047, + "step": 22184 + }, + { + "epoch": 0.6578596210301575, + "grad_norm": 0.09960594028234482, + "learning_rate": 0.0002669160560807467, + "loss": 2.5939, + "step": 22185 + }, + { + "epoch": 0.6578892743824689, + "grad_norm": 0.0965794250369072, + "learning_rate": 0.00026687443130300357, + "loss": 2.5715, + "step": 22186 + }, + { + "epoch": 0.6579189277347804, + "grad_norm": 0.09417466819286346, + "learning_rate": 0.00026683280858962743, + "loss": 2.5998, + "step": 22187 + }, + { + "epoch": 0.6579485810870919, + "grad_norm": 0.10856150835752487, + "learning_rate": 0.0002667911879409867, + "loss": 2.6258, + "step": 22188 + }, + { + "epoch": 0.6579782344394034, + "grad_norm": 0.09191747009754181, + "learning_rate": 0.0002667495693574501, + "loss": 2.5691, + "step": 22189 + }, + { + "epoch": 0.6580078877917148, + "grad_norm": 0.11164688318967819, + "learning_rate": 0.0002667079528393861, + "loss": 2.5689, + "step": 22190 + }, + { + "epoch": 0.6580375411440264, + "grad_norm": 0.10437297075986862, + "learning_rate": 0.00026666633838716316, + "loss": 2.5815, + "step": 22191 + }, + { + "epoch": 0.6580671944963378, + "grad_norm": 0.11558905243873596, + "learning_rate": 0.00026662472600114985, + "loss": 2.5975, + "step": 22192 + }, + { + "epoch": 0.6580968478486493, + "grad_norm": 0.11154269427061081, + "learning_rate": 0.0002665831156817147, + "loss": 2.6025, + "step": 22193 + }, + { + "epoch": 0.6581265012009607, + "grad_norm": 0.11850593984127045, + "learning_rate": 0.0002665415074292261, + "loss": 2.606, + "step": 22194 + }, + { + "epoch": 0.6581561545532723, + "grad_norm": 0.11397860199213028, + "learning_rate": 0.00026649990124405276, + "loss": 2.6403, + "step": 22195 + }, + { + "epoch": 0.6581858079055837, + "grad_norm": 0.0950775146484375, + "learning_rate": 0.00026645829712656263, + "loss": 2.6, + "step": 22196 + }, + { + "epoch": 0.6582154612578952, + "grad_norm": 0.11239446699619293, + "learning_rate": 0.00026641669507712417, + "loss": 2.588, + "step": 22197 + }, + { + "epoch": 0.6582451146102067, + "grad_norm": 0.1002776026725769, + "learning_rate": 0.0002663750950961062, + "loss": 2.5917, + "step": 22198 + }, + { + "epoch": 0.6582747679625182, + "grad_norm": 0.11682234704494476, + "learning_rate": 0.0002663334971838768, + "loss": 2.5982, + "step": 22199 + }, + { + "epoch": 0.6583044213148297, + "grad_norm": 0.09203720837831497, + "learning_rate": 0.00026629190134080445, + "loss": 2.5811, + "step": 22200 + }, + { + "epoch": 0.6583340746671411, + "grad_norm": 0.11984682828187943, + "learning_rate": 0.0002662503075672574, + "loss": 2.5887, + "step": 22201 + }, + { + "epoch": 0.6583637280194526, + "grad_norm": 0.11613484472036362, + "learning_rate": 0.00026620871586360405, + "loss": 2.56, + "step": 22202 + }, + { + "epoch": 0.6583933813717641, + "grad_norm": 0.10500022768974304, + "learning_rate": 0.0002661671262302126, + "loss": 2.6252, + "step": 22203 + }, + { + "epoch": 0.6584230347240756, + "grad_norm": 0.11796887218952179, + "learning_rate": 0.0002661255386674514, + "loss": 2.5568, + "step": 22204 + }, + { + "epoch": 0.658452688076387, + "grad_norm": 0.12103615701198578, + "learning_rate": 0.0002660839531756887, + "loss": 2.5932, + "step": 22205 + }, + { + "epoch": 0.6584823414286985, + "grad_norm": 0.11434759199619293, + "learning_rate": 0.0002660423697552929, + "loss": 2.6152, + "step": 22206 + }, + { + "epoch": 0.65851199478101, + "grad_norm": 0.10429432988166809, + "learning_rate": 0.00026600078840663193, + "loss": 2.5679, + "step": 22207 + }, + { + "epoch": 0.6585416481333215, + "grad_norm": 0.13117867708206177, + "learning_rate": 0.0002659592091300741, + "loss": 2.5818, + "step": 22208 + }, + { + "epoch": 0.6585713014856329, + "grad_norm": 0.11725269258022308, + "learning_rate": 0.00026591763192598773, + "loss": 2.6094, + "step": 22209 + }, + { + "epoch": 0.6586009548379445, + "grad_norm": 0.12150159478187561, + "learning_rate": 0.00026587605679474064, + "loss": 2.6083, + "step": 22210 + }, + { + "epoch": 0.6586306081902559, + "grad_norm": 0.09447730332612991, + "learning_rate": 0.00026583448373670147, + "loss": 2.622, + "step": 22211 + }, + { + "epoch": 0.6586602615425674, + "grad_norm": 0.11209473013877869, + "learning_rate": 0.00026579291275223815, + "loss": 2.571, + "step": 22212 + }, + { + "epoch": 0.6586899148948788, + "grad_norm": 0.10232444107532501, + "learning_rate": 0.0002657513438417187, + "loss": 2.6077, + "step": 22213 + }, + { + "epoch": 0.6587195682471904, + "grad_norm": 0.10419291257858276, + "learning_rate": 0.00026570977700551146, + "loss": 2.5751, + "step": 22214 + }, + { + "epoch": 0.6587492215995018, + "grad_norm": 0.11080538481473923, + "learning_rate": 0.0002656682122439843, + "loss": 2.6026, + "step": 22215 + }, + { + "epoch": 0.6587788749518133, + "grad_norm": 0.09624174237251282, + "learning_rate": 0.0002656266495575055, + "loss": 2.5908, + "step": 22216 + }, + { + "epoch": 0.6588085283041247, + "grad_norm": 0.10243187844753265, + "learning_rate": 0.0002655850889464428, + "loss": 2.6075, + "step": 22217 + }, + { + "epoch": 0.6588381816564363, + "grad_norm": 0.11156870424747467, + "learning_rate": 0.0002655435304111643, + "loss": 2.6213, + "step": 22218 + }, + { + "epoch": 0.6588678350087478, + "grad_norm": 0.0994221568107605, + "learning_rate": 0.0002655019739520381, + "loss": 2.5922, + "step": 22219 + }, + { + "epoch": 0.6588974883610592, + "grad_norm": 0.10031799226999283, + "learning_rate": 0.0002654604195694322, + "loss": 2.5842, + "step": 22220 + }, + { + "epoch": 0.6589271417133707, + "grad_norm": 0.09569647163152695, + "learning_rate": 0.00026541886726371463, + "loss": 2.589, + "step": 22221 + }, + { + "epoch": 0.6589567950656822, + "grad_norm": 0.10012544691562653, + "learning_rate": 0.00026537731703525316, + "loss": 2.5918, + "step": 22222 + }, + { + "epoch": 0.6589864484179937, + "grad_norm": 0.10016051679849625, + "learning_rate": 0.0002653357688844156, + "loss": 2.5689, + "step": 22223 + }, + { + "epoch": 0.6590161017703051, + "grad_norm": 0.10821705311536789, + "learning_rate": 0.00026529422281157037, + "loss": 2.6201, + "step": 22224 + }, + { + "epoch": 0.6590457551226166, + "grad_norm": 0.09520860016345978, + "learning_rate": 0.00026525267881708506, + "loss": 2.6138, + "step": 22225 + }, + { + "epoch": 0.6590754084749281, + "grad_norm": 0.08915134519338608, + "learning_rate": 0.00026521113690132747, + "loss": 2.6018, + "step": 22226 + }, + { + "epoch": 0.6591050618272396, + "grad_norm": 0.08951008319854736, + "learning_rate": 0.0002651695970646659, + "loss": 2.614, + "step": 22227 + }, + { + "epoch": 0.659134715179551, + "grad_norm": 0.0845719575881958, + "learning_rate": 0.0002651280593074676, + "loss": 2.56, + "step": 22228 + }, + { + "epoch": 0.6591643685318626, + "grad_norm": 0.08588720858097076, + "learning_rate": 0.0002650865236301006, + "loss": 2.5734, + "step": 22229 + }, + { + "epoch": 0.659194021884174, + "grad_norm": 0.08917475491762161, + "learning_rate": 0.0002650449900329328, + "loss": 2.5742, + "step": 22230 + }, + { + "epoch": 0.6592236752364855, + "grad_norm": 0.09051652252674103, + "learning_rate": 0.00026500345851633193, + "loss": 2.6123, + "step": 22231 + }, + { + "epoch": 0.6592533285887969, + "grad_norm": 0.08280765265226364, + "learning_rate": 0.00026496192908066584, + "loss": 2.6278, + "step": 22232 + }, + { + "epoch": 0.6592829819411085, + "grad_norm": 0.0984550416469574, + "learning_rate": 0.00026492040172630216, + "loss": 2.6151, + "step": 22233 + }, + { + "epoch": 0.6593126352934199, + "grad_norm": 0.09576394408941269, + "learning_rate": 0.00026487887645360866, + "loss": 2.5775, + "step": 22234 + }, + { + "epoch": 0.6593422886457314, + "grad_norm": 0.08977679908275604, + "learning_rate": 0.0002648373532629531, + "loss": 2.6219, + "step": 22235 + }, + { + "epoch": 0.6593719419980428, + "grad_norm": 0.10699670761823654, + "learning_rate": 0.0002647958321547029, + "loss": 2.6297, + "step": 22236 + }, + { + "epoch": 0.6594015953503544, + "grad_norm": 0.10590703040361404, + "learning_rate": 0.0002647543131292264, + "loss": 2.5665, + "step": 22237 + }, + { + "epoch": 0.6594312487026658, + "grad_norm": 0.10057058185338974, + "learning_rate": 0.00026471279618689057, + "loss": 2.5788, + "step": 22238 + }, + { + "epoch": 0.6594609020549773, + "grad_norm": 0.10462568700313568, + "learning_rate": 0.0002646712813280634, + "loss": 2.6015, + "step": 22239 + }, + { + "epoch": 0.6594905554072888, + "grad_norm": 0.11046327650547028, + "learning_rate": 0.00026462976855311243, + "loss": 2.5886, + "step": 22240 + }, + { + "epoch": 0.6595202087596003, + "grad_norm": 0.11056546121835709, + "learning_rate": 0.00026458825786240527, + "loss": 2.5881, + "step": 22241 + }, + { + "epoch": 0.6595498621119118, + "grad_norm": 0.12740568816661835, + "learning_rate": 0.00026454674925630945, + "loss": 2.5951, + "step": 22242 + }, + { + "epoch": 0.6595795154642232, + "grad_norm": 0.10263626277446747, + "learning_rate": 0.0002645052427351926, + "loss": 2.5786, + "step": 22243 + }, + { + "epoch": 0.6596091688165348, + "grad_norm": 0.10674449801445007, + "learning_rate": 0.0002644637382994223, + "loss": 2.6109, + "step": 22244 + }, + { + "epoch": 0.6596388221688462, + "grad_norm": 0.12387382984161377, + "learning_rate": 0.0002644222359493659, + "loss": 2.5787, + "step": 22245 + }, + { + "epoch": 0.6596684755211577, + "grad_norm": 0.11210950464010239, + "learning_rate": 0.0002643807356853911, + "loss": 2.6105, + "step": 22246 + }, + { + "epoch": 0.6596981288734691, + "grad_norm": 0.09620004892349243, + "learning_rate": 0.00026433923750786536, + "loss": 2.6021, + "step": 22247 + }, + { + "epoch": 0.6597277822257807, + "grad_norm": 0.1188095435500145, + "learning_rate": 0.0002642977414171561, + "loss": 2.5885, + "step": 22248 + }, + { + "epoch": 0.6597574355780921, + "grad_norm": 0.1069667637348175, + "learning_rate": 0.00026425624741363075, + "loss": 2.5438, + "step": 22249 + }, + { + "epoch": 0.6597870889304036, + "grad_norm": 0.1103726178407669, + "learning_rate": 0.0002642147554976568, + "loss": 2.5828, + "step": 22250 + }, + { + "epoch": 0.659816742282715, + "grad_norm": 0.09368092566728592, + "learning_rate": 0.00026417326566960175, + "loss": 2.6204, + "step": 22251 + }, + { + "epoch": 0.6598463956350266, + "grad_norm": 0.09982141852378845, + "learning_rate": 0.0002641317779298329, + "loss": 2.6084, + "step": 22252 + }, + { + "epoch": 0.659876048987338, + "grad_norm": 0.09768751263618469, + "learning_rate": 0.00026409029227871764, + "loss": 2.6285, + "step": 22253 + }, + { + "epoch": 0.6599057023396495, + "grad_norm": 0.11484561115503311, + "learning_rate": 0.0002640488087166233, + "loss": 2.5691, + "step": 22254 + }, + { + "epoch": 0.6599353556919609, + "grad_norm": 0.09947225451469421, + "learning_rate": 0.0002640073272439172, + "loss": 2.6142, + "step": 22255 + }, + { + "epoch": 0.6599650090442725, + "grad_norm": 0.10433846712112427, + "learning_rate": 0.0002639658478609668, + "loss": 2.577, + "step": 22256 + }, + { + "epoch": 0.6599946623965839, + "grad_norm": 0.09923277795314789, + "learning_rate": 0.00026392437056813934, + "loss": 2.5771, + "step": 22257 + }, + { + "epoch": 0.6600243157488954, + "grad_norm": 0.09257066994905472, + "learning_rate": 0.000263882895365802, + "loss": 2.6103, + "step": 22258 + }, + { + "epoch": 0.6600539691012068, + "grad_norm": 0.10630053281784058, + "learning_rate": 0.0002638414222543223, + "loss": 2.6126, + "step": 22259 + }, + { + "epoch": 0.6600836224535184, + "grad_norm": 0.10850315541028976, + "learning_rate": 0.00026379995123406726, + "loss": 2.6155, + "step": 22260 + }, + { + "epoch": 0.6601132758058299, + "grad_norm": 0.10323933511972427, + "learning_rate": 0.0002637584823054044, + "loss": 2.5601, + "step": 22261 + }, + { + "epoch": 0.6601429291581413, + "grad_norm": 0.10936348885297775, + "learning_rate": 0.00026371701546870033, + "loss": 2.5897, + "step": 22262 + }, + { + "epoch": 0.6601725825104529, + "grad_norm": 0.1002482920885086, + "learning_rate": 0.0002636755507243228, + "loss": 2.6001, + "step": 22263 + }, + { + "epoch": 0.6602022358627643, + "grad_norm": 0.1033790186047554, + "learning_rate": 0.0002636340880726389, + "loss": 2.6057, + "step": 22264 + }, + { + "epoch": 0.6602318892150758, + "grad_norm": 0.10071294009685516, + "learning_rate": 0.00026359262751401573, + "loss": 2.5882, + "step": 22265 + }, + { + "epoch": 0.6602615425673872, + "grad_norm": 0.09986747056245804, + "learning_rate": 0.00026355116904882035, + "loss": 2.5809, + "step": 22266 + }, + { + "epoch": 0.6602911959196988, + "grad_norm": 0.09593363851308823, + "learning_rate": 0.0002635097126774201, + "loss": 2.5898, + "step": 22267 + }, + { + "epoch": 0.6603208492720102, + "grad_norm": 0.10918699949979782, + "learning_rate": 0.0002634682584001818, + "loss": 2.5649, + "step": 22268 + }, + { + "epoch": 0.6603505026243217, + "grad_norm": 0.10378969460725784, + "learning_rate": 0.0002634268062174727, + "loss": 2.6024, + "step": 22269 + }, + { + "epoch": 0.6603801559766331, + "grad_norm": 0.11126845329999924, + "learning_rate": 0.0002633853561296599, + "loss": 2.5722, + "step": 22270 + }, + { + "epoch": 0.6604098093289447, + "grad_norm": 0.11952555179595947, + "learning_rate": 0.0002633439081371105, + "loss": 2.5852, + "step": 22271 + }, + { + "epoch": 0.6604394626812561, + "grad_norm": 0.10318255424499512, + "learning_rate": 0.0002633024622401912, + "loss": 2.6085, + "step": 22272 + }, + { + "epoch": 0.6604691160335676, + "grad_norm": 0.11293600499629974, + "learning_rate": 0.0002632610184392693, + "loss": 2.5869, + "step": 22273 + }, + { + "epoch": 0.660498769385879, + "grad_norm": 0.1105627715587616, + "learning_rate": 0.0002632195767347117, + "loss": 2.5996, + "step": 22274 + }, + { + "epoch": 0.6605284227381906, + "grad_norm": 0.1128723993897438, + "learning_rate": 0.0002631781371268852, + "loss": 2.6141, + "step": 22275 + }, + { + "epoch": 0.660558076090502, + "grad_norm": 0.09977971017360687, + "learning_rate": 0.00026313669961615713, + "loss": 2.583, + "step": 22276 + }, + { + "epoch": 0.6605877294428135, + "grad_norm": 0.10186896473169327, + "learning_rate": 0.0002630952642028942, + "loss": 2.5935, + "step": 22277 + }, + { + "epoch": 0.6606173827951249, + "grad_norm": 0.10103657841682434, + "learning_rate": 0.00026305383088746345, + "loss": 2.5668, + "step": 22278 + }, + { + "epoch": 0.6606470361474365, + "grad_norm": 0.09957321733236313, + "learning_rate": 0.0002630123996702316, + "loss": 2.6, + "step": 22279 + }, + { + "epoch": 0.6606766894997479, + "grad_norm": 0.09500398486852646, + "learning_rate": 0.0002629709705515657, + "loss": 2.5906, + "step": 22280 + }, + { + "epoch": 0.6607063428520594, + "grad_norm": 0.10834439843893051, + "learning_rate": 0.00026292954353183257, + "loss": 2.5857, + "step": 22281 + }, + { + "epoch": 0.660735996204371, + "grad_norm": 0.09810482710599899, + "learning_rate": 0.00026288811861139915, + "loss": 2.5396, + "step": 22282 + }, + { + "epoch": 0.6607656495566824, + "grad_norm": 0.08979988843202591, + "learning_rate": 0.00026284669579063204, + "loss": 2.5847, + "step": 22283 + }, + { + "epoch": 0.6607953029089939, + "grad_norm": 0.10588859766721725, + "learning_rate": 0.00026280527506989803, + "loss": 2.5885, + "step": 22284 + }, + { + "epoch": 0.6608249562613053, + "grad_norm": 0.09113048762083054, + "learning_rate": 0.00026276385644956405, + "loss": 2.5581, + "step": 22285 + }, + { + "epoch": 0.6608546096136169, + "grad_norm": 0.10248755663633347, + "learning_rate": 0.0002627224399299969, + "loss": 2.589, + "step": 22286 + }, + { + "epoch": 0.6608842629659283, + "grad_norm": 0.10010004043579102, + "learning_rate": 0.00026268102551156325, + "loss": 2.5947, + "step": 22287 + }, + { + "epoch": 0.6609139163182398, + "grad_norm": 0.09941178560256958, + "learning_rate": 0.00026263961319462957, + "loss": 2.5969, + "step": 22288 + }, + { + "epoch": 0.6609435696705512, + "grad_norm": 0.09646168351173401, + "learning_rate": 0.0002625982029795632, + "loss": 2.5626, + "step": 22289 + }, + { + "epoch": 0.6609732230228628, + "grad_norm": 0.10232897102832794, + "learning_rate": 0.0002625567948667304, + "loss": 2.6276, + "step": 22290 + }, + { + "epoch": 0.6610028763751742, + "grad_norm": 0.09716182947158813, + "learning_rate": 0.00026251538885649795, + "loss": 2.6032, + "step": 22291 + }, + { + "epoch": 0.6610325297274857, + "grad_norm": 0.11879003793001175, + "learning_rate": 0.0002624739849492327, + "loss": 2.547, + "step": 22292 + }, + { + "epoch": 0.6610621830797971, + "grad_norm": 0.10072686523199081, + "learning_rate": 0.0002624325831453009, + "loss": 2.5637, + "step": 22293 + }, + { + "epoch": 0.6610918364321087, + "grad_norm": 0.09959275275468826, + "learning_rate": 0.00026239118344506936, + "loss": 2.5958, + "step": 22294 + }, + { + "epoch": 0.6611214897844201, + "grad_norm": 0.0960678681731224, + "learning_rate": 0.00026234978584890466, + "loss": 2.5559, + "step": 22295 + }, + { + "epoch": 0.6611511431367316, + "grad_norm": 0.12010679394006729, + "learning_rate": 0.00026230839035717334, + "loss": 2.6363, + "step": 22296 + }, + { + "epoch": 0.661180796489043, + "grad_norm": 0.11117059737443924, + "learning_rate": 0.00026226699697024213, + "loss": 2.5809, + "step": 22297 + }, + { + "epoch": 0.6612104498413546, + "grad_norm": 0.09985045343637466, + "learning_rate": 0.00026222560568847745, + "loss": 2.63, + "step": 22298 + }, + { + "epoch": 0.661240103193666, + "grad_norm": 0.09355958551168442, + "learning_rate": 0.0002621842165122458, + "loss": 2.5913, + "step": 22299 + }, + { + "epoch": 0.6612697565459775, + "grad_norm": 0.1038907915353775, + "learning_rate": 0.0002621428294419137, + "loss": 2.5733, + "step": 22300 + }, + { + "epoch": 0.661299409898289, + "grad_norm": 0.09893348067998886, + "learning_rate": 0.0002621014444778476, + "loss": 2.5756, + "step": 22301 + }, + { + "epoch": 0.6613290632506005, + "grad_norm": 0.09332430362701416, + "learning_rate": 0.00026206006162041406, + "loss": 2.6335, + "step": 22302 + }, + { + "epoch": 0.661358716602912, + "grad_norm": 0.1003083884716034, + "learning_rate": 0.00026201868086997985, + "loss": 2.6169, + "step": 22303 + }, + { + "epoch": 0.6613883699552234, + "grad_norm": 0.10524637252092361, + "learning_rate": 0.00026197730222691086, + "loss": 2.6126, + "step": 22304 + }, + { + "epoch": 0.661418023307535, + "grad_norm": 0.11269422620534897, + "learning_rate": 0.00026193592569157367, + "loss": 2.6147, + "step": 22305 + }, + { + "epoch": 0.6614476766598464, + "grad_norm": 0.09670896828174591, + "learning_rate": 0.0002618945512643348, + "loss": 2.591, + "step": 22306 + }, + { + "epoch": 0.6614773300121579, + "grad_norm": 0.10279536992311478, + "learning_rate": 0.0002618531789455605, + "loss": 2.6282, + "step": 22307 + }, + { + "epoch": 0.6615069833644693, + "grad_norm": 0.1217806339263916, + "learning_rate": 0.0002618118087356171, + "loss": 2.6162, + "step": 22308 + }, + { + "epoch": 0.6615366367167809, + "grad_norm": 0.09916560351848602, + "learning_rate": 0.0002617704406348711, + "loss": 2.605, + "step": 22309 + }, + { + "epoch": 0.6615662900690923, + "grad_norm": 0.1179804801940918, + "learning_rate": 0.0002617290746436888, + "loss": 2.6056, + "step": 22310 + }, + { + "epoch": 0.6615959434214038, + "grad_norm": 0.1111825630068779, + "learning_rate": 0.0002616877107624363, + "loss": 2.5871, + "step": 22311 + }, + { + "epoch": 0.6616255967737152, + "grad_norm": 0.11307075619697571, + "learning_rate": 0.0002616463489914801, + "loss": 2.578, + "step": 22312 + }, + { + "epoch": 0.6616552501260268, + "grad_norm": 0.10484077781438828, + "learning_rate": 0.0002616049893311864, + "loss": 2.5875, + "step": 22313 + }, + { + "epoch": 0.6616849034783382, + "grad_norm": 0.10619726777076721, + "learning_rate": 0.00026156363178192146, + "loss": 2.6212, + "step": 22314 + }, + { + "epoch": 0.6617145568306497, + "grad_norm": 0.09504413604736328, + "learning_rate": 0.00026152227634405146, + "loss": 2.5833, + "step": 22315 + }, + { + "epoch": 0.6617442101829611, + "grad_norm": 0.10588784515857697, + "learning_rate": 0.0002614809230179426, + "loss": 2.6157, + "step": 22316 + }, + { + "epoch": 0.6617738635352727, + "grad_norm": 0.0903911218047142, + "learning_rate": 0.00026143957180396114, + "loss": 2.6242, + "step": 22317 + }, + { + "epoch": 0.6618035168875841, + "grad_norm": 0.09702757745981216, + "learning_rate": 0.00026139822270247325, + "loss": 2.5935, + "step": 22318 + }, + { + "epoch": 0.6618331702398956, + "grad_norm": 0.0954374372959137, + "learning_rate": 0.00026135687571384505, + "loss": 2.593, + "step": 22319 + }, + { + "epoch": 0.661862823592207, + "grad_norm": 0.09179248660802841, + "learning_rate": 0.0002613155308384426, + "loss": 2.5552, + "step": 22320 + }, + { + "epoch": 0.6618924769445186, + "grad_norm": 0.09341692179441452, + "learning_rate": 0.00026127418807663216, + "loss": 2.6105, + "step": 22321 + }, + { + "epoch": 0.66192213029683, + "grad_norm": 0.0936591848731041, + "learning_rate": 0.00026123284742877973, + "loss": 2.5885, + "step": 22322 + }, + { + "epoch": 0.6619517836491415, + "grad_norm": 0.09129258245229721, + "learning_rate": 0.00026119150889525143, + "loss": 2.5904, + "step": 22323 + }, + { + "epoch": 0.6619814370014531, + "grad_norm": 0.09131333976984024, + "learning_rate": 0.0002611501724764134, + "loss": 2.5877, + "step": 22324 + }, + { + "epoch": 0.6620110903537645, + "grad_norm": 0.09462624788284302, + "learning_rate": 0.0002611088381726315, + "loss": 2.5829, + "step": 22325 + }, + { + "epoch": 0.662040743706076, + "grad_norm": 0.08607855439186096, + "learning_rate": 0.00026106750598427187, + "loss": 2.6042, + "step": 22326 + }, + { + "epoch": 0.6620703970583874, + "grad_norm": 0.09845440834760666, + "learning_rate": 0.00026102617591170044, + "loss": 2.6024, + "step": 22327 + }, + { + "epoch": 0.662100050410699, + "grad_norm": 0.09986244142055511, + "learning_rate": 0.00026098484795528327, + "loss": 2.5755, + "step": 22328 + }, + { + "epoch": 0.6621297037630104, + "grad_norm": 0.09690942615270615, + "learning_rate": 0.0002609435221153863, + "loss": 2.632, + "step": 22329 + }, + { + "epoch": 0.6621593571153219, + "grad_norm": 0.10362321138381958, + "learning_rate": 0.0002609021983923755, + "loss": 2.6125, + "step": 22330 + }, + { + "epoch": 0.6621890104676333, + "grad_norm": 0.10178206115961075, + "learning_rate": 0.00026086087678661675, + "loss": 2.6181, + "step": 22331 + }, + { + "epoch": 0.6622186638199449, + "grad_norm": 0.09582190215587616, + "learning_rate": 0.00026081955729847595, + "loss": 2.5948, + "step": 22332 + }, + { + "epoch": 0.6622483171722563, + "grad_norm": 0.09637380391359329, + "learning_rate": 0.00026077823992831905, + "loss": 2.6051, + "step": 22333 + }, + { + "epoch": 0.6622779705245678, + "grad_norm": 0.09850858151912689, + "learning_rate": 0.00026073692467651187, + "loss": 2.5876, + "step": 22334 + }, + { + "epoch": 0.6623076238768792, + "grad_norm": 0.08631191402673721, + "learning_rate": 0.00026069561154342037, + "loss": 2.6134, + "step": 22335 + }, + { + "epoch": 0.6623372772291908, + "grad_norm": 0.11614422500133514, + "learning_rate": 0.0002606543005294103, + "loss": 2.6023, + "step": 22336 + }, + { + "epoch": 0.6623669305815022, + "grad_norm": 0.10534381866455078, + "learning_rate": 0.00026061299163484766, + "loss": 2.5505, + "step": 22337 + }, + { + "epoch": 0.6623965839338137, + "grad_norm": 0.10763747245073318, + "learning_rate": 0.0002605716848600978, + "loss": 2.6242, + "step": 22338 + }, + { + "epoch": 0.6624262372861252, + "grad_norm": 0.10547803342342377, + "learning_rate": 0.0002605303802055268, + "loss": 2.5631, + "step": 22339 + }, + { + "epoch": 0.6624558906384367, + "grad_norm": 0.09975163638591766, + "learning_rate": 0.00026048907767150023, + "loss": 2.5613, + "step": 22340 + }, + { + "epoch": 0.6624855439907481, + "grad_norm": 0.09493737667798996, + "learning_rate": 0.0002604477772583842, + "loss": 2.5775, + "step": 22341 + }, + { + "epoch": 0.6625151973430596, + "grad_norm": 0.10197218507528305, + "learning_rate": 0.00026040647896654413, + "loss": 2.6135, + "step": 22342 + }, + { + "epoch": 0.6625448506953711, + "grad_norm": 0.10027462244033813, + "learning_rate": 0.0002603651827963459, + "loss": 2.5847, + "step": 22343 + }, + { + "epoch": 0.6625745040476826, + "grad_norm": 0.08807658404111862, + "learning_rate": 0.00026032388874815506, + "loss": 2.5857, + "step": 22344 + }, + { + "epoch": 0.6626041573999941, + "grad_norm": 0.09514137357473373, + "learning_rate": 0.00026028259682233735, + "loss": 2.5919, + "step": 22345 + }, + { + "epoch": 0.6626338107523055, + "grad_norm": 0.09559033066034317, + "learning_rate": 0.0002602413070192584, + "loss": 2.5853, + "step": 22346 + }, + { + "epoch": 0.6626634641046171, + "grad_norm": 0.0969802513718605, + "learning_rate": 0.00026020001933928406, + "loss": 2.5992, + "step": 22347 + }, + { + "epoch": 0.6626931174569285, + "grad_norm": 0.0923692062497139, + "learning_rate": 0.0002601587337827794, + "loss": 2.6023, + "step": 22348 + }, + { + "epoch": 0.66272277080924, + "grad_norm": 0.09382818639278412, + "learning_rate": 0.0002601174503501104, + "loss": 2.586, + "step": 22349 + }, + { + "epoch": 0.6627524241615514, + "grad_norm": 0.10135402530431747, + "learning_rate": 0.00026007616904164254, + "loss": 2.6068, + "step": 22350 + }, + { + "epoch": 0.662782077513863, + "grad_norm": 0.10601574182510376, + "learning_rate": 0.00026003488985774145, + "loss": 2.5872, + "step": 22351 + }, + { + "epoch": 0.6628117308661744, + "grad_norm": 0.10684054344892502, + "learning_rate": 0.00025999361279877253, + "loss": 2.5865, + "step": 22352 + }, + { + "epoch": 0.6628413842184859, + "grad_norm": 0.10685031861066818, + "learning_rate": 0.0002599523378651012, + "loss": 2.5994, + "step": 22353 + }, + { + "epoch": 0.6628710375707974, + "grad_norm": 0.0938018187880516, + "learning_rate": 0.00025991106505709327, + "loss": 2.5858, + "step": 22354 + }, + { + "epoch": 0.6629006909231089, + "grad_norm": 0.08944125473499298, + "learning_rate": 0.00025986979437511406, + "loss": 2.6124, + "step": 22355 + }, + { + "epoch": 0.6629303442754203, + "grad_norm": 0.1057126373052597, + "learning_rate": 0.0002598285258195291, + "loss": 2.6414, + "step": 22356 + }, + { + "epoch": 0.6629599976277318, + "grad_norm": 0.1054132878780365, + "learning_rate": 0.0002597872593907037, + "loss": 2.6099, + "step": 22357 + }, + { + "epoch": 0.6629896509800433, + "grad_norm": 0.09231206029653549, + "learning_rate": 0.00025974599508900363, + "loss": 2.6538, + "step": 22358 + }, + { + "epoch": 0.6630193043323548, + "grad_norm": 0.09928229451179504, + "learning_rate": 0.0002597047329147938, + "loss": 2.5924, + "step": 22359 + }, + { + "epoch": 0.6630489576846662, + "grad_norm": 0.10848220437765121, + "learning_rate": 0.0002596634728684397, + "loss": 2.5698, + "step": 22360 + }, + { + "epoch": 0.6630786110369777, + "grad_norm": 0.09485985338687897, + "learning_rate": 0.0002596222149503069, + "loss": 2.6173, + "step": 22361 + }, + { + "epoch": 0.6631082643892892, + "grad_norm": 0.09674783796072006, + "learning_rate": 0.0002595809591607606, + "loss": 2.5721, + "step": 22362 + }, + { + "epoch": 0.6631379177416007, + "grad_norm": 0.0996054857969284, + "learning_rate": 0.00025953970550016625, + "loss": 2.6122, + "step": 22363 + }, + { + "epoch": 0.6631675710939121, + "grad_norm": 0.0952870324254036, + "learning_rate": 0.00025949845396888905, + "loss": 2.593, + "step": 22364 + }, + { + "epoch": 0.6631972244462236, + "grad_norm": 0.10193964838981628, + "learning_rate": 0.00025945720456729425, + "loss": 2.574, + "step": 22365 + }, + { + "epoch": 0.6632268777985352, + "grad_norm": 0.11022070050239563, + "learning_rate": 0.00025941595729574705, + "loss": 2.6013, + "step": 22366 + }, + { + "epoch": 0.6632565311508466, + "grad_norm": 0.10604635626077652, + "learning_rate": 0.0002593747121546131, + "loss": 2.563, + "step": 22367 + }, + { + "epoch": 0.6632861845031581, + "grad_norm": 0.11662688851356506, + "learning_rate": 0.0002593334691442575, + "loss": 2.5741, + "step": 22368 + }, + { + "epoch": 0.6633158378554695, + "grad_norm": 0.11224575340747833, + "learning_rate": 0.00025929222826504515, + "loss": 2.606, + "step": 22369 + }, + { + "epoch": 0.6633454912077811, + "grad_norm": 0.10698752850294113, + "learning_rate": 0.0002592509895173415, + "loss": 2.577, + "step": 22370 + }, + { + "epoch": 0.6633751445600925, + "grad_norm": 0.12177939713001251, + "learning_rate": 0.00025920975290151163, + "loss": 2.5876, + "step": 22371 + }, + { + "epoch": 0.663404797912404, + "grad_norm": 0.10665547102689743, + "learning_rate": 0.0002591685184179207, + "loss": 2.5977, + "step": 22372 + }, + { + "epoch": 0.6634344512647155, + "grad_norm": 0.10736048966646194, + "learning_rate": 0.0002591272860669338, + "loss": 2.6137, + "step": 22373 + }, + { + "epoch": 0.663464104617027, + "grad_norm": 0.1168103814125061, + "learning_rate": 0.00025908605584891626, + "loss": 2.5882, + "step": 22374 + }, + { + "epoch": 0.6634937579693384, + "grad_norm": 0.09498661011457443, + "learning_rate": 0.00025904482776423297, + "loss": 2.6244, + "step": 22375 + }, + { + "epoch": 0.6635234113216499, + "grad_norm": 0.12352916598320007, + "learning_rate": 0.00025900360181324914, + "loss": 2.5987, + "step": 22376 + }, + { + "epoch": 0.6635530646739614, + "grad_norm": 0.1349116414785385, + "learning_rate": 0.00025896237799632977, + "loss": 2.6005, + "step": 22377 + }, + { + "epoch": 0.6635827180262729, + "grad_norm": 0.10169361531734467, + "learning_rate": 0.00025892115631383987, + "loss": 2.5774, + "step": 22378 + }, + { + "epoch": 0.6636123713785843, + "grad_norm": 0.11855147778987885, + "learning_rate": 0.0002588799367661446, + "loss": 2.6001, + "step": 22379 + }, + { + "epoch": 0.6636420247308958, + "grad_norm": 0.10132705420255661, + "learning_rate": 0.0002588387193536088, + "loss": 2.5969, + "step": 22380 + }, + { + "epoch": 0.6636716780832073, + "grad_norm": 0.11275182664394379, + "learning_rate": 0.0002587975040765976, + "loss": 2.572, + "step": 22381 + }, + { + "epoch": 0.6637013314355188, + "grad_norm": 0.11526163667440414, + "learning_rate": 0.0002587562909354758, + "loss": 2.5868, + "step": 22382 + }, + { + "epoch": 0.6637309847878302, + "grad_norm": 0.11082517355680466, + "learning_rate": 0.00025871507993060854, + "loss": 2.5994, + "step": 22383 + }, + { + "epoch": 0.6637606381401417, + "grad_norm": 0.11587422341108322, + "learning_rate": 0.0002586738710623606, + "loss": 2.5985, + "step": 22384 + }, + { + "epoch": 0.6637902914924533, + "grad_norm": 0.1045142412185669, + "learning_rate": 0.00025863266433109704, + "loss": 2.5731, + "step": 22385 + }, + { + "epoch": 0.6638199448447647, + "grad_norm": 0.11927274614572525, + "learning_rate": 0.00025859145973718264, + "loss": 2.6002, + "step": 22386 + }, + { + "epoch": 0.6638495981970762, + "grad_norm": 0.10914955288171768, + "learning_rate": 0.00025855025728098224, + "loss": 2.5948, + "step": 22387 + }, + { + "epoch": 0.6638792515493877, + "grad_norm": 0.11497940868139267, + "learning_rate": 0.0002585090569628609, + "loss": 2.622, + "step": 22388 + }, + { + "epoch": 0.6639089049016992, + "grad_norm": 0.10460496693849564, + "learning_rate": 0.00025846785878318315, + "loss": 2.6162, + "step": 22389 + }, + { + "epoch": 0.6639385582540106, + "grad_norm": 0.1092778667807579, + "learning_rate": 0.000258426662742314, + "loss": 2.5766, + "step": 22390 + }, + { + "epoch": 0.6639682116063221, + "grad_norm": 0.10613405704498291, + "learning_rate": 0.0002583854688406183, + "loss": 2.6154, + "step": 22391 + }, + { + "epoch": 0.6639978649586336, + "grad_norm": 0.10623999685049057, + "learning_rate": 0.00025834427707846063, + "loss": 2.6008, + "step": 22392 + }, + { + "epoch": 0.6640275183109451, + "grad_norm": 0.10429105907678604, + "learning_rate": 0.000258303087456206, + "loss": 2.6236, + "step": 22393 + }, + { + "epoch": 0.6640571716632565, + "grad_norm": 0.11046423763036728, + "learning_rate": 0.0002582618999742189, + "loss": 2.608, + "step": 22394 + }, + { + "epoch": 0.664086825015568, + "grad_norm": 0.09768673032522202, + "learning_rate": 0.00025822071463286426, + "loss": 2.5838, + "step": 22395 + }, + { + "epoch": 0.6641164783678795, + "grad_norm": 0.09357928484678268, + "learning_rate": 0.0002581795314325066, + "loss": 2.5823, + "step": 22396 + }, + { + "epoch": 0.664146131720191, + "grad_norm": 0.11731670051813126, + "learning_rate": 0.00025813835037351074, + "loss": 2.5863, + "step": 22397 + }, + { + "epoch": 0.6641757850725024, + "grad_norm": 0.10018433630466461, + "learning_rate": 0.00025809717145624134, + "loss": 2.6052, + "step": 22398 + }, + { + "epoch": 0.6642054384248139, + "grad_norm": 0.10037249326705933, + "learning_rate": 0.000258055994681063, + "loss": 2.6244, + "step": 22399 + }, + { + "epoch": 0.6642350917771254, + "grad_norm": 0.12806951999664307, + "learning_rate": 0.0002580148200483403, + "loss": 2.616, + "step": 22400 + }, + { + "epoch": 0.6642647451294369, + "grad_norm": 0.09691144526004791, + "learning_rate": 0.000257973647558438, + "loss": 2.5586, + "step": 22401 + }, + { + "epoch": 0.6642943984817483, + "grad_norm": 0.10643202811479568, + "learning_rate": 0.00025793247721172055, + "loss": 2.5795, + "step": 22402 + }, + { + "epoch": 0.6643240518340598, + "grad_norm": 0.11836894601583481, + "learning_rate": 0.00025789130900855274, + "loss": 2.5758, + "step": 22403 + }, + { + "epoch": 0.6643537051863713, + "grad_norm": 0.09232264757156372, + "learning_rate": 0.00025785014294929856, + "loss": 2.577, + "step": 22404 + }, + { + "epoch": 0.6643833585386828, + "grad_norm": 0.10728148370981216, + "learning_rate": 0.0002578089790343231, + "loss": 2.5998, + "step": 22405 + }, + { + "epoch": 0.6644130118909943, + "grad_norm": 0.10533124953508377, + "learning_rate": 0.0002577678172639907, + "loss": 2.6438, + "step": 22406 + }, + { + "epoch": 0.6644426652433058, + "grad_norm": 0.10264769196510315, + "learning_rate": 0.00025772665763866586, + "loss": 2.5996, + "step": 22407 + }, + { + "epoch": 0.6644723185956173, + "grad_norm": 0.0929056704044342, + "learning_rate": 0.00025768550015871307, + "loss": 2.5936, + "step": 22408 + }, + { + "epoch": 0.6645019719479287, + "grad_norm": 0.09425362944602966, + "learning_rate": 0.0002576443448244968, + "loss": 2.5748, + "step": 22409 + }, + { + "epoch": 0.6645316253002402, + "grad_norm": 0.0880345031619072, + "learning_rate": 0.0002576031916363815, + "loss": 2.5918, + "step": 22410 + }, + { + "epoch": 0.6645612786525517, + "grad_norm": 0.09428051114082336, + "learning_rate": 0.00025756204059473143, + "loss": 2.6115, + "step": 22411 + }, + { + "epoch": 0.6645909320048632, + "grad_norm": 0.1003665179014206, + "learning_rate": 0.0002575208916999111, + "loss": 2.6251, + "step": 22412 + }, + { + "epoch": 0.6646205853571746, + "grad_norm": 0.10709485411643982, + "learning_rate": 0.00025747974495228515, + "loss": 2.5655, + "step": 22413 + }, + { + "epoch": 0.6646502387094861, + "grad_norm": 0.0990864560008049, + "learning_rate": 0.0002574386003522175, + "loss": 2.5915, + "step": 22414 + }, + { + "epoch": 0.6646798920617976, + "grad_norm": 0.10049568116664886, + "learning_rate": 0.00025739745790007265, + "loss": 2.586, + "step": 22415 + }, + { + "epoch": 0.6647095454141091, + "grad_norm": 0.10054584592580795, + "learning_rate": 0.000257356317596215, + "loss": 2.595, + "step": 22416 + }, + { + "epoch": 0.6647391987664205, + "grad_norm": 0.0929626077413559, + "learning_rate": 0.0002573151794410086, + "loss": 2.6086, + "step": 22417 + }, + { + "epoch": 0.664768852118732, + "grad_norm": 0.11352430284023285, + "learning_rate": 0.00025727404343481807, + "loss": 2.5803, + "step": 22418 + }, + { + "epoch": 0.6647985054710435, + "grad_norm": 0.12319411337375641, + "learning_rate": 0.0002572329095780076, + "loss": 2.6598, + "step": 22419 + }, + { + "epoch": 0.664828158823355, + "grad_norm": 0.10491172224283218, + "learning_rate": 0.00025719177787094136, + "loss": 2.5629, + "step": 22420 + }, + { + "epoch": 0.6648578121756664, + "grad_norm": 0.11425737291574478, + "learning_rate": 0.0002571506483139836, + "loss": 2.6125, + "step": 22421 + }, + { + "epoch": 0.664887465527978, + "grad_norm": 0.12813186645507812, + "learning_rate": 0.00025710952090749855, + "loss": 2.6154, + "step": 22422 + }, + { + "epoch": 0.6649171188802894, + "grad_norm": 0.10249669849872589, + "learning_rate": 0.0002570683956518506, + "loss": 2.5902, + "step": 22423 + }, + { + "epoch": 0.6649467722326009, + "grad_norm": 0.11853881925344467, + "learning_rate": 0.0002570272725474035, + "loss": 2.5752, + "step": 22424 + }, + { + "epoch": 0.6649764255849123, + "grad_norm": 0.12534858286380768, + "learning_rate": 0.0002569861515945216, + "loss": 2.5841, + "step": 22425 + }, + { + "epoch": 0.6650060789372239, + "grad_norm": 0.10043642669916153, + "learning_rate": 0.0002569450327935691, + "loss": 2.5928, + "step": 22426 + }, + { + "epoch": 0.6650357322895354, + "grad_norm": 0.13262981176376343, + "learning_rate": 0.00025690391614490994, + "loss": 2.5797, + "step": 22427 + }, + { + "epoch": 0.6650653856418468, + "grad_norm": 0.12388641387224197, + "learning_rate": 0.0002568628016489084, + "loss": 2.6238, + "step": 22428 + }, + { + "epoch": 0.6650950389941583, + "grad_norm": 0.09327857941389084, + "learning_rate": 0.00025682168930592843, + "loss": 2.6214, + "step": 22429 + }, + { + "epoch": 0.6651246923464698, + "grad_norm": 0.10104767978191376, + "learning_rate": 0.000256780579116334, + "loss": 2.5869, + "step": 22430 + }, + { + "epoch": 0.6651543456987813, + "grad_norm": 0.09090252220630646, + "learning_rate": 0.0002567394710804895, + "loss": 2.5915, + "step": 22431 + }, + { + "epoch": 0.6651839990510927, + "grad_norm": 0.09615789353847504, + "learning_rate": 0.0002566983651987587, + "loss": 2.6061, + "step": 22432 + }, + { + "epoch": 0.6652136524034042, + "grad_norm": 0.10555309057235718, + "learning_rate": 0.00025665726147150567, + "loss": 2.6106, + "step": 22433 + }, + { + "epoch": 0.6652433057557157, + "grad_norm": 0.10054396837949753, + "learning_rate": 0.0002566161598990945, + "loss": 2.5887, + "step": 22434 + }, + { + "epoch": 0.6652729591080272, + "grad_norm": 0.09129004180431366, + "learning_rate": 0.00025657506048188885, + "loss": 2.6048, + "step": 22435 + }, + { + "epoch": 0.6653026124603386, + "grad_norm": 0.09530645608901978, + "learning_rate": 0.0002565339632202528, + "loss": 2.575, + "step": 22436 + }, + { + "epoch": 0.6653322658126501, + "grad_norm": 0.08559183776378632, + "learning_rate": 0.00025649286811455033, + "loss": 2.5857, + "step": 22437 + }, + { + "epoch": 0.6653619191649616, + "grad_norm": 0.09923087805509567, + "learning_rate": 0.0002564517751651453, + "loss": 2.5866, + "step": 22438 + }, + { + "epoch": 0.6653915725172731, + "grad_norm": 0.09196523576974869, + "learning_rate": 0.0002564106843724016, + "loss": 2.614, + "step": 22439 + }, + { + "epoch": 0.6654212258695845, + "grad_norm": 0.09453459829092026, + "learning_rate": 0.0002563695957366831, + "loss": 2.624, + "step": 22440 + }, + { + "epoch": 0.665450879221896, + "grad_norm": 0.10054683685302734, + "learning_rate": 0.0002563285092583537, + "loss": 2.5775, + "step": 22441 + }, + { + "epoch": 0.6654805325742075, + "grad_norm": 0.09371811151504517, + "learning_rate": 0.00025628742493777714, + "loss": 2.5657, + "step": 22442 + }, + { + "epoch": 0.665510185926519, + "grad_norm": 0.10208088159561157, + "learning_rate": 0.000256246342775317, + "loss": 2.5862, + "step": 22443 + }, + { + "epoch": 0.6655398392788304, + "grad_norm": 0.11211620271205902, + "learning_rate": 0.0002562052627713378, + "loss": 2.5819, + "step": 22444 + }, + { + "epoch": 0.665569492631142, + "grad_norm": 0.10241100192070007, + "learning_rate": 0.00025616418492620263, + "loss": 2.5759, + "step": 22445 + }, + { + "epoch": 0.6655991459834534, + "grad_norm": 0.10294874012470245, + "learning_rate": 0.0002561231092402755, + "loss": 2.5988, + "step": 22446 + }, + { + "epoch": 0.6656287993357649, + "grad_norm": 0.09338085353374481, + "learning_rate": 0.00025608203571392, + "loss": 2.5928, + "step": 22447 + }, + { + "epoch": 0.6656584526880764, + "grad_norm": 0.10677701234817505, + "learning_rate": 0.00025604096434750004, + "loss": 2.6382, + "step": 22448 + }, + { + "epoch": 0.6656881060403879, + "grad_norm": 0.0947297215461731, + "learning_rate": 0.0002559998951413792, + "loss": 2.6035, + "step": 22449 + }, + { + "epoch": 0.6657177593926994, + "grad_norm": 0.0954231470823288, + "learning_rate": 0.00025595882809592113, + "loss": 2.5916, + "step": 22450 + }, + { + "epoch": 0.6657474127450108, + "grad_norm": 0.10474386066198349, + "learning_rate": 0.00025591776321148954, + "loss": 2.598, + "step": 22451 + }, + { + "epoch": 0.6657770660973223, + "grad_norm": 0.108037568628788, + "learning_rate": 0.000255876700488448, + "loss": 2.59, + "step": 22452 + }, + { + "epoch": 0.6658067194496338, + "grad_norm": 0.1094234362244606, + "learning_rate": 0.0002558356399271603, + "loss": 2.6243, + "step": 22453 + }, + { + "epoch": 0.6658363728019453, + "grad_norm": 0.09350166469812393, + "learning_rate": 0.0002557945815279898, + "loss": 2.5607, + "step": 22454 + }, + { + "epoch": 0.6658660261542567, + "grad_norm": 0.10750704258680344, + "learning_rate": 0.0002557535252913003, + "loss": 2.5872, + "step": 22455 + }, + { + "epoch": 0.6658956795065682, + "grad_norm": 0.08952858299016953, + "learning_rate": 0.0002557124712174552, + "loss": 2.5497, + "step": 22456 + }, + { + "epoch": 0.6659253328588797, + "grad_norm": 0.09863663464784622, + "learning_rate": 0.0002556714193068181, + "loss": 2.5639, + "step": 22457 + }, + { + "epoch": 0.6659549862111912, + "grad_norm": 0.1062910407781601, + "learning_rate": 0.00025563036955975255, + "loss": 2.5933, + "step": 22458 + }, + { + "epoch": 0.6659846395635026, + "grad_norm": 0.09137222170829773, + "learning_rate": 0.000255589321976622, + "loss": 2.6143, + "step": 22459 + }, + { + "epoch": 0.6660142929158142, + "grad_norm": 0.10301197320222855, + "learning_rate": 0.00025554827655779, + "loss": 2.5522, + "step": 22460 + }, + { + "epoch": 0.6660439462681256, + "grad_norm": 0.09707280993461609, + "learning_rate": 0.0002555072333036199, + "loss": 2.5926, + "step": 22461 + }, + { + "epoch": 0.6660735996204371, + "grad_norm": 0.10515382885932922, + "learning_rate": 0.0002554661922144753, + "loss": 2.6262, + "step": 22462 + }, + { + "epoch": 0.6661032529727485, + "grad_norm": 0.09891771525144577, + "learning_rate": 0.00025542515329071946, + "loss": 2.6043, + "step": 22463 + }, + { + "epoch": 0.6661329063250601, + "grad_norm": 0.10152220726013184, + "learning_rate": 0.00025538411653271587, + "loss": 2.6108, + "step": 22464 + }, + { + "epoch": 0.6661625596773715, + "grad_norm": 0.11289095133543015, + "learning_rate": 0.00025534308194082793, + "loss": 2.6011, + "step": 22465 + }, + { + "epoch": 0.666192213029683, + "grad_norm": 0.12436568737030029, + "learning_rate": 0.000255302049515419, + "loss": 2.583, + "step": 22466 + }, + { + "epoch": 0.6662218663819944, + "grad_norm": 0.10584220290184021, + "learning_rate": 0.0002552610192568524, + "loss": 2.6303, + "step": 22467 + }, + { + "epoch": 0.666251519734306, + "grad_norm": 0.09712471067905426, + "learning_rate": 0.00025521999116549175, + "loss": 2.599, + "step": 22468 + }, + { + "epoch": 0.6662811730866175, + "grad_norm": 0.10810289531946182, + "learning_rate": 0.0002551789652416997, + "loss": 2.5802, + "step": 22469 + }, + { + "epoch": 0.6663108264389289, + "grad_norm": 0.09101024270057678, + "learning_rate": 0.0002551379414858401, + "loss": 2.5764, + "step": 22470 + }, + { + "epoch": 0.6663404797912404, + "grad_norm": 0.12161345779895782, + "learning_rate": 0.000255096919898276, + "loss": 2.5643, + "step": 22471 + }, + { + "epoch": 0.6663701331435519, + "grad_norm": 0.12304031103849411, + "learning_rate": 0.0002550559004793708, + "loss": 2.5912, + "step": 22472 + }, + { + "epoch": 0.6663997864958634, + "grad_norm": 0.10165967047214508, + "learning_rate": 0.0002550148832294876, + "loss": 2.6216, + "step": 22473 + }, + { + "epoch": 0.6664294398481748, + "grad_norm": 0.1016843244433403, + "learning_rate": 0.0002549738681489896, + "loss": 2.6182, + "step": 22474 + }, + { + "epoch": 0.6664590932004864, + "grad_norm": 0.1045142263174057, + "learning_rate": 0.0002549328552382402, + "loss": 2.5955, + "step": 22475 + }, + { + "epoch": 0.6664887465527978, + "grad_norm": 0.10955747961997986, + "learning_rate": 0.0002548918444976023, + "loss": 2.6408, + "step": 22476 + }, + { + "epoch": 0.6665183999051093, + "grad_norm": 0.09666745364665985, + "learning_rate": 0.0002548508359274393, + "loss": 2.5872, + "step": 22477 + }, + { + "epoch": 0.6665480532574207, + "grad_norm": 0.09616382420063019, + "learning_rate": 0.00025480982952811416, + "loss": 2.5886, + "step": 22478 + }, + { + "epoch": 0.6665777066097323, + "grad_norm": 0.10843392461538315, + "learning_rate": 0.00025476882529999024, + "loss": 2.6032, + "step": 22479 + }, + { + "epoch": 0.6666073599620437, + "grad_norm": 0.08906615525484085, + "learning_rate": 0.00025472782324343035, + "loss": 2.6144, + "step": 22480 + }, + { + "epoch": 0.6666370133143552, + "grad_norm": 0.09459441900253296, + "learning_rate": 0.0002546868233587976, + "loss": 2.5789, + "step": 22481 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 0.10223112255334854, + "learning_rate": 0.00025464582564645497, + "loss": 2.6007, + "step": 22482 + }, + { + "epoch": 0.6666963200189782, + "grad_norm": 0.08778075128793716, + "learning_rate": 0.00025460483010676595, + "loss": 2.578, + "step": 22483 + }, + { + "epoch": 0.6667259733712896, + "grad_norm": 0.10471352934837341, + "learning_rate": 0.0002545638367400932, + "loss": 2.6162, + "step": 22484 + }, + { + "epoch": 0.6667556267236011, + "grad_norm": 0.09703826159238815, + "learning_rate": 0.00025452284554679976, + "loss": 2.5883, + "step": 22485 + }, + { + "epoch": 0.6667852800759125, + "grad_norm": 0.10594841092824936, + "learning_rate": 0.00025448185652724874, + "loss": 2.6038, + "step": 22486 + }, + { + "epoch": 0.6668149334282241, + "grad_norm": 0.10568813979625702, + "learning_rate": 0.00025444086968180296, + "loss": 2.5501, + "step": 22487 + }, + { + "epoch": 0.6668445867805355, + "grad_norm": 0.09675087034702301, + "learning_rate": 0.00025439988501082546, + "loss": 2.5588, + "step": 22488 + }, + { + "epoch": 0.666874240132847, + "grad_norm": 0.09909705817699432, + "learning_rate": 0.0002543589025146793, + "loss": 2.6146, + "step": 22489 + }, + { + "epoch": 0.6669038934851585, + "grad_norm": 0.11094318330287933, + "learning_rate": 0.0002543179221937271, + "loss": 2.5792, + "step": 22490 + }, + { + "epoch": 0.66693354683747, + "grad_norm": 0.08716003596782684, + "learning_rate": 0.0002542769440483318, + "loss": 2.5609, + "step": 22491 + }, + { + "epoch": 0.6669632001897815, + "grad_norm": 0.11280978471040726, + "learning_rate": 0.0002542359680788564, + "loss": 2.5881, + "step": 22492 + }, + { + "epoch": 0.6669928535420929, + "grad_norm": 0.10412560403347015, + "learning_rate": 0.00025419499428566364, + "loss": 2.6104, + "step": 22493 + }, + { + "epoch": 0.6670225068944045, + "grad_norm": 0.09759071469306946, + "learning_rate": 0.0002541540226691164, + "loss": 2.5787, + "step": 22494 + }, + { + "epoch": 0.6670521602467159, + "grad_norm": 0.10048042982816696, + "learning_rate": 0.00025411305322957736, + "loss": 2.5501, + "step": 22495 + }, + { + "epoch": 0.6670818135990274, + "grad_norm": 0.10068811476230621, + "learning_rate": 0.0002540720859674095, + "loss": 2.6005, + "step": 22496 + }, + { + "epoch": 0.6671114669513388, + "grad_norm": 0.09823792427778244, + "learning_rate": 0.00025403112088297566, + "loss": 2.5897, + "step": 22497 + }, + { + "epoch": 0.6671411203036504, + "grad_norm": 0.09952981024980545, + "learning_rate": 0.0002539901579766384, + "loss": 2.5978, + "step": 22498 + }, + { + "epoch": 0.6671707736559618, + "grad_norm": 0.10868088901042938, + "learning_rate": 0.0002539491972487605, + "loss": 2.5703, + "step": 22499 + }, + { + "epoch": 0.6672004270082733, + "grad_norm": 0.10646551847457886, + "learning_rate": 0.0002539082386997049, + "loss": 2.6323, + "step": 22500 + }, + { + "epoch": 0.6672300803605847, + "grad_norm": 0.10492299497127533, + "learning_rate": 0.0002538672823298339, + "loss": 2.5913, + "step": 22501 + }, + { + "epoch": 0.6672597337128963, + "grad_norm": 0.10765539109706879, + "learning_rate": 0.00025382632813951043, + "loss": 2.6338, + "step": 22502 + }, + { + "epoch": 0.6672893870652077, + "grad_norm": 0.09396011382341385, + "learning_rate": 0.000253785376129097, + "loss": 2.5848, + "step": 22503 + }, + { + "epoch": 0.6673190404175192, + "grad_norm": 0.11049705743789673, + "learning_rate": 0.0002537444262989564, + "loss": 2.5754, + "step": 22504 + }, + { + "epoch": 0.6673486937698306, + "grad_norm": 0.1033833771944046, + "learning_rate": 0.0002537034786494511, + "loss": 2.6038, + "step": 22505 + }, + { + "epoch": 0.6673783471221422, + "grad_norm": 0.11300014704465866, + "learning_rate": 0.00025366253318094377, + "loss": 2.6024, + "step": 22506 + }, + { + "epoch": 0.6674080004744536, + "grad_norm": 0.0948672667145729, + "learning_rate": 0.00025362158989379705, + "loss": 2.5954, + "step": 22507 + }, + { + "epoch": 0.6674376538267651, + "grad_norm": 0.10168657451868057, + "learning_rate": 0.0002535806487883732, + "loss": 2.6253, + "step": 22508 + }, + { + "epoch": 0.6674673071790765, + "grad_norm": 0.10591309517621994, + "learning_rate": 0.0002535397098650353, + "loss": 2.6007, + "step": 22509 + }, + { + "epoch": 0.6674969605313881, + "grad_norm": 0.09233316034078598, + "learning_rate": 0.0002534987731241456, + "loss": 2.5882, + "step": 22510 + }, + { + "epoch": 0.6675266138836996, + "grad_norm": 0.10747085511684418, + "learning_rate": 0.0002534578385660665, + "loss": 2.6054, + "step": 22511 + }, + { + "epoch": 0.667556267236011, + "grad_norm": 0.1025770753622055, + "learning_rate": 0.00025341690619116054, + "loss": 2.5926, + "step": 22512 + }, + { + "epoch": 0.6675859205883226, + "grad_norm": 0.09605494141578674, + "learning_rate": 0.0002533759759997901, + "loss": 2.57, + "step": 22513 + }, + { + "epoch": 0.667615573940634, + "grad_norm": 0.09964588284492493, + "learning_rate": 0.0002533350479923179, + "loss": 2.5967, + "step": 22514 + }, + { + "epoch": 0.6676452272929455, + "grad_norm": 0.09532269090414047, + "learning_rate": 0.0002532941221691061, + "loss": 2.5737, + "step": 22515 + }, + { + "epoch": 0.6676748806452569, + "grad_norm": 0.0951431468129158, + "learning_rate": 0.00025325319853051716, + "loss": 2.6013, + "step": 22516 + }, + { + "epoch": 0.6677045339975685, + "grad_norm": 0.0946141853928566, + "learning_rate": 0.0002532122770769135, + "loss": 2.5669, + "step": 22517 + }, + { + "epoch": 0.6677341873498799, + "grad_norm": 0.09768812358379364, + "learning_rate": 0.00025317135780865755, + "loss": 2.6026, + "step": 22518 + }, + { + "epoch": 0.6677638407021914, + "grad_norm": 0.0937032699584961, + "learning_rate": 0.0002531304407261116, + "loss": 2.5859, + "step": 22519 + }, + { + "epoch": 0.6677934940545028, + "grad_norm": 0.09588337689638138, + "learning_rate": 0.0002530895258296378, + "loss": 2.5813, + "step": 22520 + }, + { + "epoch": 0.6678231474068144, + "grad_norm": 0.0973559021949768, + "learning_rate": 0.00025304861311959884, + "loss": 2.582, + "step": 22521 + }, + { + "epoch": 0.6678528007591258, + "grad_norm": 0.10123299062252045, + "learning_rate": 0.00025300770259635677, + "loss": 2.5897, + "step": 22522 + }, + { + "epoch": 0.6678824541114373, + "grad_norm": 0.09612283110618591, + "learning_rate": 0.0002529667942602738, + "loss": 2.5707, + "step": 22523 + }, + { + "epoch": 0.6679121074637487, + "grad_norm": 0.10067794471979141, + "learning_rate": 0.0002529258881117123, + "loss": 2.6018, + "step": 22524 + }, + { + "epoch": 0.6679417608160603, + "grad_norm": 0.08584856241941452, + "learning_rate": 0.0002528849841510345, + "loss": 2.6012, + "step": 22525 + }, + { + "epoch": 0.6679714141683717, + "grad_norm": 0.10822553187608719, + "learning_rate": 0.0002528440823786026, + "loss": 2.6164, + "step": 22526 + }, + { + "epoch": 0.6680010675206832, + "grad_norm": 0.09824430197477341, + "learning_rate": 0.00025280318279477873, + "loss": 2.5825, + "step": 22527 + }, + { + "epoch": 0.6680307208729946, + "grad_norm": 0.11362503468990326, + "learning_rate": 0.00025276228539992506, + "loss": 2.583, + "step": 22528 + }, + { + "epoch": 0.6680603742253062, + "grad_norm": 0.1026049554347992, + "learning_rate": 0.0002527213901944039, + "loss": 2.5945, + "step": 22529 + }, + { + "epoch": 0.6680900275776176, + "grad_norm": 0.10172281414270401, + "learning_rate": 0.0002526804971785772, + "loss": 2.5498, + "step": 22530 + }, + { + "epoch": 0.6681196809299291, + "grad_norm": 0.1080598458647728, + "learning_rate": 0.00025263960635280713, + "loss": 2.5815, + "step": 22531 + }, + { + "epoch": 0.6681493342822407, + "grad_norm": 0.10280294716358185, + "learning_rate": 0.0002525987177174559, + "loss": 2.5853, + "step": 22532 + }, + { + "epoch": 0.6681789876345521, + "grad_norm": 0.0954602062702179, + "learning_rate": 0.0002525578312728855, + "loss": 2.6104, + "step": 22533 + }, + { + "epoch": 0.6682086409868636, + "grad_norm": 0.10928058624267578, + "learning_rate": 0.0002525169470194578, + "loss": 2.6044, + "step": 22534 + }, + { + "epoch": 0.668238294339175, + "grad_norm": 0.10237450897693634, + "learning_rate": 0.0002524760649575352, + "loss": 2.5967, + "step": 22535 + }, + { + "epoch": 0.6682679476914866, + "grad_norm": 0.09445108473300934, + "learning_rate": 0.00025243518508747943, + "loss": 2.5676, + "step": 22536 + }, + { + "epoch": 0.668297601043798, + "grad_norm": 0.10931757092475891, + "learning_rate": 0.00025239430740965266, + "loss": 2.577, + "step": 22537 + }, + { + "epoch": 0.6683272543961095, + "grad_norm": 0.09451793879270554, + "learning_rate": 0.0002523534319244167, + "loss": 2.6095, + "step": 22538 + }, + { + "epoch": 0.6683569077484209, + "grad_norm": 0.09805946797132492, + "learning_rate": 0.00025231255863213364, + "loss": 2.6255, + "step": 22539 + }, + { + "epoch": 0.6683865611007325, + "grad_norm": 0.09512082487344742, + "learning_rate": 0.0002522716875331654, + "loss": 2.5632, + "step": 22540 + }, + { + "epoch": 0.6684162144530439, + "grad_norm": 0.10275690257549286, + "learning_rate": 0.00025223081862787403, + "loss": 2.5865, + "step": 22541 + }, + { + "epoch": 0.6684458678053554, + "grad_norm": 0.08928638696670532, + "learning_rate": 0.0002521899519166211, + "loss": 2.5938, + "step": 22542 + }, + { + "epoch": 0.6684755211576668, + "grad_norm": 0.10301732271909714, + "learning_rate": 0.0002521490873997687, + "loss": 2.5786, + "step": 22543 + }, + { + "epoch": 0.6685051745099784, + "grad_norm": 0.10659157484769821, + "learning_rate": 0.00025210822507767895, + "loss": 2.5913, + "step": 22544 + }, + { + "epoch": 0.6685348278622898, + "grad_norm": 0.09925872087478638, + "learning_rate": 0.0002520673649507132, + "loss": 2.6274, + "step": 22545 + }, + { + "epoch": 0.6685644812146013, + "grad_norm": 0.09662050008773804, + "learning_rate": 0.0002520265070192335, + "loss": 2.6001, + "step": 22546 + }, + { + "epoch": 0.6685941345669127, + "grad_norm": 0.11341818422079086, + "learning_rate": 0.0002519856512836014, + "loss": 2.6114, + "step": 22547 + }, + { + "epoch": 0.6686237879192243, + "grad_norm": 0.10310184955596924, + "learning_rate": 0.0002519447977441792, + "loss": 2.6146, + "step": 22548 + }, + { + "epoch": 0.6686534412715357, + "grad_norm": 0.09956487268209457, + "learning_rate": 0.0002519039464013283, + "loss": 2.583, + "step": 22549 + }, + { + "epoch": 0.6686830946238472, + "grad_norm": 0.11436259001493454, + "learning_rate": 0.00025186309725541055, + "loss": 2.5958, + "step": 22550 + }, + { + "epoch": 0.6687127479761587, + "grad_norm": 0.10719480365514755, + "learning_rate": 0.00025182225030678774, + "loss": 2.5994, + "step": 22551 + }, + { + "epoch": 0.6687424013284702, + "grad_norm": 0.1312941163778305, + "learning_rate": 0.0002517814055558215, + "loss": 2.6136, + "step": 22552 + }, + { + "epoch": 0.6687720546807817, + "grad_norm": 0.11593261361122131, + "learning_rate": 0.00025174056300287346, + "loss": 2.5954, + "step": 22553 + }, + { + "epoch": 0.6688017080330931, + "grad_norm": 0.10152789950370789, + "learning_rate": 0.0002516997226483053, + "loss": 2.587, + "step": 22554 + }, + { + "epoch": 0.6688313613854047, + "grad_norm": 0.11591440439224243, + "learning_rate": 0.000251658884492479, + "loss": 2.5879, + "step": 22555 + }, + { + "epoch": 0.6688610147377161, + "grad_norm": 0.12404514849185944, + "learning_rate": 0.00025161804853575577, + "loss": 2.5747, + "step": 22556 + }, + { + "epoch": 0.6688906680900276, + "grad_norm": 0.10191851109266281, + "learning_rate": 0.00025157721477849724, + "loss": 2.5974, + "step": 22557 + }, + { + "epoch": 0.668920321442339, + "grad_norm": 0.10320957005023956, + "learning_rate": 0.00025153638322106514, + "loss": 2.5771, + "step": 22558 + }, + { + "epoch": 0.6689499747946506, + "grad_norm": 0.10769856721162796, + "learning_rate": 0.00025149555386382103, + "loss": 2.6011, + "step": 22559 + }, + { + "epoch": 0.668979628146962, + "grad_norm": 0.09291034191846848, + "learning_rate": 0.00025145472670712625, + "loss": 2.5975, + "step": 22560 + }, + { + "epoch": 0.6690092814992735, + "grad_norm": 0.10342124849557877, + "learning_rate": 0.00025141390175134273, + "loss": 2.6069, + "step": 22561 + }, + { + "epoch": 0.6690389348515849, + "grad_norm": 0.09092098474502563, + "learning_rate": 0.00025137307899683175, + "loss": 2.5773, + "step": 22562 + }, + { + "epoch": 0.6690685882038965, + "grad_norm": 0.1040770411491394, + "learning_rate": 0.0002513322584439549, + "loss": 2.5955, + "step": 22563 + }, + { + "epoch": 0.6690982415562079, + "grad_norm": 0.0982007309794426, + "learning_rate": 0.0002512914400930735, + "loss": 2.6129, + "step": 22564 + }, + { + "epoch": 0.6691278949085194, + "grad_norm": 0.09992796927690506, + "learning_rate": 0.00025125062394454936, + "loss": 2.566, + "step": 22565 + }, + { + "epoch": 0.6691575482608308, + "grad_norm": 0.09886657446622849, + "learning_rate": 0.00025120980999874333, + "loss": 2.5713, + "step": 22566 + }, + { + "epoch": 0.6691872016131424, + "grad_norm": 0.09457311034202576, + "learning_rate": 0.00025116899825601725, + "loss": 2.6062, + "step": 22567 + }, + { + "epoch": 0.6692168549654538, + "grad_norm": 0.10538893193006516, + "learning_rate": 0.0002511281887167324, + "loss": 2.6029, + "step": 22568 + }, + { + "epoch": 0.6692465083177653, + "grad_norm": 0.09672258049249649, + "learning_rate": 0.0002510873813812501, + "loss": 2.6417, + "step": 22569 + }, + { + "epoch": 0.6692761616700768, + "grad_norm": 0.11483702063560486, + "learning_rate": 0.00025104657624993177, + "loss": 2.6313, + "step": 22570 + }, + { + "epoch": 0.6693058150223883, + "grad_norm": 0.10488869994878769, + "learning_rate": 0.00025100577332313876, + "loss": 2.5846, + "step": 22571 + }, + { + "epoch": 0.6693354683746997, + "grad_norm": 0.09626225382089615, + "learning_rate": 0.0002509649726012322, + "loss": 2.6068, + "step": 22572 + }, + { + "epoch": 0.6693651217270112, + "grad_norm": 0.11977715790271759, + "learning_rate": 0.0002509241740845737, + "loss": 2.6248, + "step": 22573 + }, + { + "epoch": 0.6693947750793228, + "grad_norm": 0.1074523776769638, + "learning_rate": 0.00025088337777352435, + "loss": 2.5957, + "step": 22574 + }, + { + "epoch": 0.6694244284316342, + "grad_norm": 0.10720492154359818, + "learning_rate": 0.00025084258366844553, + "loss": 2.6048, + "step": 22575 + }, + { + "epoch": 0.6694540817839457, + "grad_norm": 0.11175384372472763, + "learning_rate": 0.00025080179176969856, + "loss": 2.5975, + "step": 22576 + }, + { + "epoch": 0.6694837351362571, + "grad_norm": 0.09771987795829773, + "learning_rate": 0.00025076100207764427, + "loss": 2.5923, + "step": 22577 + }, + { + "epoch": 0.6695133884885687, + "grad_norm": 0.10170648247003555, + "learning_rate": 0.0002507202145926442, + "loss": 2.6206, + "step": 22578 + }, + { + "epoch": 0.6695430418408801, + "grad_norm": 0.09573648869991302, + "learning_rate": 0.0002506794293150593, + "loss": 2.6079, + "step": 22579 + }, + { + "epoch": 0.6695726951931916, + "grad_norm": 0.11722738295793533, + "learning_rate": 0.0002506386462452509, + "loss": 2.5961, + "step": 22580 + }, + { + "epoch": 0.669602348545503, + "grad_norm": 0.10837739706039429, + "learning_rate": 0.0002505978653835801, + "loss": 2.5884, + "step": 22581 + }, + { + "epoch": 0.6696320018978146, + "grad_norm": 0.09694771468639374, + "learning_rate": 0.000250557086730408, + "loss": 2.6287, + "step": 22582 + }, + { + "epoch": 0.669661655250126, + "grad_norm": 0.09797707945108414, + "learning_rate": 0.00025051631028609575, + "loss": 2.5852, + "step": 22583 + }, + { + "epoch": 0.6696913086024375, + "grad_norm": 0.09903944283723831, + "learning_rate": 0.00025047553605100437, + "loss": 2.6169, + "step": 22584 + }, + { + "epoch": 0.669720961954749, + "grad_norm": 0.09087766706943512, + "learning_rate": 0.0002504347640254947, + "loss": 2.591, + "step": 22585 + }, + { + "epoch": 0.6697506153070605, + "grad_norm": 0.10211840271949768, + "learning_rate": 0.0002503939942099285, + "loss": 2.5788, + "step": 22586 + }, + { + "epoch": 0.6697802686593719, + "grad_norm": 0.11427754163742065, + "learning_rate": 0.0002503532266046661, + "loss": 2.5838, + "step": 22587 + }, + { + "epoch": 0.6698099220116834, + "grad_norm": 0.0938175842165947, + "learning_rate": 0.00025031246121006866, + "loss": 2.6025, + "step": 22588 + }, + { + "epoch": 0.6698395753639949, + "grad_norm": 0.10565444827079773, + "learning_rate": 0.00025027169802649727, + "loss": 2.5923, + "step": 22589 + }, + { + "epoch": 0.6698692287163064, + "grad_norm": 0.10641921311616898, + "learning_rate": 0.000250230937054313, + "loss": 2.5794, + "step": 22590 + }, + { + "epoch": 0.6698988820686178, + "grad_norm": 0.10329484939575195, + "learning_rate": 0.0002501901782938765, + "loss": 2.5819, + "step": 22591 + }, + { + "epoch": 0.6699285354209293, + "grad_norm": 0.10542138665914536, + "learning_rate": 0.0002501494217455489, + "loss": 2.6245, + "step": 22592 + }, + { + "epoch": 0.6699581887732409, + "grad_norm": 0.10699554532766342, + "learning_rate": 0.00025010866740969107, + "loss": 2.5732, + "step": 22593 + }, + { + "epoch": 0.6699878421255523, + "grad_norm": 0.08948145061731339, + "learning_rate": 0.0002500679152866638, + "loss": 2.5967, + "step": 22594 + }, + { + "epoch": 0.6700174954778638, + "grad_norm": 0.09521244466304779, + "learning_rate": 0.0002500271653768281, + "loss": 2.5888, + "step": 22595 + }, + { + "epoch": 0.6700471488301752, + "grad_norm": 0.09785138070583344, + "learning_rate": 0.00024998641768054483, + "loss": 2.5696, + "step": 22596 + }, + { + "epoch": 0.6700768021824868, + "grad_norm": 0.10619857907295227, + "learning_rate": 0.0002499456721981747, + "loss": 2.604, + "step": 22597 + }, + { + "epoch": 0.6701064555347982, + "grad_norm": 0.10581299662590027, + "learning_rate": 0.0002499049289300785, + "loss": 2.5625, + "step": 22598 + }, + { + "epoch": 0.6701361088871097, + "grad_norm": 0.09264855086803436, + "learning_rate": 0.00024986418787661713, + "loss": 2.5786, + "step": 22599 + }, + { + "epoch": 0.6701657622394211, + "grad_norm": 0.12142285704612732, + "learning_rate": 0.0002498234490381513, + "loss": 2.5458, + "step": 22600 + }, + { + "epoch": 0.6701954155917327, + "grad_norm": 0.12062817066907883, + "learning_rate": 0.00024978271241504177, + "loss": 2.592, + "step": 22601 + }, + { + "epoch": 0.6702250689440441, + "grad_norm": 0.1046297550201416, + "learning_rate": 0.00024974197800764925, + "loss": 2.5781, + "step": 22602 + }, + { + "epoch": 0.6702547222963556, + "grad_norm": 0.10314873605966568, + "learning_rate": 0.00024970124581633453, + "loss": 2.5799, + "step": 22603 + }, + { + "epoch": 0.670284375648667, + "grad_norm": 0.10082754492759705, + "learning_rate": 0.0002496605158414582, + "loss": 2.605, + "step": 22604 + }, + { + "epoch": 0.6703140290009786, + "grad_norm": 0.08923842012882233, + "learning_rate": 0.00024961978808338097, + "loss": 2.629, + "step": 22605 + }, + { + "epoch": 0.67034368235329, + "grad_norm": 0.11050272732973099, + "learning_rate": 0.0002495790625424635, + "loss": 2.591, + "step": 22606 + }, + { + "epoch": 0.6703733357056015, + "grad_norm": 0.09325924515724182, + "learning_rate": 0.00024953833921906646, + "loss": 2.608, + "step": 22607 + }, + { + "epoch": 0.670402989057913, + "grad_norm": 0.10823728889226913, + "learning_rate": 0.00024949761811355036, + "loss": 2.572, + "step": 22608 + }, + { + "epoch": 0.6704326424102245, + "grad_norm": 0.10753588378429413, + "learning_rate": 0.00024945689922627586, + "loss": 2.5964, + "step": 22609 + }, + { + "epoch": 0.6704622957625359, + "grad_norm": 0.10512631386518478, + "learning_rate": 0.0002494161825576037, + "loss": 2.5593, + "step": 22610 + }, + { + "epoch": 0.6704919491148474, + "grad_norm": 0.11024849861860275, + "learning_rate": 0.0002493754681078939, + "loss": 2.6038, + "step": 22611 + }, + { + "epoch": 0.6705216024671589, + "grad_norm": 0.11967554688453674, + "learning_rate": 0.00024933475587750754, + "loss": 2.6368, + "step": 22612 + }, + { + "epoch": 0.6705512558194704, + "grad_norm": 0.10522904247045517, + "learning_rate": 0.00024929404586680493, + "loss": 2.6015, + "step": 22613 + }, + { + "epoch": 0.6705809091717819, + "grad_norm": 0.10638045519590378, + "learning_rate": 0.00024925333807614657, + "loss": 2.6105, + "step": 22614 + }, + { + "epoch": 0.6706105625240933, + "grad_norm": 0.11364874243736267, + "learning_rate": 0.000249212632505893, + "loss": 2.6002, + "step": 22615 + }, + { + "epoch": 0.6706402158764049, + "grad_norm": 0.11477339267730713, + "learning_rate": 0.00024917192915640456, + "loss": 2.5859, + "step": 22616 + }, + { + "epoch": 0.6706698692287163, + "grad_norm": 0.09128707647323608, + "learning_rate": 0.0002491312280280418, + "loss": 2.6076, + "step": 22617 + }, + { + "epoch": 0.6706995225810278, + "grad_norm": 0.11943932622671127, + "learning_rate": 0.000249090529121165, + "loss": 2.6194, + "step": 22618 + }, + { + "epoch": 0.6707291759333393, + "grad_norm": 0.10535196214914322, + "learning_rate": 0.00024904983243613467, + "loss": 2.5959, + "step": 22619 + }, + { + "epoch": 0.6707588292856508, + "grad_norm": 0.12152882665395737, + "learning_rate": 0.00024900913797331134, + "loss": 2.6254, + "step": 22620 + }, + { + "epoch": 0.6707884826379622, + "grad_norm": 0.10484163463115692, + "learning_rate": 0.0002489684457330549, + "loss": 2.5882, + "step": 22621 + }, + { + "epoch": 0.6708181359902737, + "grad_norm": 0.10563971847295761, + "learning_rate": 0.00024892775571572613, + "loss": 2.5745, + "step": 22622 + }, + { + "epoch": 0.6708477893425852, + "grad_norm": 0.10157689452171326, + "learning_rate": 0.0002488870679216851, + "loss": 2.5933, + "step": 22623 + }, + { + "epoch": 0.6708774426948967, + "grad_norm": 0.09964672476053238, + "learning_rate": 0.0002488463823512919, + "loss": 2.5477, + "step": 22624 + }, + { + "epoch": 0.6709070960472081, + "grad_norm": 0.10479414463043213, + "learning_rate": 0.0002488056990049074, + "loss": 2.5589, + "step": 22625 + }, + { + "epoch": 0.6709367493995196, + "grad_norm": 0.10049928724765778, + "learning_rate": 0.00024876501788289153, + "loss": 2.5814, + "step": 22626 + }, + { + "epoch": 0.6709664027518311, + "grad_norm": 0.10682729631662369, + "learning_rate": 0.00024872433898560453, + "loss": 2.6059, + "step": 22627 + }, + { + "epoch": 0.6709960561041426, + "grad_norm": 0.09725833684206009, + "learning_rate": 0.0002486836623134067, + "loss": 2.5861, + "step": 22628 + }, + { + "epoch": 0.671025709456454, + "grad_norm": 0.09242740273475647, + "learning_rate": 0.00024864298786665814, + "loss": 2.633, + "step": 22629 + }, + { + "epoch": 0.6710553628087655, + "grad_norm": 0.09831448644399643, + "learning_rate": 0.000248602315645719, + "loss": 2.5846, + "step": 22630 + }, + { + "epoch": 0.671085016161077, + "grad_norm": 0.09839916974306107, + "learning_rate": 0.0002485616456509498, + "loss": 2.5953, + "step": 22631 + }, + { + "epoch": 0.6711146695133885, + "grad_norm": 0.10170678794384003, + "learning_rate": 0.0002485209778827101, + "loss": 2.568, + "step": 22632 + }, + { + "epoch": 0.6711443228656999, + "grad_norm": 0.10494419932365417, + "learning_rate": 0.0002484803123413604, + "loss": 2.5831, + "step": 22633 + }, + { + "epoch": 0.6711739762180114, + "grad_norm": 0.10745534300804138, + "learning_rate": 0.00024843964902726063, + "loss": 2.5896, + "step": 22634 + }, + { + "epoch": 0.671203629570323, + "grad_norm": 0.0958472415804863, + "learning_rate": 0.00024839898794077096, + "loss": 2.5973, + "step": 22635 + }, + { + "epoch": 0.6712332829226344, + "grad_norm": 0.1063031405210495, + "learning_rate": 0.0002483583290822515, + "loss": 2.6064, + "step": 22636 + }, + { + "epoch": 0.6712629362749459, + "grad_norm": 0.09594354778528214, + "learning_rate": 0.000248317672452062, + "loss": 2.579, + "step": 22637 + }, + { + "epoch": 0.6712925896272574, + "grad_norm": 0.09863672405481339, + "learning_rate": 0.0002482770180505629, + "loss": 2.6109, + "step": 22638 + }, + { + "epoch": 0.6713222429795689, + "grad_norm": 0.09780558198690414, + "learning_rate": 0.000248236365878114, + "loss": 2.6057, + "step": 22639 + }, + { + "epoch": 0.6713518963318803, + "grad_norm": 0.09435220062732697, + "learning_rate": 0.0002481957159350753, + "loss": 2.6062, + "step": 22640 + }, + { + "epoch": 0.6713815496841918, + "grad_norm": 0.09682976454496384, + "learning_rate": 0.000248155068221807, + "loss": 2.5811, + "step": 22641 + }, + { + "epoch": 0.6714112030365033, + "grad_norm": 0.09728296846151352, + "learning_rate": 0.0002481144227386685, + "loss": 2.6097, + "step": 22642 + }, + { + "epoch": 0.6714408563888148, + "grad_norm": 0.0920535996556282, + "learning_rate": 0.00024807377948602013, + "loss": 2.5931, + "step": 22643 + }, + { + "epoch": 0.6714705097411262, + "grad_norm": 0.10593336820602417, + "learning_rate": 0.00024803313846422163, + "loss": 2.5891, + "step": 22644 + }, + { + "epoch": 0.6715001630934377, + "grad_norm": 0.11119308322668076, + "learning_rate": 0.000247992499673633, + "loss": 2.6161, + "step": 22645 + }, + { + "epoch": 0.6715298164457492, + "grad_norm": 0.09938576817512512, + "learning_rate": 0.00024795186311461396, + "loss": 2.6057, + "step": 22646 + }, + { + "epoch": 0.6715594697980607, + "grad_norm": 0.09033264219760895, + "learning_rate": 0.0002479112287875245, + "loss": 2.6113, + "step": 22647 + }, + { + "epoch": 0.6715891231503721, + "grad_norm": 0.09729848802089691, + "learning_rate": 0.00024787059669272433, + "loss": 2.6007, + "step": 22648 + }, + { + "epoch": 0.6716187765026836, + "grad_norm": 0.09698684513568878, + "learning_rate": 0.0002478299668305733, + "loss": 2.5727, + "step": 22649 + }, + { + "epoch": 0.6716484298549951, + "grad_norm": 0.08965045213699341, + "learning_rate": 0.00024778933920143104, + "loss": 2.6069, + "step": 22650 + }, + { + "epoch": 0.6716780832073066, + "grad_norm": 0.10369832068681717, + "learning_rate": 0.00024774871380565765, + "loss": 2.5996, + "step": 22651 + }, + { + "epoch": 0.671707736559618, + "grad_norm": 0.10411905497312546, + "learning_rate": 0.00024770809064361286, + "loss": 2.5968, + "step": 22652 + }, + { + "epoch": 0.6717373899119295, + "grad_norm": 0.09823181480169296, + "learning_rate": 0.0002476674697156561, + "loss": 2.5709, + "step": 22653 + }, + { + "epoch": 0.671767043264241, + "grad_norm": 0.10929930210113525, + "learning_rate": 0.0002476268510221472, + "loss": 2.585, + "step": 22654 + }, + { + "epoch": 0.6717966966165525, + "grad_norm": 0.10137219727039337, + "learning_rate": 0.0002475862345634458, + "loss": 2.6164, + "step": 22655 + }, + { + "epoch": 0.671826349968864, + "grad_norm": 0.10342701524496078, + "learning_rate": 0.00024754562033991166, + "loss": 2.5931, + "step": 22656 + }, + { + "epoch": 0.6718560033211755, + "grad_norm": 0.09316851943731308, + "learning_rate": 0.0002475050083519044, + "loss": 2.563, + "step": 22657 + }, + { + "epoch": 0.671885656673487, + "grad_norm": 0.10305218398571014, + "learning_rate": 0.0002474643985997836, + "loss": 2.5726, + "step": 22658 + }, + { + "epoch": 0.6719153100257984, + "grad_norm": 0.10561788827180862, + "learning_rate": 0.00024742379108390897, + "loss": 2.6543, + "step": 22659 + }, + { + "epoch": 0.6719449633781099, + "grad_norm": 0.11396607756614685, + "learning_rate": 0.0002473831858046399, + "loss": 2.5816, + "step": 22660 + }, + { + "epoch": 0.6719746167304214, + "grad_norm": 0.09509358555078506, + "learning_rate": 0.00024734258276233617, + "loss": 2.5907, + "step": 22661 + }, + { + "epoch": 0.6720042700827329, + "grad_norm": 0.112420953810215, + "learning_rate": 0.0002473019819573572, + "loss": 2.5945, + "step": 22662 + }, + { + "epoch": 0.6720339234350443, + "grad_norm": 0.09973324835300446, + "learning_rate": 0.00024726138339006257, + "loss": 2.6141, + "step": 22663 + }, + { + "epoch": 0.6720635767873558, + "grad_norm": 0.10083156824111938, + "learning_rate": 0.00024722078706081174, + "loss": 2.5916, + "step": 22664 + }, + { + "epoch": 0.6720932301396673, + "grad_norm": 0.09809014946222305, + "learning_rate": 0.0002471801929699643, + "loss": 2.598, + "step": 22665 + }, + { + "epoch": 0.6721228834919788, + "grad_norm": 0.08991141617298126, + "learning_rate": 0.0002471396011178796, + "loss": 2.5809, + "step": 22666 + }, + { + "epoch": 0.6721525368442902, + "grad_norm": 0.09746401757001877, + "learning_rate": 0.00024709901150491716, + "loss": 2.5683, + "step": 22667 + }, + { + "epoch": 0.6721821901966017, + "grad_norm": 0.0874391421675682, + "learning_rate": 0.00024705842413143633, + "loss": 2.5793, + "step": 22668 + }, + { + "epoch": 0.6722118435489132, + "grad_norm": 0.10585005581378937, + "learning_rate": 0.0002470178389977966, + "loss": 2.5875, + "step": 22669 + }, + { + "epoch": 0.6722414969012247, + "grad_norm": 0.10009734332561493, + "learning_rate": 0.00024697725610435734, + "loss": 2.5836, + "step": 22670 + }, + { + "epoch": 0.6722711502535361, + "grad_norm": 0.10220520943403244, + "learning_rate": 0.0002469366754514779, + "loss": 2.5952, + "step": 22671 + }, + { + "epoch": 0.6723008036058477, + "grad_norm": 0.10241875797510147, + "learning_rate": 0.00024689609703951767, + "loss": 2.6002, + "step": 22672 + }, + { + "epoch": 0.6723304569581591, + "grad_norm": 0.10224048048257828, + "learning_rate": 0.0002468555208688359, + "loss": 2.5986, + "step": 22673 + }, + { + "epoch": 0.6723601103104706, + "grad_norm": 0.1170080155134201, + "learning_rate": 0.00024681494693979197, + "loss": 2.6115, + "step": 22674 + }, + { + "epoch": 0.672389763662782, + "grad_norm": 0.09271851181983948, + "learning_rate": 0.0002467743752527453, + "loss": 2.6119, + "step": 22675 + }, + { + "epoch": 0.6724194170150936, + "grad_norm": 0.10380575060844421, + "learning_rate": 0.0002467338058080546, + "loss": 2.5896, + "step": 22676 + }, + { + "epoch": 0.6724490703674051, + "grad_norm": 0.09430544823408127, + "learning_rate": 0.0002466932386060798, + "loss": 2.5666, + "step": 22677 + }, + { + "epoch": 0.6724787237197165, + "grad_norm": 0.1048199012875557, + "learning_rate": 0.0002466526736471798, + "loss": 2.6064, + "step": 22678 + }, + { + "epoch": 0.672508377072028, + "grad_norm": 0.10144872218370438, + "learning_rate": 0.00024661211093171385, + "loss": 2.5641, + "step": 22679 + }, + { + "epoch": 0.6725380304243395, + "grad_norm": 0.11403005570173264, + "learning_rate": 0.0002465715504600412, + "loss": 2.5752, + "step": 22680 + }, + { + "epoch": 0.672567683776651, + "grad_norm": 0.10178620368242264, + "learning_rate": 0.0002465309922325209, + "loss": 2.5813, + "step": 22681 + }, + { + "epoch": 0.6725973371289624, + "grad_norm": 0.10060810297727585, + "learning_rate": 0.00024649043624951224, + "loss": 2.5906, + "step": 22682 + }, + { + "epoch": 0.672626990481274, + "grad_norm": 0.10586945712566376, + "learning_rate": 0.00024644988251137424, + "loss": 2.593, + "step": 22683 + }, + { + "epoch": 0.6726566438335854, + "grad_norm": 0.10509448498487473, + "learning_rate": 0.00024640933101846606, + "loss": 2.581, + "step": 22684 + }, + { + "epoch": 0.6726862971858969, + "grad_norm": 0.11544812470674515, + "learning_rate": 0.0002463687817711468, + "loss": 2.536, + "step": 22685 + }, + { + "epoch": 0.6727159505382083, + "grad_norm": 0.10043544322252274, + "learning_rate": 0.00024632823476977565, + "loss": 2.572, + "step": 22686 + }, + { + "epoch": 0.6727456038905198, + "grad_norm": 0.1128062978386879, + "learning_rate": 0.0002462876900147114, + "loss": 2.5527, + "step": 22687 + }, + { + "epoch": 0.6727752572428313, + "grad_norm": 0.10941439867019653, + "learning_rate": 0.0002462471475063132, + "loss": 2.6244, + "step": 22688 + }, + { + "epoch": 0.6728049105951428, + "grad_norm": 0.09471511095762253, + "learning_rate": 0.0002462066072449399, + "loss": 2.5972, + "step": 22689 + }, + { + "epoch": 0.6728345639474542, + "grad_norm": 0.11450571566820145, + "learning_rate": 0.00024616606923095075, + "loss": 2.6214, + "step": 22690 + }, + { + "epoch": 0.6728642172997658, + "grad_norm": 0.08755644410848618, + "learning_rate": 0.0002461255334647047, + "loss": 2.6282, + "step": 22691 + }, + { + "epoch": 0.6728938706520772, + "grad_norm": 0.11173758655786514, + "learning_rate": 0.0002460849999465606, + "loss": 2.6154, + "step": 22692 + }, + { + "epoch": 0.6729235240043887, + "grad_norm": 0.0894852876663208, + "learning_rate": 0.0002460444686768774, + "loss": 2.5946, + "step": 22693 + }, + { + "epoch": 0.6729531773567001, + "grad_norm": 0.10161968320608139, + "learning_rate": 0.00024600393965601403, + "loss": 2.5809, + "step": 22694 + }, + { + "epoch": 0.6729828307090117, + "grad_norm": 0.09017317742109299, + "learning_rate": 0.0002459634128843294, + "loss": 2.5459, + "step": 22695 + }, + { + "epoch": 0.6730124840613231, + "grad_norm": 0.09854092448949814, + "learning_rate": 0.0002459228883621825, + "loss": 2.6194, + "step": 22696 + }, + { + "epoch": 0.6730421374136346, + "grad_norm": 0.09098537266254425, + "learning_rate": 0.00024588236608993186, + "loss": 2.5828, + "step": 22697 + }, + { + "epoch": 0.6730717907659461, + "grad_norm": 0.10421531647443771, + "learning_rate": 0.0002458418460679365, + "loss": 2.6028, + "step": 22698 + }, + { + "epoch": 0.6731014441182576, + "grad_norm": 0.10138029605150223, + "learning_rate": 0.0002458013282965552, + "loss": 2.6146, + "step": 22699 + }, + { + "epoch": 0.6731310974705691, + "grad_norm": 0.09629078954458237, + "learning_rate": 0.00024576081277614677, + "loss": 2.5657, + "step": 22700 + }, + { + "epoch": 0.6731607508228805, + "grad_norm": 0.10520893335342407, + "learning_rate": 0.00024572029950706997, + "loss": 2.5995, + "step": 22701 + }, + { + "epoch": 0.673190404175192, + "grad_norm": 0.10965502262115479, + "learning_rate": 0.00024567978848968343, + "loss": 2.5988, + "step": 22702 + }, + { + "epoch": 0.6732200575275035, + "grad_norm": 0.09787808358669281, + "learning_rate": 0.0002456392797243462, + "loss": 2.5685, + "step": 22703 + }, + { + "epoch": 0.673249710879815, + "grad_norm": 0.0909201130270958, + "learning_rate": 0.00024559877321141674, + "loss": 2.5938, + "step": 22704 + }, + { + "epoch": 0.6732793642321264, + "grad_norm": 0.11154184490442276, + "learning_rate": 0.0002455582689512539, + "loss": 2.5832, + "step": 22705 + }, + { + "epoch": 0.673309017584438, + "grad_norm": 0.10670086741447449, + "learning_rate": 0.0002455177669442162, + "loss": 2.5969, + "step": 22706 + }, + { + "epoch": 0.6733386709367494, + "grad_norm": 0.10391170531511307, + "learning_rate": 0.0002454772671906625, + "loss": 2.5947, + "step": 22707 + }, + { + "epoch": 0.6733683242890609, + "grad_norm": 0.09461294114589691, + "learning_rate": 0.0002454367696909512, + "loss": 2.5781, + "step": 22708 + }, + { + "epoch": 0.6733979776413723, + "grad_norm": 0.10828463733196259, + "learning_rate": 0.000245396274445441, + "loss": 2.5987, + "step": 22709 + }, + { + "epoch": 0.6734276309936839, + "grad_norm": 0.09758704900741577, + "learning_rate": 0.00024535578145449047, + "loss": 2.601, + "step": 22710 + }, + { + "epoch": 0.6734572843459953, + "grad_norm": 0.11129499971866608, + "learning_rate": 0.00024531529071845817, + "loss": 2.6057, + "step": 22711 + }, + { + "epoch": 0.6734869376983068, + "grad_norm": 0.09390496462583542, + "learning_rate": 0.00024527480223770267, + "loss": 2.6348, + "step": 22712 + }, + { + "epoch": 0.6735165910506182, + "grad_norm": 0.1168702244758606, + "learning_rate": 0.0002452343160125826, + "loss": 2.5777, + "step": 22713 + }, + { + "epoch": 0.6735462444029298, + "grad_norm": 0.09481272101402283, + "learning_rate": 0.0002451938320434563, + "loss": 2.5801, + "step": 22714 + }, + { + "epoch": 0.6735758977552412, + "grad_norm": 0.10467983037233353, + "learning_rate": 0.0002451533503306822, + "loss": 2.5904, + "step": 22715 + }, + { + "epoch": 0.6736055511075527, + "grad_norm": 0.0989471971988678, + "learning_rate": 0.00024511287087461913, + "loss": 2.5924, + "step": 22716 + }, + { + "epoch": 0.6736352044598641, + "grad_norm": 0.1015380397439003, + "learning_rate": 0.0002450723936756255, + "loss": 2.5691, + "step": 22717 + }, + { + "epoch": 0.6736648578121757, + "grad_norm": 0.10324236005544662, + "learning_rate": 0.00024503191873405947, + "loss": 2.5887, + "step": 22718 + }, + { + "epoch": 0.6736945111644872, + "grad_norm": 0.1050802692770958, + "learning_rate": 0.00024499144605027946, + "loss": 2.5881, + "step": 22719 + }, + { + "epoch": 0.6737241645167986, + "grad_norm": 0.11360292136669159, + "learning_rate": 0.000244950975624644, + "loss": 2.6064, + "step": 22720 + }, + { + "epoch": 0.6737538178691101, + "grad_norm": 0.09277037531137466, + "learning_rate": 0.0002449105074575115, + "loss": 2.587, + "step": 22721 + }, + { + "epoch": 0.6737834712214216, + "grad_norm": 0.09407371282577515, + "learning_rate": 0.00024487004154924013, + "loss": 2.6083, + "step": 22722 + }, + { + "epoch": 0.6738131245737331, + "grad_norm": 0.10013410449028015, + "learning_rate": 0.0002448295779001884, + "loss": 2.6292, + "step": 22723 + }, + { + "epoch": 0.6738427779260445, + "grad_norm": 0.09550046175718307, + "learning_rate": 0.00024478911651071457, + "loss": 2.5965, + "step": 22724 + }, + { + "epoch": 0.673872431278356, + "grad_norm": 0.09747979789972305, + "learning_rate": 0.00024474865738117693, + "loss": 2.6093, + "step": 22725 + }, + { + "epoch": 0.6739020846306675, + "grad_norm": 0.10482694953680038, + "learning_rate": 0.00024470820051193367, + "loss": 2.5888, + "step": 22726 + }, + { + "epoch": 0.673931737982979, + "grad_norm": 0.10770233720541, + "learning_rate": 0.00024466774590334317, + "loss": 2.5807, + "step": 22727 + }, + { + "epoch": 0.6739613913352904, + "grad_norm": 0.08780045062303543, + "learning_rate": 0.0002446272935557636, + "loss": 2.6043, + "step": 22728 + }, + { + "epoch": 0.673991044687602, + "grad_norm": 0.11381018906831741, + "learning_rate": 0.0002445868434695532, + "loss": 2.5917, + "step": 22729 + }, + { + "epoch": 0.6740206980399134, + "grad_norm": 0.16256776452064514, + "learning_rate": 0.0002445463956450701, + "loss": 2.5893, + "step": 22730 + }, + { + "epoch": 0.6740503513922249, + "grad_norm": 0.10001274943351746, + "learning_rate": 0.00024450595008267253, + "loss": 2.597, + "step": 22731 + }, + { + "epoch": 0.6740800047445363, + "grad_norm": 0.11020294576883316, + "learning_rate": 0.0002444655067827186, + "loss": 2.5766, + "step": 22732 + }, + { + "epoch": 0.6741096580968479, + "grad_norm": 0.09299509227275848, + "learning_rate": 0.0002444250657455665, + "loss": 2.5794, + "step": 22733 + }, + { + "epoch": 0.6741393114491593, + "grad_norm": 0.09208298474550247, + "learning_rate": 0.0002443846269715743, + "loss": 2.5831, + "step": 22734 + }, + { + "epoch": 0.6741689648014708, + "grad_norm": 0.09937067329883575, + "learning_rate": 0.00024434419046110014, + "loss": 2.6024, + "step": 22735 + }, + { + "epoch": 0.6741986181537822, + "grad_norm": 0.0998019278049469, + "learning_rate": 0.000244303756214502, + "loss": 2.5655, + "step": 22736 + }, + { + "epoch": 0.6742282715060938, + "grad_norm": 0.10209383815526962, + "learning_rate": 0.000244263324232138, + "loss": 2.5984, + "step": 22737 + }, + { + "epoch": 0.6742579248584052, + "grad_norm": 0.09710220247507095, + "learning_rate": 0.00024422289451436616, + "loss": 2.5813, + "step": 22738 + }, + { + "epoch": 0.6742875782107167, + "grad_norm": 0.09018569439649582, + "learning_rate": 0.0002441824670615445, + "loss": 2.5794, + "step": 22739 + }, + { + "epoch": 0.6743172315630283, + "grad_norm": 0.09745561331510544, + "learning_rate": 0.000244142041874031, + "loss": 2.5842, + "step": 22740 + }, + { + "epoch": 0.6743468849153397, + "grad_norm": 0.09368010610342026, + "learning_rate": 0.00024410161895218368, + "loss": 2.5873, + "step": 22741 + }, + { + "epoch": 0.6743765382676512, + "grad_norm": 0.09996526688337326, + "learning_rate": 0.00024406119829636043, + "loss": 2.6062, + "step": 22742 + }, + { + "epoch": 0.6744061916199626, + "grad_norm": 0.0938415601849556, + "learning_rate": 0.00024402077990691917, + "loss": 2.5939, + "step": 22743 + }, + { + "epoch": 0.6744358449722742, + "grad_norm": 0.1000298485159874, + "learning_rate": 0.0002439803637842179, + "loss": 2.5704, + "step": 22744 + }, + { + "epoch": 0.6744654983245856, + "grad_norm": 0.09745411574840546, + "learning_rate": 0.0002439399499286144, + "loss": 2.5643, + "step": 22745 + }, + { + "epoch": 0.6744951516768971, + "grad_norm": 0.11261343955993652, + "learning_rate": 0.00024389953834046664, + "loss": 2.5383, + "step": 22746 + }, + { + "epoch": 0.6745248050292085, + "grad_norm": 0.09055154025554657, + "learning_rate": 0.00024385912902013236, + "loss": 2.5667, + "step": 22747 + }, + { + "epoch": 0.6745544583815201, + "grad_norm": 0.13514475524425507, + "learning_rate": 0.00024381872196796951, + "loss": 2.5896, + "step": 22748 + }, + { + "epoch": 0.6745841117338315, + "grad_norm": 0.11028839647769928, + "learning_rate": 0.00024377831718433584, + "loss": 2.5684, + "step": 22749 + }, + { + "epoch": 0.674613765086143, + "grad_norm": 0.11215632408857346, + "learning_rate": 0.00024373791466958918, + "loss": 2.6089, + "step": 22750 + }, + { + "epoch": 0.6746434184384544, + "grad_norm": 0.1127561703324318, + "learning_rate": 0.0002436975144240874, + "loss": 2.6003, + "step": 22751 + }, + { + "epoch": 0.674673071790766, + "grad_norm": 0.10281964391469955, + "learning_rate": 0.00024365711644818794, + "loss": 2.6063, + "step": 22752 + }, + { + "epoch": 0.6747027251430774, + "grad_norm": 0.10444699227809906, + "learning_rate": 0.00024361672074224856, + "loss": 2.5943, + "step": 22753 + }, + { + "epoch": 0.6747323784953889, + "grad_norm": 0.1090390756726265, + "learning_rate": 0.00024357632730662732, + "loss": 2.577, + "step": 22754 + }, + { + "epoch": 0.6747620318477003, + "grad_norm": 0.10382605344057083, + "learning_rate": 0.0002435359361416817, + "loss": 2.5643, + "step": 22755 + }, + { + "epoch": 0.6747916852000119, + "grad_norm": 0.09794506430625916, + "learning_rate": 0.0002434955472477694, + "loss": 2.5753, + "step": 22756 + }, + { + "epoch": 0.6748213385523233, + "grad_norm": 0.10222528874874115, + "learning_rate": 0.00024345516062524803, + "loss": 2.5998, + "step": 22757 + }, + { + "epoch": 0.6748509919046348, + "grad_norm": 0.09470147639513016, + "learning_rate": 0.0002434147762744752, + "loss": 2.5956, + "step": 22758 + }, + { + "epoch": 0.6748806452569462, + "grad_norm": 0.09809118509292603, + "learning_rate": 0.0002433743941958087, + "loss": 2.5885, + "step": 22759 + }, + { + "epoch": 0.6749102986092578, + "grad_norm": 0.10489051789045334, + "learning_rate": 0.0002433340143896059, + "loss": 2.6221, + "step": 22760 + }, + { + "epoch": 0.6749399519615693, + "grad_norm": 0.10392676293849945, + "learning_rate": 0.00024329363685622447, + "loss": 2.5649, + "step": 22761 + }, + { + "epoch": 0.6749696053138807, + "grad_norm": 0.10107166320085526, + "learning_rate": 0.00024325326159602218, + "loss": 2.5732, + "step": 22762 + }, + { + "epoch": 0.6749992586661923, + "grad_norm": 0.1078493520617485, + "learning_rate": 0.00024321288860935608, + "loss": 2.6095, + "step": 22763 + }, + { + "epoch": 0.6750289120185037, + "grad_norm": 0.09569603204727173, + "learning_rate": 0.00024317251789658396, + "loss": 2.5833, + "step": 22764 + }, + { + "epoch": 0.6750585653708152, + "grad_norm": 0.12232884019613266, + "learning_rate": 0.0002431321494580633, + "loss": 2.6159, + "step": 22765 + }, + { + "epoch": 0.6750882187231266, + "grad_norm": 0.09767677634954453, + "learning_rate": 0.00024309178329415127, + "loss": 2.5854, + "step": 22766 + }, + { + "epoch": 0.6751178720754382, + "grad_norm": 0.09713480621576309, + "learning_rate": 0.00024305141940520586, + "loss": 2.5961, + "step": 22767 + }, + { + "epoch": 0.6751475254277496, + "grad_norm": 0.08745714277029037, + "learning_rate": 0.00024301105779158422, + "loss": 2.5526, + "step": 22768 + }, + { + "epoch": 0.6751771787800611, + "grad_norm": 0.10547450929880142, + "learning_rate": 0.00024297069845364378, + "loss": 2.5582, + "step": 22769 + }, + { + "epoch": 0.6752068321323725, + "grad_norm": 0.100446917116642, + "learning_rate": 0.00024293034139174192, + "loss": 2.6169, + "step": 22770 + }, + { + "epoch": 0.6752364854846841, + "grad_norm": 0.09083761274814606, + "learning_rate": 0.000242889986606236, + "loss": 2.564, + "step": 22771 + }, + { + "epoch": 0.6752661388369955, + "grad_norm": 0.11681728810071945, + "learning_rate": 0.00024284963409748362, + "loss": 2.6, + "step": 22772 + }, + { + "epoch": 0.675295792189307, + "grad_norm": 0.10555413365364075, + "learning_rate": 0.0002428092838658416, + "loss": 2.5861, + "step": 22773 + }, + { + "epoch": 0.6753254455416184, + "grad_norm": 0.0942823514342308, + "learning_rate": 0.0002427689359116676, + "loss": 2.5698, + "step": 22774 + }, + { + "epoch": 0.67535509889393, + "grad_norm": 0.10038150101900101, + "learning_rate": 0.0002427285902353188, + "loss": 2.606, + "step": 22775 + }, + { + "epoch": 0.6753847522462414, + "grad_norm": 0.11101600527763367, + "learning_rate": 0.00024268824683715246, + "loss": 2.5899, + "step": 22776 + }, + { + "epoch": 0.6754144055985529, + "grad_norm": 0.11740735918283463, + "learning_rate": 0.00024264790571752588, + "loss": 2.6094, + "step": 22777 + }, + { + "epoch": 0.6754440589508643, + "grad_norm": 0.10291556268930435, + "learning_rate": 0.00024260756687679625, + "loss": 2.5507, + "step": 22778 + }, + { + "epoch": 0.6754737123031759, + "grad_norm": 0.10200244933366776, + "learning_rate": 0.00024256723031532062, + "loss": 2.6338, + "step": 22779 + }, + { + "epoch": 0.6755033656554873, + "grad_norm": 0.11021123081445694, + "learning_rate": 0.00024252689603345657, + "loss": 2.5821, + "step": 22780 + }, + { + "epoch": 0.6755330190077988, + "grad_norm": 0.10334540158510208, + "learning_rate": 0.000242486564031561, + "loss": 2.5857, + "step": 22781 + }, + { + "epoch": 0.6755626723601104, + "grad_norm": 0.09682075679302216, + "learning_rate": 0.00024244623430999114, + "loss": 2.5969, + "step": 22782 + }, + { + "epoch": 0.6755923257124218, + "grad_norm": 0.1068994328379631, + "learning_rate": 0.0002424059068691043, + "loss": 2.5876, + "step": 22783 + }, + { + "epoch": 0.6756219790647333, + "grad_norm": 0.10661336034536362, + "learning_rate": 0.00024236558170925715, + "loss": 2.6222, + "step": 22784 + }, + { + "epoch": 0.6756516324170447, + "grad_norm": 0.0945221558213234, + "learning_rate": 0.00024232525883080696, + "loss": 2.6011, + "step": 22785 + }, + { + "epoch": 0.6756812857693563, + "grad_norm": 0.11958015710115433, + "learning_rate": 0.0002422849382341109, + "loss": 2.5614, + "step": 22786 + }, + { + "epoch": 0.6757109391216677, + "grad_norm": 0.11083384603261948, + "learning_rate": 0.0002422446199195259, + "loss": 2.58, + "step": 22787 + }, + { + "epoch": 0.6757405924739792, + "grad_norm": 0.0945986956357956, + "learning_rate": 0.00024220430388740904, + "loss": 2.5781, + "step": 22788 + }, + { + "epoch": 0.6757702458262906, + "grad_norm": 0.10662137717008591, + "learning_rate": 0.00024216399013811735, + "loss": 2.5607, + "step": 22789 + }, + { + "epoch": 0.6757998991786022, + "grad_norm": 0.10115602612495422, + "learning_rate": 0.00024212367867200774, + "loss": 2.5964, + "step": 22790 + }, + { + "epoch": 0.6758295525309136, + "grad_norm": 0.10390176624059677, + "learning_rate": 0.00024208336948943726, + "loss": 2.6325, + "step": 22791 + }, + { + "epoch": 0.6758592058832251, + "grad_norm": 0.09357946366071701, + "learning_rate": 0.0002420430625907626, + "loss": 2.5696, + "step": 22792 + }, + { + "epoch": 0.6758888592355365, + "grad_norm": 0.09550102055072784, + "learning_rate": 0.0002420027579763413, + "loss": 2.6041, + "step": 22793 + }, + { + "epoch": 0.6759185125878481, + "grad_norm": 0.10837163031101227, + "learning_rate": 0.00024196245564652962, + "loss": 2.6146, + "step": 22794 + }, + { + "epoch": 0.6759481659401595, + "grad_norm": 0.08871739357709885, + "learning_rate": 0.00024192215560168474, + "loss": 2.6214, + "step": 22795 + }, + { + "epoch": 0.675977819292471, + "grad_norm": 0.11069826781749725, + "learning_rate": 0.00024188185784216349, + "loss": 2.5496, + "step": 22796 + }, + { + "epoch": 0.6760074726447824, + "grad_norm": 0.09933391213417053, + "learning_rate": 0.00024184156236832266, + "loss": 2.5832, + "step": 22797 + }, + { + "epoch": 0.676037125997094, + "grad_norm": 0.09904305636882782, + "learning_rate": 0.00024180126918051909, + "loss": 2.5727, + "step": 22798 + }, + { + "epoch": 0.6760667793494054, + "grad_norm": 0.10166274011135101, + "learning_rate": 0.00024176097827910965, + "loss": 2.5867, + "step": 22799 + }, + { + "epoch": 0.6760964327017169, + "grad_norm": 0.10959278792142868, + "learning_rate": 0.00024172068966445105, + "loss": 2.5912, + "step": 22800 + }, + { + "epoch": 0.6761260860540285, + "grad_norm": 0.09891301393508911, + "learning_rate": 0.00024168040333690007, + "loss": 2.633, + "step": 22801 + }, + { + "epoch": 0.6761557394063399, + "grad_norm": 0.0978781059384346, + "learning_rate": 0.0002416401192968134, + "loss": 2.5526, + "step": 22802 + }, + { + "epoch": 0.6761853927586514, + "grad_norm": 0.09865908324718475, + "learning_rate": 0.0002415998375445479, + "loss": 2.6195, + "step": 22803 + }, + { + "epoch": 0.6762150461109628, + "grad_norm": 0.08822622895240784, + "learning_rate": 0.00024155955808046015, + "loss": 2.5776, + "step": 22804 + }, + { + "epoch": 0.6762446994632744, + "grad_norm": 0.10217060148715973, + "learning_rate": 0.00024151928090490694, + "loss": 2.5586, + "step": 22805 + }, + { + "epoch": 0.6762743528155858, + "grad_norm": 0.11098525673151016, + "learning_rate": 0.0002414790060182448, + "loss": 2.5972, + "step": 22806 + }, + { + "epoch": 0.6763040061678973, + "grad_norm": 0.0948210060596466, + "learning_rate": 0.00024143873342083046, + "loss": 2.549, + "step": 22807 + }, + { + "epoch": 0.6763336595202087, + "grad_norm": 0.09636973589658737, + "learning_rate": 0.0002413984631130205, + "loss": 2.6187, + "step": 22808 + }, + { + "epoch": 0.6763633128725203, + "grad_norm": 0.09235560148954391, + "learning_rate": 0.00024135819509517155, + "loss": 2.5925, + "step": 22809 + }, + { + "epoch": 0.6763929662248317, + "grad_norm": 0.08701540529727936, + "learning_rate": 0.0002413179293676402, + "loss": 2.5618, + "step": 22810 + }, + { + "epoch": 0.6764226195771432, + "grad_norm": 0.09621010720729828, + "learning_rate": 0.00024127766593078293, + "loss": 2.5896, + "step": 22811 + }, + { + "epoch": 0.6764522729294546, + "grad_norm": 0.08263956755399704, + "learning_rate": 0.00024123740478495636, + "loss": 2.5801, + "step": 22812 + }, + { + "epoch": 0.6764819262817662, + "grad_norm": 0.09262832254171371, + "learning_rate": 0.000241197145930517, + "loss": 2.5874, + "step": 22813 + }, + { + "epoch": 0.6765115796340776, + "grad_norm": 0.10328313708305359, + "learning_rate": 0.00024115688936782138, + "loss": 2.6201, + "step": 22814 + }, + { + "epoch": 0.6765412329863891, + "grad_norm": 0.10249236226081848, + "learning_rate": 0.00024111663509722585, + "loss": 2.5943, + "step": 22815 + }, + { + "epoch": 0.6765708863387006, + "grad_norm": 0.0867997333407402, + "learning_rate": 0.00024107638311908697, + "loss": 2.6046, + "step": 22816 + }, + { + "epoch": 0.6766005396910121, + "grad_norm": 0.10236413031816483, + "learning_rate": 0.0002410361334337614, + "loss": 2.6135, + "step": 22817 + }, + { + "epoch": 0.6766301930433235, + "grad_norm": 0.10469944030046463, + "learning_rate": 0.00024099588604160495, + "loss": 2.5996, + "step": 22818 + }, + { + "epoch": 0.676659846395635, + "grad_norm": 0.10029423981904984, + "learning_rate": 0.0002409556409429745, + "loss": 2.5981, + "step": 22819 + }, + { + "epoch": 0.6766894997479465, + "grad_norm": 0.12209001183509827, + "learning_rate": 0.00024091539813822632, + "loss": 2.611, + "step": 22820 + }, + { + "epoch": 0.676719153100258, + "grad_norm": 0.10291943699121475, + "learning_rate": 0.00024087515762771683, + "loss": 2.6021, + "step": 22821 + }, + { + "epoch": 0.6767488064525695, + "grad_norm": 0.1023663803935051, + "learning_rate": 0.00024083491941180224, + "loss": 2.6006, + "step": 22822 + }, + { + "epoch": 0.6767784598048809, + "grad_norm": 0.09829429537057877, + "learning_rate": 0.00024079468349083894, + "loss": 2.5977, + "step": 22823 + }, + { + "epoch": 0.6768081131571925, + "grad_norm": 0.10655366629362106, + "learning_rate": 0.00024075444986518325, + "loss": 2.56, + "step": 22824 + }, + { + "epoch": 0.6768377665095039, + "grad_norm": 0.1059376448392868, + "learning_rate": 0.00024071421853519138, + "loss": 2.5782, + "step": 22825 + }, + { + "epoch": 0.6768674198618154, + "grad_norm": 0.09994591027498245, + "learning_rate": 0.00024067398950121955, + "loss": 2.5911, + "step": 22826 + }, + { + "epoch": 0.6768970732141268, + "grad_norm": 0.10150426626205444, + "learning_rate": 0.00024063376276362431, + "loss": 2.5995, + "step": 22827 + }, + { + "epoch": 0.6769267265664384, + "grad_norm": 0.09800505638122559, + "learning_rate": 0.00024059353832276144, + "loss": 2.5381, + "step": 22828 + }, + { + "epoch": 0.6769563799187498, + "grad_norm": 0.10799865424633026, + "learning_rate": 0.0002405533161789873, + "loss": 2.5458, + "step": 22829 + }, + { + "epoch": 0.6769860332710613, + "grad_norm": 0.1006256714463234, + "learning_rate": 0.00024051309633265806, + "loss": 2.5826, + "step": 22830 + }, + { + "epoch": 0.6770156866233727, + "grad_norm": 0.1030672937631607, + "learning_rate": 0.0002404728787841297, + "loss": 2.5931, + "step": 22831 + }, + { + "epoch": 0.6770453399756843, + "grad_norm": 0.11166319996118546, + "learning_rate": 0.00024043266353375876, + "loss": 2.5864, + "step": 22832 + }, + { + "epoch": 0.6770749933279957, + "grad_norm": 0.11227133125066757, + "learning_rate": 0.00024039245058190113, + "loss": 2.5973, + "step": 22833 + }, + { + "epoch": 0.6771046466803072, + "grad_norm": 0.10041350871324539, + "learning_rate": 0.00024035223992891286, + "loss": 2.5838, + "step": 22834 + }, + { + "epoch": 0.6771343000326187, + "grad_norm": 0.11755827814340591, + "learning_rate": 0.00024031203157515014, + "loss": 2.5902, + "step": 22835 + }, + { + "epoch": 0.6771639533849302, + "grad_norm": 0.11004751920700073, + "learning_rate": 0.00024027182552096893, + "loss": 2.5627, + "step": 22836 + }, + { + "epoch": 0.6771936067372416, + "grad_norm": 0.1054110899567604, + "learning_rate": 0.0002402316217667253, + "loss": 2.5919, + "step": 22837 + }, + { + "epoch": 0.6772232600895531, + "grad_norm": 0.10359999537467957, + "learning_rate": 0.00024019142031277542, + "loss": 2.583, + "step": 22838 + }, + { + "epoch": 0.6772529134418646, + "grad_norm": 0.1204320639371872, + "learning_rate": 0.00024015122115947495, + "loss": 2.622, + "step": 22839 + }, + { + "epoch": 0.6772825667941761, + "grad_norm": 0.10705935955047607, + "learning_rate": 0.00024011102430718, + "loss": 2.5782, + "step": 22840 + }, + { + "epoch": 0.6773122201464875, + "grad_norm": 0.10397912561893463, + "learning_rate": 0.00024007082975624656, + "loss": 2.5912, + "step": 22841 + }, + { + "epoch": 0.677341873498799, + "grad_norm": 0.10937528312206268, + "learning_rate": 0.00024003063750703047, + "loss": 2.5725, + "step": 22842 + }, + { + "epoch": 0.6773715268511106, + "grad_norm": 0.10640822350978851, + "learning_rate": 0.00023999044755988774, + "loss": 2.615, + "step": 22843 + }, + { + "epoch": 0.677401180203422, + "grad_norm": 0.10504863411188126, + "learning_rate": 0.0002399502599151741, + "loss": 2.561, + "step": 22844 + }, + { + "epoch": 0.6774308335557335, + "grad_norm": 0.105780228972435, + "learning_rate": 0.00023991007457324566, + "loss": 2.5631, + "step": 22845 + }, + { + "epoch": 0.677460486908045, + "grad_norm": 0.0987459272146225, + "learning_rate": 0.00023986989153445821, + "loss": 2.5887, + "step": 22846 + }, + { + "epoch": 0.6774901402603565, + "grad_norm": 0.10348275303840637, + "learning_rate": 0.0002398297107991675, + "loss": 2.5763, + "step": 22847 + }, + { + "epoch": 0.6775197936126679, + "grad_norm": 0.11464222520589828, + "learning_rate": 0.00023978953236772948, + "loss": 2.5736, + "step": 22848 + }, + { + "epoch": 0.6775494469649794, + "grad_norm": 0.09232750535011292, + "learning_rate": 0.0002397493562404997, + "loss": 2.5793, + "step": 22849 + }, + { + "epoch": 0.6775791003172908, + "grad_norm": 0.094545878469944, + "learning_rate": 0.00023970918241783402, + "loss": 2.5912, + "step": 22850 + }, + { + "epoch": 0.6776087536696024, + "grad_norm": 0.08659125864505768, + "learning_rate": 0.00023966901090008826, + "loss": 2.607, + "step": 22851 + }, + { + "epoch": 0.6776384070219138, + "grad_norm": 0.09039638191461563, + "learning_rate": 0.00023962884168761802, + "loss": 2.5919, + "step": 22852 + }, + { + "epoch": 0.6776680603742253, + "grad_norm": 0.09475419670343399, + "learning_rate": 0.0002395886747807791, + "loss": 2.5886, + "step": 22853 + }, + { + "epoch": 0.6776977137265368, + "grad_norm": 0.08710667490959167, + "learning_rate": 0.00023954851017992719, + "loss": 2.639, + "step": 22854 + }, + { + "epoch": 0.6777273670788483, + "grad_norm": 0.09494924545288086, + "learning_rate": 0.0002395083478854179, + "loss": 2.6147, + "step": 22855 + }, + { + "epoch": 0.6777570204311597, + "grad_norm": 0.10092190653085709, + "learning_rate": 0.00023946818789760693, + "loss": 2.586, + "step": 22856 + }, + { + "epoch": 0.6777866737834712, + "grad_norm": 0.0851038247346878, + "learning_rate": 0.00023942803021684966, + "loss": 2.5712, + "step": 22857 + }, + { + "epoch": 0.6778163271357827, + "grad_norm": 0.09526671469211578, + "learning_rate": 0.00023938787484350212, + "loss": 2.6021, + "step": 22858 + }, + { + "epoch": 0.6778459804880942, + "grad_norm": 0.08971159160137177, + "learning_rate": 0.00023934772177791985, + "loss": 2.6041, + "step": 22859 + }, + { + "epoch": 0.6778756338404056, + "grad_norm": 0.10387295484542847, + "learning_rate": 0.00023930757102045801, + "loss": 2.6153, + "step": 22860 + }, + { + "epoch": 0.6779052871927171, + "grad_norm": 0.0892038568854332, + "learning_rate": 0.00023926742257147248, + "loss": 2.6366, + "step": 22861 + }, + { + "epoch": 0.6779349405450286, + "grad_norm": 0.0896950215101242, + "learning_rate": 0.00023922727643131865, + "loss": 2.6112, + "step": 22862 + }, + { + "epoch": 0.6779645938973401, + "grad_norm": 0.10393080860376358, + "learning_rate": 0.00023918713260035198, + "loss": 2.6047, + "step": 22863 + }, + { + "epoch": 0.6779942472496516, + "grad_norm": 0.09495251625776291, + "learning_rate": 0.00023914699107892808, + "loss": 2.58, + "step": 22864 + }, + { + "epoch": 0.678023900601963, + "grad_norm": 0.09166432172060013, + "learning_rate": 0.00023910685186740233, + "loss": 2.5947, + "step": 22865 + }, + { + "epoch": 0.6780535539542746, + "grad_norm": 0.09986802190542221, + "learning_rate": 0.00023906671496613018, + "loss": 2.6221, + "step": 22866 + }, + { + "epoch": 0.678083207306586, + "grad_norm": 0.09606532007455826, + "learning_rate": 0.0002390265803754671, + "loss": 2.5941, + "step": 22867 + }, + { + "epoch": 0.6781128606588975, + "grad_norm": 0.10696258395910263, + "learning_rate": 0.00023898644809576837, + "loss": 2.5795, + "step": 22868 + }, + { + "epoch": 0.678142514011209, + "grad_norm": 0.09575216472148895, + "learning_rate": 0.00023894631812738947, + "loss": 2.6462, + "step": 22869 + }, + { + "epoch": 0.6781721673635205, + "grad_norm": 0.09557707607746124, + "learning_rate": 0.0002389061904706858, + "loss": 2.589, + "step": 22870 + }, + { + "epoch": 0.6782018207158319, + "grad_norm": 0.09439998865127563, + "learning_rate": 0.00023886606512601256, + "loss": 2.5871, + "step": 22871 + }, + { + "epoch": 0.6782314740681434, + "grad_norm": 0.11241336911916733, + "learning_rate": 0.00023882594209372515, + "loss": 2.5818, + "step": 22872 + }, + { + "epoch": 0.6782611274204549, + "grad_norm": 0.10137760639190674, + "learning_rate": 0.00023878582137417886, + "loss": 2.5687, + "step": 22873 + }, + { + "epoch": 0.6782907807727664, + "grad_norm": 0.10058632493019104, + "learning_rate": 0.00023874570296772895, + "loss": 2.5911, + "step": 22874 + }, + { + "epoch": 0.6783204341250778, + "grad_norm": 0.11513837426900864, + "learning_rate": 0.0002387055868747307, + "loss": 2.5951, + "step": 22875 + }, + { + "epoch": 0.6783500874773893, + "grad_norm": 0.10880178213119507, + "learning_rate": 0.00023866547309553933, + "loss": 2.6033, + "step": 22876 + }, + { + "epoch": 0.6783797408297008, + "grad_norm": 0.0938640907406807, + "learning_rate": 0.00023862536163051008, + "loss": 2.5535, + "step": 22877 + }, + { + "epoch": 0.6784093941820123, + "grad_norm": 0.11108577251434326, + "learning_rate": 0.00023858525247999807, + "loss": 2.5776, + "step": 22878 + }, + { + "epoch": 0.6784390475343237, + "grad_norm": 0.11289974302053452, + "learning_rate": 0.00023854514564435853, + "loss": 2.5657, + "step": 22879 + }, + { + "epoch": 0.6784687008866352, + "grad_norm": 0.12566278874874115, + "learning_rate": 0.0002385050411239466, + "loss": 2.6125, + "step": 22880 + }, + { + "epoch": 0.6784983542389467, + "grad_norm": 0.09328795224428177, + "learning_rate": 0.00023846493891911742, + "loss": 2.6074, + "step": 22881 + }, + { + "epoch": 0.6785280075912582, + "grad_norm": 0.11380762606859207, + "learning_rate": 0.0002384248390302261, + "loss": 2.5753, + "step": 22882 + }, + { + "epoch": 0.6785576609435696, + "grad_norm": 0.11872253566980362, + "learning_rate": 0.00023838474145762773, + "loss": 2.5708, + "step": 22883 + }, + { + "epoch": 0.6785873142958811, + "grad_norm": 0.10662880539894104, + "learning_rate": 0.00023834464620167735, + "loss": 2.6143, + "step": 22884 + }, + { + "epoch": 0.6786169676481927, + "grad_norm": 0.11539868265390396, + "learning_rate": 0.0002383045532627301, + "loss": 2.5855, + "step": 22885 + }, + { + "epoch": 0.6786466210005041, + "grad_norm": 0.10935761034488678, + "learning_rate": 0.00023826446264114089, + "loss": 2.5815, + "step": 22886 + }, + { + "epoch": 0.6786762743528156, + "grad_norm": 0.10605907440185547, + "learning_rate": 0.00023822437433726484, + "loss": 2.5984, + "step": 22887 + }, + { + "epoch": 0.678705927705127, + "grad_norm": 0.10087642818689346, + "learning_rate": 0.00023818428835145684, + "loss": 2.5633, + "step": 22888 + }, + { + "epoch": 0.6787355810574386, + "grad_norm": 0.11601412296295166, + "learning_rate": 0.00023814420468407194, + "loss": 2.608, + "step": 22889 + }, + { + "epoch": 0.67876523440975, + "grad_norm": 0.0983876958489418, + "learning_rate": 0.00023810412333546505, + "loss": 2.5835, + "step": 22890 + }, + { + "epoch": 0.6787948877620615, + "grad_norm": 0.12512937188148499, + "learning_rate": 0.0002380640443059911, + "loss": 2.6148, + "step": 22891 + }, + { + "epoch": 0.678824541114373, + "grad_norm": 0.10757461190223694, + "learning_rate": 0.00023802396759600502, + "loss": 2.5841, + "step": 22892 + }, + { + "epoch": 0.6788541944666845, + "grad_norm": 0.10631711035966873, + "learning_rate": 0.00023798389320586183, + "loss": 2.5952, + "step": 22893 + }, + { + "epoch": 0.6788838478189959, + "grad_norm": 0.11668973416090012, + "learning_rate": 0.00023794382113591605, + "loss": 2.5783, + "step": 22894 + }, + { + "epoch": 0.6789135011713074, + "grad_norm": 0.09922703355550766, + "learning_rate": 0.00023790375138652275, + "loss": 2.5941, + "step": 22895 + }, + { + "epoch": 0.6789431545236189, + "grad_norm": 0.11039542406797409, + "learning_rate": 0.00023786368395803647, + "loss": 2.6004, + "step": 22896 + }, + { + "epoch": 0.6789728078759304, + "grad_norm": 0.11218992620706558, + "learning_rate": 0.0002378236188508125, + "loss": 2.5887, + "step": 22897 + }, + { + "epoch": 0.6790024612282418, + "grad_norm": 0.0992288738489151, + "learning_rate": 0.00023778355606520542, + "loss": 2.6201, + "step": 22898 + }, + { + "epoch": 0.6790321145805533, + "grad_norm": 0.11693038791418076, + "learning_rate": 0.00023774349560156988, + "loss": 2.5575, + "step": 22899 + }, + { + "epoch": 0.6790617679328648, + "grad_norm": 0.0977066159248352, + "learning_rate": 0.00023770343746026075, + "loss": 2.6002, + "step": 22900 + }, + { + "epoch": 0.6790914212851763, + "grad_norm": 0.09734426438808441, + "learning_rate": 0.0002376633816416327, + "loss": 2.5649, + "step": 22901 + }, + { + "epoch": 0.6791210746374877, + "grad_norm": 0.10570559650659561, + "learning_rate": 0.00023762332814604044, + "loss": 2.5699, + "step": 22902 + }, + { + "epoch": 0.6791507279897993, + "grad_norm": 0.10304030776023865, + "learning_rate": 0.00023758327697383865, + "loss": 2.6042, + "step": 22903 + }, + { + "epoch": 0.6791803813421107, + "grad_norm": 0.09703207015991211, + "learning_rate": 0.00023754322812538214, + "loss": 2.5691, + "step": 22904 + }, + { + "epoch": 0.6792100346944222, + "grad_norm": 0.11377723515033722, + "learning_rate": 0.00023750318160102525, + "loss": 2.6317, + "step": 22905 + }, + { + "epoch": 0.6792396880467337, + "grad_norm": 0.10483873635530472, + "learning_rate": 0.0002374631374011227, + "loss": 2.604, + "step": 22906 + }, + { + "epoch": 0.6792693413990452, + "grad_norm": 0.10002832859754562, + "learning_rate": 0.00023742309552602915, + "loss": 2.5966, + "step": 22907 + }, + { + "epoch": 0.6792989947513567, + "grad_norm": 0.11230940371751785, + "learning_rate": 0.00023738305597609916, + "loss": 2.5563, + "step": 22908 + }, + { + "epoch": 0.6793286481036681, + "grad_norm": 0.09036876261234283, + "learning_rate": 0.00023734301875168713, + "loss": 2.5791, + "step": 22909 + }, + { + "epoch": 0.6793583014559796, + "grad_norm": 0.10089226067066193, + "learning_rate": 0.00023730298385314785, + "loss": 2.5738, + "step": 22910 + }, + { + "epoch": 0.6793879548082911, + "grad_norm": 0.09756836295127869, + "learning_rate": 0.0002372629512808358, + "loss": 2.5763, + "step": 22911 + }, + { + "epoch": 0.6794176081606026, + "grad_norm": 0.0902370736002922, + "learning_rate": 0.00023722292103510546, + "loss": 2.5879, + "step": 22912 + }, + { + "epoch": 0.679447261512914, + "grad_norm": 0.09944208711385727, + "learning_rate": 0.0002371828931163112, + "loss": 2.5976, + "step": 22913 + }, + { + "epoch": 0.6794769148652255, + "grad_norm": 0.10237947851419449, + "learning_rate": 0.00023714286752480774, + "loss": 2.6107, + "step": 22914 + }, + { + "epoch": 0.679506568217537, + "grad_norm": 0.09462464600801468, + "learning_rate": 0.00023710284426094912, + "loss": 2.5842, + "step": 22915 + }, + { + "epoch": 0.6795362215698485, + "grad_norm": 0.10172990709543228, + "learning_rate": 0.00023706282332508995, + "loss": 2.5663, + "step": 22916 + }, + { + "epoch": 0.6795658749221599, + "grad_norm": 0.09695501625537872, + "learning_rate": 0.0002370228047175846, + "loss": 2.6043, + "step": 22917 + }, + { + "epoch": 0.6795955282744714, + "grad_norm": 0.10336543619632721, + "learning_rate": 0.00023698278843878746, + "loss": 2.5897, + "step": 22918 + }, + { + "epoch": 0.6796251816267829, + "grad_norm": 0.09899317473173141, + "learning_rate": 0.00023694277448905283, + "loss": 2.5878, + "step": 22919 + }, + { + "epoch": 0.6796548349790944, + "grad_norm": 0.09367727488279343, + "learning_rate": 0.00023690276286873513, + "loss": 2.5303, + "step": 22920 + }, + { + "epoch": 0.6796844883314058, + "grad_norm": 0.10413524508476257, + "learning_rate": 0.0002368627535781886, + "loss": 2.5991, + "step": 22921 + }, + { + "epoch": 0.6797141416837174, + "grad_norm": 0.09825924038887024, + "learning_rate": 0.00023682274661776737, + "loss": 2.6044, + "step": 22922 + }, + { + "epoch": 0.6797437950360288, + "grad_norm": 0.09816140681505203, + "learning_rate": 0.00023678274198782613, + "loss": 2.6152, + "step": 22923 + }, + { + "epoch": 0.6797734483883403, + "grad_norm": 0.10971497744321823, + "learning_rate": 0.000236742739688719, + "loss": 2.6004, + "step": 22924 + }, + { + "epoch": 0.6798031017406517, + "grad_norm": 0.10377537459135056, + "learning_rate": 0.00023670273972079998, + "loss": 2.5838, + "step": 22925 + }, + { + "epoch": 0.6798327550929633, + "grad_norm": 0.09418292343616486, + "learning_rate": 0.00023666274208442335, + "loss": 2.5576, + "step": 22926 + }, + { + "epoch": 0.6798624084452748, + "grad_norm": 0.11983854323625565, + "learning_rate": 0.00023662274677994338, + "loss": 2.5622, + "step": 22927 + }, + { + "epoch": 0.6798920617975862, + "grad_norm": 0.11690518260002136, + "learning_rate": 0.00023658275380771416, + "loss": 2.6286, + "step": 22928 + }, + { + "epoch": 0.6799217151498977, + "grad_norm": 0.1006227359175682, + "learning_rate": 0.00023654276316808988, + "loss": 2.5601, + "step": 22929 + }, + { + "epoch": 0.6799513685022092, + "grad_norm": 0.09835516661405563, + "learning_rate": 0.00023650277486142462, + "loss": 2.6388, + "step": 22930 + }, + { + "epoch": 0.6799810218545207, + "grad_norm": 0.10263041406869888, + "learning_rate": 0.0002364627888880726, + "loss": 2.5764, + "step": 22931 + }, + { + "epoch": 0.6800106752068321, + "grad_norm": 0.09102708101272583, + "learning_rate": 0.00023642280524838777, + "loss": 2.5305, + "step": 22932 + }, + { + "epoch": 0.6800403285591436, + "grad_norm": 0.11092966049909592, + "learning_rate": 0.00023638282394272426, + "loss": 2.564, + "step": 22933 + }, + { + "epoch": 0.6800699819114551, + "grad_norm": 0.09044685959815979, + "learning_rate": 0.00023634284497143587, + "loss": 2.6095, + "step": 22934 + }, + { + "epoch": 0.6800996352637666, + "grad_norm": 0.09859812259674072, + "learning_rate": 0.00023630286833487723, + "loss": 2.569, + "step": 22935 + }, + { + "epoch": 0.680129288616078, + "grad_norm": 0.10812798887491226, + "learning_rate": 0.00023626289403340178, + "loss": 2.5521, + "step": 22936 + }, + { + "epoch": 0.6801589419683896, + "grad_norm": 0.08686301857233047, + "learning_rate": 0.00023622292206736367, + "loss": 2.5998, + "step": 22937 + }, + { + "epoch": 0.680188595320701, + "grad_norm": 0.11591526865959167, + "learning_rate": 0.00023618295243711684, + "loss": 2.5727, + "step": 22938 + }, + { + "epoch": 0.6802182486730125, + "grad_norm": 0.09899663180112839, + "learning_rate": 0.0002361429851430153, + "loss": 2.5898, + "step": 22939 + }, + { + "epoch": 0.6802479020253239, + "grad_norm": 0.10009031742811203, + "learning_rate": 0.00023610302018541284, + "loss": 2.5819, + "step": 22940 + }, + { + "epoch": 0.6802775553776355, + "grad_norm": 0.11083784699440002, + "learning_rate": 0.00023606305756466352, + "loss": 2.5736, + "step": 22941 + }, + { + "epoch": 0.6803072087299469, + "grad_norm": 0.12188642472028732, + "learning_rate": 0.0002360230972811211, + "loss": 2.5999, + "step": 22942 + }, + { + "epoch": 0.6803368620822584, + "grad_norm": 0.10946149379014969, + "learning_rate": 0.00023598313933513942, + "loss": 2.5612, + "step": 22943 + }, + { + "epoch": 0.6803665154345698, + "grad_norm": 0.09685441106557846, + "learning_rate": 0.00023594318372707242, + "loss": 2.5826, + "step": 22944 + }, + { + "epoch": 0.6803961687868814, + "grad_norm": 0.121064692735672, + "learning_rate": 0.0002359032304572738, + "loss": 2.6131, + "step": 22945 + }, + { + "epoch": 0.6804258221391928, + "grad_norm": 0.0975351631641388, + "learning_rate": 0.00023586327952609742, + "loss": 2.6078, + "step": 22946 + }, + { + "epoch": 0.6804554754915043, + "grad_norm": 0.11118225753307343, + "learning_rate": 0.00023582333093389706, + "loss": 2.595, + "step": 22947 + }, + { + "epoch": 0.6804851288438158, + "grad_norm": 0.10219752043485641, + "learning_rate": 0.00023578338468102644, + "loss": 2.5898, + "step": 22948 + }, + { + "epoch": 0.6805147821961273, + "grad_norm": 0.10715045779943466, + "learning_rate": 0.0002357434407678393, + "loss": 2.5719, + "step": 22949 + }, + { + "epoch": 0.6805444355484388, + "grad_norm": 0.09706207364797592, + "learning_rate": 0.00023570349919468936, + "loss": 2.5978, + "step": 22950 + }, + { + "epoch": 0.6805740889007502, + "grad_norm": 0.11038598418235779, + "learning_rate": 0.00023566355996193028, + "loss": 2.5839, + "step": 22951 + }, + { + "epoch": 0.6806037422530617, + "grad_norm": 0.09390974789857864, + "learning_rate": 0.00023562362306991575, + "loss": 2.5641, + "step": 22952 + }, + { + "epoch": 0.6806333956053732, + "grad_norm": 0.10174760967493057, + "learning_rate": 0.00023558368851899947, + "loss": 2.6001, + "step": 22953 + }, + { + "epoch": 0.6806630489576847, + "grad_norm": 0.09656016528606415, + "learning_rate": 0.00023554375630953494, + "loss": 2.6, + "step": 22954 + }, + { + "epoch": 0.6806927023099961, + "grad_norm": 0.10318858921527863, + "learning_rate": 0.0002355038264418759, + "loss": 2.5664, + "step": 22955 + }, + { + "epoch": 0.6807223556623077, + "grad_norm": 0.09817399829626083, + "learning_rate": 0.00023546389891637587, + "loss": 2.6193, + "step": 22956 + }, + { + "epoch": 0.6807520090146191, + "grad_norm": 0.10587884485721588, + "learning_rate": 0.00023542397373338837, + "loss": 2.5699, + "step": 22957 + }, + { + "epoch": 0.6807816623669306, + "grad_norm": 0.09334294497966766, + "learning_rate": 0.00023538405089326703, + "loss": 2.6172, + "step": 22958 + }, + { + "epoch": 0.680811315719242, + "grad_norm": 0.10135089606046677, + "learning_rate": 0.00023534413039636554, + "loss": 2.6073, + "step": 22959 + }, + { + "epoch": 0.6808409690715536, + "grad_norm": 0.09749408811330795, + "learning_rate": 0.00023530421224303682, + "loss": 2.5997, + "step": 22960 + }, + { + "epoch": 0.680870622423865, + "grad_norm": 0.098440021276474, + "learning_rate": 0.00023526429643363488, + "loss": 2.5898, + "step": 22961 + }, + { + "epoch": 0.6809002757761765, + "grad_norm": 0.10121189057826996, + "learning_rate": 0.00023522438296851313, + "loss": 2.5881, + "step": 22962 + }, + { + "epoch": 0.6809299291284879, + "grad_norm": 0.10220558941364288, + "learning_rate": 0.00023518447184802483, + "loss": 2.5722, + "step": 22963 + }, + { + "epoch": 0.6809595824807995, + "grad_norm": 0.10578126460313797, + "learning_rate": 0.00023514456307252351, + "loss": 2.5963, + "step": 22964 + }, + { + "epoch": 0.6809892358331109, + "grad_norm": 0.11026883870363235, + "learning_rate": 0.00023510465664236259, + "loss": 2.6106, + "step": 22965 + }, + { + "epoch": 0.6810188891854224, + "grad_norm": 0.10368113219738007, + "learning_rate": 0.00023506475255789534, + "loss": 2.6046, + "step": 22966 + }, + { + "epoch": 0.6810485425377338, + "grad_norm": 0.09798086434602737, + "learning_rate": 0.00023502485081947522, + "loss": 2.5491, + "step": 22967 + }, + { + "epoch": 0.6810781958900454, + "grad_norm": 0.10256599634885788, + "learning_rate": 0.00023498495142745552, + "loss": 2.5819, + "step": 22968 + }, + { + "epoch": 0.6811078492423569, + "grad_norm": 0.09688720107078552, + "learning_rate": 0.00023494505438218977, + "loss": 2.5996, + "step": 22969 + }, + { + "epoch": 0.6811375025946683, + "grad_norm": 0.09292041510343552, + "learning_rate": 0.00023490515968403082, + "loss": 2.5984, + "step": 22970 + }, + { + "epoch": 0.6811671559469799, + "grad_norm": 0.10699858516454697, + "learning_rate": 0.00023486526733333224, + "loss": 2.5915, + "step": 22971 + }, + { + "epoch": 0.6811968092992913, + "grad_norm": 0.09422704577445984, + "learning_rate": 0.00023482537733044716, + "loss": 2.5886, + "step": 22972 + }, + { + "epoch": 0.6812264626516028, + "grad_norm": 0.09466860443353653, + "learning_rate": 0.00023478548967572878, + "loss": 2.553, + "step": 22973 + }, + { + "epoch": 0.6812561160039142, + "grad_norm": 0.09684155136346817, + "learning_rate": 0.00023474560436953057, + "loss": 2.5932, + "step": 22974 + }, + { + "epoch": 0.6812857693562258, + "grad_norm": 0.09683999419212341, + "learning_rate": 0.00023470572141220554, + "loss": 2.608, + "step": 22975 + }, + { + "epoch": 0.6813154227085372, + "grad_norm": 0.10991301387548447, + "learning_rate": 0.00023466584080410693, + "loss": 2.5704, + "step": 22976 + }, + { + "epoch": 0.6813450760608487, + "grad_norm": 0.1016346886754036, + "learning_rate": 0.00023462596254558777, + "loss": 2.6008, + "step": 22977 + }, + { + "epoch": 0.6813747294131601, + "grad_norm": 0.08615979552268982, + "learning_rate": 0.00023458608663700132, + "loss": 2.57, + "step": 22978 + }, + { + "epoch": 0.6814043827654717, + "grad_norm": 0.10174208134412766, + "learning_rate": 0.0002345462130787006, + "loss": 2.5806, + "step": 22979 + }, + { + "epoch": 0.6814340361177831, + "grad_norm": 0.10236907750368118, + "learning_rate": 0.00023450634187103893, + "loss": 2.5905, + "step": 22980 + }, + { + "epoch": 0.6814636894700946, + "grad_norm": 0.09168838709592819, + "learning_rate": 0.00023446647301436895, + "loss": 2.5842, + "step": 22981 + }, + { + "epoch": 0.681493342822406, + "grad_norm": 0.10096348822116852, + "learning_rate": 0.00023442660650904395, + "loss": 2.5978, + "step": 22982 + }, + { + "epoch": 0.6815229961747176, + "grad_norm": 0.11174017935991287, + "learning_rate": 0.00023438674235541697, + "loss": 2.5798, + "step": 22983 + }, + { + "epoch": 0.681552649527029, + "grad_norm": 0.08606688678264618, + "learning_rate": 0.00023434688055384097, + "loss": 2.5789, + "step": 22984 + }, + { + "epoch": 0.6815823028793405, + "grad_norm": 0.12221105396747589, + "learning_rate": 0.0002343070211046689, + "loss": 2.6164, + "step": 22985 + }, + { + "epoch": 0.6816119562316519, + "grad_norm": 0.12508124113082886, + "learning_rate": 0.0002342671640082536, + "loss": 2.5878, + "step": 22986 + }, + { + "epoch": 0.6816416095839635, + "grad_norm": 0.09687233716249466, + "learning_rate": 0.00023422730926494839, + "loss": 2.5786, + "step": 22987 + }, + { + "epoch": 0.6816712629362749, + "grad_norm": 0.11426973342895508, + "learning_rate": 0.00023418745687510596, + "loss": 2.5769, + "step": 22988 + }, + { + "epoch": 0.6817009162885864, + "grad_norm": 0.09588959813117981, + "learning_rate": 0.0002341476068390792, + "loss": 2.5531, + "step": 22989 + }, + { + "epoch": 0.681730569640898, + "grad_norm": 0.10884872823953629, + "learning_rate": 0.00023410775915722122, + "loss": 2.5969, + "step": 22990 + }, + { + "epoch": 0.6817602229932094, + "grad_norm": 0.09586173295974731, + "learning_rate": 0.0002340679138298845, + "loss": 2.6231, + "step": 22991 + }, + { + "epoch": 0.6817898763455209, + "grad_norm": 0.09868766367435455, + "learning_rate": 0.000234028070857422, + "loss": 2.5666, + "step": 22992 + }, + { + "epoch": 0.6818195296978323, + "grad_norm": 0.0973467007279396, + "learning_rate": 0.00023398823024018667, + "loss": 2.617, + "step": 22993 + }, + { + "epoch": 0.6818491830501439, + "grad_norm": 0.10801970213651657, + "learning_rate": 0.00023394839197853114, + "loss": 2.5835, + "step": 22994 + }, + { + "epoch": 0.6818788364024553, + "grad_norm": 0.0915079116821289, + "learning_rate": 0.0002339085560728083, + "loss": 2.5944, + "step": 22995 + }, + { + "epoch": 0.6819084897547668, + "grad_norm": 0.10873392224311829, + "learning_rate": 0.0002338687225233709, + "loss": 2.5718, + "step": 22996 + }, + { + "epoch": 0.6819381431070782, + "grad_norm": 0.09598556160926819, + "learning_rate": 0.00023382889133057155, + "loss": 2.5772, + "step": 22997 + }, + { + "epoch": 0.6819677964593898, + "grad_norm": 0.10769723355770111, + "learning_rate": 0.00023378906249476317, + "loss": 2.6063, + "step": 22998 + }, + { + "epoch": 0.6819974498117012, + "grad_norm": 0.11798692494630814, + "learning_rate": 0.00023374923601629806, + "loss": 2.5661, + "step": 22999 + }, + { + "epoch": 0.6820271031640127, + "grad_norm": 0.12165889143943787, + "learning_rate": 0.0002337094118955296, + "loss": 2.6266, + "step": 23000 + }, + { + "epoch": 0.6820567565163241, + "grad_norm": 0.10595834255218506, + "learning_rate": 0.0002336695901328098, + "loss": 2.581, + "step": 23001 + }, + { + "epoch": 0.6820864098686357, + "grad_norm": 0.104228675365448, + "learning_rate": 0.00023362977072849145, + "loss": 2.5956, + "step": 23002 + }, + { + "epoch": 0.6821160632209471, + "grad_norm": 0.12341325730085373, + "learning_rate": 0.00023358995368292723, + "loss": 2.6056, + "step": 23003 + }, + { + "epoch": 0.6821457165732586, + "grad_norm": 0.10344825685024261, + "learning_rate": 0.00023355013899646976, + "loss": 2.6007, + "step": 23004 + }, + { + "epoch": 0.68217536992557, + "grad_norm": 0.1001797467470169, + "learning_rate": 0.00023351032666947148, + "loss": 2.5493, + "step": 23005 + }, + { + "epoch": 0.6822050232778816, + "grad_norm": 0.10552597045898438, + "learning_rate": 0.00023347051670228504, + "loss": 2.623, + "step": 23006 + }, + { + "epoch": 0.682234676630193, + "grad_norm": 0.1097143143415451, + "learning_rate": 0.00023343070909526286, + "loss": 2.5847, + "step": 23007 + }, + { + "epoch": 0.6822643299825045, + "grad_norm": 0.1141243651509285, + "learning_rate": 0.00023339090384875754, + "loss": 2.5871, + "step": 23008 + }, + { + "epoch": 0.682293983334816, + "grad_norm": 0.10985184460878372, + "learning_rate": 0.00023335110096312157, + "loss": 2.5499, + "step": 23009 + }, + { + "epoch": 0.6823236366871275, + "grad_norm": 0.10435216873884201, + "learning_rate": 0.0002333113004387073, + "loss": 2.6096, + "step": 23010 + }, + { + "epoch": 0.682353290039439, + "grad_norm": 0.10523874312639236, + "learning_rate": 0.0002332715022758673, + "loss": 2.5931, + "step": 23011 + }, + { + "epoch": 0.6823829433917504, + "grad_norm": 0.09908712655305862, + "learning_rate": 0.0002332317064749539, + "loss": 2.6052, + "step": 23012 + }, + { + "epoch": 0.682412596744062, + "grad_norm": 0.10842191427946091, + "learning_rate": 0.00023319191303631953, + "loss": 2.5901, + "step": 23013 + }, + { + "epoch": 0.6824422500963734, + "grad_norm": 0.10574564337730408, + "learning_rate": 0.00023315212196031655, + "loss": 2.5964, + "step": 23014 + }, + { + "epoch": 0.6824719034486849, + "grad_norm": 0.08920909464359283, + "learning_rate": 0.00023311233324729735, + "loss": 2.5844, + "step": 23015 + }, + { + "epoch": 0.6825015568009963, + "grad_norm": 0.10726510733366013, + "learning_rate": 0.00023307254689761427, + "loss": 2.5773, + "step": 23016 + }, + { + "epoch": 0.6825312101533079, + "grad_norm": 0.11123368889093399, + "learning_rate": 0.00023303276291161964, + "loss": 2.6073, + "step": 23017 + }, + { + "epoch": 0.6825608635056193, + "grad_norm": 0.10349652916193008, + "learning_rate": 0.0002329929812896656, + "loss": 2.5841, + "step": 23018 + }, + { + "epoch": 0.6825905168579308, + "grad_norm": 0.09862468391656876, + "learning_rate": 0.00023295320203210463, + "loss": 2.5686, + "step": 23019 + }, + { + "epoch": 0.6826201702102422, + "grad_norm": 0.11185482144355774, + "learning_rate": 0.00023291342513928888, + "loss": 2.5819, + "step": 23020 + }, + { + "epoch": 0.6826498235625538, + "grad_norm": 0.10711681842803955, + "learning_rate": 0.00023287365061157062, + "loss": 2.5507, + "step": 23021 + }, + { + "epoch": 0.6826794769148652, + "grad_norm": 0.10017567127943039, + "learning_rate": 0.00023283387844930199, + "loss": 2.5734, + "step": 23022 + }, + { + "epoch": 0.6827091302671767, + "grad_norm": 0.11139512062072754, + "learning_rate": 0.00023279410865283524, + "loss": 2.5793, + "step": 23023 + }, + { + "epoch": 0.6827387836194881, + "grad_norm": 0.09165733307600021, + "learning_rate": 0.0002327543412225227, + "loss": 2.5404, + "step": 23024 + }, + { + "epoch": 0.6827684369717997, + "grad_norm": 0.11443217843770981, + "learning_rate": 0.000232714576158716, + "loss": 2.5832, + "step": 23025 + }, + { + "epoch": 0.6827980903241111, + "grad_norm": 0.08697037398815155, + "learning_rate": 0.00023267481346176777, + "loss": 2.579, + "step": 23026 + }, + { + "epoch": 0.6828277436764226, + "grad_norm": 0.11281975358724594, + "learning_rate": 0.00023263505313202992, + "loss": 2.5956, + "step": 23027 + }, + { + "epoch": 0.682857397028734, + "grad_norm": 0.09842673689126968, + "learning_rate": 0.00023259529516985461, + "loss": 2.5755, + "step": 23028 + }, + { + "epoch": 0.6828870503810456, + "grad_norm": 0.09884537011384964, + "learning_rate": 0.00023255553957559388, + "loss": 2.5693, + "step": 23029 + }, + { + "epoch": 0.6829167037333571, + "grad_norm": 0.10640265792608261, + "learning_rate": 0.0002325157863495997, + "loss": 2.5925, + "step": 23030 + }, + { + "epoch": 0.6829463570856685, + "grad_norm": 0.09181427955627441, + "learning_rate": 0.0002324760354922242, + "loss": 2.5672, + "step": 23031 + }, + { + "epoch": 0.6829760104379801, + "grad_norm": 0.10308210551738739, + "learning_rate": 0.0002324362870038193, + "loss": 2.5753, + "step": 23032 + }, + { + "epoch": 0.6830056637902915, + "grad_norm": 0.09960468858480453, + "learning_rate": 0.00023239654088473699, + "loss": 2.5844, + "step": 23033 + }, + { + "epoch": 0.683035317142603, + "grad_norm": 0.10524209588766098, + "learning_rate": 0.00023235679713532926, + "loss": 2.583, + "step": 23034 + }, + { + "epoch": 0.6830649704949144, + "grad_norm": 0.10925544053316116, + "learning_rate": 0.0002323170557559482, + "loss": 2.5857, + "step": 23035 + }, + { + "epoch": 0.683094623847226, + "grad_norm": 0.10698278248310089, + "learning_rate": 0.00023227731674694535, + "loss": 2.599, + "step": 23036 + }, + { + "epoch": 0.6831242771995374, + "grad_norm": 0.09595329314470291, + "learning_rate": 0.0002322375801086729, + "loss": 2.5698, + "step": 23037 + }, + { + "epoch": 0.6831539305518489, + "grad_norm": 0.1041485071182251, + "learning_rate": 0.0002321978458414824, + "loss": 2.5847, + "step": 23038 + }, + { + "epoch": 0.6831835839041603, + "grad_norm": 0.09861104190349579, + "learning_rate": 0.00023215811394572611, + "loss": 2.5859, + "step": 23039 + }, + { + "epoch": 0.6832132372564719, + "grad_norm": 0.0977640300989151, + "learning_rate": 0.0002321183844217557, + "loss": 2.5838, + "step": 23040 + }, + { + "epoch": 0.6832428906087833, + "grad_norm": 0.10036028176546097, + "learning_rate": 0.000232078657269923, + "loss": 2.6258, + "step": 23041 + }, + { + "epoch": 0.6832725439610948, + "grad_norm": 0.09650789946317673, + "learning_rate": 0.0002320389324905798, + "loss": 2.5958, + "step": 23042 + }, + { + "epoch": 0.6833021973134062, + "grad_norm": 0.10758829116821289, + "learning_rate": 0.0002319992100840778, + "loss": 2.6044, + "step": 23043 + }, + { + "epoch": 0.6833318506657178, + "grad_norm": 0.09395194053649902, + "learning_rate": 0.00023195949005076882, + "loss": 2.5977, + "step": 23044 + }, + { + "epoch": 0.6833615040180292, + "grad_norm": 0.09980326890945435, + "learning_rate": 0.00023191977239100475, + "loss": 2.632, + "step": 23045 + }, + { + "epoch": 0.6833911573703407, + "grad_norm": 0.09500952064990997, + "learning_rate": 0.00023188005710513693, + "loss": 2.5906, + "step": 23046 + }, + { + "epoch": 0.6834208107226522, + "grad_norm": 0.10307374596595764, + "learning_rate": 0.00023184034419351725, + "loss": 2.5991, + "step": 23047 + }, + { + "epoch": 0.6834504640749637, + "grad_norm": 0.10190195590257645, + "learning_rate": 0.00023180063365649728, + "loss": 2.564, + "step": 23048 + }, + { + "epoch": 0.6834801174272751, + "grad_norm": 0.10574982315301895, + "learning_rate": 0.00023176092549442878, + "loss": 2.5666, + "step": 23049 + }, + { + "epoch": 0.6835097707795866, + "grad_norm": 0.10444339364767075, + "learning_rate": 0.0002317212197076633, + "loss": 2.5596, + "step": 23050 + }, + { + "epoch": 0.6835394241318982, + "grad_norm": 0.1195247620344162, + "learning_rate": 0.00023168151629655232, + "loss": 2.5604, + "step": 23051 + }, + { + "epoch": 0.6835690774842096, + "grad_norm": 0.10717704147100449, + "learning_rate": 0.00023164181526144772, + "loss": 2.5899, + "step": 23052 + }, + { + "epoch": 0.6835987308365211, + "grad_norm": 0.10927107185125351, + "learning_rate": 0.0002316021166027009, + "loss": 2.6002, + "step": 23053 + }, + { + "epoch": 0.6836283841888325, + "grad_norm": 0.10256877541542053, + "learning_rate": 0.0002315624203206635, + "loss": 2.5883, + "step": 23054 + }, + { + "epoch": 0.6836580375411441, + "grad_norm": 0.11513019353151321, + "learning_rate": 0.00023152272641568684, + "loss": 2.6435, + "step": 23055 + }, + { + "epoch": 0.6836876908934555, + "grad_norm": 0.09464965015649796, + "learning_rate": 0.00023148303488812277, + "loss": 2.5897, + "step": 23056 + }, + { + "epoch": 0.683717344245767, + "grad_norm": 0.11167563498020172, + "learning_rate": 0.00023144334573832232, + "loss": 2.5756, + "step": 23057 + }, + { + "epoch": 0.6837469975980784, + "grad_norm": 0.0933750793337822, + "learning_rate": 0.00023140365896663712, + "loss": 2.6013, + "step": 23058 + }, + { + "epoch": 0.68377665095039, + "grad_norm": 0.11039306223392487, + "learning_rate": 0.00023136397457341863, + "loss": 2.6375, + "step": 23059 + }, + { + "epoch": 0.6838063043027014, + "grad_norm": 0.10162515938282013, + "learning_rate": 0.00023132429255901828, + "loss": 2.5762, + "step": 23060 + }, + { + "epoch": 0.6838359576550129, + "grad_norm": 0.10060905665159225, + "learning_rate": 0.00023128461292378738, + "loss": 2.5503, + "step": 23061 + }, + { + "epoch": 0.6838656110073243, + "grad_norm": 0.08845459669828415, + "learning_rate": 0.0002312449356680774, + "loss": 2.5892, + "step": 23062 + }, + { + "epoch": 0.6838952643596359, + "grad_norm": 0.09781965613365173, + "learning_rate": 0.00023120526079223964, + "loss": 2.5793, + "step": 23063 + }, + { + "epoch": 0.6839249177119473, + "grad_norm": 0.09298159927129745, + "learning_rate": 0.00023116558829662525, + "loss": 2.5678, + "step": 23064 + }, + { + "epoch": 0.6839545710642588, + "grad_norm": 0.10600939393043518, + "learning_rate": 0.0002311259181815859, + "loss": 2.5932, + "step": 23065 + }, + { + "epoch": 0.6839842244165703, + "grad_norm": 0.09852370619773865, + "learning_rate": 0.0002310862504474729, + "loss": 2.6065, + "step": 23066 + }, + { + "epoch": 0.6840138777688818, + "grad_norm": 0.09847575426101685, + "learning_rate": 0.0002310465850946371, + "loss": 2.5638, + "step": 23067 + }, + { + "epoch": 0.6840435311211932, + "grad_norm": 0.10596712678670883, + "learning_rate": 0.00023100692212342993, + "loss": 2.6141, + "step": 23068 + }, + { + "epoch": 0.6840731844735047, + "grad_norm": 0.09805968403816223, + "learning_rate": 0.00023096726153420271, + "loss": 2.5885, + "step": 23069 + }, + { + "epoch": 0.6841028378258162, + "grad_norm": 0.10647395253181458, + "learning_rate": 0.0002309276033273065, + "loss": 2.5762, + "step": 23070 + }, + { + "epoch": 0.6841324911781277, + "grad_norm": 0.10864845663309097, + "learning_rate": 0.0002308879475030926, + "loss": 2.5848, + "step": 23071 + }, + { + "epoch": 0.6841621445304392, + "grad_norm": 0.10743484646081924, + "learning_rate": 0.0002308482940619121, + "loss": 2.6202, + "step": 23072 + }, + { + "epoch": 0.6841917978827506, + "grad_norm": 0.10964476317167282, + "learning_rate": 0.00023080864300411614, + "loss": 2.5462, + "step": 23073 + }, + { + "epoch": 0.6842214512350622, + "grad_norm": 0.10092970728874207, + "learning_rate": 0.00023076899433005588, + "loss": 2.6016, + "step": 23074 + }, + { + "epoch": 0.6842511045873736, + "grad_norm": 0.11308150738477707, + "learning_rate": 0.00023072934804008234, + "loss": 2.5708, + "step": 23075 + }, + { + "epoch": 0.6842807579396851, + "grad_norm": 0.09190548956394196, + "learning_rate": 0.00023068970413454672, + "loss": 2.5876, + "step": 23076 + }, + { + "epoch": 0.6843104112919965, + "grad_norm": 0.10347206145524979, + "learning_rate": 0.00023065006261379988, + "loss": 2.5906, + "step": 23077 + }, + { + "epoch": 0.6843400646443081, + "grad_norm": 0.10588718205690384, + "learning_rate": 0.00023061042347819307, + "loss": 2.5959, + "step": 23078 + }, + { + "epoch": 0.6843697179966195, + "grad_norm": 0.09665990620851517, + "learning_rate": 0.0002305707867280772, + "loss": 2.5741, + "step": 23079 + }, + { + "epoch": 0.684399371348931, + "grad_norm": 0.11920633912086487, + "learning_rate": 0.00023053115236380318, + "loss": 2.548, + "step": 23080 + }, + { + "epoch": 0.6844290247012424, + "grad_norm": 0.08450010418891907, + "learning_rate": 0.00023049152038572213, + "loss": 2.5605, + "step": 23081 + }, + { + "epoch": 0.684458678053554, + "grad_norm": 0.12927861511707306, + "learning_rate": 0.00023045189079418487, + "loss": 2.607, + "step": 23082 + }, + { + "epoch": 0.6844883314058654, + "grad_norm": 0.11460563540458679, + "learning_rate": 0.00023041226358954243, + "loss": 2.5963, + "step": 23083 + }, + { + "epoch": 0.6845179847581769, + "grad_norm": 0.1079990565776825, + "learning_rate": 0.0002303726387721457, + "loss": 2.5596, + "step": 23084 + }, + { + "epoch": 0.6845476381104884, + "grad_norm": 0.12661591172218323, + "learning_rate": 0.0002303330163423455, + "loss": 2.6029, + "step": 23085 + }, + { + "epoch": 0.6845772914627999, + "grad_norm": 0.12229833006858826, + "learning_rate": 0.00023029339630049268, + "loss": 2.5942, + "step": 23086 + }, + { + "epoch": 0.6846069448151113, + "grad_norm": 0.0989890843629837, + "learning_rate": 0.0002302537786469382, + "loss": 2.5509, + "step": 23087 + }, + { + "epoch": 0.6846365981674228, + "grad_norm": 0.1270717978477478, + "learning_rate": 0.00023021416338203277, + "loss": 2.5936, + "step": 23088 + }, + { + "epoch": 0.6846662515197343, + "grad_norm": 0.09679530560970306, + "learning_rate": 0.00023017455050612724, + "loss": 2.6092, + "step": 23089 + }, + { + "epoch": 0.6846959048720458, + "grad_norm": 0.10701075196266174, + "learning_rate": 0.0002301349400195724, + "loss": 2.5815, + "step": 23090 + }, + { + "epoch": 0.6847255582243572, + "grad_norm": 0.10141955316066742, + "learning_rate": 0.00023009533192271898, + "loss": 2.611, + "step": 23091 + }, + { + "epoch": 0.6847552115766687, + "grad_norm": 0.10953477025032043, + "learning_rate": 0.0002300557262159177, + "loss": 2.6117, + "step": 23092 + }, + { + "epoch": 0.6847848649289803, + "grad_norm": 0.11599334329366684, + "learning_rate": 0.00023001612289951935, + "loss": 2.6015, + "step": 23093 + }, + { + "epoch": 0.6848145182812917, + "grad_norm": 0.09331879019737244, + "learning_rate": 0.00022997652197387453, + "loss": 2.5707, + "step": 23094 + }, + { + "epoch": 0.6848441716336032, + "grad_norm": 0.106355220079422, + "learning_rate": 0.00022993692343933398, + "loss": 2.5984, + "step": 23095 + }, + { + "epoch": 0.6848738249859146, + "grad_norm": 0.09204382449388504, + "learning_rate": 0.0002298973272962483, + "loss": 2.5842, + "step": 23096 + }, + { + "epoch": 0.6849034783382262, + "grad_norm": 0.1037047877907753, + "learning_rate": 0.00022985773354496813, + "loss": 2.5546, + "step": 23097 + }, + { + "epoch": 0.6849331316905376, + "grad_norm": 0.09294001758098602, + "learning_rate": 0.00022981814218584417, + "loss": 2.6357, + "step": 23098 + }, + { + "epoch": 0.6849627850428491, + "grad_norm": 0.09952863305807114, + "learning_rate": 0.00022977855321922692, + "loss": 2.5649, + "step": 23099 + }, + { + "epoch": 0.6849924383951606, + "grad_norm": 0.11880473047494888, + "learning_rate": 0.00022973896664546712, + "loss": 2.5931, + "step": 23100 + }, + { + "epoch": 0.6850220917474721, + "grad_norm": 0.09922883659601212, + "learning_rate": 0.00022969938246491495, + "loss": 2.5838, + "step": 23101 + }, + { + "epoch": 0.6850517450997835, + "grad_norm": 0.10123763978481293, + "learning_rate": 0.00022965980067792119, + "loss": 2.5764, + "step": 23102 + }, + { + "epoch": 0.685081398452095, + "grad_norm": 0.11704976111650467, + "learning_rate": 0.00022962022128483612, + "loss": 2.6237, + "step": 23103 + }, + { + "epoch": 0.6851110518044065, + "grad_norm": 0.08966392278671265, + "learning_rate": 0.00022958064428601056, + "loss": 2.5725, + "step": 23104 + }, + { + "epoch": 0.685140705156718, + "grad_norm": 0.11061122268438339, + "learning_rate": 0.00022954106968179483, + "loss": 2.5799, + "step": 23105 + }, + { + "epoch": 0.6851703585090294, + "grad_norm": 0.10858208686113358, + "learning_rate": 0.00022950149747253936, + "loss": 2.6074, + "step": 23106 + }, + { + "epoch": 0.6852000118613409, + "grad_norm": 0.10069012641906738, + "learning_rate": 0.00022946192765859453, + "loss": 2.5865, + "step": 23107 + }, + { + "epoch": 0.6852296652136524, + "grad_norm": 0.10618720948696136, + "learning_rate": 0.00022942236024031077, + "loss": 2.5963, + "step": 23108 + }, + { + "epoch": 0.6852593185659639, + "grad_norm": 0.09851382672786713, + "learning_rate": 0.00022938279521803845, + "loss": 2.5908, + "step": 23109 + }, + { + "epoch": 0.6852889719182753, + "grad_norm": 0.10358653217554092, + "learning_rate": 0.00022934323259212797, + "loss": 2.5971, + "step": 23110 + }, + { + "epoch": 0.6853186252705868, + "grad_norm": 0.10278929769992828, + "learning_rate": 0.00022930367236292977, + "loss": 2.5709, + "step": 23111 + }, + { + "epoch": 0.6853482786228983, + "grad_norm": 0.10708330571651459, + "learning_rate": 0.00022926411453079389, + "loss": 2.5793, + "step": 23112 + }, + { + "epoch": 0.6853779319752098, + "grad_norm": 0.08832962065935135, + "learning_rate": 0.00022922455909607064, + "loss": 2.5532, + "step": 23113 + }, + { + "epoch": 0.6854075853275213, + "grad_norm": 0.11173281073570251, + "learning_rate": 0.00022918500605911046, + "loss": 2.596, + "step": 23114 + }, + { + "epoch": 0.6854372386798327, + "grad_norm": 0.09291159361600876, + "learning_rate": 0.00022914545542026356, + "loss": 2.5873, + "step": 23115 + }, + { + "epoch": 0.6854668920321443, + "grad_norm": 0.10978510975837708, + "learning_rate": 0.00022910590717987995, + "loss": 2.5819, + "step": 23116 + }, + { + "epoch": 0.6854965453844557, + "grad_norm": 0.09596797078847885, + "learning_rate": 0.00022906636133831015, + "loss": 2.6122, + "step": 23117 + }, + { + "epoch": 0.6855261987367672, + "grad_norm": 0.09090020507574081, + "learning_rate": 0.00022902681789590428, + "loss": 2.6324, + "step": 23118 + }, + { + "epoch": 0.6855558520890787, + "grad_norm": 0.11647626757621765, + "learning_rate": 0.00022898727685301246, + "loss": 2.6019, + "step": 23119 + }, + { + "epoch": 0.6855855054413902, + "grad_norm": 0.10721871256828308, + "learning_rate": 0.00022894773820998483, + "loss": 2.627, + "step": 23120 + }, + { + "epoch": 0.6856151587937016, + "grad_norm": 0.10727047175168991, + "learning_rate": 0.00022890820196717166, + "loss": 2.5751, + "step": 23121 + }, + { + "epoch": 0.6856448121460131, + "grad_norm": 0.1036454290151596, + "learning_rate": 0.00022886866812492267, + "loss": 2.6322, + "step": 23122 + }, + { + "epoch": 0.6856744654983246, + "grad_norm": 0.10089080780744553, + "learning_rate": 0.00022882913668358822, + "loss": 2.6234, + "step": 23123 + }, + { + "epoch": 0.6857041188506361, + "grad_norm": 0.09881773591041565, + "learning_rate": 0.00022878960764351831, + "loss": 2.5984, + "step": 23124 + }, + { + "epoch": 0.6857337722029475, + "grad_norm": 0.10607638210058212, + "learning_rate": 0.00022875008100506296, + "loss": 2.6158, + "step": 23125 + }, + { + "epoch": 0.685763425555259, + "grad_norm": 0.1053873598575592, + "learning_rate": 0.00022871055676857222, + "loss": 2.5916, + "step": 23126 + }, + { + "epoch": 0.6857930789075705, + "grad_norm": 0.09704536199569702, + "learning_rate": 0.00022867103493439607, + "loss": 2.5991, + "step": 23127 + }, + { + "epoch": 0.685822732259882, + "grad_norm": 0.10585497319698334, + "learning_rate": 0.00022863151550288425, + "loss": 2.5768, + "step": 23128 + }, + { + "epoch": 0.6858523856121934, + "grad_norm": 0.09657751023769379, + "learning_rate": 0.00022859199847438718, + "loss": 2.5817, + "step": 23129 + }, + { + "epoch": 0.685882038964505, + "grad_norm": 0.10077459365129471, + "learning_rate": 0.00022855248384925448, + "loss": 2.5879, + "step": 23130 + }, + { + "epoch": 0.6859116923168164, + "grad_norm": 0.10695181787014008, + "learning_rate": 0.00022851297162783618, + "loss": 2.5403, + "step": 23131 + }, + { + "epoch": 0.6859413456691279, + "grad_norm": 0.10364506393671036, + "learning_rate": 0.00022847346181048228, + "loss": 2.561, + "step": 23132 + }, + { + "epoch": 0.6859709990214393, + "grad_norm": 0.09600573778152466, + "learning_rate": 0.00022843395439754233, + "loss": 2.5918, + "step": 23133 + }, + { + "epoch": 0.6860006523737509, + "grad_norm": 0.10040443390607834, + "learning_rate": 0.00022839444938936628, + "loss": 2.6029, + "step": 23134 + }, + { + "epoch": 0.6860303057260624, + "grad_norm": 0.09710662811994553, + "learning_rate": 0.00022835494678630404, + "loss": 2.597, + "step": 23135 + }, + { + "epoch": 0.6860599590783738, + "grad_norm": 0.09478407353162766, + "learning_rate": 0.00022831544658870535, + "loss": 2.5903, + "step": 23136 + }, + { + "epoch": 0.6860896124306853, + "grad_norm": 0.10382603853940964, + "learning_rate": 0.00022827594879692005, + "loss": 2.6176, + "step": 23137 + }, + { + "epoch": 0.6861192657829968, + "grad_norm": 0.10063643008470535, + "learning_rate": 0.00022823645341129783, + "loss": 2.5824, + "step": 23138 + }, + { + "epoch": 0.6861489191353083, + "grad_norm": 0.09885693341493607, + "learning_rate": 0.00022819696043218846, + "loss": 2.5587, + "step": 23139 + }, + { + "epoch": 0.6861785724876197, + "grad_norm": 0.08885449171066284, + "learning_rate": 0.0002281574698599417, + "loss": 2.5515, + "step": 23140 + }, + { + "epoch": 0.6862082258399312, + "grad_norm": 0.10052414238452911, + "learning_rate": 0.00022811798169490694, + "loss": 2.5939, + "step": 23141 + }, + { + "epoch": 0.6862378791922427, + "grad_norm": 0.10448348522186279, + "learning_rate": 0.00022807849593743456, + "loss": 2.5607, + "step": 23142 + }, + { + "epoch": 0.6862675325445542, + "grad_norm": 0.09734176844358444, + "learning_rate": 0.00022803901258787356, + "loss": 2.5548, + "step": 23143 + }, + { + "epoch": 0.6862971858968656, + "grad_norm": 0.09142763167619705, + "learning_rate": 0.00022799953164657382, + "loss": 2.5957, + "step": 23144 + }, + { + "epoch": 0.6863268392491771, + "grad_norm": 0.10947546362876892, + "learning_rate": 0.0002279600531138849, + "loss": 2.5641, + "step": 23145 + }, + { + "epoch": 0.6863564926014886, + "grad_norm": 0.10119304805994034, + "learning_rate": 0.0002279205769901564, + "loss": 2.5751, + "step": 23146 + }, + { + "epoch": 0.6863861459538001, + "grad_norm": 0.10881724208593369, + "learning_rate": 0.00022788110327573785, + "loss": 2.5763, + "step": 23147 + }, + { + "epoch": 0.6864157993061115, + "grad_norm": 0.11175011098384857, + "learning_rate": 0.00022784163197097891, + "loss": 2.5719, + "step": 23148 + }, + { + "epoch": 0.686445452658423, + "grad_norm": 0.10218173265457153, + "learning_rate": 0.00022780216307622896, + "loss": 2.5748, + "step": 23149 + }, + { + "epoch": 0.6864751060107345, + "grad_norm": 0.10888667404651642, + "learning_rate": 0.00022776269659183763, + "loss": 2.5744, + "step": 23150 + }, + { + "epoch": 0.686504759363046, + "grad_norm": 0.08780563622713089, + "learning_rate": 0.00022772323251815435, + "loss": 2.5724, + "step": 23151 + }, + { + "epoch": 0.6865344127153574, + "grad_norm": 0.11638756096363068, + "learning_rate": 0.00022768377085552856, + "loss": 2.5753, + "step": 23152 + }, + { + "epoch": 0.686564066067669, + "grad_norm": 0.08623915910720825, + "learning_rate": 0.00022764431160430976, + "loss": 2.6129, + "step": 23153 + }, + { + "epoch": 0.6865937194199804, + "grad_norm": 0.0953705683350563, + "learning_rate": 0.00022760485476484727, + "loss": 2.58, + "step": 23154 + }, + { + "epoch": 0.6866233727722919, + "grad_norm": 0.10178584605455399, + "learning_rate": 0.00022756540033749058, + "loss": 2.5462, + "step": 23155 + }, + { + "epoch": 0.6866530261246034, + "grad_norm": 0.10276616364717484, + "learning_rate": 0.00022752594832258904, + "loss": 2.6128, + "step": 23156 + }, + { + "epoch": 0.6866826794769149, + "grad_norm": 0.09392359107732773, + "learning_rate": 0.00022748649872049198, + "loss": 2.5702, + "step": 23157 + }, + { + "epoch": 0.6867123328292264, + "grad_norm": 0.103483647108078, + "learning_rate": 0.00022744705153154876, + "loss": 2.6007, + "step": 23158 + }, + { + "epoch": 0.6867419861815378, + "grad_norm": 0.10038736462593079, + "learning_rate": 0.0002274076067561087, + "loss": 2.5779, + "step": 23159 + }, + { + "epoch": 0.6867716395338493, + "grad_norm": 0.09615093469619751, + "learning_rate": 0.00022736816439452106, + "loss": 2.5751, + "step": 23160 + }, + { + "epoch": 0.6868012928861608, + "grad_norm": 0.10696528851985931, + "learning_rate": 0.0002273287244471351, + "loss": 2.6271, + "step": 23161 + }, + { + "epoch": 0.6868309462384723, + "grad_norm": 0.10170494019985199, + "learning_rate": 0.0002272892869143001, + "loss": 2.567, + "step": 23162 + }, + { + "epoch": 0.6868605995907837, + "grad_norm": 0.10667736828327179, + "learning_rate": 0.00022724985179636533, + "loss": 2.6049, + "step": 23163 + }, + { + "epoch": 0.6868902529430952, + "grad_norm": 0.12112273275852203, + "learning_rate": 0.00022721041909367983, + "loss": 2.5799, + "step": 23164 + }, + { + "epoch": 0.6869199062954067, + "grad_norm": 0.11269161850214005, + "learning_rate": 0.00022717098880659298, + "loss": 2.6138, + "step": 23165 + }, + { + "epoch": 0.6869495596477182, + "grad_norm": 0.09777696430683136, + "learning_rate": 0.000227131560935454, + "loss": 2.5856, + "step": 23166 + }, + { + "epoch": 0.6869792130000296, + "grad_norm": 0.11002762615680695, + "learning_rate": 0.00022709213548061154, + "loss": 2.6026, + "step": 23167 + }, + { + "epoch": 0.6870088663523412, + "grad_norm": 0.12335235625505447, + "learning_rate": 0.00022705271244241522, + "loss": 2.6358, + "step": 23168 + }, + { + "epoch": 0.6870385197046526, + "grad_norm": 0.10133200138807297, + "learning_rate": 0.000227013291821214, + "loss": 2.5836, + "step": 23169 + }, + { + "epoch": 0.6870681730569641, + "grad_norm": 0.10469485819339752, + "learning_rate": 0.00022697387361735695, + "loss": 2.5708, + "step": 23170 + }, + { + "epoch": 0.6870978264092755, + "grad_norm": 0.09938935190439224, + "learning_rate": 0.0002269344578311931, + "loss": 2.5723, + "step": 23171 + }, + { + "epoch": 0.6871274797615871, + "grad_norm": 0.09814583510160446, + "learning_rate": 0.00022689504446307148, + "loss": 2.5871, + "step": 23172 + }, + { + "epoch": 0.6871571331138985, + "grad_norm": 0.10443343222141266, + "learning_rate": 0.00022685563351334116, + "loss": 2.567, + "step": 23173 + }, + { + "epoch": 0.68718678646621, + "grad_norm": 0.09394820779561996, + "learning_rate": 0.00022681622498235105, + "loss": 2.5816, + "step": 23174 + }, + { + "epoch": 0.6872164398185214, + "grad_norm": 0.101404570043087, + "learning_rate": 0.00022677681887045017, + "loss": 2.5658, + "step": 23175 + }, + { + "epoch": 0.687246093170833, + "grad_norm": 0.09366099536418915, + "learning_rate": 0.00022673741517798763, + "loss": 2.569, + "step": 23176 + }, + { + "epoch": 0.6872757465231445, + "grad_norm": 0.0913599357008934, + "learning_rate": 0.00022669801390531202, + "loss": 2.5762, + "step": 23177 + }, + { + "epoch": 0.6873053998754559, + "grad_norm": 0.09053559601306915, + "learning_rate": 0.0002266586150527724, + "loss": 2.5522, + "step": 23178 + }, + { + "epoch": 0.6873350532277674, + "grad_norm": 0.0908006951212883, + "learning_rate": 0.00022661921862071767, + "loss": 2.5915, + "step": 23179 + }, + { + "epoch": 0.6873647065800789, + "grad_norm": 0.09593084454536438, + "learning_rate": 0.0002265798246094965, + "loss": 2.5814, + "step": 23180 + }, + { + "epoch": 0.6873943599323904, + "grad_norm": 0.10448330640792847, + "learning_rate": 0.00022654043301945808, + "loss": 2.619, + "step": 23181 + }, + { + "epoch": 0.6874240132847018, + "grad_norm": 0.09670425206422806, + "learning_rate": 0.0002265010438509511, + "loss": 2.5953, + "step": 23182 + }, + { + "epoch": 0.6874536666370133, + "grad_norm": 0.09597953408956528, + "learning_rate": 0.00022646165710432425, + "loss": 2.6109, + "step": 23183 + }, + { + "epoch": 0.6874833199893248, + "grad_norm": 0.10980311781167984, + "learning_rate": 0.00022642227277992644, + "loss": 2.5883, + "step": 23184 + }, + { + "epoch": 0.6875129733416363, + "grad_norm": 0.09206653386354446, + "learning_rate": 0.00022638289087810638, + "loss": 2.5908, + "step": 23185 + }, + { + "epoch": 0.6875426266939477, + "grad_norm": 0.10534559935331345, + "learning_rate": 0.00022634351139921277, + "loss": 2.5962, + "step": 23186 + }, + { + "epoch": 0.6875722800462593, + "grad_norm": 0.11028921604156494, + "learning_rate": 0.00022630413434359447, + "loss": 2.5685, + "step": 23187 + }, + { + "epoch": 0.6876019333985707, + "grad_norm": 0.0942942425608635, + "learning_rate": 0.00022626475971159994, + "loss": 2.5766, + "step": 23188 + }, + { + "epoch": 0.6876315867508822, + "grad_norm": 0.1104929968714714, + "learning_rate": 0.0002262253875035779, + "loss": 2.5855, + "step": 23189 + }, + { + "epoch": 0.6876612401031936, + "grad_norm": 0.09617418795824051, + "learning_rate": 0.00022618601771987707, + "loss": 2.5818, + "step": 23190 + }, + { + "epoch": 0.6876908934555052, + "grad_norm": 0.08993186801671982, + "learning_rate": 0.00022614665036084603, + "loss": 2.586, + "step": 23191 + }, + { + "epoch": 0.6877205468078166, + "grad_norm": 0.11577510833740234, + "learning_rate": 0.0002261072854268334, + "loss": 2.5985, + "step": 23192 + }, + { + "epoch": 0.6877502001601281, + "grad_norm": 0.10289072245359421, + "learning_rate": 0.0002260679229181876, + "loss": 2.6148, + "step": 23193 + }, + { + "epoch": 0.6877798535124395, + "grad_norm": 0.09947143495082855, + "learning_rate": 0.0002260285628352575, + "loss": 2.5492, + "step": 23194 + }, + { + "epoch": 0.6878095068647511, + "grad_norm": 0.10709379613399506, + "learning_rate": 0.00022598920517839162, + "loss": 2.5802, + "step": 23195 + }, + { + "epoch": 0.6878391602170625, + "grad_norm": 0.10875999927520752, + "learning_rate": 0.00022594984994793826, + "loss": 2.5781, + "step": 23196 + }, + { + "epoch": 0.687868813569374, + "grad_norm": 0.10552042722702026, + "learning_rate": 0.00022591049714424622, + "loss": 2.5911, + "step": 23197 + }, + { + "epoch": 0.6878984669216855, + "grad_norm": 0.10516468435525894, + "learning_rate": 0.00022587114676766363, + "loss": 2.5923, + "step": 23198 + }, + { + "epoch": 0.687928120273997, + "grad_norm": 0.10688760876655579, + "learning_rate": 0.00022583179881853905, + "loss": 2.5923, + "step": 23199 + }, + { + "epoch": 0.6879577736263085, + "grad_norm": 0.09722016006708145, + "learning_rate": 0.000225792453297221, + "loss": 2.5759, + "step": 23200 + }, + { + "epoch": 0.6879874269786199, + "grad_norm": 0.10155322402715683, + "learning_rate": 0.00022575311020405774, + "loss": 2.5805, + "step": 23201 + }, + { + "epoch": 0.6880170803309315, + "grad_norm": 0.09188129752874374, + "learning_rate": 0.00022571376953939786, + "loss": 2.6094, + "step": 23202 + }, + { + "epoch": 0.6880467336832429, + "grad_norm": 0.10487121343612671, + "learning_rate": 0.0002256744313035896, + "loss": 2.5794, + "step": 23203 + }, + { + "epoch": 0.6880763870355544, + "grad_norm": 0.09201061725616455, + "learning_rate": 0.00022563509549698135, + "loss": 2.5966, + "step": 23204 + }, + { + "epoch": 0.6881060403878658, + "grad_norm": 0.10888608545064926, + "learning_rate": 0.00022559576211992144, + "loss": 2.5994, + "step": 23205 + }, + { + "epoch": 0.6881356937401774, + "grad_norm": 0.09718553721904755, + "learning_rate": 0.00022555643117275792, + "loss": 2.5919, + "step": 23206 + }, + { + "epoch": 0.6881653470924888, + "grad_norm": 0.10310909897089005, + "learning_rate": 0.00022551710265583953, + "loss": 2.609, + "step": 23207 + }, + { + "epoch": 0.6881950004448003, + "grad_norm": 0.10262768715620041, + "learning_rate": 0.00022547777656951445, + "loss": 2.5846, + "step": 23208 + }, + { + "epoch": 0.6882246537971117, + "grad_norm": 0.09169783443212509, + "learning_rate": 0.00022543845291413068, + "loss": 2.5911, + "step": 23209 + }, + { + "epoch": 0.6882543071494233, + "grad_norm": 0.09816569834947586, + "learning_rate": 0.00022539913169003644, + "loss": 2.6121, + "step": 23210 + }, + { + "epoch": 0.6882839605017347, + "grad_norm": 0.10371209681034088, + "learning_rate": 0.00022535981289758012, + "loss": 2.5842, + "step": 23211 + }, + { + "epoch": 0.6883136138540462, + "grad_norm": 0.09683282673358917, + "learning_rate": 0.00022532049653710973, + "loss": 2.5925, + "step": 23212 + }, + { + "epoch": 0.6883432672063576, + "grad_norm": 0.10681634396314621, + "learning_rate": 0.00022528118260897352, + "loss": 2.6032, + "step": 23213 + }, + { + "epoch": 0.6883729205586692, + "grad_norm": 0.10788025707006454, + "learning_rate": 0.00022524187111351958, + "loss": 2.5714, + "step": 23214 + }, + { + "epoch": 0.6884025739109806, + "grad_norm": 0.09496474266052246, + "learning_rate": 0.00022520256205109602, + "loss": 2.5853, + "step": 23215 + }, + { + "epoch": 0.6884322272632921, + "grad_norm": 0.10043781995773315, + "learning_rate": 0.00022516325542205095, + "loss": 2.6011, + "step": 23216 + }, + { + "epoch": 0.6884618806156035, + "grad_norm": 0.09629779309034348, + "learning_rate": 0.00022512395122673245, + "loss": 2.6225, + "step": 23217 + }, + { + "epoch": 0.6884915339679151, + "grad_norm": 0.10673926770687103, + "learning_rate": 0.00022508464946548857, + "loss": 2.5863, + "step": 23218 + }, + { + "epoch": 0.6885211873202266, + "grad_norm": 0.09273140877485275, + "learning_rate": 0.00022504535013866722, + "loss": 2.5992, + "step": 23219 + }, + { + "epoch": 0.688550840672538, + "grad_norm": 0.10133853554725647, + "learning_rate": 0.00022500605324661654, + "loss": 2.5957, + "step": 23220 + }, + { + "epoch": 0.6885804940248496, + "grad_norm": 0.09292785078287125, + "learning_rate": 0.0002249667587896845, + "loss": 2.58, + "step": 23221 + }, + { + "epoch": 0.688610147377161, + "grad_norm": 0.11127161979675293, + "learning_rate": 0.00022492746676821895, + "loss": 2.6277, + "step": 23222 + }, + { + "epoch": 0.6886398007294725, + "grad_norm": 0.10800132155418396, + "learning_rate": 0.00022488817718256793, + "loss": 2.5847, + "step": 23223 + }, + { + "epoch": 0.6886694540817839, + "grad_norm": 0.10490860044956207, + "learning_rate": 0.00022484889003307934, + "loss": 2.58, + "step": 23224 + }, + { + "epoch": 0.6886991074340955, + "grad_norm": 0.10136806964874268, + "learning_rate": 0.00022480960532010103, + "loss": 2.5724, + "step": 23225 + }, + { + "epoch": 0.6887287607864069, + "grad_norm": 0.10761708766222, + "learning_rate": 0.00022477032304398092, + "loss": 2.5918, + "step": 23226 + }, + { + "epoch": 0.6887584141387184, + "grad_norm": 0.10461663454771042, + "learning_rate": 0.00022473104320506682, + "loss": 2.6109, + "step": 23227 + }, + { + "epoch": 0.6887880674910298, + "grad_norm": 0.10465726256370544, + "learning_rate": 0.0002246917658037066, + "loss": 2.5825, + "step": 23228 + }, + { + "epoch": 0.6888177208433414, + "grad_norm": 0.09488188475370407, + "learning_rate": 0.00022465249084024802, + "loss": 2.5971, + "step": 23229 + }, + { + "epoch": 0.6888473741956528, + "grad_norm": 0.1090768352150917, + "learning_rate": 0.00022461321831503895, + "loss": 2.6011, + "step": 23230 + }, + { + "epoch": 0.6888770275479643, + "grad_norm": 0.08507020026445389, + "learning_rate": 0.00022457394822842725, + "loss": 2.5888, + "step": 23231 + }, + { + "epoch": 0.6889066809002757, + "grad_norm": 0.11146048456430435, + "learning_rate": 0.00022453468058076015, + "loss": 2.5892, + "step": 23232 + }, + { + "epoch": 0.6889363342525873, + "grad_norm": 0.08996928483247757, + "learning_rate": 0.00022449541537238589, + "loss": 2.5549, + "step": 23233 + }, + { + "epoch": 0.6889659876048987, + "grad_norm": 0.08937665075063705, + "learning_rate": 0.00022445615260365204, + "loss": 2.5803, + "step": 23234 + }, + { + "epoch": 0.6889956409572102, + "grad_norm": 0.10106058418750763, + "learning_rate": 0.0002244168922749063, + "loss": 2.5849, + "step": 23235 + }, + { + "epoch": 0.6890252943095216, + "grad_norm": 0.08880747854709625, + "learning_rate": 0.0002243776343864962, + "loss": 2.6011, + "step": 23236 + }, + { + "epoch": 0.6890549476618332, + "grad_norm": 0.09641940146684647, + "learning_rate": 0.00022433837893876953, + "loss": 2.5358, + "step": 23237 + }, + { + "epoch": 0.6890846010141447, + "grad_norm": 0.09172357618808746, + "learning_rate": 0.0002242991259320738, + "loss": 2.5931, + "step": 23238 + }, + { + "epoch": 0.6891142543664561, + "grad_norm": 0.1039312481880188, + "learning_rate": 0.00022425987536675663, + "loss": 2.6334, + "step": 23239 + }, + { + "epoch": 0.6891439077187677, + "grad_norm": 0.09147179126739502, + "learning_rate": 0.0002242206272431656, + "loss": 2.5967, + "step": 23240 + }, + { + "epoch": 0.6891735610710791, + "grad_norm": 0.09660951793193817, + "learning_rate": 0.00022418138156164824, + "loss": 2.6076, + "step": 23241 + }, + { + "epoch": 0.6892032144233906, + "grad_norm": 0.10302799940109253, + "learning_rate": 0.00022414213832255232, + "loss": 2.5774, + "step": 23242 + }, + { + "epoch": 0.689232867775702, + "grad_norm": 0.0918363630771637, + "learning_rate": 0.00022410289752622487, + "loss": 2.5764, + "step": 23243 + }, + { + "epoch": 0.6892625211280136, + "grad_norm": 0.10082481056451797, + "learning_rate": 0.00022406365917301363, + "loss": 2.5862, + "step": 23244 + }, + { + "epoch": 0.689292174480325, + "grad_norm": 0.09739775955677032, + "learning_rate": 0.00022402442326326593, + "loss": 2.5443, + "step": 23245 + }, + { + "epoch": 0.6893218278326365, + "grad_norm": 0.10720457136631012, + "learning_rate": 0.00022398518979732947, + "loss": 2.581, + "step": 23246 + }, + { + "epoch": 0.6893514811849479, + "grad_norm": 0.10802160948514938, + "learning_rate": 0.00022394595877555152, + "loss": 2.6046, + "step": 23247 + }, + { + "epoch": 0.6893811345372595, + "grad_norm": 0.09510496258735657, + "learning_rate": 0.0002239067301982795, + "loss": 2.6123, + "step": 23248 + }, + { + "epoch": 0.6894107878895709, + "grad_norm": 0.09138516336679459, + "learning_rate": 0.00022386750406586076, + "loss": 2.5873, + "step": 23249 + }, + { + "epoch": 0.6894404412418824, + "grad_norm": 0.08731725811958313, + "learning_rate": 0.00022382828037864272, + "loss": 2.5922, + "step": 23250 + }, + { + "epoch": 0.6894700945941938, + "grad_norm": 0.09561340510845184, + "learning_rate": 0.00022378905913697262, + "loss": 2.5704, + "step": 23251 + }, + { + "epoch": 0.6894997479465054, + "grad_norm": 0.08903990685939789, + "learning_rate": 0.000223749840341198, + "loss": 2.5777, + "step": 23252 + }, + { + "epoch": 0.6895294012988168, + "grad_norm": 0.09483412653207779, + "learning_rate": 0.0002237106239916658, + "loss": 2.6063, + "step": 23253 + }, + { + "epoch": 0.6895590546511283, + "grad_norm": 0.10152825713157654, + "learning_rate": 0.00022367141008872344, + "loss": 2.6019, + "step": 23254 + }, + { + "epoch": 0.6895887080034397, + "grad_norm": 0.09356899559497833, + "learning_rate": 0.00022363219863271816, + "loss": 2.5866, + "step": 23255 + }, + { + "epoch": 0.6896183613557513, + "grad_norm": 0.10106606781482697, + "learning_rate": 0.00022359298962399722, + "loss": 2.5917, + "step": 23256 + }, + { + "epoch": 0.6896480147080627, + "grad_norm": 0.11504020541906357, + "learning_rate": 0.00022355378306290774, + "loss": 2.5423, + "step": 23257 + }, + { + "epoch": 0.6896776680603742, + "grad_norm": 0.11077439785003662, + "learning_rate": 0.0002235145789497968, + "loss": 2.5634, + "step": 23258 + }, + { + "epoch": 0.6897073214126858, + "grad_norm": 0.11883527040481567, + "learning_rate": 0.00022347537728501199, + "loss": 2.5881, + "step": 23259 + }, + { + "epoch": 0.6897369747649972, + "grad_norm": 0.09493055939674377, + "learning_rate": 0.0002234361780689001, + "loss": 2.5769, + "step": 23260 + }, + { + "epoch": 0.6897666281173087, + "grad_norm": 0.12477324903011322, + "learning_rate": 0.00022339698130180837, + "loss": 2.5855, + "step": 23261 + }, + { + "epoch": 0.6897962814696201, + "grad_norm": 0.10471511632204056, + "learning_rate": 0.0002233577869840838, + "loss": 2.5791, + "step": 23262 + }, + { + "epoch": 0.6898259348219317, + "grad_norm": 0.10912060737609863, + "learning_rate": 0.0002233185951160737, + "loss": 2.5657, + "step": 23263 + }, + { + "epoch": 0.6898555881742431, + "grad_norm": 0.09269613027572632, + "learning_rate": 0.00022327940569812478, + "loss": 2.6162, + "step": 23264 + }, + { + "epoch": 0.6898852415265546, + "grad_norm": 0.10779938101768494, + "learning_rate": 0.0002232402187305842, + "loss": 2.5769, + "step": 23265 + }, + { + "epoch": 0.689914894878866, + "grad_norm": 0.09724462777376175, + "learning_rate": 0.000223201034213799, + "loss": 2.6264, + "step": 23266 + }, + { + "epoch": 0.6899445482311776, + "grad_norm": 0.09541089087724686, + "learning_rate": 0.00022316185214811614, + "loss": 2.5676, + "step": 23267 + }, + { + "epoch": 0.689974201583489, + "grad_norm": 0.09889770299196243, + "learning_rate": 0.00022312267253388264, + "loss": 2.6344, + "step": 23268 + }, + { + "epoch": 0.6900038549358005, + "grad_norm": 0.10458838939666748, + "learning_rate": 0.00022308349537144534, + "loss": 2.5719, + "step": 23269 + }, + { + "epoch": 0.6900335082881119, + "grad_norm": 0.10771074146032333, + "learning_rate": 0.00022304432066115127, + "loss": 2.5714, + "step": 23270 + }, + { + "epoch": 0.6900631616404235, + "grad_norm": 0.10027630627155304, + "learning_rate": 0.00022300514840334706, + "loss": 2.5816, + "step": 23271 + }, + { + "epoch": 0.6900928149927349, + "grad_norm": 0.11403650045394897, + "learning_rate": 0.00022296597859838003, + "loss": 2.5649, + "step": 23272 + }, + { + "epoch": 0.6901224683450464, + "grad_norm": 0.09676124155521393, + "learning_rate": 0.00022292681124659697, + "loss": 2.5845, + "step": 23273 + }, + { + "epoch": 0.6901521216973578, + "grad_norm": 0.12079606205224991, + "learning_rate": 0.00022288764634834435, + "loss": 2.5868, + "step": 23274 + }, + { + "epoch": 0.6901817750496694, + "grad_norm": 0.0955289974808693, + "learning_rate": 0.00022284848390396921, + "loss": 2.5768, + "step": 23275 + }, + { + "epoch": 0.6902114284019808, + "grad_norm": 0.11530248820781708, + "learning_rate": 0.0002228093239138183, + "loss": 2.5668, + "step": 23276 + }, + { + "epoch": 0.6902410817542923, + "grad_norm": 0.1034378632903099, + "learning_rate": 0.00022277016637823843, + "loss": 2.5459, + "step": 23277 + }, + { + "epoch": 0.6902707351066037, + "grad_norm": 0.1129317432641983, + "learning_rate": 0.00022273101129757634, + "loss": 2.5616, + "step": 23278 + }, + { + "epoch": 0.6903003884589153, + "grad_norm": 0.10707971453666687, + "learning_rate": 0.00022269185867217868, + "loss": 2.5327, + "step": 23279 + }, + { + "epoch": 0.6903300418112268, + "grad_norm": 0.11697856336832047, + "learning_rate": 0.00022265270850239228, + "loss": 2.5995, + "step": 23280 + }, + { + "epoch": 0.6903596951635382, + "grad_norm": 0.09447577595710754, + "learning_rate": 0.0002226135607885637, + "loss": 2.6125, + "step": 23281 + }, + { + "epoch": 0.6903893485158498, + "grad_norm": 0.10924120247364044, + "learning_rate": 0.00022257441553103963, + "loss": 2.6321, + "step": 23282 + }, + { + "epoch": 0.6904190018681612, + "grad_norm": 0.1051364541053772, + "learning_rate": 0.00022253527273016676, + "loss": 2.5652, + "step": 23283 + }, + { + "epoch": 0.6904486552204727, + "grad_norm": 0.11737345904111862, + "learning_rate": 0.00022249613238629174, + "loss": 2.5847, + "step": 23284 + }, + { + "epoch": 0.6904783085727841, + "grad_norm": 0.1022961363196373, + "learning_rate": 0.00022245699449976102, + "loss": 2.5739, + "step": 23285 + }, + { + "epoch": 0.6905079619250957, + "grad_norm": 0.10794362425804138, + "learning_rate": 0.00022241785907092126, + "loss": 2.6158, + "step": 23286 + }, + { + "epoch": 0.6905376152774071, + "grad_norm": 0.11050593107938766, + "learning_rate": 0.00022237872610011907, + "loss": 2.5827, + "step": 23287 + }, + { + "epoch": 0.6905672686297186, + "grad_norm": 0.0929642841219902, + "learning_rate": 0.00022233959558770085, + "loss": 2.5604, + "step": 23288 + }, + { + "epoch": 0.69059692198203, + "grad_norm": 0.11458656936883926, + "learning_rate": 0.00022230046753401317, + "loss": 2.5761, + "step": 23289 + }, + { + "epoch": 0.6906265753343416, + "grad_norm": 0.09848017990589142, + "learning_rate": 0.00022226134193940257, + "loss": 2.6074, + "step": 23290 + }, + { + "epoch": 0.690656228686653, + "grad_norm": 0.09966551512479782, + "learning_rate": 0.0002222222188042154, + "loss": 2.5747, + "step": 23291 + }, + { + "epoch": 0.6906858820389645, + "grad_norm": 0.09448873996734619, + "learning_rate": 0.00022218309812879817, + "loss": 2.5513, + "step": 23292 + }, + { + "epoch": 0.690715535391276, + "grad_norm": 0.10564722120761871, + "learning_rate": 0.00022214397991349733, + "loss": 2.605, + "step": 23293 + }, + { + "epoch": 0.6907451887435875, + "grad_norm": 0.09781834483146667, + "learning_rate": 0.00022210486415865922, + "loss": 2.5888, + "step": 23294 + }, + { + "epoch": 0.6907748420958989, + "grad_norm": 0.10892864316701889, + "learning_rate": 0.0002220657508646302, + "loss": 2.6027, + "step": 23295 + }, + { + "epoch": 0.6908044954482104, + "grad_norm": 0.09976469725370407, + "learning_rate": 0.0002220266400317567, + "loss": 2.567, + "step": 23296 + }, + { + "epoch": 0.6908341488005219, + "grad_norm": 0.10370903462171555, + "learning_rate": 0.00022198753166038506, + "loss": 2.5586, + "step": 23297 + }, + { + "epoch": 0.6908638021528334, + "grad_norm": 0.1045202985405922, + "learning_rate": 0.00022194842575086148, + "loss": 2.6126, + "step": 23298 + }, + { + "epoch": 0.6908934555051448, + "grad_norm": 0.09413676708936691, + "learning_rate": 0.00022190932230353234, + "loss": 2.602, + "step": 23299 + }, + { + "epoch": 0.6909231088574563, + "grad_norm": 0.11108455061912537, + "learning_rate": 0.0002218702213187439, + "loss": 2.5721, + "step": 23300 + }, + { + "epoch": 0.6909527622097679, + "grad_norm": 0.10622724890708923, + "learning_rate": 0.00022183112279684235, + "loss": 2.6029, + "step": 23301 + }, + { + "epoch": 0.6909824155620793, + "grad_norm": 0.10454485565423965, + "learning_rate": 0.000221792026738174, + "loss": 2.5657, + "step": 23302 + }, + { + "epoch": 0.6910120689143908, + "grad_norm": 0.10380850732326508, + "learning_rate": 0.00022175293314308497, + "loss": 2.5525, + "step": 23303 + }, + { + "epoch": 0.6910417222667022, + "grad_norm": 0.09596198052167892, + "learning_rate": 0.00022171384201192151, + "loss": 2.5904, + "step": 23304 + }, + { + "epoch": 0.6910713756190138, + "grad_norm": 0.11105331033468246, + "learning_rate": 0.00022167475334502974, + "loss": 2.6356, + "step": 23305 + }, + { + "epoch": 0.6911010289713252, + "grad_norm": 0.10699927806854248, + "learning_rate": 0.00022163566714275578, + "loss": 2.5644, + "step": 23306 + }, + { + "epoch": 0.6911306823236367, + "grad_norm": 0.11804255843162537, + "learning_rate": 0.00022159658340544598, + "loss": 2.591, + "step": 23307 + }, + { + "epoch": 0.6911603356759481, + "grad_norm": 0.1105034351348877, + "learning_rate": 0.000221557502133446, + "loss": 2.5187, + "step": 23308 + }, + { + "epoch": 0.6911899890282597, + "grad_norm": 0.10354247689247131, + "learning_rate": 0.00022151842332710197, + "loss": 2.578, + "step": 23309 + }, + { + "epoch": 0.6912196423805711, + "grad_norm": 0.10230755805969238, + "learning_rate": 0.0002214793469867603, + "loss": 2.6067, + "step": 23310 + }, + { + "epoch": 0.6912492957328826, + "grad_norm": 0.1070307046175003, + "learning_rate": 0.00022144027311276683, + "loss": 2.5515, + "step": 23311 + }, + { + "epoch": 0.691278949085194, + "grad_norm": 0.11339680105447769, + "learning_rate": 0.00022140120170546752, + "loss": 2.6059, + "step": 23312 + }, + { + "epoch": 0.6913086024375056, + "grad_norm": 0.1006256490945816, + "learning_rate": 0.0002213621327652084, + "loss": 2.6083, + "step": 23313 + }, + { + "epoch": 0.691338255789817, + "grad_norm": 0.10474833101034164, + "learning_rate": 0.00022132306629233539, + "loss": 2.5504, + "step": 23314 + }, + { + "epoch": 0.6913679091421285, + "grad_norm": 0.09781502187252045, + "learning_rate": 0.00022128400228719453, + "loss": 2.5988, + "step": 23315 + }, + { + "epoch": 0.69139756249444, + "grad_norm": 0.09996291249990463, + "learning_rate": 0.00022124494075013163, + "loss": 2.5428, + "step": 23316 + }, + { + "epoch": 0.6914272158467515, + "grad_norm": 0.10291806608438492, + "learning_rate": 0.00022120588168149263, + "loss": 2.578, + "step": 23317 + }, + { + "epoch": 0.6914568691990629, + "grad_norm": 0.10004386305809021, + "learning_rate": 0.00022116682508162362, + "loss": 2.58, + "step": 23318 + }, + { + "epoch": 0.6914865225513744, + "grad_norm": 0.10412706434726715, + "learning_rate": 0.00022112777095087, + "loss": 2.5533, + "step": 23319 + }, + { + "epoch": 0.6915161759036859, + "grad_norm": 0.10482881963253021, + "learning_rate": 0.00022108871928957786, + "loss": 2.5931, + "step": 23320 + }, + { + "epoch": 0.6915458292559974, + "grad_norm": 0.10005856305360794, + "learning_rate": 0.00022104967009809297, + "loss": 2.5919, + "step": 23321 + }, + { + "epoch": 0.6915754826083089, + "grad_norm": 0.107405886054039, + "learning_rate": 0.000221010623376761, + "loss": 2.5662, + "step": 23322 + }, + { + "epoch": 0.6916051359606203, + "grad_norm": 0.09479781985282898, + "learning_rate": 0.00022097157912592797, + "loss": 2.581, + "step": 23323 + }, + { + "epoch": 0.6916347893129319, + "grad_norm": 0.10288790613412857, + "learning_rate": 0.00022093253734593955, + "loss": 2.596, + "step": 23324 + }, + { + "epoch": 0.6916644426652433, + "grad_norm": 0.10496409237384796, + "learning_rate": 0.00022089349803714137, + "loss": 2.5546, + "step": 23325 + }, + { + "epoch": 0.6916940960175548, + "grad_norm": 0.08688211441040039, + "learning_rate": 0.0002208544611998792, + "loss": 2.5697, + "step": 23326 + }, + { + "epoch": 0.6917237493698662, + "grad_norm": 0.09107652306556702, + "learning_rate": 0.00022081542683449867, + "loss": 2.5662, + "step": 23327 + }, + { + "epoch": 0.6917534027221778, + "grad_norm": 0.09496913105249405, + "learning_rate": 0.00022077639494134566, + "loss": 2.5852, + "step": 23328 + }, + { + "epoch": 0.6917830560744892, + "grad_norm": 0.10009171813726425, + "learning_rate": 0.00022073736552076539, + "loss": 2.6066, + "step": 23329 + }, + { + "epoch": 0.6918127094268007, + "grad_norm": 0.0955422893166542, + "learning_rate": 0.00022069833857310374, + "loss": 2.594, + "step": 23330 + }, + { + "epoch": 0.6918423627791122, + "grad_norm": 0.10175716876983643, + "learning_rate": 0.00022065931409870622, + "loss": 2.5976, + "step": 23331 + }, + { + "epoch": 0.6918720161314237, + "grad_norm": 0.11649920046329498, + "learning_rate": 0.0002206202920979184, + "loss": 2.5844, + "step": 23332 + }, + { + "epoch": 0.6919016694837351, + "grad_norm": 0.09711548686027527, + "learning_rate": 0.00022058127257108584, + "loss": 2.5791, + "step": 23333 + }, + { + "epoch": 0.6919313228360466, + "grad_norm": 0.09727802872657776, + "learning_rate": 0.00022054225551855405, + "loss": 2.5759, + "step": 23334 + }, + { + "epoch": 0.6919609761883581, + "grad_norm": 0.10861510783433914, + "learning_rate": 0.00022050324094066842, + "loss": 2.5448, + "step": 23335 + }, + { + "epoch": 0.6919906295406696, + "grad_norm": 0.09080181270837784, + "learning_rate": 0.0002204642288377748, + "loss": 2.5656, + "step": 23336 + }, + { + "epoch": 0.692020282892981, + "grad_norm": 0.09736625850200653, + "learning_rate": 0.0002204252192102183, + "loss": 2.5887, + "step": 23337 + }, + { + "epoch": 0.6920499362452925, + "grad_norm": 0.09853426367044449, + "learning_rate": 0.00022038621205834453, + "loss": 2.5809, + "step": 23338 + }, + { + "epoch": 0.692079589597604, + "grad_norm": 0.08748432993888855, + "learning_rate": 0.00022034720738249903, + "loss": 2.5827, + "step": 23339 + }, + { + "epoch": 0.6921092429499155, + "grad_norm": 0.10072434693574905, + "learning_rate": 0.00022030820518302685, + "loss": 2.596, + "step": 23340 + }, + { + "epoch": 0.6921388963022269, + "grad_norm": 0.0989924967288971, + "learning_rate": 0.0002202692054602735, + "loss": 2.5917, + "step": 23341 + }, + { + "epoch": 0.6921685496545384, + "grad_norm": 0.09346270561218262, + "learning_rate": 0.00022023020821458432, + "loss": 2.5805, + "step": 23342 + }, + { + "epoch": 0.69219820300685, + "grad_norm": 0.101913683116436, + "learning_rate": 0.00022019121344630472, + "loss": 2.6062, + "step": 23343 + }, + { + "epoch": 0.6922278563591614, + "grad_norm": 0.10002894699573517, + "learning_rate": 0.00022015222115577993, + "loss": 2.5762, + "step": 23344 + }, + { + "epoch": 0.6922575097114729, + "grad_norm": 0.09555372595787048, + "learning_rate": 0.00022011323134335525, + "loss": 2.5929, + "step": 23345 + }, + { + "epoch": 0.6922871630637843, + "grad_norm": 0.1014479547739029, + "learning_rate": 0.00022007424400937597, + "loss": 2.5837, + "step": 23346 + }, + { + "epoch": 0.6923168164160959, + "grad_norm": 0.11492804437875748, + "learning_rate": 0.00022003525915418733, + "loss": 2.6083, + "step": 23347 + }, + { + "epoch": 0.6923464697684073, + "grad_norm": 0.10210546106100082, + "learning_rate": 0.00021999627677813426, + "loss": 2.5911, + "step": 23348 + }, + { + "epoch": 0.6923761231207188, + "grad_norm": 0.10543201118707657, + "learning_rate": 0.00021995729688156264, + "loss": 2.5892, + "step": 23349 + }, + { + "epoch": 0.6924057764730303, + "grad_norm": 0.1051689088344574, + "learning_rate": 0.00021991831946481706, + "loss": 2.5807, + "step": 23350 + }, + { + "epoch": 0.6924354298253418, + "grad_norm": 0.10647020488977432, + "learning_rate": 0.00021987934452824282, + "loss": 2.6093, + "step": 23351 + }, + { + "epoch": 0.6924650831776532, + "grad_norm": 0.09982873499393463, + "learning_rate": 0.00021984037207218506, + "loss": 2.5806, + "step": 23352 + }, + { + "epoch": 0.6924947365299647, + "grad_norm": 0.10097519308328629, + "learning_rate": 0.0002198014020969889, + "loss": 2.5961, + "step": 23353 + }, + { + "epoch": 0.6925243898822762, + "grad_norm": 0.10499858111143112, + "learning_rate": 0.0002197624346029994, + "loss": 2.5625, + "step": 23354 + }, + { + "epoch": 0.6925540432345877, + "grad_norm": 0.09729132801294327, + "learning_rate": 0.00021972346959056168, + "loss": 2.6189, + "step": 23355 + }, + { + "epoch": 0.6925836965868991, + "grad_norm": 0.09903573244810104, + "learning_rate": 0.0002196845070600207, + "loss": 2.5752, + "step": 23356 + }, + { + "epoch": 0.6926133499392106, + "grad_norm": 0.08922035247087479, + "learning_rate": 0.0002196455470117215, + "loss": 2.5558, + "step": 23357 + }, + { + "epoch": 0.6926430032915221, + "grad_norm": 0.1049145981669426, + "learning_rate": 0.00021960658944600919, + "loss": 2.5753, + "step": 23358 + }, + { + "epoch": 0.6926726566438336, + "grad_norm": 0.08604810386896133, + "learning_rate": 0.00021956763436322863, + "loss": 2.5888, + "step": 23359 + }, + { + "epoch": 0.692702309996145, + "grad_norm": 0.09976980835199356, + "learning_rate": 0.00021952868176372477, + "loss": 2.5559, + "step": 23360 + }, + { + "epoch": 0.6927319633484565, + "grad_norm": 0.09237531572580338, + "learning_rate": 0.00021948973164784258, + "loss": 2.5846, + "step": 23361 + }, + { + "epoch": 0.692761616700768, + "grad_norm": 0.10461558401584625, + "learning_rate": 0.000219450784015927, + "loss": 2.6252, + "step": 23362 + }, + { + "epoch": 0.6927912700530795, + "grad_norm": 0.10338771343231201, + "learning_rate": 0.0002194118388683229, + "loss": 2.5505, + "step": 23363 + }, + { + "epoch": 0.692820923405391, + "grad_norm": 0.09625531733036041, + "learning_rate": 0.0002193728962053751, + "loss": 2.6233, + "step": 23364 + }, + { + "epoch": 0.6928505767577025, + "grad_norm": 0.10903558135032654, + "learning_rate": 0.00021933395602742844, + "loss": 2.5667, + "step": 23365 + }, + { + "epoch": 0.692880230110014, + "grad_norm": 0.09519852697849274, + "learning_rate": 0.0002192950183348278, + "loss": 2.5778, + "step": 23366 + }, + { + "epoch": 0.6929098834623254, + "grad_norm": 0.10337422788143158, + "learning_rate": 0.00021925608312791794, + "loss": 2.6114, + "step": 23367 + }, + { + "epoch": 0.6929395368146369, + "grad_norm": 0.10313696414232254, + "learning_rate": 0.0002192171504070437, + "loss": 2.5827, + "step": 23368 + }, + { + "epoch": 0.6929691901669484, + "grad_norm": 0.121028371155262, + "learning_rate": 0.00021917822017254978, + "loss": 2.6093, + "step": 23369 + }, + { + "epoch": 0.6929988435192599, + "grad_norm": 0.11145712435245514, + "learning_rate": 0.00021913929242478087, + "loss": 2.566, + "step": 23370 + }, + { + "epoch": 0.6930284968715713, + "grad_norm": 0.10080195218324661, + "learning_rate": 0.00021910036716408176, + "loss": 2.5807, + "step": 23371 + }, + { + "epoch": 0.6930581502238828, + "grad_norm": 0.10765004903078079, + "learning_rate": 0.00021906144439079716, + "loss": 2.5898, + "step": 23372 + }, + { + "epoch": 0.6930878035761943, + "grad_norm": 0.10500722378492355, + "learning_rate": 0.00021902252410527185, + "loss": 2.6152, + "step": 23373 + }, + { + "epoch": 0.6931174569285058, + "grad_norm": 0.09541740268468857, + "learning_rate": 0.00021898360630784991, + "loss": 2.5921, + "step": 23374 + }, + { + "epoch": 0.6931471102808172, + "grad_norm": 0.09943270683288574, + "learning_rate": 0.0002189446909988766, + "loss": 2.5499, + "step": 23375 + }, + { + "epoch": 0.6931767636331287, + "grad_norm": 0.10045922547578812, + "learning_rate": 0.0002189057781786963, + "loss": 2.5529, + "step": 23376 + }, + { + "epoch": 0.6932064169854402, + "grad_norm": 0.10434068739414215, + "learning_rate": 0.00021886686784765354, + "loss": 2.5836, + "step": 23377 + }, + { + "epoch": 0.6932360703377517, + "grad_norm": 0.10752362757921219, + "learning_rate": 0.00021882796000609296, + "loss": 2.5457, + "step": 23378 + }, + { + "epoch": 0.6932657236900631, + "grad_norm": 0.09662674367427826, + "learning_rate": 0.00021878905465435905, + "loss": 2.5935, + "step": 23379 + }, + { + "epoch": 0.6932953770423746, + "grad_norm": 0.10927026718854904, + "learning_rate": 0.00021875015179279627, + "loss": 2.5956, + "step": 23380 + }, + { + "epoch": 0.6933250303946861, + "grad_norm": 0.09885934740304947, + "learning_rate": 0.00021871125142174924, + "loss": 2.6226, + "step": 23381 + }, + { + "epoch": 0.6933546837469976, + "grad_norm": 0.10478315502405167, + "learning_rate": 0.00021867235354156234, + "loss": 2.5906, + "step": 23382 + }, + { + "epoch": 0.693384337099309, + "grad_norm": 0.09774193167686462, + "learning_rate": 0.00021863345815258006, + "loss": 2.549, + "step": 23383 + }, + { + "epoch": 0.6934139904516206, + "grad_norm": 0.10225481539964676, + "learning_rate": 0.00021859456525514698, + "loss": 2.5972, + "step": 23384 + }, + { + "epoch": 0.6934436438039321, + "grad_norm": 0.11477799713611603, + "learning_rate": 0.00021855567484960716, + "loss": 2.5955, + "step": 23385 + }, + { + "epoch": 0.6934732971562435, + "grad_norm": 0.10256399214267731, + "learning_rate": 0.0002185167869363051, + "loss": 2.5561, + "step": 23386 + }, + { + "epoch": 0.693502950508555, + "grad_norm": 0.10479653626680374, + "learning_rate": 0.00021847790151558505, + "loss": 2.5487, + "step": 23387 + }, + { + "epoch": 0.6935326038608665, + "grad_norm": 0.10614316910505295, + "learning_rate": 0.0002184390185877917, + "loss": 2.5891, + "step": 23388 + }, + { + "epoch": 0.693562257213178, + "grad_norm": 0.10110669583082199, + "learning_rate": 0.00021840013815326915, + "loss": 2.5744, + "step": 23389 + }, + { + "epoch": 0.6935919105654894, + "grad_norm": 0.09320105612277985, + "learning_rate": 0.0002183612602123617, + "loss": 2.6183, + "step": 23390 + }, + { + "epoch": 0.6936215639178009, + "grad_norm": 0.103060282766819, + "learning_rate": 0.00021832238476541366, + "loss": 2.5792, + "step": 23391 + }, + { + "epoch": 0.6936512172701124, + "grad_norm": 0.09663552790880203, + "learning_rate": 0.00021828351181276922, + "loss": 2.5838, + "step": 23392 + }, + { + "epoch": 0.6936808706224239, + "grad_norm": 0.09299343079328537, + "learning_rate": 0.00021824464135477268, + "loss": 2.5705, + "step": 23393 + }, + { + "epoch": 0.6937105239747353, + "grad_norm": 0.09747537225484848, + "learning_rate": 0.0002182057733917684, + "loss": 2.6146, + "step": 23394 + }, + { + "epoch": 0.6937401773270468, + "grad_norm": 0.09580762684345245, + "learning_rate": 0.0002181669079241001, + "loss": 2.6221, + "step": 23395 + }, + { + "epoch": 0.6937698306793583, + "grad_norm": 0.10474912822246552, + "learning_rate": 0.00021812804495211231, + "loss": 2.6082, + "step": 23396 + }, + { + "epoch": 0.6937994840316698, + "grad_norm": 0.10042140632867813, + "learning_rate": 0.00021808918447614902, + "loss": 2.6133, + "step": 23397 + }, + { + "epoch": 0.6938291373839812, + "grad_norm": 0.10244983434677124, + "learning_rate": 0.00021805032649655436, + "loss": 2.5624, + "step": 23398 + }, + { + "epoch": 0.6938587907362928, + "grad_norm": 0.09565524756908417, + "learning_rate": 0.00021801147101367248, + "loss": 2.5835, + "step": 23399 + }, + { + "epoch": 0.6938884440886042, + "grad_norm": 0.08940941840410233, + "learning_rate": 0.00021797261802784725, + "loss": 2.6047, + "step": 23400 + }, + { + "epoch": 0.6939180974409157, + "grad_norm": 0.11154299974441528, + "learning_rate": 0.00021793376753942307, + "loss": 2.5964, + "step": 23401 + }, + { + "epoch": 0.6939477507932271, + "grad_norm": 0.10166268795728683, + "learning_rate": 0.0002178949195487438, + "loss": 2.5799, + "step": 23402 + }, + { + "epoch": 0.6939774041455387, + "grad_norm": 0.09351127594709396, + "learning_rate": 0.00021785607405615343, + "loss": 2.5958, + "step": 23403 + }, + { + "epoch": 0.6940070574978501, + "grad_norm": 0.10764729231595993, + "learning_rate": 0.00021781723106199615, + "loss": 2.5592, + "step": 23404 + }, + { + "epoch": 0.6940367108501616, + "grad_norm": 0.10474539548158646, + "learning_rate": 0.00021777839056661552, + "loss": 2.5937, + "step": 23405 + }, + { + "epoch": 0.6940663642024731, + "grad_norm": 0.1099478229880333, + "learning_rate": 0.0002177395525703557, + "loss": 2.5884, + "step": 23406 + }, + { + "epoch": 0.6940960175547846, + "grad_norm": 0.09108500927686691, + "learning_rate": 0.00021770071707356058, + "loss": 2.581, + "step": 23407 + }, + { + "epoch": 0.6941256709070961, + "grad_norm": 0.10646521300077438, + "learning_rate": 0.00021766188407657406, + "loss": 2.5516, + "step": 23408 + }, + { + "epoch": 0.6941553242594075, + "grad_norm": 0.10636548697948456, + "learning_rate": 0.00021762305357974005, + "loss": 2.5815, + "step": 23409 + }, + { + "epoch": 0.694184977611719, + "grad_norm": 0.08991283178329468, + "learning_rate": 0.00021758422558340235, + "loss": 2.5586, + "step": 23410 + }, + { + "epoch": 0.6942146309640305, + "grad_norm": 0.08980374783277512, + "learning_rate": 0.00021754540008790484, + "loss": 2.5697, + "step": 23411 + }, + { + "epoch": 0.694244284316342, + "grad_norm": 0.09338288754224777, + "learning_rate": 0.00021750657709359127, + "loss": 2.5648, + "step": 23412 + }, + { + "epoch": 0.6942739376686534, + "grad_norm": 0.10016138106584549, + "learning_rate": 0.00021746775660080525, + "loss": 2.612, + "step": 23413 + }, + { + "epoch": 0.694303591020965, + "grad_norm": 0.10515288263559341, + "learning_rate": 0.00021742893860989093, + "loss": 2.5922, + "step": 23414 + }, + { + "epoch": 0.6943332443732764, + "grad_norm": 0.08991410583257675, + "learning_rate": 0.00021739012312119207, + "loss": 2.5529, + "step": 23415 + }, + { + "epoch": 0.6943628977255879, + "grad_norm": 0.10838736593723297, + "learning_rate": 0.00021735131013505198, + "loss": 2.5892, + "step": 23416 + }, + { + "epoch": 0.6943925510778993, + "grad_norm": 0.09324055910110474, + "learning_rate": 0.00021731249965181455, + "loss": 2.56, + "step": 23417 + }, + { + "epoch": 0.6944222044302109, + "grad_norm": 0.10668040066957474, + "learning_rate": 0.00021727369167182347, + "loss": 2.5742, + "step": 23418 + }, + { + "epoch": 0.6944518577825223, + "grad_norm": 0.10735748708248138, + "learning_rate": 0.00021723488619542237, + "loss": 2.5979, + "step": 23419 + }, + { + "epoch": 0.6944815111348338, + "grad_norm": 0.09921902418136597, + "learning_rate": 0.00021719608322295493, + "loss": 2.5804, + "step": 23420 + }, + { + "epoch": 0.6945111644871452, + "grad_norm": 0.10947654396295547, + "learning_rate": 0.00021715728275476464, + "loss": 2.5833, + "step": 23421 + }, + { + "epoch": 0.6945408178394568, + "grad_norm": 0.0955362319946289, + "learning_rate": 0.00021711848479119523, + "loss": 2.5957, + "step": 23422 + }, + { + "epoch": 0.6945704711917682, + "grad_norm": 0.10028194636106491, + "learning_rate": 0.00021707968933259015, + "loss": 2.5931, + "step": 23423 + }, + { + "epoch": 0.6946001245440797, + "grad_norm": 0.10058518499135971, + "learning_rate": 0.00021704089637929297, + "loss": 2.5893, + "step": 23424 + }, + { + "epoch": 0.6946297778963911, + "grad_norm": 0.09671647101640701, + "learning_rate": 0.00021700210593164726, + "loss": 2.6251, + "step": 23425 + }, + { + "epoch": 0.6946594312487027, + "grad_norm": 0.10551842302083969, + "learning_rate": 0.00021696331798999648, + "loss": 2.5822, + "step": 23426 + }, + { + "epoch": 0.6946890846010142, + "grad_norm": 0.09386103600263596, + "learning_rate": 0.0002169245325546841, + "loss": 2.6054, + "step": 23427 + }, + { + "epoch": 0.6947187379533256, + "grad_norm": 0.09485294669866562, + "learning_rate": 0.0002168857496260535, + "loss": 2.5986, + "step": 23428 + }, + { + "epoch": 0.6947483913056371, + "grad_norm": 0.0947047621011734, + "learning_rate": 0.0002168469692044483, + "loss": 2.5851, + "step": 23429 + }, + { + "epoch": 0.6947780446579486, + "grad_norm": 0.10361006110906601, + "learning_rate": 0.00021680819129021173, + "loss": 2.5943, + "step": 23430 + }, + { + "epoch": 0.6948076980102601, + "grad_norm": 0.10257230699062347, + "learning_rate": 0.0002167694158836872, + "loss": 2.5854, + "step": 23431 + }, + { + "epoch": 0.6948373513625715, + "grad_norm": 0.09703881293535233, + "learning_rate": 0.00021673064298521815, + "loss": 2.5687, + "step": 23432 + }, + { + "epoch": 0.694867004714883, + "grad_norm": 0.09886428713798523, + "learning_rate": 0.0002166918725951479, + "loss": 2.5354, + "step": 23433 + }, + { + "epoch": 0.6948966580671945, + "grad_norm": 0.10778346657752991, + "learning_rate": 0.00021665310471381973, + "loss": 2.6164, + "step": 23434 + }, + { + "epoch": 0.694926311419506, + "grad_norm": 0.09603622555732727, + "learning_rate": 0.00021661433934157693, + "loss": 2.5924, + "step": 23435 + }, + { + "epoch": 0.6949559647718174, + "grad_norm": 0.10118774324655533, + "learning_rate": 0.0002165755764787628, + "loss": 2.6228, + "step": 23436 + }, + { + "epoch": 0.694985618124129, + "grad_norm": 0.1051597073674202, + "learning_rate": 0.00021653681612572064, + "loss": 2.5977, + "step": 23437 + }, + { + "epoch": 0.6950152714764404, + "grad_norm": 0.10345710813999176, + "learning_rate": 0.00021649805828279358, + "loss": 2.5818, + "step": 23438 + }, + { + "epoch": 0.6950449248287519, + "grad_norm": 0.09390953183174133, + "learning_rate": 0.0002164593029503249, + "loss": 2.5727, + "step": 23439 + }, + { + "epoch": 0.6950745781810633, + "grad_norm": 0.0974838063120842, + "learning_rate": 0.00021642055012865773, + "loss": 2.5663, + "step": 23440 + }, + { + "epoch": 0.6951042315333749, + "grad_norm": 0.10846444964408875, + "learning_rate": 0.00021638179981813528, + "loss": 2.6036, + "step": 23441 + }, + { + "epoch": 0.6951338848856863, + "grad_norm": 0.09944786131381989, + "learning_rate": 0.0002163430520191007, + "loss": 2.592, + "step": 23442 + }, + { + "epoch": 0.6951635382379978, + "grad_norm": 0.10708030313253403, + "learning_rate": 0.00021630430673189705, + "loss": 2.5801, + "step": 23443 + }, + { + "epoch": 0.6951931915903092, + "grad_norm": 0.11253681778907776, + "learning_rate": 0.00021626556395686747, + "loss": 2.6015, + "step": 23444 + }, + { + "epoch": 0.6952228449426208, + "grad_norm": 0.09102161973714828, + "learning_rate": 0.00021622682369435505, + "loss": 2.5839, + "step": 23445 + }, + { + "epoch": 0.6952524982949323, + "grad_norm": 0.11437150835990906, + "learning_rate": 0.0002161880859447028, + "loss": 2.5886, + "step": 23446 + }, + { + "epoch": 0.6952821516472437, + "grad_norm": 0.11342314630746841, + "learning_rate": 0.00021614935070825376, + "loss": 2.6192, + "step": 23447 + }, + { + "epoch": 0.6953118049995552, + "grad_norm": 0.09977376461029053, + "learning_rate": 0.00021611061798535092, + "loss": 2.5504, + "step": 23448 + }, + { + "epoch": 0.6953414583518667, + "grad_norm": 0.10985807329416275, + "learning_rate": 0.00021607188777633752, + "loss": 2.5907, + "step": 23449 + }, + { + "epoch": 0.6953711117041782, + "grad_norm": 0.09822751581668854, + "learning_rate": 0.00021603316008155605, + "loss": 2.5417, + "step": 23450 + }, + { + "epoch": 0.6954007650564896, + "grad_norm": 0.10465718805789948, + "learning_rate": 0.00021599443490134975, + "loss": 2.5575, + "step": 23451 + }, + { + "epoch": 0.6954304184088012, + "grad_norm": 0.09529207646846771, + "learning_rate": 0.0002159557122360612, + "loss": 2.6066, + "step": 23452 + }, + { + "epoch": 0.6954600717611126, + "grad_norm": 0.09616398066282272, + "learning_rate": 0.00021591699208603382, + "loss": 2.5943, + "step": 23453 + }, + { + "epoch": 0.6954897251134241, + "grad_norm": 0.09569235891103745, + "learning_rate": 0.00021587827445161022, + "loss": 2.5938, + "step": 23454 + }, + { + "epoch": 0.6955193784657355, + "grad_norm": 0.09421908855438232, + "learning_rate": 0.0002158395593331333, + "loss": 2.5722, + "step": 23455 + }, + { + "epoch": 0.6955490318180471, + "grad_norm": 0.10566753894090652, + "learning_rate": 0.00021580084673094584, + "loss": 2.5919, + "step": 23456 + }, + { + "epoch": 0.6955786851703585, + "grad_norm": 0.09327124059200287, + "learning_rate": 0.00021576213664539068, + "loss": 2.6331, + "step": 23457 + }, + { + "epoch": 0.69560833852267, + "grad_norm": 0.11045411229133606, + "learning_rate": 0.0002157234290768106, + "loss": 2.636, + "step": 23458 + }, + { + "epoch": 0.6956379918749814, + "grad_norm": 0.09856033325195312, + "learning_rate": 0.0002156847240255483, + "loss": 2.6046, + "step": 23459 + }, + { + "epoch": 0.695667645227293, + "grad_norm": 0.10423186421394348, + "learning_rate": 0.0002156460214919468, + "loss": 2.5927, + "step": 23460 + }, + { + "epoch": 0.6956972985796044, + "grad_norm": 0.09846053272485733, + "learning_rate": 0.00021560732147634837, + "loss": 2.5942, + "step": 23461 + }, + { + "epoch": 0.6957269519319159, + "grad_norm": 0.09822485595941544, + "learning_rate": 0.00021556862397909593, + "loss": 2.5486, + "step": 23462 + }, + { + "epoch": 0.6957566052842273, + "grad_norm": 0.08989689499139786, + "learning_rate": 0.00021552992900053213, + "loss": 2.5792, + "step": 23463 + }, + { + "epoch": 0.6957862586365389, + "grad_norm": 0.09955987334251404, + "learning_rate": 0.00021549123654099968, + "loss": 2.607, + "step": 23464 + }, + { + "epoch": 0.6958159119888503, + "grad_norm": 0.10116146504878998, + "learning_rate": 0.00021545254660084096, + "loss": 2.612, + "step": 23465 + }, + { + "epoch": 0.6958455653411618, + "grad_norm": 0.09470920264720917, + "learning_rate": 0.00021541385918039897, + "loss": 2.5657, + "step": 23466 + }, + { + "epoch": 0.6958752186934734, + "grad_norm": 0.10315898060798645, + "learning_rate": 0.00021537517428001614, + "loss": 2.5633, + "step": 23467 + }, + { + "epoch": 0.6959048720457848, + "grad_norm": 0.09138283133506775, + "learning_rate": 0.0002153364919000349, + "loss": 2.5978, + "step": 23468 + }, + { + "epoch": 0.6959345253980963, + "grad_norm": 0.09931250661611557, + "learning_rate": 0.00021529781204079795, + "loss": 2.5608, + "step": 23469 + }, + { + "epoch": 0.6959641787504077, + "grad_norm": 0.08576098829507828, + "learning_rate": 0.00021525913470264797, + "loss": 2.5756, + "step": 23470 + }, + { + "epoch": 0.6959938321027193, + "grad_norm": 0.10359574854373932, + "learning_rate": 0.000215220459885927, + "loss": 2.5756, + "step": 23471 + }, + { + "epoch": 0.6960234854550307, + "grad_norm": 0.08407346159219742, + "learning_rate": 0.00021518178759097773, + "loss": 2.567, + "step": 23472 + }, + { + "epoch": 0.6960531388073422, + "grad_norm": 0.10879731178283691, + "learning_rate": 0.0002151431178181426, + "loss": 2.5721, + "step": 23473 + }, + { + "epoch": 0.6960827921596536, + "grad_norm": 0.09392384439706802, + "learning_rate": 0.00021510445056776407, + "loss": 2.5897, + "step": 23474 + }, + { + "epoch": 0.6961124455119652, + "grad_norm": 0.1226295605301857, + "learning_rate": 0.00021506578584018455, + "loss": 2.6071, + "step": 23475 + }, + { + "epoch": 0.6961420988642766, + "grad_norm": 0.10302985459566116, + "learning_rate": 0.0002150271236357464, + "loss": 2.5878, + "step": 23476 + }, + { + "epoch": 0.6961717522165881, + "grad_norm": 0.1021229550242424, + "learning_rate": 0.00021498846395479198, + "loss": 2.5791, + "step": 23477 + }, + { + "epoch": 0.6962014055688995, + "grad_norm": 0.11233489215373993, + "learning_rate": 0.00021494980679766346, + "loss": 2.5751, + "step": 23478 + }, + { + "epoch": 0.6962310589212111, + "grad_norm": 0.10022997856140137, + "learning_rate": 0.00021491115216470353, + "loss": 2.5857, + "step": 23479 + }, + { + "epoch": 0.6962607122735225, + "grad_norm": 0.12076768279075623, + "learning_rate": 0.00021487250005625442, + "loss": 2.6088, + "step": 23480 + }, + { + "epoch": 0.696290365625834, + "grad_norm": 0.09785038977861404, + "learning_rate": 0.00021483385047265814, + "loss": 2.5759, + "step": 23481 + }, + { + "epoch": 0.6963200189781454, + "grad_norm": 0.12138565629720688, + "learning_rate": 0.000214795203414257, + "loss": 2.5595, + "step": 23482 + }, + { + "epoch": 0.696349672330457, + "grad_norm": 0.09749618917703629, + "learning_rate": 0.00021475655888139334, + "loss": 2.588, + "step": 23483 + }, + { + "epoch": 0.6963793256827684, + "grad_norm": 0.11124671995639801, + "learning_rate": 0.00021471791687440928, + "loss": 2.5682, + "step": 23484 + }, + { + "epoch": 0.6964089790350799, + "grad_norm": 0.09481917321681976, + "learning_rate": 0.00021467927739364702, + "loss": 2.5725, + "step": 23485 + }, + { + "epoch": 0.6964386323873913, + "grad_norm": 0.11566115915775299, + "learning_rate": 0.00021464064043944875, + "loss": 2.6002, + "step": 23486 + }, + { + "epoch": 0.6964682857397029, + "grad_norm": 0.09976369142532349, + "learning_rate": 0.00021460200601215658, + "loss": 2.5568, + "step": 23487 + }, + { + "epoch": 0.6964979390920144, + "grad_norm": 0.10780713707208633, + "learning_rate": 0.00021456337411211268, + "loss": 2.5886, + "step": 23488 + }, + { + "epoch": 0.6965275924443258, + "grad_norm": 0.09659069776535034, + "learning_rate": 0.00021452474473965906, + "loss": 2.6042, + "step": 23489 + }, + { + "epoch": 0.6965572457966374, + "grad_norm": 0.1022278219461441, + "learning_rate": 0.00021448611789513767, + "loss": 2.6078, + "step": 23490 + }, + { + "epoch": 0.6965868991489488, + "grad_norm": 0.0862179845571518, + "learning_rate": 0.00021444749357889104, + "loss": 2.5928, + "step": 23491 + }, + { + "epoch": 0.6966165525012603, + "grad_norm": 0.09844324737787247, + "learning_rate": 0.00021440887179126068, + "loss": 2.5886, + "step": 23492 + }, + { + "epoch": 0.6966462058535717, + "grad_norm": 0.09066429734230042, + "learning_rate": 0.0002143702525325888, + "loss": 2.5973, + "step": 23493 + }, + { + "epoch": 0.6966758592058833, + "grad_norm": 0.09615874290466309, + "learning_rate": 0.0002143316358032174, + "loss": 2.6039, + "step": 23494 + }, + { + "epoch": 0.6967055125581947, + "grad_norm": 0.10330694913864136, + "learning_rate": 0.00021429302160348834, + "loss": 2.6125, + "step": 23495 + }, + { + "epoch": 0.6967351659105062, + "grad_norm": 0.08730742335319519, + "learning_rate": 0.00021425440993374367, + "loss": 2.5341, + "step": 23496 + }, + { + "epoch": 0.6967648192628176, + "grad_norm": 0.09577508270740509, + "learning_rate": 0.0002142158007943252, + "loss": 2.5924, + "step": 23497 + }, + { + "epoch": 0.6967944726151292, + "grad_norm": 0.09606233984231949, + "learning_rate": 0.00021417719418557492, + "loss": 2.5869, + "step": 23498 + }, + { + "epoch": 0.6968241259674406, + "grad_norm": 0.08603270351886749, + "learning_rate": 0.0002141385901078346, + "loss": 2.5717, + "step": 23499 + }, + { + "epoch": 0.6968537793197521, + "grad_norm": 0.08989644795656204, + "learning_rate": 0.00021409998856144615, + "loss": 2.6038, + "step": 23500 + }, + { + "epoch": 0.6968834326720635, + "grad_norm": 0.09015519171953201, + "learning_rate": 0.00021406138954675136, + "loss": 2.594, + "step": 23501 + }, + { + "epoch": 0.6969130860243751, + "grad_norm": 0.0858294665813446, + "learning_rate": 0.00021402279306409206, + "loss": 2.5991, + "step": 23502 + }, + { + "epoch": 0.6969427393766865, + "grad_norm": 0.10627111792564392, + "learning_rate": 0.00021398419911381, + "loss": 2.5831, + "step": 23503 + }, + { + "epoch": 0.696972392728998, + "grad_norm": 0.10027817636728287, + "learning_rate": 0.00021394560769624695, + "loss": 2.5808, + "step": 23504 + }, + { + "epoch": 0.6970020460813094, + "grad_norm": 0.10049968957901001, + "learning_rate": 0.0002139070188117447, + "loss": 2.6123, + "step": 23505 + }, + { + "epoch": 0.697031699433621, + "grad_norm": 0.10142470896244049, + "learning_rate": 0.00021386843246064486, + "loss": 2.6139, + "step": 23506 + }, + { + "epoch": 0.6970613527859324, + "grad_norm": 0.09440693259239197, + "learning_rate": 0.00021382984864328915, + "loss": 2.5926, + "step": 23507 + }, + { + "epoch": 0.6970910061382439, + "grad_norm": 0.10227787494659424, + "learning_rate": 0.00021379126736001926, + "loss": 2.5698, + "step": 23508 + }, + { + "epoch": 0.6971206594905555, + "grad_norm": 0.10467097908258438, + "learning_rate": 0.0002137526886111768, + "loss": 2.604, + "step": 23509 + }, + { + "epoch": 0.6971503128428669, + "grad_norm": 0.09700801968574524, + "learning_rate": 0.00021371411239710347, + "loss": 2.5564, + "step": 23510 + }, + { + "epoch": 0.6971799661951784, + "grad_norm": 0.09519834071397781, + "learning_rate": 0.00021367553871814082, + "loss": 2.5936, + "step": 23511 + }, + { + "epoch": 0.6972096195474898, + "grad_norm": 0.10161690413951874, + "learning_rate": 0.00021363696757463035, + "loss": 2.6047, + "step": 23512 + }, + { + "epoch": 0.6972392728998014, + "grad_norm": 0.08308887481689453, + "learning_rate": 0.00021359839896691374, + "loss": 2.5924, + "step": 23513 + }, + { + "epoch": 0.6972689262521128, + "grad_norm": 0.09717999398708344, + "learning_rate": 0.00021355983289533248, + "loss": 2.5846, + "step": 23514 + }, + { + "epoch": 0.6972985796044243, + "grad_norm": 0.08448558300733566, + "learning_rate": 0.0002135212693602282, + "loss": 2.583, + "step": 23515 + }, + { + "epoch": 0.6973282329567357, + "grad_norm": 0.09237919002771378, + "learning_rate": 0.00021348270836194194, + "loss": 2.5522, + "step": 23516 + }, + { + "epoch": 0.6973578863090473, + "grad_norm": 0.09641916304826736, + "learning_rate": 0.0002134441499008156, + "loss": 2.5711, + "step": 23517 + }, + { + "epoch": 0.6973875396613587, + "grad_norm": 0.0896209105849266, + "learning_rate": 0.00021340559397719055, + "loss": 2.6148, + "step": 23518 + }, + { + "epoch": 0.6974171930136702, + "grad_norm": 0.0976729765534401, + "learning_rate": 0.00021336704059140817, + "loss": 2.579, + "step": 23519 + }, + { + "epoch": 0.6974468463659816, + "grad_norm": 0.09178119152784348, + "learning_rate": 0.0002133284897438098, + "loss": 2.5938, + "step": 23520 + }, + { + "epoch": 0.6974764997182932, + "grad_norm": 0.08943423628807068, + "learning_rate": 0.00021328994143473691, + "loss": 2.6008, + "step": 23521 + }, + { + "epoch": 0.6975061530706046, + "grad_norm": 0.08297950029373169, + "learning_rate": 0.00021325139566453078, + "loss": 2.6278, + "step": 23522 + }, + { + "epoch": 0.6975358064229161, + "grad_norm": 0.08834080398082733, + "learning_rate": 0.00021321285243353273, + "loss": 2.5529, + "step": 23523 + }, + { + "epoch": 0.6975654597752275, + "grad_norm": 0.09400256723165512, + "learning_rate": 0.00021317431174208414, + "loss": 2.5947, + "step": 23524 + }, + { + "epoch": 0.6975951131275391, + "grad_norm": 0.09361403435468674, + "learning_rate": 0.0002131357735905264, + "loss": 2.5987, + "step": 23525 + }, + { + "epoch": 0.6976247664798505, + "grad_norm": 0.09893859177827835, + "learning_rate": 0.00021309723797920043, + "loss": 2.5914, + "step": 23526 + }, + { + "epoch": 0.697654419832162, + "grad_norm": 0.09045186638832092, + "learning_rate": 0.00021305870490844769, + "loss": 2.5741, + "step": 23527 + }, + { + "epoch": 0.6976840731844735, + "grad_norm": 0.10179567337036133, + "learning_rate": 0.0002130201743786094, + "loss": 2.5978, + "step": 23528 + }, + { + "epoch": 0.697713726536785, + "grad_norm": 0.10318366438150406, + "learning_rate": 0.0002129816463900265, + "loss": 2.5864, + "step": 23529 + }, + { + "epoch": 0.6977433798890965, + "grad_norm": 0.11586906015872955, + "learning_rate": 0.00021294312094304057, + "loss": 2.6011, + "step": 23530 + }, + { + "epoch": 0.6977730332414079, + "grad_norm": 0.08521836996078491, + "learning_rate": 0.00021290459803799261, + "loss": 2.5521, + "step": 23531 + }, + { + "epoch": 0.6978026865937195, + "grad_norm": 0.09889724105596542, + "learning_rate": 0.0002128660776752237, + "loss": 2.5714, + "step": 23532 + }, + { + "epoch": 0.6978323399460309, + "grad_norm": 0.08146417886018753, + "learning_rate": 0.00021282755985507497, + "loss": 2.5831, + "step": 23533 + }, + { + "epoch": 0.6978619932983424, + "grad_norm": 0.09901834279298782, + "learning_rate": 0.00021278904457788745, + "loss": 2.5734, + "step": 23534 + }, + { + "epoch": 0.6978916466506538, + "grad_norm": 0.08825419843196869, + "learning_rate": 0.0002127505318440023, + "loss": 2.615, + "step": 23535 + }, + { + "epoch": 0.6979213000029654, + "grad_norm": 0.08817917108535767, + "learning_rate": 0.0002127120216537606, + "loss": 2.5901, + "step": 23536 + }, + { + "epoch": 0.6979509533552768, + "grad_norm": 0.09382302314043045, + "learning_rate": 0.00021267351400750312, + "loss": 2.5601, + "step": 23537 + }, + { + "epoch": 0.6979806067075883, + "grad_norm": 0.10100851953029633, + "learning_rate": 0.00021263500890557097, + "loss": 2.5901, + "step": 23538 + }, + { + "epoch": 0.6980102600598997, + "grad_norm": 0.09395107626914978, + "learning_rate": 0.00021259650634830517, + "loss": 2.5578, + "step": 23539 + }, + { + "epoch": 0.6980399134122113, + "grad_norm": 0.09307531267404556, + "learning_rate": 0.00021255800633604666, + "loss": 2.6251, + "step": 23540 + }, + { + "epoch": 0.6980695667645227, + "grad_norm": 0.11337538063526154, + "learning_rate": 0.0002125195088691363, + "loss": 2.5843, + "step": 23541 + }, + { + "epoch": 0.6980992201168342, + "grad_norm": 0.10441461950540543, + "learning_rate": 0.00021248101394791486, + "loss": 2.58, + "step": 23542 + }, + { + "epoch": 0.6981288734691456, + "grad_norm": 0.09351150691509247, + "learning_rate": 0.0002124425215727236, + "loss": 2.6004, + "step": 23543 + }, + { + "epoch": 0.6981585268214572, + "grad_norm": 0.08939523249864578, + "learning_rate": 0.00021240403174390315, + "loss": 2.5812, + "step": 23544 + }, + { + "epoch": 0.6981881801737686, + "grad_norm": 0.10417721420526505, + "learning_rate": 0.00021236554446179434, + "loss": 2.5906, + "step": 23545 + }, + { + "epoch": 0.6982178335260801, + "grad_norm": 0.09555395692586899, + "learning_rate": 0.0002123270597267382, + "loss": 2.5825, + "step": 23546 + }, + { + "epoch": 0.6982474868783916, + "grad_norm": 0.09668980538845062, + "learning_rate": 0.00021228857753907522, + "loss": 2.5815, + "step": 23547 + }, + { + "epoch": 0.6982771402307031, + "grad_norm": 0.10091330856084824, + "learning_rate": 0.00021225009789914618, + "loss": 2.5954, + "step": 23548 + }, + { + "epoch": 0.6983067935830145, + "grad_norm": 0.10488195717334747, + "learning_rate": 0.00021221162080729196, + "loss": 2.5931, + "step": 23549 + }, + { + "epoch": 0.698336446935326, + "grad_norm": 0.09608180820941925, + "learning_rate": 0.0002121731462638532, + "loss": 2.594, + "step": 23550 + }, + { + "epoch": 0.6983661002876376, + "grad_norm": 0.10797765105962753, + "learning_rate": 0.00021213467426917066, + "loss": 2.593, + "step": 23551 + }, + { + "epoch": 0.698395753639949, + "grad_norm": 0.1153869777917862, + "learning_rate": 0.00021209620482358498, + "loss": 2.5573, + "step": 23552 + }, + { + "epoch": 0.6984254069922605, + "grad_norm": 0.09802794456481934, + "learning_rate": 0.00021205773792743683, + "loss": 2.6164, + "step": 23553 + }, + { + "epoch": 0.6984550603445719, + "grad_norm": 0.12031194567680359, + "learning_rate": 0.00021201927358106682, + "loss": 2.5859, + "step": 23554 + }, + { + "epoch": 0.6984847136968835, + "grad_norm": 0.1129804253578186, + "learning_rate": 0.00021198081178481543, + "loss": 2.6057, + "step": 23555 + }, + { + "epoch": 0.6985143670491949, + "grad_norm": 0.11152355372905731, + "learning_rate": 0.00021194235253902357, + "loss": 2.5655, + "step": 23556 + }, + { + "epoch": 0.6985440204015064, + "grad_norm": 0.1129215881228447, + "learning_rate": 0.00021190389584403174, + "loss": 2.5861, + "step": 23557 + }, + { + "epoch": 0.6985736737538178, + "grad_norm": 0.09441889077425003, + "learning_rate": 0.00021186544170018025, + "loss": 2.5748, + "step": 23558 + }, + { + "epoch": 0.6986033271061294, + "grad_norm": 0.11962998658418655, + "learning_rate": 0.00021182699010780965, + "loss": 2.598, + "step": 23559 + }, + { + "epoch": 0.6986329804584408, + "grad_norm": 0.09126736968755722, + "learning_rate": 0.00021178854106726057, + "loss": 2.5614, + "step": 23560 + }, + { + "epoch": 0.6986626338107523, + "grad_norm": 0.10901550203561783, + "learning_rate": 0.00021175009457887346, + "loss": 2.5672, + "step": 23561 + }, + { + "epoch": 0.6986922871630638, + "grad_norm": 0.10433545708656311, + "learning_rate": 0.00021171165064298868, + "loss": 2.5362, + "step": 23562 + }, + { + "epoch": 0.6987219405153753, + "grad_norm": 0.1034875437617302, + "learning_rate": 0.00021167320925994678, + "loss": 2.5836, + "step": 23563 + }, + { + "epoch": 0.6987515938676867, + "grad_norm": 0.10505042970180511, + "learning_rate": 0.00021163477043008804, + "loss": 2.5427, + "step": 23564 + }, + { + "epoch": 0.6987812472199982, + "grad_norm": 0.09676529467105865, + "learning_rate": 0.00021159633415375297, + "loss": 2.5982, + "step": 23565 + }, + { + "epoch": 0.6988109005723097, + "grad_norm": 0.10055842250585556, + "learning_rate": 0.0002115579004312818, + "loss": 2.5741, + "step": 23566 + }, + { + "epoch": 0.6988405539246212, + "grad_norm": 0.10230743139982224, + "learning_rate": 0.00021151946926301497, + "loss": 2.573, + "step": 23567 + }, + { + "epoch": 0.6988702072769326, + "grad_norm": 0.09967560321092606, + "learning_rate": 0.00021148104064929273, + "loss": 2.5681, + "step": 23568 + }, + { + "epoch": 0.6988998606292441, + "grad_norm": 0.10222459584474564, + "learning_rate": 0.00021144261459045543, + "loss": 2.5911, + "step": 23569 + }, + { + "epoch": 0.6989295139815556, + "grad_norm": 0.11023221164941788, + "learning_rate": 0.00021140419108684334, + "loss": 2.5552, + "step": 23570 + }, + { + "epoch": 0.6989591673338671, + "grad_norm": 0.10274103283882141, + "learning_rate": 0.0002113657701387966, + "loss": 2.5834, + "step": 23571 + }, + { + "epoch": 0.6989888206861786, + "grad_norm": 0.10059476643800735, + "learning_rate": 0.00021132735174665557, + "loss": 2.565, + "step": 23572 + }, + { + "epoch": 0.69901847403849, + "grad_norm": 0.10616970807313919, + "learning_rate": 0.0002112889359107603, + "loss": 2.5644, + "step": 23573 + }, + { + "epoch": 0.6990481273908016, + "grad_norm": 0.09253467619419098, + "learning_rate": 0.00021125052263145118, + "loss": 2.58, + "step": 23574 + }, + { + "epoch": 0.699077780743113, + "grad_norm": 0.11239703744649887, + "learning_rate": 0.00021121211190906815, + "loss": 2.5369, + "step": 23575 + }, + { + "epoch": 0.6991074340954245, + "grad_norm": 0.09586618840694427, + "learning_rate": 0.0002111737037439515, + "loss": 2.5464, + "step": 23576 + }, + { + "epoch": 0.699137087447736, + "grad_norm": 0.10232804715633392, + "learning_rate": 0.00021113529813644122, + "loss": 2.621, + "step": 23577 + }, + { + "epoch": 0.6991667408000475, + "grad_norm": 0.0961645320057869, + "learning_rate": 0.00021109689508687753, + "loss": 2.5982, + "step": 23578 + }, + { + "epoch": 0.6991963941523589, + "grad_norm": 0.10260947048664093, + "learning_rate": 0.00021105849459560034, + "loss": 2.6017, + "step": 23579 + }, + { + "epoch": 0.6992260475046704, + "grad_norm": 0.0942702367901802, + "learning_rate": 0.00021102009666295002, + "loss": 2.5985, + "step": 23580 + }, + { + "epoch": 0.6992557008569819, + "grad_norm": 0.10356798022985458, + "learning_rate": 0.00021098170128926598, + "loss": 2.6101, + "step": 23581 + }, + { + "epoch": 0.6992853542092934, + "grad_norm": 0.09830443561077118, + "learning_rate": 0.00021094330847488873, + "loss": 2.6092, + "step": 23582 + }, + { + "epoch": 0.6993150075616048, + "grad_norm": 0.10229874402284622, + "learning_rate": 0.00021090491822015812, + "loss": 2.5739, + "step": 23583 + }, + { + "epoch": 0.6993446609139163, + "grad_norm": 0.10169542580842972, + "learning_rate": 0.0002108665305254141, + "loss": 2.5895, + "step": 23584 + }, + { + "epoch": 0.6993743142662278, + "grad_norm": 0.09516365826129913, + "learning_rate": 0.00021082814539099653, + "loss": 2.582, + "step": 23585 + }, + { + "epoch": 0.6994039676185393, + "grad_norm": 0.09054850041866302, + "learning_rate": 0.00021078976281724542, + "loss": 2.5942, + "step": 23586 + }, + { + "epoch": 0.6994336209708507, + "grad_norm": 0.0891595408320427, + "learning_rate": 0.00021075138280450062, + "loss": 2.5657, + "step": 23587 + }, + { + "epoch": 0.6994632743231622, + "grad_norm": 0.0927916020154953, + "learning_rate": 0.0002107130053531019, + "loss": 2.6077, + "step": 23588 + }, + { + "epoch": 0.6994929276754737, + "grad_norm": 0.08724731206893921, + "learning_rate": 0.00021067463046338925, + "loss": 2.5738, + "step": 23589 + }, + { + "epoch": 0.6995225810277852, + "grad_norm": 0.09143324941396713, + "learning_rate": 0.00021063625813570237, + "loss": 2.5825, + "step": 23590 + }, + { + "epoch": 0.6995522343800966, + "grad_norm": 0.09435846656560898, + "learning_rate": 0.00021059788837038125, + "loss": 2.5732, + "step": 23591 + }, + { + "epoch": 0.6995818877324081, + "grad_norm": 0.0949467197060585, + "learning_rate": 0.00021055952116776533, + "loss": 2.5739, + "step": 23592 + }, + { + "epoch": 0.6996115410847197, + "grad_norm": 0.08869681507349014, + "learning_rate": 0.0002105211565281946, + "loss": 2.6251, + "step": 23593 + }, + { + "epoch": 0.6996411944370311, + "grad_norm": 0.095002681016922, + "learning_rate": 0.00021048279445200847, + "loss": 2.5903, + "step": 23594 + }, + { + "epoch": 0.6996708477893426, + "grad_norm": 0.08708539605140686, + "learning_rate": 0.00021044443493954706, + "loss": 2.5831, + "step": 23595 + }, + { + "epoch": 0.699700501141654, + "grad_norm": 0.08605396747589111, + "learning_rate": 0.00021040607799114992, + "loss": 2.5633, + "step": 23596 + }, + { + "epoch": 0.6997301544939656, + "grad_norm": 0.08729807287454605, + "learning_rate": 0.00021036772360715666, + "loss": 2.5895, + "step": 23597 + }, + { + "epoch": 0.699759807846277, + "grad_norm": 0.10537195950746536, + "learning_rate": 0.0002103293717879069, + "loss": 2.5886, + "step": 23598 + }, + { + "epoch": 0.6997894611985885, + "grad_norm": 0.08733624964952469, + "learning_rate": 0.00021029102253374032, + "loss": 2.5968, + "step": 23599 + }, + { + "epoch": 0.6998191145509, + "grad_norm": 0.10340000689029694, + "learning_rate": 0.00021025267584499641, + "loss": 2.5808, + "step": 23600 + }, + { + "epoch": 0.6998487679032115, + "grad_norm": 0.09480572491884232, + "learning_rate": 0.00021021433172201503, + "loss": 2.5976, + "step": 23601 + }, + { + "epoch": 0.6998784212555229, + "grad_norm": 0.10138516128063202, + "learning_rate": 0.00021017599016513527, + "loss": 2.5444, + "step": 23602 + }, + { + "epoch": 0.6999080746078344, + "grad_norm": 0.09398947656154633, + "learning_rate": 0.00021013765117469684, + "loss": 2.587, + "step": 23603 + }, + { + "epoch": 0.6999377279601459, + "grad_norm": 0.09613438695669174, + "learning_rate": 0.00021009931475103927, + "loss": 2.5541, + "step": 23604 + }, + { + "epoch": 0.6999673813124574, + "grad_norm": 0.0942244604229927, + "learning_rate": 0.0002100609808945021, + "loss": 2.5736, + "step": 23605 + }, + { + "epoch": 0.6999970346647688, + "grad_norm": 0.09301714599132538, + "learning_rate": 0.0002100226496054246, + "loss": 2.6065, + "step": 23606 + }, + { + "epoch": 0.7000266880170803, + "grad_norm": 0.10517916083335876, + "learning_rate": 0.00020998432088414621, + "loss": 2.5834, + "step": 23607 + }, + { + "epoch": 0.7000563413693918, + "grad_norm": 0.08721762895584106, + "learning_rate": 0.00020994599473100661, + "loss": 2.5351, + "step": 23608 + }, + { + "epoch": 0.7000859947217033, + "grad_norm": 0.10929516702890396, + "learning_rate": 0.00020990767114634502, + "loss": 2.5754, + "step": 23609 + }, + { + "epoch": 0.7001156480740147, + "grad_norm": 0.0882544219493866, + "learning_rate": 0.00020986935013050074, + "loss": 2.5807, + "step": 23610 + }, + { + "epoch": 0.7001453014263262, + "grad_norm": 0.1007949709892273, + "learning_rate": 0.00020983103168381324, + "loss": 2.59, + "step": 23611 + }, + { + "epoch": 0.7001749547786377, + "grad_norm": 0.09902002662420273, + "learning_rate": 0.00020979271580662192, + "loss": 2.6092, + "step": 23612 + }, + { + "epoch": 0.7002046081309492, + "grad_norm": 0.09123171865940094, + "learning_rate": 0.0002097544024992657, + "loss": 2.56, + "step": 23613 + }, + { + "epoch": 0.7002342614832607, + "grad_norm": 0.10405527800321579, + "learning_rate": 0.00020971609176208405, + "loss": 2.569, + "step": 23614 + }, + { + "epoch": 0.7002639148355722, + "grad_norm": 0.09400951117277145, + "learning_rate": 0.00020967778359541627, + "loss": 2.5849, + "step": 23615 + }, + { + "epoch": 0.7002935681878837, + "grad_norm": 0.10779685527086258, + "learning_rate": 0.00020963947799960153, + "loss": 2.5666, + "step": 23616 + }, + { + "epoch": 0.7003232215401951, + "grad_norm": 0.08774092048406601, + "learning_rate": 0.0002096011749749791, + "loss": 2.5682, + "step": 23617 + }, + { + "epoch": 0.7003528748925066, + "grad_norm": 0.10656418651342392, + "learning_rate": 0.0002095628745218881, + "loss": 2.5964, + "step": 23618 + }, + { + "epoch": 0.7003825282448181, + "grad_norm": 0.10164295881986618, + "learning_rate": 0.00020952457664066766, + "loss": 2.6163, + "step": 23619 + }, + { + "epoch": 0.7004121815971296, + "grad_norm": 0.09669645875692368, + "learning_rate": 0.0002094862813316568, + "loss": 2.5918, + "step": 23620 + }, + { + "epoch": 0.700441834949441, + "grad_norm": 0.11761365830898285, + "learning_rate": 0.00020944798859519494, + "loss": 2.589, + "step": 23621 + }, + { + "epoch": 0.7004714883017525, + "grad_norm": 0.11244653910398483, + "learning_rate": 0.0002094096984316212, + "loss": 2.5956, + "step": 23622 + }, + { + "epoch": 0.700501141654064, + "grad_norm": 0.09849077463150024, + "learning_rate": 0.0002093714108412743, + "loss": 2.5712, + "step": 23623 + }, + { + "epoch": 0.7005307950063755, + "grad_norm": 0.08273118734359741, + "learning_rate": 0.00020933312582449343, + "loss": 2.5755, + "step": 23624 + }, + { + "epoch": 0.7005604483586869, + "grad_norm": 0.09453452378511429, + "learning_rate": 0.00020929484338161764, + "loss": 2.578, + "step": 23625 + }, + { + "epoch": 0.7005901017109984, + "grad_norm": 0.08674938231706619, + "learning_rate": 0.0002092565635129859, + "loss": 2.5939, + "step": 23626 + }, + { + "epoch": 0.7006197550633099, + "grad_norm": 0.0989314541220665, + "learning_rate": 0.00020921828621893717, + "loss": 2.6055, + "step": 23627 + }, + { + "epoch": 0.7006494084156214, + "grad_norm": 0.09066957235336304, + "learning_rate": 0.00020918001149981046, + "loss": 2.5878, + "step": 23628 + }, + { + "epoch": 0.7006790617679328, + "grad_norm": 0.08972296863794327, + "learning_rate": 0.00020914173935594467, + "loss": 2.572, + "step": 23629 + }, + { + "epoch": 0.7007087151202444, + "grad_norm": 0.0866297110915184, + "learning_rate": 0.00020910346978767868, + "loss": 2.6198, + "step": 23630 + }, + { + "epoch": 0.7007383684725558, + "grad_norm": 0.09916099905967712, + "learning_rate": 0.0002090652027953514, + "loss": 2.5988, + "step": 23631 + }, + { + "epoch": 0.7007680218248673, + "grad_norm": 0.08904929459095001, + "learning_rate": 0.0002090269383793017, + "loss": 2.6027, + "step": 23632 + }, + { + "epoch": 0.7007976751771787, + "grad_norm": 0.0958869457244873, + "learning_rate": 0.0002089886765398684, + "loss": 2.5799, + "step": 23633 + }, + { + "epoch": 0.7008273285294903, + "grad_norm": 0.09680566936731339, + "learning_rate": 0.00020895041727739033, + "loss": 2.594, + "step": 23634 + }, + { + "epoch": 0.7008569818818018, + "grad_norm": 0.10125337541103363, + "learning_rate": 0.0002089121605922063, + "loss": 2.5909, + "step": 23635 + }, + { + "epoch": 0.7008866352341132, + "grad_norm": 0.09621629863977432, + "learning_rate": 0.000208873906484655, + "loss": 2.5833, + "step": 23636 + }, + { + "epoch": 0.7009162885864247, + "grad_norm": 0.09355610609054565, + "learning_rate": 0.00020883565495507522, + "loss": 2.5724, + "step": 23637 + }, + { + "epoch": 0.7009459419387362, + "grad_norm": 0.09010285139083862, + "learning_rate": 0.00020879740600380576, + "loss": 2.5826, + "step": 23638 + }, + { + "epoch": 0.7009755952910477, + "grad_norm": 0.09441730380058289, + "learning_rate": 0.00020875915963118518, + "loss": 2.5822, + "step": 23639 + }, + { + "epoch": 0.7010052486433591, + "grad_norm": 0.10343588888645172, + "learning_rate": 0.00020872091583755233, + "loss": 2.6161, + "step": 23640 + }, + { + "epoch": 0.7010349019956706, + "grad_norm": 0.09363285452127457, + "learning_rate": 0.00020868267462324569, + "loss": 2.5867, + "step": 23641 + }, + { + "epoch": 0.7010645553479821, + "grad_norm": 0.09146258980035782, + "learning_rate": 0.00020864443598860393, + "loss": 2.5727, + "step": 23642 + }, + { + "epoch": 0.7010942087002936, + "grad_norm": 0.0896865501999855, + "learning_rate": 0.0002086061999339658, + "loss": 2.5976, + "step": 23643 + }, + { + "epoch": 0.701123862052605, + "grad_norm": 0.0926935225725174, + "learning_rate": 0.00020856796645966975, + "loss": 2.5681, + "step": 23644 + }, + { + "epoch": 0.7011535154049165, + "grad_norm": 0.09307846426963806, + "learning_rate": 0.00020852973556605438, + "loss": 2.5936, + "step": 23645 + }, + { + "epoch": 0.701183168757228, + "grad_norm": 0.09704049676656723, + "learning_rate": 0.00020849150725345817, + "loss": 2.5836, + "step": 23646 + }, + { + "epoch": 0.7012128221095395, + "grad_norm": 0.10017220675945282, + "learning_rate": 0.00020845328152221975, + "loss": 2.5998, + "step": 23647 + }, + { + "epoch": 0.7012424754618509, + "grad_norm": 0.09339065849781036, + "learning_rate": 0.0002084150583726776, + "loss": 2.5786, + "step": 23648 + }, + { + "epoch": 0.7012721288141625, + "grad_norm": 0.09062656760215759, + "learning_rate": 0.00020837683780517013, + "loss": 2.5908, + "step": 23649 + }, + { + "epoch": 0.7013017821664739, + "grad_norm": 0.09698555618524551, + "learning_rate": 0.00020833861982003581, + "loss": 2.5646, + "step": 23650 + }, + { + "epoch": 0.7013314355187854, + "grad_norm": 0.10558909922838211, + "learning_rate": 0.00020830040441761305, + "loss": 2.5795, + "step": 23651 + }, + { + "epoch": 0.7013610888710968, + "grad_norm": 0.09537976235151291, + "learning_rate": 0.00020826219159824033, + "loss": 2.5684, + "step": 23652 + }, + { + "epoch": 0.7013907422234084, + "grad_norm": 0.10698717087507248, + "learning_rate": 0.00020822398136225595, + "loss": 2.6014, + "step": 23653 + }, + { + "epoch": 0.7014203955757199, + "grad_norm": 0.09313822537660599, + "learning_rate": 0.0002081857737099983, + "loss": 2.5622, + "step": 23654 + }, + { + "epoch": 0.7014500489280313, + "grad_norm": 0.1006317064166069, + "learning_rate": 0.00020814756864180573, + "loss": 2.5899, + "step": 23655 + }, + { + "epoch": 0.7014797022803428, + "grad_norm": 0.11181958019733429, + "learning_rate": 0.0002081093661580167, + "loss": 2.5783, + "step": 23656 + }, + { + "epoch": 0.7015093556326543, + "grad_norm": 0.09987838566303253, + "learning_rate": 0.00020807116625896915, + "loss": 2.5467, + "step": 23657 + }, + { + "epoch": 0.7015390089849658, + "grad_norm": 0.10721808671951294, + "learning_rate": 0.00020803296894500158, + "loss": 2.5656, + "step": 23658 + }, + { + "epoch": 0.7015686623372772, + "grad_norm": 0.10930117219686508, + "learning_rate": 0.00020799477421645196, + "loss": 2.5907, + "step": 23659 + }, + { + "epoch": 0.7015983156895887, + "grad_norm": 0.11024441570043564, + "learning_rate": 0.0002079565820736589, + "loss": 2.5886, + "step": 23660 + }, + { + "epoch": 0.7016279690419002, + "grad_norm": 0.10875558108091354, + "learning_rate": 0.00020791839251696042, + "loss": 2.5795, + "step": 23661 + }, + { + "epoch": 0.7016576223942117, + "grad_norm": 0.09945036470890045, + "learning_rate": 0.00020788020554669479, + "loss": 2.5503, + "step": 23662 + }, + { + "epoch": 0.7016872757465231, + "grad_norm": 0.10297225415706635, + "learning_rate": 0.0002078420211632, + "loss": 2.5741, + "step": 23663 + }, + { + "epoch": 0.7017169290988347, + "grad_norm": 0.08203399926424026, + "learning_rate": 0.00020780383936681436, + "loss": 2.583, + "step": 23664 + }, + { + "epoch": 0.7017465824511461, + "grad_norm": 0.09943817555904388, + "learning_rate": 0.0002077656601578758, + "loss": 2.5881, + "step": 23665 + }, + { + "epoch": 0.7017762358034576, + "grad_norm": 0.09954164177179337, + "learning_rate": 0.0002077274835367225, + "loss": 2.5829, + "step": 23666 + }, + { + "epoch": 0.701805889155769, + "grad_norm": 0.09798432886600494, + "learning_rate": 0.00020768930950369264, + "loss": 2.6031, + "step": 23667 + }, + { + "epoch": 0.7018355425080806, + "grad_norm": 0.09607556462287903, + "learning_rate": 0.000207651138059124, + "loss": 2.584, + "step": 23668 + }, + { + "epoch": 0.701865195860392, + "grad_norm": 0.09993263334035873, + "learning_rate": 0.0002076129692033547, + "loss": 2.5523, + "step": 23669 + }, + { + "epoch": 0.7018948492127035, + "grad_norm": 0.09832915663719177, + "learning_rate": 0.0002075748029367227, + "loss": 2.594, + "step": 23670 + }, + { + "epoch": 0.7019245025650149, + "grad_norm": 0.099977046251297, + "learning_rate": 0.00020753663925956607, + "loss": 2.5977, + "step": 23671 + }, + { + "epoch": 0.7019541559173265, + "grad_norm": 0.10049672424793243, + "learning_rate": 0.0002074984781722225, + "loss": 2.5832, + "step": 23672 + }, + { + "epoch": 0.7019838092696379, + "grad_norm": 0.10115431249141693, + "learning_rate": 0.00020746031967503026, + "loss": 2.5784, + "step": 23673 + }, + { + "epoch": 0.7020134626219494, + "grad_norm": 0.09997611492872238, + "learning_rate": 0.00020742216376832718, + "loss": 2.5978, + "step": 23674 + }, + { + "epoch": 0.702043115974261, + "grad_norm": 0.09798227250576019, + "learning_rate": 0.000207384010452451, + "loss": 2.6144, + "step": 23675 + }, + { + "epoch": 0.7020727693265724, + "grad_norm": 0.10036837309598923, + "learning_rate": 0.00020734585972773968, + "loss": 2.5905, + "step": 23676 + }, + { + "epoch": 0.7021024226788839, + "grad_norm": 0.09487152099609375, + "learning_rate": 0.0002073077115945311, + "loss": 2.5604, + "step": 23677 + }, + { + "epoch": 0.7021320760311953, + "grad_norm": 0.09871075302362442, + "learning_rate": 0.0002072695660531629, + "loss": 2.6092, + "step": 23678 + }, + { + "epoch": 0.7021617293835068, + "grad_norm": 0.0984124094247818, + "learning_rate": 0.00020723142310397286, + "loss": 2.573, + "step": 23679 + }, + { + "epoch": 0.7021913827358183, + "grad_norm": 0.10628464072942734, + "learning_rate": 0.00020719328274729887, + "loss": 2.6305, + "step": 23680 + }, + { + "epoch": 0.7022210360881298, + "grad_norm": 0.10326077789068222, + "learning_rate": 0.00020715514498347858, + "loss": 2.5917, + "step": 23681 + }, + { + "epoch": 0.7022506894404412, + "grad_norm": 0.10453777760267258, + "learning_rate": 0.00020711700981284977, + "loss": 2.5662, + "step": 23682 + }, + { + "epoch": 0.7022803427927528, + "grad_norm": 0.10218070447444916, + "learning_rate": 0.00020707887723575008, + "loss": 2.6016, + "step": 23683 + }, + { + "epoch": 0.7023099961450642, + "grad_norm": 0.10392530262470245, + "learning_rate": 0.0002070407472525171, + "loss": 2.5795, + "step": 23684 + }, + { + "epoch": 0.7023396494973757, + "grad_norm": 0.0900774896144867, + "learning_rate": 0.00020700261986348874, + "loss": 2.5729, + "step": 23685 + }, + { + "epoch": 0.7023693028496871, + "grad_norm": 0.10420387238264084, + "learning_rate": 0.00020696449506900244, + "loss": 2.5776, + "step": 23686 + }, + { + "epoch": 0.7023989562019987, + "grad_norm": 0.10100209712982178, + "learning_rate": 0.00020692637286939586, + "loss": 2.5787, + "step": 23687 + }, + { + "epoch": 0.7024286095543101, + "grad_norm": 0.10034369677305222, + "learning_rate": 0.00020688825326500672, + "loss": 2.592, + "step": 23688 + }, + { + "epoch": 0.7024582629066216, + "grad_norm": 0.0992790162563324, + "learning_rate": 0.0002068501362561722, + "loss": 2.5919, + "step": 23689 + }, + { + "epoch": 0.702487916258933, + "grad_norm": 0.10095280408859253, + "learning_rate": 0.00020681202184323012, + "loss": 2.5742, + "step": 23690 + }, + { + "epoch": 0.7025175696112446, + "grad_norm": 0.10238370299339294, + "learning_rate": 0.00020677391002651784, + "loss": 2.5684, + "step": 23691 + }, + { + "epoch": 0.702547222963556, + "grad_norm": 0.10388617217540741, + "learning_rate": 0.000206735800806373, + "loss": 2.6076, + "step": 23692 + }, + { + "epoch": 0.7025768763158675, + "grad_norm": 0.10188118368387222, + "learning_rate": 0.00020669769418313295, + "loss": 2.6182, + "step": 23693 + }, + { + "epoch": 0.7026065296681789, + "grad_norm": 0.09440293908119202, + "learning_rate": 0.00020665959015713514, + "loss": 2.5912, + "step": 23694 + }, + { + "epoch": 0.7026361830204905, + "grad_norm": 0.0952242985367775, + "learning_rate": 0.000206621488728717, + "loss": 2.618, + "step": 23695 + }, + { + "epoch": 0.702665836372802, + "grad_norm": 0.09289059787988663, + "learning_rate": 0.00020658338989821596, + "loss": 2.5396, + "step": 23696 + }, + { + "epoch": 0.7026954897251134, + "grad_norm": 0.09811908006668091, + "learning_rate": 0.00020654529366596918, + "loss": 2.5653, + "step": 23697 + }, + { + "epoch": 0.702725143077425, + "grad_norm": 0.10425291210412979, + "learning_rate": 0.00020650720003231456, + "loss": 2.5691, + "step": 23698 + }, + { + "epoch": 0.7027547964297364, + "grad_norm": 0.10305064171552658, + "learning_rate": 0.00020646910899758887, + "loss": 2.5701, + "step": 23699 + }, + { + "epoch": 0.7027844497820479, + "grad_norm": 0.0957137867808342, + "learning_rate": 0.00020643102056212958, + "loss": 2.5626, + "step": 23700 + }, + { + "epoch": 0.7028141031343593, + "grad_norm": 0.12225441634654999, + "learning_rate": 0.00020639293472627401, + "loss": 2.5523, + "step": 23701 + }, + { + "epoch": 0.7028437564866709, + "grad_norm": 0.08477520942687988, + "learning_rate": 0.00020635485149035943, + "loss": 2.5698, + "step": 23702 + }, + { + "epoch": 0.7028734098389823, + "grad_norm": 0.10931409895420074, + "learning_rate": 0.00020631677085472305, + "loss": 2.5876, + "step": 23703 + }, + { + "epoch": 0.7029030631912938, + "grad_norm": 0.09444581717252731, + "learning_rate": 0.000206278692819702, + "loss": 2.5106, + "step": 23704 + }, + { + "epoch": 0.7029327165436052, + "grad_norm": 0.10135070979595184, + "learning_rate": 0.00020624061738563367, + "loss": 2.5791, + "step": 23705 + }, + { + "epoch": 0.7029623698959168, + "grad_norm": 0.1043381616473198, + "learning_rate": 0.00020620254455285499, + "loss": 2.5542, + "step": 23706 + }, + { + "epoch": 0.7029920232482282, + "grad_norm": 0.10580751299858093, + "learning_rate": 0.0002061644743217032, + "loss": 2.5632, + "step": 23707 + }, + { + "epoch": 0.7030216766005397, + "grad_norm": 0.09124849736690521, + "learning_rate": 0.0002061264066925155, + "loss": 2.6209, + "step": 23708 + }, + { + "epoch": 0.7030513299528511, + "grad_norm": 0.09821916371583939, + "learning_rate": 0.00020608834166562884, + "loss": 2.5606, + "step": 23709 + }, + { + "epoch": 0.7030809833051627, + "grad_norm": 0.09457121789455414, + "learning_rate": 0.00020605027924138042, + "loss": 2.5785, + "step": 23710 + }, + { + "epoch": 0.7031106366574741, + "grad_norm": 0.09573716670274734, + "learning_rate": 0.00020601221942010724, + "loss": 2.6183, + "step": 23711 + }, + { + "epoch": 0.7031402900097856, + "grad_norm": 0.09870936721563339, + "learning_rate": 0.00020597416220214626, + "loss": 2.5697, + "step": 23712 + }, + { + "epoch": 0.703169943362097, + "grad_norm": 0.09279797971248627, + "learning_rate": 0.0002059361075878346, + "loss": 2.5704, + "step": 23713 + }, + { + "epoch": 0.7031995967144086, + "grad_norm": 0.10252590477466583, + "learning_rate": 0.00020589805557750913, + "loss": 2.6321, + "step": 23714 + }, + { + "epoch": 0.70322925006672, + "grad_norm": 0.10129925608634949, + "learning_rate": 0.00020586000617150692, + "loss": 2.5312, + "step": 23715 + }, + { + "epoch": 0.7032589034190315, + "grad_norm": 0.091777503490448, + "learning_rate": 0.0002058219593701648, + "loss": 2.5678, + "step": 23716 + }, + { + "epoch": 0.703288556771343, + "grad_norm": 0.10606324672698975, + "learning_rate": 0.00020578391517381972, + "loss": 2.6024, + "step": 23717 + }, + { + "epoch": 0.7033182101236545, + "grad_norm": 0.10149769484996796, + "learning_rate": 0.00020574587358280854, + "loss": 2.579, + "step": 23718 + }, + { + "epoch": 0.703347863475966, + "grad_norm": 0.10048262029886246, + "learning_rate": 0.00020570783459746822, + "loss": 2.589, + "step": 23719 + }, + { + "epoch": 0.7033775168282774, + "grad_norm": 0.0956495851278305, + "learning_rate": 0.00020566979821813554, + "loss": 2.5502, + "step": 23720 + }, + { + "epoch": 0.703407170180589, + "grad_norm": 0.09939489513635635, + "learning_rate": 0.0002056317644451473, + "loss": 2.5965, + "step": 23721 + }, + { + "epoch": 0.7034368235329004, + "grad_norm": 0.08895467966794968, + "learning_rate": 0.00020559373327884052, + "loss": 2.5841, + "step": 23722 + }, + { + "epoch": 0.7034664768852119, + "grad_norm": 0.1132376417517662, + "learning_rate": 0.00020555570471955138, + "loss": 2.5571, + "step": 23723 + }, + { + "epoch": 0.7034961302375233, + "grad_norm": 0.0903337374329567, + "learning_rate": 0.00020551767876761718, + "loss": 2.5809, + "step": 23724 + }, + { + "epoch": 0.7035257835898349, + "grad_norm": 0.09586881101131439, + "learning_rate": 0.00020547965542337448, + "loss": 2.6046, + "step": 23725 + }, + { + "epoch": 0.7035554369421463, + "grad_norm": 0.10415524989366531, + "learning_rate": 0.00020544163468716003, + "loss": 2.6136, + "step": 23726 + }, + { + "epoch": 0.7035850902944578, + "grad_norm": 0.0931917279958725, + "learning_rate": 0.00020540361655931044, + "loss": 2.5685, + "step": 23727 + }, + { + "epoch": 0.7036147436467692, + "grad_norm": 0.10398087650537491, + "learning_rate": 0.00020536560104016238, + "loss": 2.5821, + "step": 23728 + }, + { + "epoch": 0.7036443969990808, + "grad_norm": 0.10551458597183228, + "learning_rate": 0.00020532758813005247, + "loss": 2.5734, + "step": 23729 + }, + { + "epoch": 0.7036740503513922, + "grad_norm": 0.09870994836091995, + "learning_rate": 0.0002052895778293174, + "loss": 2.6054, + "step": 23730 + }, + { + "epoch": 0.7037037037037037, + "grad_norm": 0.09207555651664734, + "learning_rate": 0.00020525157013829372, + "loss": 2.5656, + "step": 23731 + }, + { + "epoch": 0.7037333570560151, + "grad_norm": 0.10015605390071869, + "learning_rate": 0.0002052135650573181, + "loss": 2.5509, + "step": 23732 + }, + { + "epoch": 0.7037630104083267, + "grad_norm": 0.09591449052095413, + "learning_rate": 0.0002051755625867268, + "loss": 2.5581, + "step": 23733 + }, + { + "epoch": 0.7037926637606381, + "grad_norm": 0.09548117965459824, + "learning_rate": 0.00020513756272685652, + "loss": 2.5536, + "step": 23734 + }, + { + "epoch": 0.7038223171129496, + "grad_norm": 0.09816114604473114, + "learning_rate": 0.0002050995654780437, + "loss": 2.5708, + "step": 23735 + }, + { + "epoch": 0.703851970465261, + "grad_norm": 0.09896830469369888, + "learning_rate": 0.00020506157084062472, + "loss": 2.5608, + "step": 23736 + }, + { + "epoch": 0.7038816238175726, + "grad_norm": 0.10648342967033386, + "learning_rate": 0.00020502357881493628, + "loss": 2.5983, + "step": 23737 + }, + { + "epoch": 0.7039112771698841, + "grad_norm": 0.09180229902267456, + "learning_rate": 0.00020498558940131473, + "loss": 2.5865, + "step": 23738 + }, + { + "epoch": 0.7039409305221955, + "grad_norm": 0.10833921283483505, + "learning_rate": 0.00020494760260009637, + "loss": 2.6196, + "step": 23739 + }, + { + "epoch": 0.7039705838745071, + "grad_norm": 0.0871930867433548, + "learning_rate": 0.0002049096184116177, + "loss": 2.5435, + "step": 23740 + }, + { + "epoch": 0.7040002372268185, + "grad_norm": 0.1076134517788887, + "learning_rate": 0.00020487163683621497, + "loss": 2.5553, + "step": 23741 + }, + { + "epoch": 0.70402989057913, + "grad_norm": 0.08872517198324203, + "learning_rate": 0.00020483365787422452, + "loss": 2.5619, + "step": 23742 + }, + { + "epoch": 0.7040595439314414, + "grad_norm": 0.10019256919622421, + "learning_rate": 0.00020479568152598294, + "loss": 2.5637, + "step": 23743 + }, + { + "epoch": 0.704089197283753, + "grad_norm": 0.09512294828891754, + "learning_rate": 0.00020475770779182606, + "loss": 2.5987, + "step": 23744 + }, + { + "epoch": 0.7041188506360644, + "grad_norm": 0.09885336458683014, + "learning_rate": 0.00020471973667209037, + "loss": 2.6158, + "step": 23745 + }, + { + "epoch": 0.7041485039883759, + "grad_norm": 0.1111283004283905, + "learning_rate": 0.0002046817681671121, + "loss": 2.5741, + "step": 23746 + }, + { + "epoch": 0.7041781573406873, + "grad_norm": 0.08832849562168121, + "learning_rate": 0.00020464380227722747, + "loss": 2.5814, + "step": 23747 + }, + { + "epoch": 0.7042078106929989, + "grad_norm": 0.1089738979935646, + "learning_rate": 0.00020460583900277262, + "loss": 2.582, + "step": 23748 + }, + { + "epoch": 0.7042374640453103, + "grad_norm": 0.09767656773328781, + "learning_rate": 0.0002045678783440836, + "loss": 2.5825, + "step": 23749 + }, + { + "epoch": 0.7042671173976218, + "grad_norm": 0.10938931256532669, + "learning_rate": 0.0002045299203014969, + "loss": 2.5906, + "step": 23750 + }, + { + "epoch": 0.7042967707499332, + "grad_norm": 0.10964134335517883, + "learning_rate": 0.00020449196487534854, + "loss": 2.5667, + "step": 23751 + }, + { + "epoch": 0.7043264241022448, + "grad_norm": 0.1055867150425911, + "learning_rate": 0.00020445401206597446, + "loss": 2.5464, + "step": 23752 + }, + { + "epoch": 0.7043560774545562, + "grad_norm": 0.1062094122171402, + "learning_rate": 0.00020441606187371109, + "loss": 2.5923, + "step": 23753 + }, + { + "epoch": 0.7043857308068677, + "grad_norm": 0.09742888808250427, + "learning_rate": 0.000204378114298894, + "loss": 2.5309, + "step": 23754 + }, + { + "epoch": 0.7044153841591791, + "grad_norm": 0.10711871832609177, + "learning_rate": 0.0002043401693418595, + "loss": 2.5648, + "step": 23755 + }, + { + "epoch": 0.7044450375114907, + "grad_norm": 0.09843114018440247, + "learning_rate": 0.00020430222700294355, + "loss": 2.5464, + "step": 23756 + }, + { + "epoch": 0.7044746908638021, + "grad_norm": 0.09865523874759674, + "learning_rate": 0.0002042642872824821, + "loss": 2.5696, + "step": 23757 + }, + { + "epoch": 0.7045043442161136, + "grad_norm": 0.1149701178073883, + "learning_rate": 0.00020422635018081114, + "loss": 2.6122, + "step": 23758 + }, + { + "epoch": 0.7045339975684252, + "grad_norm": 0.09811332821846008, + "learning_rate": 0.00020418841569826664, + "loss": 2.5776, + "step": 23759 + }, + { + "epoch": 0.7045636509207366, + "grad_norm": 0.10313533246517181, + "learning_rate": 0.00020415048383518448, + "loss": 2.5801, + "step": 23760 + }, + { + "epoch": 0.7045933042730481, + "grad_norm": 0.11709129065275192, + "learning_rate": 0.0002041125545919006, + "loss": 2.595, + "step": 23761 + }, + { + "epoch": 0.7046229576253595, + "grad_norm": 0.09130392968654633, + "learning_rate": 0.00020407462796875065, + "loss": 2.6072, + "step": 23762 + }, + { + "epoch": 0.7046526109776711, + "grad_norm": 0.12562939524650574, + "learning_rate": 0.00020403670396607082, + "loss": 2.5971, + "step": 23763 + }, + { + "epoch": 0.7046822643299825, + "grad_norm": 0.11021239310503006, + "learning_rate": 0.00020399878258419696, + "loss": 2.5815, + "step": 23764 + }, + { + "epoch": 0.704711917682294, + "grad_norm": 0.10626329481601715, + "learning_rate": 0.00020396086382346452, + "loss": 2.574, + "step": 23765 + }, + { + "epoch": 0.7047415710346054, + "grad_norm": 0.10851849615573883, + "learning_rate": 0.00020392294768420944, + "loss": 2.5953, + "step": 23766 + }, + { + "epoch": 0.704771224386917, + "grad_norm": 0.1168091893196106, + "learning_rate": 0.00020388503416676746, + "loss": 2.6049, + "step": 23767 + }, + { + "epoch": 0.7048008777392284, + "grad_norm": 0.09407796710729599, + "learning_rate": 0.00020384712327147436, + "loss": 2.5923, + "step": 23768 + }, + { + "epoch": 0.7048305310915399, + "grad_norm": 0.11966369301080704, + "learning_rate": 0.0002038092149986658, + "loss": 2.5938, + "step": 23769 + }, + { + "epoch": 0.7048601844438513, + "grad_norm": 0.09372889250516891, + "learning_rate": 0.0002037713093486775, + "loss": 2.6102, + "step": 23770 + }, + { + "epoch": 0.7048898377961629, + "grad_norm": 0.10632555186748505, + "learning_rate": 0.00020373340632184506, + "loss": 2.5894, + "step": 23771 + }, + { + "epoch": 0.7049194911484743, + "grad_norm": 0.10801270604133606, + "learning_rate": 0.00020369550591850421, + "loss": 2.5965, + "step": 23772 + }, + { + "epoch": 0.7049491445007858, + "grad_norm": 0.09882547706365585, + "learning_rate": 0.0002036576081389905, + "loss": 2.5917, + "step": 23773 + }, + { + "epoch": 0.7049787978530972, + "grad_norm": 0.11609471589326859, + "learning_rate": 0.0002036197129836395, + "loss": 2.5866, + "step": 23774 + }, + { + "epoch": 0.7050084512054088, + "grad_norm": 0.10010112822055817, + "learning_rate": 0.0002035818204527869, + "loss": 2.5931, + "step": 23775 + }, + { + "epoch": 0.7050381045577202, + "grad_norm": 0.10814245790243149, + "learning_rate": 0.00020354393054676807, + "loss": 2.5967, + "step": 23776 + }, + { + "epoch": 0.7050677579100317, + "grad_norm": 0.08900287747383118, + "learning_rate": 0.00020350604326591865, + "loss": 2.5839, + "step": 23777 + }, + { + "epoch": 0.7050974112623432, + "grad_norm": 0.12114682048559189, + "learning_rate": 0.00020346815861057416, + "loss": 2.5785, + "step": 23778 + }, + { + "epoch": 0.7051270646146547, + "grad_norm": 0.09634983539581299, + "learning_rate": 0.00020343027658106995, + "loss": 2.5659, + "step": 23779 + }, + { + "epoch": 0.7051567179669662, + "grad_norm": 0.11400051414966583, + "learning_rate": 0.00020339239717774162, + "loss": 2.5819, + "step": 23780 + }, + { + "epoch": 0.7051863713192776, + "grad_norm": 0.10593201220035553, + "learning_rate": 0.00020335452040092444, + "loss": 2.5696, + "step": 23781 + }, + { + "epoch": 0.7052160246715892, + "grad_norm": 0.0948345810174942, + "learning_rate": 0.00020331664625095397, + "loss": 2.5624, + "step": 23782 + }, + { + "epoch": 0.7052456780239006, + "grad_norm": 0.10052917152643204, + "learning_rate": 0.00020327877472816548, + "loss": 2.5859, + "step": 23783 + }, + { + "epoch": 0.7052753313762121, + "grad_norm": 0.09440735727548599, + "learning_rate": 0.00020324090583289438, + "loss": 2.5639, + "step": 23784 + }, + { + "epoch": 0.7053049847285235, + "grad_norm": 0.10139370709657669, + "learning_rate": 0.00020320303956547603, + "loss": 2.5617, + "step": 23785 + }, + { + "epoch": 0.7053346380808351, + "grad_norm": 0.09642818570137024, + "learning_rate": 0.00020316517592624568, + "loss": 2.5782, + "step": 23786 + }, + { + "epoch": 0.7053642914331465, + "grad_norm": 0.08838961273431778, + "learning_rate": 0.00020312731491553866, + "loss": 2.5403, + "step": 23787 + }, + { + "epoch": 0.705393944785458, + "grad_norm": 0.09176374226808548, + "learning_rate": 0.0002030894565336902, + "loss": 2.5698, + "step": 23788 + }, + { + "epoch": 0.7054235981377694, + "grad_norm": 0.09612051397562027, + "learning_rate": 0.00020305160078103556, + "loss": 2.5844, + "step": 23789 + }, + { + "epoch": 0.705453251490081, + "grad_norm": 0.09789866954088211, + "learning_rate": 0.00020301374765790996, + "loss": 2.5496, + "step": 23790 + }, + { + "epoch": 0.7054829048423924, + "grad_norm": 0.09029321372509003, + "learning_rate": 0.00020297589716464866, + "loss": 2.5692, + "step": 23791 + }, + { + "epoch": 0.7055125581947039, + "grad_norm": 0.09864623099565506, + "learning_rate": 0.00020293804930158677, + "loss": 2.5397, + "step": 23792 + }, + { + "epoch": 0.7055422115470154, + "grad_norm": 0.09543900936841965, + "learning_rate": 0.0002029002040690594, + "loss": 2.5736, + "step": 23793 + }, + { + "epoch": 0.7055718648993269, + "grad_norm": 0.1017148420214653, + "learning_rate": 0.00020286236146740172, + "loss": 2.5773, + "step": 23794 + }, + { + "epoch": 0.7056015182516383, + "grad_norm": 0.0872400626540184, + "learning_rate": 0.00020282452149694886, + "loss": 2.5685, + "step": 23795 + }, + { + "epoch": 0.7056311716039498, + "grad_norm": 0.0980583131313324, + "learning_rate": 0.00020278668415803585, + "loss": 2.5479, + "step": 23796 + }, + { + "epoch": 0.7056608249562613, + "grad_norm": 0.09371768683195114, + "learning_rate": 0.00020274884945099781, + "loss": 2.5691, + "step": 23797 + }, + { + "epoch": 0.7056904783085728, + "grad_norm": 0.0926402360200882, + "learning_rate": 0.00020271101737616987, + "loss": 2.5854, + "step": 23798 + }, + { + "epoch": 0.7057201316608842, + "grad_norm": 0.09283892065286636, + "learning_rate": 0.00020267318793388672, + "loss": 2.5553, + "step": 23799 + }, + { + "epoch": 0.7057497850131957, + "grad_norm": 0.08753272891044617, + "learning_rate": 0.00020263536112448356, + "loss": 2.5869, + "step": 23800 + }, + { + "epoch": 0.7057794383655073, + "grad_norm": 0.09214161336421967, + "learning_rate": 0.00020259753694829507, + "loss": 2.6291, + "step": 23801 + }, + { + "epoch": 0.7058090917178187, + "grad_norm": 0.09912925213575363, + "learning_rate": 0.0002025597154056567, + "loss": 2.5653, + "step": 23802 + }, + { + "epoch": 0.7058387450701302, + "grad_norm": 0.09968999773263931, + "learning_rate": 0.00020252189649690305, + "loss": 2.5444, + "step": 23803 + }, + { + "epoch": 0.7058683984224416, + "grad_norm": 0.10141503810882568, + "learning_rate": 0.0002024840802223691, + "loss": 2.5882, + "step": 23804 + }, + { + "epoch": 0.7058980517747532, + "grad_norm": 0.08458980917930603, + "learning_rate": 0.0002024462665823897, + "loss": 2.5627, + "step": 23805 + }, + { + "epoch": 0.7059277051270646, + "grad_norm": 0.0883575826883316, + "learning_rate": 0.00020240845557729963, + "loss": 2.556, + "step": 23806 + }, + { + "epoch": 0.7059573584793761, + "grad_norm": 0.08524489402770996, + "learning_rate": 0.0002023706472074338, + "loss": 2.6034, + "step": 23807 + }, + { + "epoch": 0.7059870118316875, + "grad_norm": 0.10135511308908463, + "learning_rate": 0.00020233284147312713, + "loss": 2.5466, + "step": 23808 + }, + { + "epoch": 0.7060166651839991, + "grad_norm": 0.09561225771903992, + "learning_rate": 0.0002022950383747141, + "loss": 2.5561, + "step": 23809 + }, + { + "epoch": 0.7060463185363105, + "grad_norm": 0.09663384407758713, + "learning_rate": 0.0002022572379125296, + "loss": 2.5718, + "step": 23810 + }, + { + "epoch": 0.706075971888622, + "grad_norm": 0.09653647243976593, + "learning_rate": 0.00020221944008690836, + "loss": 2.609, + "step": 23811 + }, + { + "epoch": 0.7061056252409335, + "grad_norm": 0.08800274133682251, + "learning_rate": 0.00020218164489818507, + "loss": 2.5568, + "step": 23812 + }, + { + "epoch": 0.706135278593245, + "grad_norm": 0.09607461839914322, + "learning_rate": 0.00020214385234669442, + "loss": 2.5736, + "step": 23813 + }, + { + "epoch": 0.7061649319455564, + "grad_norm": 0.10288777947425842, + "learning_rate": 0.00020210606243277097, + "loss": 2.5685, + "step": 23814 + }, + { + "epoch": 0.7061945852978679, + "grad_norm": 0.10008669644594193, + "learning_rate": 0.0002020682751567496, + "loss": 2.6074, + "step": 23815 + }, + { + "epoch": 0.7062242386501794, + "grad_norm": 0.10555814206600189, + "learning_rate": 0.00020203049051896482, + "loss": 2.5965, + "step": 23816 + }, + { + "epoch": 0.7062538920024909, + "grad_norm": 0.10085151344537735, + "learning_rate": 0.00020199270851975116, + "loss": 2.5709, + "step": 23817 + }, + { + "epoch": 0.7062835453548023, + "grad_norm": 0.09628547728061676, + "learning_rate": 0.00020195492915944325, + "loss": 2.5828, + "step": 23818 + }, + { + "epoch": 0.7063131987071138, + "grad_norm": 0.10085857659578323, + "learning_rate": 0.00020191715243837578, + "loss": 2.6261, + "step": 23819 + }, + { + "epoch": 0.7063428520594253, + "grad_norm": 0.09481004625558853, + "learning_rate": 0.00020187937835688285, + "loss": 2.6133, + "step": 23820 + }, + { + "epoch": 0.7063725054117368, + "grad_norm": 0.10973133146762848, + "learning_rate": 0.0002018416069152993, + "loss": 2.5656, + "step": 23821 + }, + { + "epoch": 0.7064021587640483, + "grad_norm": 0.0936284214258194, + "learning_rate": 0.0002018038381139594, + "loss": 2.59, + "step": 23822 + }, + { + "epoch": 0.7064318121163597, + "grad_norm": 0.09700234979391098, + "learning_rate": 0.00020176607195319775, + "loss": 2.5831, + "step": 23823 + }, + { + "epoch": 0.7064614654686713, + "grad_norm": 0.0977705642580986, + "learning_rate": 0.00020172830843334872, + "loss": 2.5751, + "step": 23824 + }, + { + "epoch": 0.7064911188209827, + "grad_norm": 0.09889817982912064, + "learning_rate": 0.0002016905475547467, + "loss": 2.5829, + "step": 23825 + }, + { + "epoch": 0.7065207721732942, + "grad_norm": 0.09816557914018631, + "learning_rate": 0.00020165278931772612, + "loss": 2.6033, + "step": 23826 + }, + { + "epoch": 0.7065504255256057, + "grad_norm": 0.10057731717824936, + "learning_rate": 0.0002016150337226211, + "loss": 2.5859, + "step": 23827 + }, + { + "epoch": 0.7065800788779172, + "grad_norm": 0.08936526626348495, + "learning_rate": 0.00020157728076976633, + "loss": 2.5831, + "step": 23828 + }, + { + "epoch": 0.7066097322302286, + "grad_norm": 0.10278304666280746, + "learning_rate": 0.0002015395304594962, + "loss": 2.5736, + "step": 23829 + }, + { + "epoch": 0.7066393855825401, + "grad_norm": 0.09268791228532791, + "learning_rate": 0.00020150178279214453, + "loss": 2.5954, + "step": 23830 + }, + { + "epoch": 0.7066690389348516, + "grad_norm": 0.09670621156692505, + "learning_rate": 0.00020146403776804585, + "loss": 2.5874, + "step": 23831 + }, + { + "epoch": 0.7066986922871631, + "grad_norm": 0.11350274831056595, + "learning_rate": 0.00020142629538753433, + "loss": 2.6002, + "step": 23832 + }, + { + "epoch": 0.7067283456394745, + "grad_norm": 0.10591328144073486, + "learning_rate": 0.00020138855565094416, + "loss": 2.5576, + "step": 23833 + }, + { + "epoch": 0.706757998991786, + "grad_norm": 0.09859855473041534, + "learning_rate": 0.00020135081855860966, + "loss": 2.5764, + "step": 23834 + }, + { + "epoch": 0.7067876523440975, + "grad_norm": 0.10798625648021698, + "learning_rate": 0.0002013130841108649, + "loss": 2.5647, + "step": 23835 + }, + { + "epoch": 0.706817305696409, + "grad_norm": 0.09978312253952026, + "learning_rate": 0.00020127535230804407, + "loss": 2.5998, + "step": 23836 + }, + { + "epoch": 0.7068469590487204, + "grad_norm": 0.10095397382974625, + "learning_rate": 0.00020123762315048122, + "loss": 2.5891, + "step": 23837 + }, + { + "epoch": 0.706876612401032, + "grad_norm": 0.10523652285337448, + "learning_rate": 0.00020119989663851056, + "loss": 2.5506, + "step": 23838 + }, + { + "epoch": 0.7069062657533434, + "grad_norm": 0.0934721827507019, + "learning_rate": 0.00020116217277246608, + "loss": 2.5554, + "step": 23839 + }, + { + "epoch": 0.7069359191056549, + "grad_norm": 0.10262279957532883, + "learning_rate": 0.00020112445155268184, + "loss": 2.5834, + "step": 23840 + }, + { + "epoch": 0.7069655724579663, + "grad_norm": 0.09316235780715942, + "learning_rate": 0.0002010867329794919, + "loss": 2.5238, + "step": 23841 + }, + { + "epoch": 0.7069952258102778, + "grad_norm": 0.10024455189704895, + "learning_rate": 0.00020104901705323027, + "loss": 2.5656, + "step": 23842 + }, + { + "epoch": 0.7070248791625894, + "grad_norm": 0.09694229811429977, + "learning_rate": 0.00020101130377423088, + "loss": 2.6172, + "step": 23843 + }, + { + "epoch": 0.7070545325149008, + "grad_norm": 0.09189872443675995, + "learning_rate": 0.0002009735931428277, + "loss": 2.5676, + "step": 23844 + }, + { + "epoch": 0.7070841858672123, + "grad_norm": 0.08896569907665253, + "learning_rate": 0.00020093588515935468, + "loss": 2.5945, + "step": 23845 + }, + { + "epoch": 0.7071138392195238, + "grad_norm": 0.11620738357305527, + "learning_rate": 0.00020089817982414575, + "loss": 2.5795, + "step": 23846 + }, + { + "epoch": 0.7071434925718353, + "grad_norm": 0.10877172648906708, + "learning_rate": 0.0002008604771375348, + "loss": 2.5832, + "step": 23847 + }, + { + "epoch": 0.7071731459241467, + "grad_norm": 0.09904666244983673, + "learning_rate": 0.00020082277709985562, + "loss": 2.6054, + "step": 23848 + }, + { + "epoch": 0.7072027992764582, + "grad_norm": 0.1340995579957962, + "learning_rate": 0.00020078507971144215, + "loss": 2.5959, + "step": 23849 + }, + { + "epoch": 0.7072324526287697, + "grad_norm": 0.11091718822717667, + "learning_rate": 0.0002007473849726281, + "loss": 2.5994, + "step": 23850 + }, + { + "epoch": 0.7072621059810812, + "grad_norm": 0.10852380841970444, + "learning_rate": 0.0002007096928837474, + "loss": 2.5899, + "step": 23851 + }, + { + "epoch": 0.7072917593333926, + "grad_norm": 0.12900130450725555, + "learning_rate": 0.0002006720034451337, + "loss": 2.5501, + "step": 23852 + }, + { + "epoch": 0.7073214126857041, + "grad_norm": 0.08991740643978119, + "learning_rate": 0.0002006343166571208, + "loss": 2.6029, + "step": 23853 + }, + { + "epoch": 0.7073510660380156, + "grad_norm": 0.12584199011325836, + "learning_rate": 0.0002005966325200424, + "loss": 2.6144, + "step": 23854 + }, + { + "epoch": 0.7073807193903271, + "grad_norm": 0.1098950132727623, + "learning_rate": 0.00020055895103423223, + "loss": 2.6028, + "step": 23855 + }, + { + "epoch": 0.7074103727426385, + "grad_norm": 0.11129483580589294, + "learning_rate": 0.00020052127220002391, + "loss": 2.5954, + "step": 23856 + }, + { + "epoch": 0.70744002609495, + "grad_norm": 0.1262284219264984, + "learning_rate": 0.00020048359601775113, + "loss": 2.6005, + "step": 23857 + }, + { + "epoch": 0.7074696794472615, + "grad_norm": 0.10468518733978271, + "learning_rate": 0.00020044592248774756, + "loss": 2.5818, + "step": 23858 + }, + { + "epoch": 0.707499332799573, + "grad_norm": 0.10289044678211212, + "learning_rate": 0.00020040825161034672, + "loss": 2.5636, + "step": 23859 + }, + { + "epoch": 0.7075289861518844, + "grad_norm": 0.10267002880573273, + "learning_rate": 0.00020037058338588225, + "loss": 2.6069, + "step": 23860 + }, + { + "epoch": 0.707558639504196, + "grad_norm": 0.09878773242235184, + "learning_rate": 0.0002003329178146877, + "loss": 2.5463, + "step": 23861 + }, + { + "epoch": 0.7075882928565075, + "grad_norm": 0.09866869449615479, + "learning_rate": 0.0002002952548970966, + "loss": 2.6265, + "step": 23862 + }, + { + "epoch": 0.7076179462088189, + "grad_norm": 0.10722196102142334, + "learning_rate": 0.00020025759463344247, + "loss": 2.5813, + "step": 23863 + }, + { + "epoch": 0.7076475995611304, + "grad_norm": 0.09830480068922043, + "learning_rate": 0.000200219937024059, + "loss": 2.5833, + "step": 23864 + }, + { + "epoch": 0.7076772529134419, + "grad_norm": 0.11361147463321686, + "learning_rate": 0.00020018228206927901, + "loss": 2.5965, + "step": 23865 + }, + { + "epoch": 0.7077069062657534, + "grad_norm": 0.09275253117084503, + "learning_rate": 0.0002001446297694366, + "loss": 2.6048, + "step": 23866 + }, + { + "epoch": 0.7077365596180648, + "grad_norm": 0.11091553419828415, + "learning_rate": 0.00020010698012486495, + "loss": 2.5837, + "step": 23867 + }, + { + "epoch": 0.7077662129703763, + "grad_norm": 0.09234457463026047, + "learning_rate": 0.0002000693331358975, + "loss": 2.5626, + "step": 23868 + }, + { + "epoch": 0.7077958663226878, + "grad_norm": 0.10325109213590622, + "learning_rate": 0.00020003168880286754, + "loss": 2.5713, + "step": 23869 + }, + { + "epoch": 0.7078255196749993, + "grad_norm": 0.09129665046930313, + "learning_rate": 0.00019999404712610846, + "loss": 2.521, + "step": 23870 + }, + { + "epoch": 0.7078551730273107, + "grad_norm": 0.09275693446397781, + "learning_rate": 0.00019995640810595367, + "loss": 2.5662, + "step": 23871 + }, + { + "epoch": 0.7078848263796222, + "grad_norm": 0.0908278375864029, + "learning_rate": 0.0001999187717427363, + "loss": 2.5619, + "step": 23872 + }, + { + "epoch": 0.7079144797319337, + "grad_norm": 0.08729864656925201, + "learning_rate": 0.00019988113803678977, + "loss": 2.5569, + "step": 23873 + }, + { + "epoch": 0.7079441330842452, + "grad_norm": 0.09203671663999557, + "learning_rate": 0.00019984350698844738, + "loss": 2.574, + "step": 23874 + }, + { + "epoch": 0.7079737864365566, + "grad_norm": 0.09387239068746567, + "learning_rate": 0.00019980587859804217, + "loss": 2.5876, + "step": 23875 + }, + { + "epoch": 0.7080034397888681, + "grad_norm": 0.09632261097431183, + "learning_rate": 0.00019976825286590737, + "loss": 2.5864, + "step": 23876 + }, + { + "epoch": 0.7080330931411796, + "grad_norm": 0.08629574626684189, + "learning_rate": 0.00019973062979237628, + "loss": 2.5841, + "step": 23877 + }, + { + "epoch": 0.7080627464934911, + "grad_norm": 0.10314365476369858, + "learning_rate": 0.00019969300937778178, + "loss": 2.5954, + "step": 23878 + }, + { + "epoch": 0.7080923998458025, + "grad_norm": 0.09805966913700104, + "learning_rate": 0.00019965539162245744, + "loss": 2.6024, + "step": 23879 + }, + { + "epoch": 0.708122053198114, + "grad_norm": 0.1007842943072319, + "learning_rate": 0.00019961777652673614, + "loss": 2.5814, + "step": 23880 + }, + { + "epoch": 0.7081517065504255, + "grad_norm": 0.09802605956792831, + "learning_rate": 0.000199580164090951, + "loss": 2.5778, + "step": 23881 + }, + { + "epoch": 0.708181359902737, + "grad_norm": 0.10617540776729584, + "learning_rate": 0.00019954255431543506, + "loss": 2.6081, + "step": 23882 + }, + { + "epoch": 0.7082110132550485, + "grad_norm": 0.08523427695035934, + "learning_rate": 0.00019950494720052137, + "loss": 2.5745, + "step": 23883 + }, + { + "epoch": 0.70824066660736, + "grad_norm": 0.10385403782129288, + "learning_rate": 0.0001994673427465431, + "loss": 2.5359, + "step": 23884 + }, + { + "epoch": 0.7082703199596715, + "grad_norm": 0.09375720471143723, + "learning_rate": 0.00019942974095383298, + "loss": 2.5697, + "step": 23885 + }, + { + "epoch": 0.7082999733119829, + "grad_norm": 0.10252655297517776, + "learning_rate": 0.0001993921418227241, + "loss": 2.5366, + "step": 23886 + }, + { + "epoch": 0.7083296266642944, + "grad_norm": 0.10545673966407776, + "learning_rate": 0.00019935454535354935, + "loss": 2.5824, + "step": 23887 + }, + { + "epoch": 0.7083592800166059, + "grad_norm": 0.09491468966007233, + "learning_rate": 0.0001993169515466417, + "loss": 2.6035, + "step": 23888 + }, + { + "epoch": 0.7083889333689174, + "grad_norm": 0.11389113962650299, + "learning_rate": 0.00019927936040233413, + "loss": 2.5892, + "step": 23889 + }, + { + "epoch": 0.7084185867212288, + "grad_norm": 0.0852053239941597, + "learning_rate": 0.00019924177192095938, + "loss": 2.5646, + "step": 23890 + }, + { + "epoch": 0.7084482400735403, + "grad_norm": 0.10765748471021652, + "learning_rate": 0.00019920418610285017, + "loss": 2.5997, + "step": 23891 + }, + { + "epoch": 0.7084778934258518, + "grad_norm": 0.09290298819541931, + "learning_rate": 0.00019916660294833977, + "loss": 2.5723, + "step": 23892 + }, + { + "epoch": 0.7085075467781633, + "grad_norm": 0.09446170181035995, + "learning_rate": 0.0001991290224577607, + "loss": 2.6019, + "step": 23893 + }, + { + "epoch": 0.7085372001304747, + "grad_norm": 0.09387177973985672, + "learning_rate": 0.0001990914446314458, + "loss": 2.591, + "step": 23894 + }, + { + "epoch": 0.7085668534827863, + "grad_norm": 0.0849972814321518, + "learning_rate": 0.0001990538694697279, + "loss": 2.5688, + "step": 23895 + }, + { + "epoch": 0.7085965068350977, + "grad_norm": 0.10091184079647064, + "learning_rate": 0.00019901629697293961, + "loss": 2.6204, + "step": 23896 + }, + { + "epoch": 0.7086261601874092, + "grad_norm": 0.08765359967947006, + "learning_rate": 0.00019897872714141356, + "loss": 2.5824, + "step": 23897 + }, + { + "epoch": 0.7086558135397206, + "grad_norm": 0.09567553550004959, + "learning_rate": 0.00019894115997548257, + "loss": 2.6071, + "step": 23898 + }, + { + "epoch": 0.7086854668920322, + "grad_norm": 0.08697070181369781, + "learning_rate": 0.0001989035954754793, + "loss": 2.6086, + "step": 23899 + }, + { + "epoch": 0.7087151202443436, + "grad_norm": 0.10073152929544449, + "learning_rate": 0.00019886603364173639, + "loss": 2.5615, + "step": 23900 + }, + { + "epoch": 0.7087447735966551, + "grad_norm": 0.09095535427331924, + "learning_rate": 0.00019882847447458645, + "loss": 2.5934, + "step": 23901 + }, + { + "epoch": 0.7087744269489665, + "grad_norm": 0.08946356177330017, + "learning_rate": 0.00019879091797436204, + "loss": 2.5732, + "step": 23902 + }, + { + "epoch": 0.7088040803012781, + "grad_norm": 0.08531086146831512, + "learning_rate": 0.00019875336414139572, + "loss": 2.5861, + "step": 23903 + }, + { + "epoch": 0.7088337336535896, + "grad_norm": 0.0887083038687706, + "learning_rate": 0.00019871581297601992, + "loss": 2.5655, + "step": 23904 + }, + { + "epoch": 0.708863387005901, + "grad_norm": 0.09812656790018082, + "learning_rate": 0.0001986782644785677, + "loss": 2.5928, + "step": 23905 + }, + { + "epoch": 0.7088930403582125, + "grad_norm": 0.08822423219680786, + "learning_rate": 0.00019864071864937095, + "loss": 2.5865, + "step": 23906 + }, + { + "epoch": 0.708922693710524, + "grad_norm": 0.09112558513879776, + "learning_rate": 0.00019860317548876238, + "loss": 2.6233, + "step": 23907 + }, + { + "epoch": 0.7089523470628355, + "grad_norm": 0.0995173454284668, + "learning_rate": 0.00019856563499707442, + "loss": 2.5928, + "step": 23908 + }, + { + "epoch": 0.7089820004151469, + "grad_norm": 0.09028954803943634, + "learning_rate": 0.00019852809717463954, + "loss": 2.5862, + "step": 23909 + }, + { + "epoch": 0.7090116537674584, + "grad_norm": 0.0987422838807106, + "learning_rate": 0.00019849056202179006, + "loss": 2.5873, + "step": 23910 + }, + { + "epoch": 0.7090413071197699, + "grad_norm": 0.09731251001358032, + "learning_rate": 0.0001984530295388584, + "loss": 2.5655, + "step": 23911 + }, + { + "epoch": 0.7090709604720814, + "grad_norm": 0.10057444870471954, + "learning_rate": 0.00019841549972617696, + "loss": 2.5986, + "step": 23912 + }, + { + "epoch": 0.7091006138243928, + "grad_norm": 0.10609120875597, + "learning_rate": 0.000198377972584078, + "loss": 2.579, + "step": 23913 + }, + { + "epoch": 0.7091302671767044, + "grad_norm": 0.09868987649679184, + "learning_rate": 0.00019834044811289393, + "loss": 2.6066, + "step": 23914 + }, + { + "epoch": 0.7091599205290158, + "grad_norm": 0.09071817249059677, + "learning_rate": 0.00019830292631295687, + "loss": 2.5706, + "step": 23915 + }, + { + "epoch": 0.7091895738813273, + "grad_norm": 0.09892532974481583, + "learning_rate": 0.00019826540718459928, + "loss": 2.6132, + "step": 23916 + }, + { + "epoch": 0.7092192272336387, + "grad_norm": 0.10731639713048935, + "learning_rate": 0.00019822789072815322, + "loss": 2.5677, + "step": 23917 + }, + { + "epoch": 0.7092488805859503, + "grad_norm": 0.09667076915502548, + "learning_rate": 0.00019819037694395104, + "loss": 2.5839, + "step": 23918 + }, + { + "epoch": 0.7092785339382617, + "grad_norm": 0.1110658124089241, + "learning_rate": 0.0001981528658323249, + "loss": 2.6033, + "step": 23919 + }, + { + "epoch": 0.7093081872905732, + "grad_norm": 0.10690604150295258, + "learning_rate": 0.00019811535739360685, + "loss": 2.5729, + "step": 23920 + }, + { + "epoch": 0.7093378406428846, + "grad_norm": 0.09286917001008987, + "learning_rate": 0.0001980778516281292, + "loss": 2.5472, + "step": 23921 + }, + { + "epoch": 0.7093674939951962, + "grad_norm": 0.10690095275640488, + "learning_rate": 0.000198040348536224, + "loss": 2.5935, + "step": 23922 + }, + { + "epoch": 0.7093971473475076, + "grad_norm": 0.09327170252799988, + "learning_rate": 0.00019800284811822332, + "loss": 2.5558, + "step": 23923 + }, + { + "epoch": 0.7094268006998191, + "grad_norm": 0.111866794526577, + "learning_rate": 0.0001979653503744593, + "loss": 2.5501, + "step": 23924 + }, + { + "epoch": 0.7094564540521306, + "grad_norm": 0.09623749554157257, + "learning_rate": 0.00019792785530526385, + "loss": 2.5643, + "step": 23925 + }, + { + "epoch": 0.7094861074044421, + "grad_norm": 0.1361645758152008, + "learning_rate": 0.00019789036291096917, + "loss": 2.5427, + "step": 23926 + }, + { + "epoch": 0.7095157607567536, + "grad_norm": 0.09918307512998581, + "learning_rate": 0.0001978528731919072, + "loss": 2.5746, + "step": 23927 + }, + { + "epoch": 0.709545414109065, + "grad_norm": 0.1236235499382019, + "learning_rate": 0.00019781538614840983, + "loss": 2.6209, + "step": 23928 + }, + { + "epoch": 0.7095750674613766, + "grad_norm": 0.10924067348241806, + "learning_rate": 0.00019777790178080934, + "loss": 2.6041, + "step": 23929 + }, + { + "epoch": 0.709604720813688, + "grad_norm": 0.09677613526582718, + "learning_rate": 0.000197740420089437, + "loss": 2.5914, + "step": 23930 + }, + { + "epoch": 0.7096343741659995, + "grad_norm": 0.10630865395069122, + "learning_rate": 0.00019770294107462528, + "loss": 2.5793, + "step": 23931 + }, + { + "epoch": 0.7096640275183109, + "grad_norm": 0.09525787830352783, + "learning_rate": 0.00019766546473670593, + "loss": 2.585, + "step": 23932 + }, + { + "epoch": 0.7096936808706225, + "grad_norm": 0.11927289515733719, + "learning_rate": 0.0001976279910760108, + "loss": 2.6082, + "step": 23933 + }, + { + "epoch": 0.7097233342229339, + "grad_norm": 0.08664209395647049, + "learning_rate": 0.0001975905200928717, + "loss": 2.5792, + "step": 23934 + }, + { + "epoch": 0.7097529875752454, + "grad_norm": 0.0915154367685318, + "learning_rate": 0.00019755305178762044, + "loss": 2.5874, + "step": 23935 + }, + { + "epoch": 0.7097826409275568, + "grad_norm": 0.08825980126857758, + "learning_rate": 0.00019751558616058884, + "loss": 2.5548, + "step": 23936 + }, + { + "epoch": 0.7098122942798684, + "grad_norm": 0.0903063639998436, + "learning_rate": 0.00019747812321210868, + "loss": 2.5973, + "step": 23937 + }, + { + "epoch": 0.7098419476321798, + "grad_norm": 0.09017007797956467, + "learning_rate": 0.00019744066294251163, + "loss": 2.5499, + "step": 23938 + }, + { + "epoch": 0.7098716009844913, + "grad_norm": 0.0898907408118248, + "learning_rate": 0.00019740320535212942, + "loss": 2.5806, + "step": 23939 + }, + { + "epoch": 0.7099012543368027, + "grad_norm": 0.09795501083135605, + "learning_rate": 0.00019736575044129395, + "loss": 2.5556, + "step": 23940 + }, + { + "epoch": 0.7099309076891143, + "grad_norm": 0.09105461090803146, + "learning_rate": 0.00019732829821033653, + "loss": 2.5935, + "step": 23941 + }, + { + "epoch": 0.7099605610414257, + "grad_norm": 0.10067497938871384, + "learning_rate": 0.000197290848659589, + "loss": 2.614, + "step": 23942 + }, + { + "epoch": 0.7099902143937372, + "grad_norm": 0.10353010147809982, + "learning_rate": 0.0001972534017893828, + "loss": 2.5641, + "step": 23943 + }, + { + "epoch": 0.7100198677460486, + "grad_norm": 0.09340808540582657, + "learning_rate": 0.00019721595760004984, + "loss": 2.5494, + "step": 23944 + }, + { + "epoch": 0.7100495210983602, + "grad_norm": 0.10572446882724762, + "learning_rate": 0.00019717851609192156, + "loss": 2.5589, + "step": 23945 + }, + { + "epoch": 0.7100791744506717, + "grad_norm": 0.09425012022256851, + "learning_rate": 0.00019714107726532949, + "loss": 2.58, + "step": 23946 + }, + { + "epoch": 0.7101088278029831, + "grad_norm": 0.09876695275306702, + "learning_rate": 0.00019710364112060519, + "loss": 2.6137, + "step": 23947 + }, + { + "epoch": 0.7101384811552947, + "grad_norm": 0.09130099415779114, + "learning_rate": 0.00019706620765808008, + "loss": 2.5565, + "step": 23948 + }, + { + "epoch": 0.7101681345076061, + "grad_norm": 0.09489033371210098, + "learning_rate": 0.00019702877687808573, + "loss": 2.5756, + "step": 23949 + }, + { + "epoch": 0.7101977878599176, + "grad_norm": 0.09306388348340988, + "learning_rate": 0.00019699134878095376, + "loss": 2.5436, + "step": 23950 + }, + { + "epoch": 0.710227441212229, + "grad_norm": 0.09570413827896118, + "learning_rate": 0.00019695392336701524, + "loss": 2.5631, + "step": 23951 + }, + { + "epoch": 0.7102570945645406, + "grad_norm": 0.08956272155046463, + "learning_rate": 0.00019691650063660172, + "loss": 2.5974, + "step": 23952 + }, + { + "epoch": 0.710286747916852, + "grad_norm": 0.088827945291996, + "learning_rate": 0.0001968790805900446, + "loss": 2.5834, + "step": 23953 + }, + { + "epoch": 0.7103164012691635, + "grad_norm": 0.09569460153579712, + "learning_rate": 0.00019684166322767532, + "loss": 2.5673, + "step": 23954 + }, + { + "epoch": 0.7103460546214749, + "grad_norm": 0.09439900517463684, + "learning_rate": 0.00019680424854982508, + "loss": 2.5647, + "step": 23955 + }, + { + "epoch": 0.7103757079737865, + "grad_norm": 0.09690933674573898, + "learning_rate": 0.00019676683655682508, + "loss": 2.5765, + "step": 23956 + }, + { + "epoch": 0.7104053613260979, + "grad_norm": 0.09910652786493301, + "learning_rate": 0.00019672942724900699, + "loss": 2.5955, + "step": 23957 + }, + { + "epoch": 0.7104350146784094, + "grad_norm": 0.10067705810070038, + "learning_rate": 0.0001966920206267019, + "loss": 2.5685, + "step": 23958 + }, + { + "epoch": 0.7104646680307208, + "grad_norm": 0.09424453973770142, + "learning_rate": 0.00019665461669024105, + "loss": 2.6208, + "step": 23959 + }, + { + "epoch": 0.7104943213830324, + "grad_norm": 0.11552242189645767, + "learning_rate": 0.0001966172154399556, + "loss": 2.5719, + "step": 23960 + }, + { + "epoch": 0.7105239747353438, + "grad_norm": 0.08545660972595215, + "learning_rate": 0.00019657981687617692, + "loss": 2.5756, + "step": 23961 + }, + { + "epoch": 0.7105536280876553, + "grad_norm": 0.11039025336503983, + "learning_rate": 0.00019654242099923592, + "loss": 2.5608, + "step": 23962 + }, + { + "epoch": 0.7105832814399667, + "grad_norm": 0.10330456495285034, + "learning_rate": 0.00019650502780946383, + "loss": 2.5847, + "step": 23963 + }, + { + "epoch": 0.7106129347922783, + "grad_norm": 0.09554030746221542, + "learning_rate": 0.00019646763730719186, + "loss": 2.5872, + "step": 23964 + }, + { + "epoch": 0.7106425881445897, + "grad_norm": 0.0981389582157135, + "learning_rate": 0.00019643024949275102, + "loss": 2.5574, + "step": 23965 + }, + { + "epoch": 0.7106722414969012, + "grad_norm": 0.10011310130357742, + "learning_rate": 0.00019639286436647248, + "loss": 2.6068, + "step": 23966 + }, + { + "epoch": 0.7107018948492128, + "grad_norm": 0.09445574879646301, + "learning_rate": 0.0001963554819286872, + "loss": 2.5648, + "step": 23967 + }, + { + "epoch": 0.7107315482015242, + "grad_norm": 0.10002325475215912, + "learning_rate": 0.00019631810217972623, + "loss": 2.5917, + "step": 23968 + }, + { + "epoch": 0.7107612015538357, + "grad_norm": 0.10417220741510391, + "learning_rate": 0.00019628072511992045, + "loss": 2.5445, + "step": 23969 + }, + { + "epoch": 0.7107908549061471, + "grad_norm": 0.08995714783668518, + "learning_rate": 0.00019624335074960116, + "loss": 2.5985, + "step": 23970 + }, + { + "epoch": 0.7108205082584587, + "grad_norm": 0.10878366231918335, + "learning_rate": 0.00019620597906909925, + "loss": 2.5723, + "step": 23971 + }, + { + "epoch": 0.7108501616107701, + "grad_norm": 0.100090391933918, + "learning_rate": 0.00019616861007874543, + "loss": 2.5886, + "step": 23972 + }, + { + "epoch": 0.7108798149630816, + "grad_norm": 0.10470622777938843, + "learning_rate": 0.0001961312437788707, + "loss": 2.5281, + "step": 23973 + }, + { + "epoch": 0.710909468315393, + "grad_norm": 0.11058637499809265, + "learning_rate": 0.000196093880169806, + "loss": 2.5868, + "step": 23974 + }, + { + "epoch": 0.7109391216677046, + "grad_norm": 0.09523297846317291, + "learning_rate": 0.00019605651925188211, + "loss": 2.5417, + "step": 23975 + }, + { + "epoch": 0.710968775020016, + "grad_norm": 0.09529983252286911, + "learning_rate": 0.00019601916102542994, + "loss": 2.5669, + "step": 23976 + }, + { + "epoch": 0.7109984283723275, + "grad_norm": 0.09750334918498993, + "learning_rate": 0.00019598180549078027, + "loss": 2.5784, + "step": 23977 + }, + { + "epoch": 0.7110280817246389, + "grad_norm": 0.09960266947746277, + "learning_rate": 0.00019594445264826388, + "loss": 2.5623, + "step": 23978 + }, + { + "epoch": 0.7110577350769505, + "grad_norm": 0.10331480205059052, + "learning_rate": 0.0001959071024982116, + "loss": 2.5938, + "step": 23979 + }, + { + "epoch": 0.7110873884292619, + "grad_norm": 0.08912942558526993, + "learning_rate": 0.00019586975504095407, + "loss": 2.5799, + "step": 23980 + }, + { + "epoch": 0.7111170417815734, + "grad_norm": 0.09003341197967529, + "learning_rate": 0.00019583241027682207, + "loss": 2.5463, + "step": 23981 + }, + { + "epoch": 0.7111466951338848, + "grad_norm": 0.0987754836678505, + "learning_rate": 0.0001957950682061463, + "loss": 2.5827, + "step": 23982 + }, + { + "epoch": 0.7111763484861964, + "grad_norm": 0.09144240617752075, + "learning_rate": 0.0001957577288292574, + "loss": 2.5949, + "step": 23983 + }, + { + "epoch": 0.7112060018385078, + "grad_norm": 0.10176005959510803, + "learning_rate": 0.00019572039214648608, + "loss": 2.5857, + "step": 23984 + }, + { + "epoch": 0.7112356551908193, + "grad_norm": 0.08848375082015991, + "learning_rate": 0.00019568305815816285, + "loss": 2.5631, + "step": 23985 + }, + { + "epoch": 0.7112653085431307, + "grad_norm": 0.09557753056287766, + "learning_rate": 0.0001956457268646184, + "loss": 2.5607, + "step": 23986 + }, + { + "epoch": 0.7112949618954423, + "grad_norm": 0.10411106050014496, + "learning_rate": 0.00019560839826618333, + "loss": 2.6067, + "step": 23987 + }, + { + "epoch": 0.7113246152477538, + "grad_norm": 0.0970454141497612, + "learning_rate": 0.00019557107236318805, + "loss": 2.6138, + "step": 23988 + }, + { + "epoch": 0.7113542686000652, + "grad_norm": 0.08541272580623627, + "learning_rate": 0.00019553374915596327, + "loss": 2.5703, + "step": 23989 + }, + { + "epoch": 0.7113839219523768, + "grad_norm": 0.09417303651571274, + "learning_rate": 0.00019549642864483935, + "loss": 2.6001, + "step": 23990 + }, + { + "epoch": 0.7114135753046882, + "grad_norm": 0.09339223057031631, + "learning_rate": 0.00019545911083014683, + "loss": 2.5993, + "step": 23991 + }, + { + "epoch": 0.7114432286569997, + "grad_norm": 0.0955456793308258, + "learning_rate": 0.00019542179571221618, + "loss": 2.5767, + "step": 23992 + }, + { + "epoch": 0.7114728820093111, + "grad_norm": 0.09856688231229782, + "learning_rate": 0.00019538448329137775, + "loss": 2.5976, + "step": 23993 + }, + { + "epoch": 0.7115025353616227, + "grad_norm": 0.10217808932065964, + "learning_rate": 0.00019534717356796204, + "loss": 2.5839, + "step": 23994 + }, + { + "epoch": 0.7115321887139341, + "grad_norm": 0.1134113073348999, + "learning_rate": 0.00019530986654229943, + "loss": 2.5871, + "step": 23995 + }, + { + "epoch": 0.7115618420662456, + "grad_norm": 0.09580771625041962, + "learning_rate": 0.00019527256221472022, + "loss": 2.6051, + "step": 23996 + }, + { + "epoch": 0.711591495418557, + "grad_norm": 0.10966707020998001, + "learning_rate": 0.0001952352605855548, + "loss": 2.6022, + "step": 23997 + }, + { + "epoch": 0.7116211487708686, + "grad_norm": 0.09307246655225754, + "learning_rate": 0.0001951979616551335, + "loss": 2.5796, + "step": 23998 + }, + { + "epoch": 0.71165080212318, + "grad_norm": 0.10591083019971848, + "learning_rate": 0.00019516066542378646, + "loss": 2.5706, + "step": 23999 + }, + { + "epoch": 0.7116804554754915, + "grad_norm": 0.10242300480604172, + "learning_rate": 0.00019512337189184415, + "loss": 2.5893, + "step": 24000 + }, + { + "epoch": 0.711710108827803, + "grad_norm": 0.10389336943626404, + "learning_rate": 0.00019508608105963665, + "loss": 2.5763, + "step": 24001 + }, + { + "epoch": 0.7117397621801145, + "grad_norm": 0.11191091686487198, + "learning_rate": 0.00019504879292749427, + "loss": 2.6053, + "step": 24002 + }, + { + "epoch": 0.7117694155324259, + "grad_norm": 0.1027936264872551, + "learning_rate": 0.00019501150749574715, + "loss": 2.5991, + "step": 24003 + }, + { + "epoch": 0.7117990688847374, + "grad_norm": 0.09849509596824646, + "learning_rate": 0.0001949742247647255, + "loss": 2.5811, + "step": 24004 + }, + { + "epoch": 0.7118287222370488, + "grad_norm": 0.10268864035606384, + "learning_rate": 0.00019493694473475965, + "loss": 2.5516, + "step": 24005 + }, + { + "epoch": 0.7118583755893604, + "grad_norm": 0.0973135232925415, + "learning_rate": 0.00019489966740617926, + "loss": 2.5927, + "step": 24006 + }, + { + "epoch": 0.7118880289416718, + "grad_norm": 0.11039972305297852, + "learning_rate": 0.0001948623927793148, + "loss": 2.5833, + "step": 24007 + }, + { + "epoch": 0.7119176822939833, + "grad_norm": 0.08665554225444794, + "learning_rate": 0.00019482512085449593, + "loss": 2.6033, + "step": 24008 + }, + { + "epoch": 0.7119473356462949, + "grad_norm": 0.10239994525909424, + "learning_rate": 0.00019478785163205327, + "loss": 2.567, + "step": 24009 + }, + { + "epoch": 0.7119769889986063, + "grad_norm": 0.0870954692363739, + "learning_rate": 0.00019475058511231653, + "loss": 2.5549, + "step": 24010 + }, + { + "epoch": 0.7120066423509178, + "grad_norm": 0.11503688991069794, + "learning_rate": 0.00019471332129561574, + "loss": 2.5984, + "step": 24011 + }, + { + "epoch": 0.7120362957032292, + "grad_norm": 0.10584399104118347, + "learning_rate": 0.0001946760601822809, + "loss": 2.6028, + "step": 24012 + }, + { + "epoch": 0.7120659490555408, + "grad_norm": 0.1160084530711174, + "learning_rate": 0.00019463880177264197, + "loss": 2.5649, + "step": 24013 + }, + { + "epoch": 0.7120956024078522, + "grad_norm": 0.11174843460321426, + "learning_rate": 0.00019460154606702884, + "loss": 2.6173, + "step": 24014 + }, + { + "epoch": 0.7121252557601637, + "grad_norm": 0.09832976013422012, + "learning_rate": 0.0001945642930657715, + "loss": 2.5612, + "step": 24015 + }, + { + "epoch": 0.7121549091124751, + "grad_norm": 0.1003103107213974, + "learning_rate": 0.00019452704276919991, + "loss": 2.5831, + "step": 24016 + }, + { + "epoch": 0.7121845624647867, + "grad_norm": 0.09825487434864044, + "learning_rate": 0.0001944897951776436, + "loss": 2.6184, + "step": 24017 + }, + { + "epoch": 0.7122142158170981, + "grad_norm": 0.11443822085857391, + "learning_rate": 0.0001944525502914326, + "loss": 2.5759, + "step": 24018 + }, + { + "epoch": 0.7122438691694096, + "grad_norm": 0.09676118195056915, + "learning_rate": 0.00019441530811089674, + "loss": 2.5998, + "step": 24019 + }, + { + "epoch": 0.712273522521721, + "grad_norm": 0.09790373593568802, + "learning_rate": 0.00019437806863636576, + "loss": 2.622, + "step": 24020 + }, + { + "epoch": 0.7123031758740326, + "grad_norm": 0.10179377347230911, + "learning_rate": 0.00019434083186816925, + "loss": 2.5875, + "step": 24021 + }, + { + "epoch": 0.712332829226344, + "grad_norm": 0.0906488448381424, + "learning_rate": 0.0001943035978066373, + "loss": 2.5718, + "step": 24022 + }, + { + "epoch": 0.7123624825786555, + "grad_norm": 0.10116793215274811, + "learning_rate": 0.00019426636645209955, + "loss": 2.5796, + "step": 24023 + }, + { + "epoch": 0.712392135930967, + "grad_norm": 0.0951228141784668, + "learning_rate": 0.00019422913780488556, + "loss": 2.5775, + "step": 24024 + }, + { + "epoch": 0.7124217892832785, + "grad_norm": 0.09105668216943741, + "learning_rate": 0.00019419191186532497, + "loss": 2.578, + "step": 24025 + }, + { + "epoch": 0.7124514426355899, + "grad_norm": 0.0949062630534172, + "learning_rate": 0.00019415468863374775, + "loss": 2.5614, + "step": 24026 + }, + { + "epoch": 0.7124810959879014, + "grad_norm": 0.08864811807870865, + "learning_rate": 0.00019411746811048302, + "loss": 2.5536, + "step": 24027 + }, + { + "epoch": 0.7125107493402129, + "grad_norm": 0.10169842094182968, + "learning_rate": 0.00019408025029586068, + "loss": 2.6083, + "step": 24028 + }, + { + "epoch": 0.7125404026925244, + "grad_norm": 0.09092549234628677, + "learning_rate": 0.00019404303519021023, + "loss": 2.5988, + "step": 24029 + }, + { + "epoch": 0.7125700560448359, + "grad_norm": 0.09364277124404907, + "learning_rate": 0.0001940058227938612, + "loss": 2.5938, + "step": 24030 + }, + { + "epoch": 0.7125997093971473, + "grad_norm": 0.08760188519954681, + "learning_rate": 0.00019396861310714308, + "loss": 2.6001, + "step": 24031 + }, + { + "epoch": 0.7126293627494589, + "grad_norm": 0.09285072237253189, + "learning_rate": 0.0001939314061303855, + "loss": 2.5955, + "step": 24032 + }, + { + "epoch": 0.7126590161017703, + "grad_norm": 0.10047908872365952, + "learning_rate": 0.00019389420186391782, + "loss": 2.5923, + "step": 24033 + }, + { + "epoch": 0.7126886694540818, + "grad_norm": 0.09739526361227036, + "learning_rate": 0.00019385700030806935, + "loss": 2.6019, + "step": 24034 + }, + { + "epoch": 0.7127183228063932, + "grad_norm": 0.08544164896011353, + "learning_rate": 0.00019381980146316991, + "loss": 2.5714, + "step": 24035 + }, + { + "epoch": 0.7127479761587048, + "grad_norm": 0.0847022607922554, + "learning_rate": 0.00019378260532954862, + "loss": 2.5963, + "step": 24036 + }, + { + "epoch": 0.7127776295110162, + "grad_norm": 0.09115390479564667, + "learning_rate": 0.00019374541190753515, + "loss": 2.6186, + "step": 24037 + }, + { + "epoch": 0.7128072828633277, + "grad_norm": 0.0963975340127945, + "learning_rate": 0.0001937082211974584, + "loss": 2.5586, + "step": 24038 + }, + { + "epoch": 0.7128369362156391, + "grad_norm": 0.08853769302368164, + "learning_rate": 0.00019367103319964797, + "loss": 2.6245, + "step": 24039 + }, + { + "epoch": 0.7128665895679507, + "grad_norm": 0.0894276350736618, + "learning_rate": 0.00019363384791443305, + "loss": 2.5702, + "step": 24040 + }, + { + "epoch": 0.7128962429202621, + "grad_norm": 0.09838451445102692, + "learning_rate": 0.00019359666534214304, + "loss": 2.5938, + "step": 24041 + }, + { + "epoch": 0.7129258962725736, + "grad_norm": 0.09314123541116714, + "learning_rate": 0.00019355948548310713, + "loss": 2.5827, + "step": 24042 + }, + { + "epoch": 0.712955549624885, + "grad_norm": 0.09195670485496521, + "learning_rate": 0.00019352230833765465, + "loss": 2.5424, + "step": 24043 + }, + { + "epoch": 0.7129852029771966, + "grad_norm": 0.09195847064256668, + "learning_rate": 0.00019348513390611465, + "loss": 2.5752, + "step": 24044 + }, + { + "epoch": 0.713014856329508, + "grad_norm": 0.08840177953243256, + "learning_rate": 0.00019344796218881644, + "loss": 2.5725, + "step": 24045 + }, + { + "epoch": 0.7130445096818195, + "grad_norm": 0.10346398502588272, + "learning_rate": 0.000193410793186089, + "loss": 2.6148, + "step": 24046 + }, + { + "epoch": 0.713074163034131, + "grad_norm": 0.11602174490690231, + "learning_rate": 0.00019337362689826195, + "loss": 2.5803, + "step": 24047 + }, + { + "epoch": 0.7131038163864425, + "grad_norm": 0.08899535983800888, + "learning_rate": 0.00019333646332566384, + "loss": 2.6339, + "step": 24048 + }, + { + "epoch": 0.7131334697387539, + "grad_norm": 0.10102701187133789, + "learning_rate": 0.00019329930246862403, + "loss": 2.5581, + "step": 24049 + }, + { + "epoch": 0.7131631230910654, + "grad_norm": 0.09383491426706314, + "learning_rate": 0.00019326214432747153, + "loss": 2.5685, + "step": 24050 + }, + { + "epoch": 0.713192776443377, + "grad_norm": 0.11055481433868408, + "learning_rate": 0.00019322498890253536, + "loss": 2.5704, + "step": 24051 + }, + { + "epoch": 0.7132224297956884, + "grad_norm": 0.0894390121102333, + "learning_rate": 0.0001931878361941446, + "loss": 2.5837, + "step": 24052 + }, + { + "epoch": 0.7132520831479999, + "grad_norm": 0.10037453472614288, + "learning_rate": 0.0001931506862026282, + "loss": 2.6025, + "step": 24053 + }, + { + "epoch": 0.7132817365003113, + "grad_norm": 0.0884433463215828, + "learning_rate": 0.0001931135389283152, + "loss": 2.5607, + "step": 24054 + }, + { + "epoch": 0.7133113898526229, + "grad_norm": 0.10521341860294342, + "learning_rate": 0.0001930763943715344, + "loss": 2.6047, + "step": 24055 + }, + { + "epoch": 0.7133410432049343, + "grad_norm": 0.10103083401918411, + "learning_rate": 0.0001930392525326149, + "loss": 2.6027, + "step": 24056 + }, + { + "epoch": 0.7133706965572458, + "grad_norm": 0.10675763338804245, + "learning_rate": 0.00019300211341188544, + "loss": 2.5867, + "step": 24057 + }, + { + "epoch": 0.7134003499095573, + "grad_norm": 0.10135170817375183, + "learning_rate": 0.00019296497700967496, + "loss": 2.6165, + "step": 24058 + }, + { + "epoch": 0.7134300032618688, + "grad_norm": 0.0944296196103096, + "learning_rate": 0.00019292784332631237, + "loss": 2.5762, + "step": 24059 + }, + { + "epoch": 0.7134596566141802, + "grad_norm": 0.10438272356987, + "learning_rate": 0.0001928907123621264, + "loss": 2.5725, + "step": 24060 + }, + { + "epoch": 0.7134893099664917, + "grad_norm": 0.10035500675439835, + "learning_rate": 0.00019285358411744586, + "loss": 2.5682, + "step": 24061 + }, + { + "epoch": 0.7135189633188032, + "grad_norm": 0.09260185062885284, + "learning_rate": 0.0001928164585925996, + "loss": 2.5323, + "step": 24062 + }, + { + "epoch": 0.7135486166711147, + "grad_norm": 0.10585113614797592, + "learning_rate": 0.0001927793357879163, + "loss": 2.5768, + "step": 24063 + }, + { + "epoch": 0.7135782700234261, + "grad_norm": 0.10375959426164627, + "learning_rate": 0.00019274221570372474, + "loss": 2.6076, + "step": 24064 + }, + { + "epoch": 0.7136079233757376, + "grad_norm": 0.0964650884270668, + "learning_rate": 0.00019270509834035354, + "loss": 2.5949, + "step": 24065 + }, + { + "epoch": 0.7136375767280491, + "grad_norm": 0.09904460608959198, + "learning_rate": 0.00019266798369813154, + "loss": 2.5755, + "step": 24066 + }, + { + "epoch": 0.7136672300803606, + "grad_norm": 0.09317681938409805, + "learning_rate": 0.0001926308717773872, + "loss": 2.5888, + "step": 24067 + }, + { + "epoch": 0.713696883432672, + "grad_norm": 0.10857663303613663, + "learning_rate": 0.0001925937625784493, + "loss": 2.5915, + "step": 24068 + }, + { + "epoch": 0.7137265367849835, + "grad_norm": 0.08813334256410599, + "learning_rate": 0.0001925566561016464, + "loss": 2.5819, + "step": 24069 + }, + { + "epoch": 0.713756190137295, + "grad_norm": 0.09956547617912292, + "learning_rate": 0.00019251955234730707, + "loss": 2.5959, + "step": 24070 + }, + { + "epoch": 0.7137858434896065, + "grad_norm": 0.10135690122842789, + "learning_rate": 0.00019248245131576002, + "loss": 2.592, + "step": 24071 + }, + { + "epoch": 0.713815496841918, + "grad_norm": 0.09852079302072525, + "learning_rate": 0.0001924453530073334, + "loss": 2.5831, + "step": 24072 + }, + { + "epoch": 0.7138451501942294, + "grad_norm": 0.09952999651432037, + "learning_rate": 0.00019240825742235606, + "loss": 2.5768, + "step": 24073 + }, + { + "epoch": 0.713874803546541, + "grad_norm": 0.0961129292845726, + "learning_rate": 0.00019237116456115644, + "loss": 2.5341, + "step": 24074 + }, + { + "epoch": 0.7139044568988524, + "grad_norm": 0.10518576949834824, + "learning_rate": 0.00019233407442406292, + "loss": 2.595, + "step": 24075 + }, + { + "epoch": 0.7139341102511639, + "grad_norm": 0.08626144379377365, + "learning_rate": 0.00019229698701140403, + "loss": 2.5945, + "step": 24076 + }, + { + "epoch": 0.7139637636034754, + "grad_norm": 0.10447391122579575, + "learning_rate": 0.00019225990232350808, + "loss": 2.5612, + "step": 24077 + }, + { + "epoch": 0.7139934169557869, + "grad_norm": 0.10199365764856339, + "learning_rate": 0.00019222282036070355, + "loss": 2.5596, + "step": 24078 + }, + { + "epoch": 0.7140230703080983, + "grad_norm": 0.10287552326917648, + "learning_rate": 0.00019218574112331877, + "loss": 2.597, + "step": 24079 + }, + { + "epoch": 0.7140527236604098, + "grad_norm": 0.09943700581789017, + "learning_rate": 0.00019214866461168207, + "loss": 2.5772, + "step": 24080 + }, + { + "epoch": 0.7140823770127213, + "grad_norm": 0.10598381608724594, + "learning_rate": 0.000192111590826122, + "loss": 2.6031, + "step": 24081 + }, + { + "epoch": 0.7141120303650328, + "grad_norm": 0.10560090839862823, + "learning_rate": 0.00019207451976696645, + "loss": 2.581, + "step": 24082 + }, + { + "epoch": 0.7141416837173442, + "grad_norm": 0.09630879014730453, + "learning_rate": 0.00019203745143454387, + "loss": 2.6004, + "step": 24083 + }, + { + "epoch": 0.7141713370696557, + "grad_norm": 0.09378387778997421, + "learning_rate": 0.00019200038582918255, + "loss": 2.5769, + "step": 24084 + }, + { + "epoch": 0.7142009904219672, + "grad_norm": 0.0924936830997467, + "learning_rate": 0.0001919633229512105, + "loss": 2.5682, + "step": 24085 + }, + { + "epoch": 0.7142306437742787, + "grad_norm": 0.10669771581888199, + "learning_rate": 0.0001919262628009562, + "loss": 2.5744, + "step": 24086 + }, + { + "epoch": 0.7142602971265901, + "grad_norm": 0.09305890649557114, + "learning_rate": 0.00019188920537874783, + "loss": 2.5794, + "step": 24087 + }, + { + "epoch": 0.7142899504789016, + "grad_norm": 0.0975121259689331, + "learning_rate": 0.00019185215068491336, + "loss": 2.5695, + "step": 24088 + }, + { + "epoch": 0.7143196038312131, + "grad_norm": 0.0930948480963707, + "learning_rate": 0.00019181509871978104, + "loss": 2.5966, + "step": 24089 + }, + { + "epoch": 0.7143492571835246, + "grad_norm": 0.09396103769540787, + "learning_rate": 0.00019177804948367888, + "loss": 2.5977, + "step": 24090 + }, + { + "epoch": 0.7143789105358361, + "grad_norm": 0.10012126713991165, + "learning_rate": 0.00019174100297693504, + "loss": 2.6084, + "step": 24091 + }, + { + "epoch": 0.7144085638881476, + "grad_norm": 0.09113192558288574, + "learning_rate": 0.00019170395919987767, + "loss": 2.5862, + "step": 24092 + }, + { + "epoch": 0.7144382172404591, + "grad_norm": 0.09150926023721695, + "learning_rate": 0.0001916669181528345, + "loss": 2.5979, + "step": 24093 + }, + { + "epoch": 0.7144678705927705, + "grad_norm": 0.11941002309322357, + "learning_rate": 0.00019162987983613368, + "loss": 2.5777, + "step": 24094 + }, + { + "epoch": 0.714497523945082, + "grad_norm": 0.09864534437656403, + "learning_rate": 0.0001915928442501032, + "loss": 2.5688, + "step": 24095 + }, + { + "epoch": 0.7145271772973935, + "grad_norm": 0.09457148611545563, + "learning_rate": 0.000191555811395071, + "loss": 2.551, + "step": 24096 + }, + { + "epoch": 0.714556830649705, + "grad_norm": 0.09369859844446182, + "learning_rate": 0.00019151878127136507, + "loss": 2.5997, + "step": 24097 + }, + { + "epoch": 0.7145864840020164, + "grad_norm": 0.09738969802856445, + "learning_rate": 0.0001914817538793131, + "loss": 2.5873, + "step": 24098 + }, + { + "epoch": 0.7146161373543279, + "grad_norm": 0.08979079872369766, + "learning_rate": 0.00019144472921924332, + "loss": 2.5844, + "step": 24099 + }, + { + "epoch": 0.7146457907066394, + "grad_norm": 0.08985293656587601, + "learning_rate": 0.00019140770729148348, + "loss": 2.5643, + "step": 24100 + }, + { + "epoch": 0.7146754440589509, + "grad_norm": 0.08386661112308502, + "learning_rate": 0.00019137068809636134, + "loss": 2.5584, + "step": 24101 + }, + { + "epoch": 0.7147050974112623, + "grad_norm": 0.08990713208913803, + "learning_rate": 0.00019133367163420484, + "loss": 2.5852, + "step": 24102 + }, + { + "epoch": 0.7147347507635738, + "grad_norm": 0.09307534247636795, + "learning_rate": 0.00019129665790534157, + "loss": 2.5851, + "step": 24103 + }, + { + "epoch": 0.7147644041158853, + "grad_norm": 0.087392657995224, + "learning_rate": 0.00019125964691009935, + "loss": 2.5669, + "step": 24104 + }, + { + "epoch": 0.7147940574681968, + "grad_norm": 0.07887833565473557, + "learning_rate": 0.0001912226386488059, + "loss": 2.5553, + "step": 24105 + }, + { + "epoch": 0.7148237108205082, + "grad_norm": 0.093849778175354, + "learning_rate": 0.00019118563312178904, + "loss": 2.6017, + "step": 24106 + }, + { + "epoch": 0.7148533641728197, + "grad_norm": 0.07985146343708038, + "learning_rate": 0.0001911486303293764, + "loss": 2.5549, + "step": 24107 + }, + { + "epoch": 0.7148830175251312, + "grad_norm": 0.09113595634698868, + "learning_rate": 0.00019111163027189565, + "loss": 2.591, + "step": 24108 + }, + { + "epoch": 0.7149126708774427, + "grad_norm": 0.0875493735074997, + "learning_rate": 0.00019107463294967448, + "loss": 2.5786, + "step": 24109 + }, + { + "epoch": 0.7149423242297541, + "grad_norm": 0.08701243996620178, + "learning_rate": 0.0001910376383630404, + "loss": 2.5453, + "step": 24110 + }, + { + "epoch": 0.7149719775820657, + "grad_norm": 0.09682294726371765, + "learning_rate": 0.0001910006465123209, + "loss": 2.5958, + "step": 24111 + }, + { + "epoch": 0.7150016309343772, + "grad_norm": 0.08708126842975616, + "learning_rate": 0.0001909636573978439, + "loss": 2.5564, + "step": 24112 + }, + { + "epoch": 0.7150312842866886, + "grad_norm": 0.09803709387779236, + "learning_rate": 0.00019092667101993694, + "loss": 2.5996, + "step": 24113 + }, + { + "epoch": 0.7150609376390001, + "grad_norm": 0.09188202023506165, + "learning_rate": 0.00019088968737892716, + "loss": 2.5713, + "step": 24114 + }, + { + "epoch": 0.7150905909913116, + "grad_norm": 0.09245344251394272, + "learning_rate": 0.00019085270647514226, + "loss": 2.5391, + "step": 24115 + }, + { + "epoch": 0.7151202443436231, + "grad_norm": 0.09230075776576996, + "learning_rate": 0.0001908157283089097, + "loss": 2.5429, + "step": 24116 + }, + { + "epoch": 0.7151498976959345, + "grad_norm": 0.08967839181423187, + "learning_rate": 0.00019077875288055695, + "loss": 2.6059, + "step": 24117 + }, + { + "epoch": 0.715179551048246, + "grad_norm": 0.09464582800865173, + "learning_rate": 0.00019074178019041143, + "loss": 2.5594, + "step": 24118 + }, + { + "epoch": 0.7152092044005575, + "grad_norm": 0.09634601324796677, + "learning_rate": 0.00019070481023880054, + "loss": 2.6191, + "step": 24119 + }, + { + "epoch": 0.715238857752869, + "grad_norm": 0.1035241112112999, + "learning_rate": 0.00019066784302605166, + "loss": 2.566, + "step": 24120 + }, + { + "epoch": 0.7152685111051804, + "grad_norm": 0.0991215631365776, + "learning_rate": 0.00019063087855249205, + "loss": 2.554, + "step": 24121 + }, + { + "epoch": 0.715298164457492, + "grad_norm": 0.0916445180773735, + "learning_rate": 0.00019059391681844917, + "loss": 2.5864, + "step": 24122 + }, + { + "epoch": 0.7153278178098034, + "grad_norm": 0.10617654025554657, + "learning_rate": 0.00019055695782425026, + "loss": 2.5975, + "step": 24123 + }, + { + "epoch": 0.7153574711621149, + "grad_norm": 0.08932468295097351, + "learning_rate": 0.0001905200015702226, + "loss": 2.5773, + "step": 24124 + }, + { + "epoch": 0.7153871245144263, + "grad_norm": 0.0950942113995552, + "learning_rate": 0.00019048304805669347, + "loss": 2.5845, + "step": 24125 + }, + { + "epoch": 0.7154167778667379, + "grad_norm": 0.10699629783630371, + "learning_rate": 0.00019044609728399004, + "loss": 2.5484, + "step": 24126 + }, + { + "epoch": 0.7154464312190493, + "grad_norm": 0.08427827060222626, + "learning_rate": 0.00019040914925243956, + "loss": 2.5784, + "step": 24127 + }, + { + "epoch": 0.7154760845713608, + "grad_norm": 0.11861709505319595, + "learning_rate": 0.0001903722039623692, + "loss": 2.5931, + "step": 24128 + }, + { + "epoch": 0.7155057379236722, + "grad_norm": 0.10782604664564133, + "learning_rate": 0.00019033526141410618, + "loss": 2.5681, + "step": 24129 + }, + { + "epoch": 0.7155353912759838, + "grad_norm": 0.09048110246658325, + "learning_rate": 0.00019029832160797749, + "loss": 2.5801, + "step": 24130 + }, + { + "epoch": 0.7155650446282952, + "grad_norm": 0.09930681437253952, + "learning_rate": 0.00019026138454431035, + "loss": 2.5774, + "step": 24131 + }, + { + "epoch": 0.7155946979806067, + "grad_norm": 0.08959951251745224, + "learning_rate": 0.00019022445022343182, + "loss": 2.5636, + "step": 24132 + }, + { + "epoch": 0.7156243513329182, + "grad_norm": 0.10219694674015045, + "learning_rate": 0.00019018751864566897, + "loss": 2.5307, + "step": 24133 + }, + { + "epoch": 0.7156540046852297, + "grad_norm": 0.09910104423761368, + "learning_rate": 0.0001901505898113488, + "loss": 2.6022, + "step": 24134 + }, + { + "epoch": 0.7156836580375412, + "grad_norm": 0.10143091529607773, + "learning_rate": 0.00019011366372079835, + "loss": 2.5664, + "step": 24135 + }, + { + "epoch": 0.7157133113898526, + "grad_norm": 0.09152460098266602, + "learning_rate": 0.0001900767403743448, + "loss": 2.5572, + "step": 24136 + }, + { + "epoch": 0.7157429647421641, + "grad_norm": 0.09382349997758865, + "learning_rate": 0.00019003981977231454, + "loss": 2.5873, + "step": 24137 + }, + { + "epoch": 0.7157726180944756, + "grad_norm": 0.08971969038248062, + "learning_rate": 0.00019000290191503505, + "loss": 2.549, + "step": 24138 + }, + { + "epoch": 0.7158022714467871, + "grad_norm": 0.09821221977472305, + "learning_rate": 0.00018996598680283305, + "loss": 2.5841, + "step": 24139 + }, + { + "epoch": 0.7158319247990985, + "grad_norm": 0.0976821705698967, + "learning_rate": 0.00018992907443603552, + "loss": 2.5515, + "step": 24140 + }, + { + "epoch": 0.71586157815141, + "grad_norm": 0.09901082515716553, + "learning_rate": 0.00018989216481496917, + "loss": 2.5793, + "step": 24141 + }, + { + "epoch": 0.7158912315037215, + "grad_norm": 0.09083867818117142, + "learning_rate": 0.000189855257939961, + "loss": 2.6118, + "step": 24142 + }, + { + "epoch": 0.715920884856033, + "grad_norm": 0.11293840408325195, + "learning_rate": 0.00018981835381133778, + "loss": 2.5998, + "step": 24143 + }, + { + "epoch": 0.7159505382083444, + "grad_norm": 0.08706536144018173, + "learning_rate": 0.00018978145242942618, + "loss": 2.5664, + "step": 24144 + }, + { + "epoch": 0.715980191560656, + "grad_norm": 0.10966600477695465, + "learning_rate": 0.00018974455379455312, + "loss": 2.6082, + "step": 24145 + }, + { + "epoch": 0.7160098449129674, + "grad_norm": 0.0815955400466919, + "learning_rate": 0.0001897076579070453, + "loss": 2.5912, + "step": 24146 + }, + { + "epoch": 0.7160394982652789, + "grad_norm": 0.09632836282253265, + "learning_rate": 0.0001896707647672296, + "loss": 2.5588, + "step": 24147 + }, + { + "epoch": 0.7160691516175903, + "grad_norm": 0.09375607967376709, + "learning_rate": 0.0001896338743754324, + "loss": 2.5943, + "step": 24148 + }, + { + "epoch": 0.7160988049699019, + "grad_norm": 0.09197146445512772, + "learning_rate": 0.00018959698673198046, + "loss": 2.5886, + "step": 24149 + }, + { + "epoch": 0.7161284583222133, + "grad_norm": 0.08783485740423203, + "learning_rate": 0.00018956010183720034, + "loss": 2.5732, + "step": 24150 + }, + { + "epoch": 0.7161581116745248, + "grad_norm": 0.10004745423793793, + "learning_rate": 0.00018952321969141895, + "loss": 2.5743, + "step": 24151 + }, + { + "epoch": 0.7161877650268362, + "grad_norm": 0.08947291970252991, + "learning_rate": 0.00018948634029496275, + "loss": 2.6071, + "step": 24152 + }, + { + "epoch": 0.7162174183791478, + "grad_norm": 0.08828181028366089, + "learning_rate": 0.00018944946364815834, + "loss": 2.5781, + "step": 24153 + }, + { + "epoch": 0.7162470717314593, + "grad_norm": 0.09111503511667252, + "learning_rate": 0.00018941258975133224, + "loss": 2.5647, + "step": 24154 + }, + { + "epoch": 0.7162767250837707, + "grad_norm": 0.09345591813325882, + "learning_rate": 0.0001893757186048109, + "loss": 2.5843, + "step": 24155 + }, + { + "epoch": 0.7163063784360822, + "grad_norm": 0.08998535573482513, + "learning_rate": 0.00018933885020892095, + "loss": 2.5686, + "step": 24156 + }, + { + "epoch": 0.7163360317883937, + "grad_norm": 0.08844228833913803, + "learning_rate": 0.00018930198456398894, + "loss": 2.5576, + "step": 24157 + }, + { + "epoch": 0.7163656851407052, + "grad_norm": 0.09096261858940125, + "learning_rate": 0.00018926512167034105, + "loss": 2.5742, + "step": 24158 + }, + { + "epoch": 0.7163953384930166, + "grad_norm": 0.09837278723716736, + "learning_rate": 0.00018922826152830387, + "loss": 2.5446, + "step": 24159 + }, + { + "epoch": 0.7164249918453282, + "grad_norm": 0.1014869436621666, + "learning_rate": 0.00018919140413820368, + "loss": 2.6182, + "step": 24160 + }, + { + "epoch": 0.7164546451976396, + "grad_norm": 0.09150553494691849, + "learning_rate": 0.00018915454950036703, + "loss": 2.6018, + "step": 24161 + }, + { + "epoch": 0.7164842985499511, + "grad_norm": 0.09742655605077744, + "learning_rate": 0.0001891176976151202, + "loss": 2.5629, + "step": 24162 + }, + { + "epoch": 0.7165139519022625, + "grad_norm": 0.08717647194862366, + "learning_rate": 0.00018908084848278934, + "loss": 2.5548, + "step": 24163 + }, + { + "epoch": 0.7165436052545741, + "grad_norm": 0.09452077001333237, + "learning_rate": 0.00018904400210370109, + "loss": 2.5916, + "step": 24164 + }, + { + "epoch": 0.7165732586068855, + "grad_norm": 0.09829765558242798, + "learning_rate": 0.00018900715847818157, + "loss": 2.6014, + "step": 24165 + }, + { + "epoch": 0.716602911959197, + "grad_norm": 0.09182540327310562, + "learning_rate": 0.00018897031760655708, + "loss": 2.5428, + "step": 24166 + }, + { + "epoch": 0.7166325653115084, + "grad_norm": 0.09437672793865204, + "learning_rate": 0.0001889334794891538, + "loss": 2.5864, + "step": 24167 + }, + { + "epoch": 0.71666221866382, + "grad_norm": 0.09942343086004257, + "learning_rate": 0.0001888966441262981, + "loss": 2.6037, + "step": 24168 + }, + { + "epoch": 0.7166918720161314, + "grad_norm": 0.0967651754617691, + "learning_rate": 0.0001888598115183159, + "loss": 2.6077, + "step": 24169 + }, + { + "epoch": 0.7167215253684429, + "grad_norm": 0.0973377674818039, + "learning_rate": 0.00018882298166553342, + "loss": 2.5615, + "step": 24170 + }, + { + "epoch": 0.7167511787207543, + "grad_norm": 0.091453418135643, + "learning_rate": 0.00018878615456827686, + "loss": 2.5681, + "step": 24171 + }, + { + "epoch": 0.7167808320730659, + "grad_norm": 0.10362106561660767, + "learning_rate": 0.0001887493302268723, + "loss": 2.5748, + "step": 24172 + }, + { + "epoch": 0.7168104854253773, + "grad_norm": 0.0832783579826355, + "learning_rate": 0.0001887125086416459, + "loss": 2.5902, + "step": 24173 + }, + { + "epoch": 0.7168401387776888, + "grad_norm": 0.10449688881635666, + "learning_rate": 0.00018867568981292365, + "loss": 2.5962, + "step": 24174 + }, + { + "epoch": 0.7168697921300003, + "grad_norm": 0.09784967452287674, + "learning_rate": 0.00018863887374103154, + "loss": 2.6002, + "step": 24175 + }, + { + "epoch": 0.7168994454823118, + "grad_norm": 0.09932158887386322, + "learning_rate": 0.00018860206042629557, + "loss": 2.5664, + "step": 24176 + }, + { + "epoch": 0.7169290988346233, + "grad_norm": 0.10214050859212875, + "learning_rate": 0.00018856524986904196, + "loss": 2.5756, + "step": 24177 + }, + { + "epoch": 0.7169587521869347, + "grad_norm": 0.11035699397325516, + "learning_rate": 0.00018852844206959662, + "loss": 2.562, + "step": 24178 + }, + { + "epoch": 0.7169884055392463, + "grad_norm": 0.09587273001670837, + "learning_rate": 0.00018849163702828531, + "loss": 2.5817, + "step": 24179 + }, + { + "epoch": 0.7170180588915577, + "grad_norm": 0.10232454538345337, + "learning_rate": 0.00018845483474543394, + "loss": 2.5712, + "step": 24180 + }, + { + "epoch": 0.7170477122438692, + "grad_norm": 0.10085956752300262, + "learning_rate": 0.00018841803522136858, + "loss": 2.6149, + "step": 24181 + }, + { + "epoch": 0.7170773655961806, + "grad_norm": 0.09636856615543365, + "learning_rate": 0.0001883812384564149, + "loss": 2.5609, + "step": 24182 + }, + { + "epoch": 0.7171070189484922, + "grad_norm": 0.11444531381130219, + "learning_rate": 0.00018834444445089892, + "loss": 2.5802, + "step": 24183 + }, + { + "epoch": 0.7171366723008036, + "grad_norm": 0.0959121361374855, + "learning_rate": 0.00018830765320514636, + "loss": 2.5705, + "step": 24184 + }, + { + "epoch": 0.7171663256531151, + "grad_norm": 0.099732905626297, + "learning_rate": 0.00018827086471948301, + "loss": 2.5903, + "step": 24185 + }, + { + "epoch": 0.7171959790054265, + "grad_norm": 0.10728079825639725, + "learning_rate": 0.00018823407899423467, + "loss": 2.5835, + "step": 24186 + }, + { + "epoch": 0.7172256323577381, + "grad_norm": 0.09556905925273895, + "learning_rate": 0.00018819729602972707, + "loss": 2.5401, + "step": 24187 + }, + { + "epoch": 0.7172552857100495, + "grad_norm": 0.09901108592748642, + "learning_rate": 0.00018816051582628597, + "loss": 2.5459, + "step": 24188 + }, + { + "epoch": 0.717284939062361, + "grad_norm": 0.08676318824291229, + "learning_rate": 0.00018812373838423697, + "loss": 2.5936, + "step": 24189 + }, + { + "epoch": 0.7173145924146724, + "grad_norm": 0.09762434661388397, + "learning_rate": 0.00018808696370390584, + "loss": 2.5694, + "step": 24190 + }, + { + "epoch": 0.717344245766984, + "grad_norm": 0.09364702552556992, + "learning_rate": 0.00018805019178561817, + "loss": 2.5959, + "step": 24191 + }, + { + "epoch": 0.7173738991192954, + "grad_norm": 0.09825102984905243, + "learning_rate": 0.0001880134226296996, + "loss": 2.5926, + "step": 24192 + }, + { + "epoch": 0.7174035524716069, + "grad_norm": 0.10279158502817154, + "learning_rate": 0.00018797665623647574, + "loss": 2.5396, + "step": 24193 + }, + { + "epoch": 0.7174332058239183, + "grad_norm": 0.08936543762683868, + "learning_rate": 0.00018793989260627216, + "loss": 2.552, + "step": 24194 + }, + { + "epoch": 0.7174628591762299, + "grad_norm": 0.09704455733299255, + "learning_rate": 0.0001879031317394143, + "loss": 2.6109, + "step": 24195 + }, + { + "epoch": 0.7174925125285414, + "grad_norm": 0.09086133539676666, + "learning_rate": 0.00018786637363622788, + "loss": 2.5679, + "step": 24196 + }, + { + "epoch": 0.7175221658808528, + "grad_norm": 0.10047496110200882, + "learning_rate": 0.00018782961829703825, + "loss": 2.5679, + "step": 24197 + }, + { + "epoch": 0.7175518192331644, + "grad_norm": 0.10423603653907776, + "learning_rate": 0.00018779286572217097, + "loss": 2.5964, + "step": 24198 + }, + { + "epoch": 0.7175814725854758, + "grad_norm": 0.0984969511628151, + "learning_rate": 0.00018775611591195142, + "loss": 2.5872, + "step": 24199 + }, + { + "epoch": 0.7176111259377873, + "grad_norm": 0.09071653336286545, + "learning_rate": 0.0001877193688667051, + "loss": 2.5611, + "step": 24200 + }, + { + "epoch": 0.7176407792900987, + "grad_norm": 0.10922589153051376, + "learning_rate": 0.00018768262458675734, + "loss": 2.616, + "step": 24201 + }, + { + "epoch": 0.7176704326424103, + "grad_norm": 0.09677505493164062, + "learning_rate": 0.00018764588307243352, + "loss": 2.5651, + "step": 24202 + }, + { + "epoch": 0.7177000859947217, + "grad_norm": 0.10593487322330475, + "learning_rate": 0.00018760914432405906, + "loss": 2.5919, + "step": 24203 + }, + { + "epoch": 0.7177297393470332, + "grad_norm": 0.09567160904407501, + "learning_rate": 0.0001875724083419592, + "loss": 2.593, + "step": 24204 + }, + { + "epoch": 0.7177593926993446, + "grad_norm": 0.1129777580499649, + "learning_rate": 0.00018753567512645936, + "loss": 2.5808, + "step": 24205 + }, + { + "epoch": 0.7177890460516562, + "grad_norm": 0.09529983997344971, + "learning_rate": 0.00018749894467788475, + "loss": 2.5649, + "step": 24206 + }, + { + "epoch": 0.7178186994039676, + "grad_norm": 0.10189582407474518, + "learning_rate": 0.0001874622169965606, + "loss": 2.5455, + "step": 24207 + }, + { + "epoch": 0.7178483527562791, + "grad_norm": 0.09489158540964127, + "learning_rate": 0.00018742549208281212, + "loss": 2.5802, + "step": 24208 + }, + { + "epoch": 0.7178780061085905, + "grad_norm": 0.10517782717943192, + "learning_rate": 0.00018738876993696464, + "loss": 2.5865, + "step": 24209 + }, + { + "epoch": 0.7179076594609021, + "grad_norm": 0.09907590597867966, + "learning_rate": 0.00018735205055934318, + "loss": 2.5478, + "step": 24210 + }, + { + "epoch": 0.7179373128132135, + "grad_norm": 0.09623079746961594, + "learning_rate": 0.00018731533395027305, + "loss": 2.557, + "step": 24211 + }, + { + "epoch": 0.717966966165525, + "grad_norm": 0.09884712845087051, + "learning_rate": 0.00018727862011007945, + "loss": 2.568, + "step": 24212 + }, + { + "epoch": 0.7179966195178364, + "grad_norm": 0.09216509014368057, + "learning_rate": 0.00018724190903908716, + "loss": 2.5849, + "step": 24213 + }, + { + "epoch": 0.718026272870148, + "grad_norm": 0.08855892717838287, + "learning_rate": 0.00018720520073762148, + "loss": 2.5677, + "step": 24214 + }, + { + "epoch": 0.7180559262224594, + "grad_norm": 0.0896959900856018, + "learning_rate": 0.00018716849520600726, + "loss": 2.5986, + "step": 24215 + }, + { + "epoch": 0.7180855795747709, + "grad_norm": 0.09483519196510315, + "learning_rate": 0.00018713179244456984, + "loss": 2.5447, + "step": 24216 + }, + { + "epoch": 0.7181152329270825, + "grad_norm": 0.09196630865335464, + "learning_rate": 0.00018709509245363415, + "loss": 2.5812, + "step": 24217 + }, + { + "epoch": 0.7181448862793939, + "grad_norm": 0.09980040043592453, + "learning_rate": 0.00018705839523352514, + "loss": 2.5502, + "step": 24218 + }, + { + "epoch": 0.7181745396317054, + "grad_norm": 0.09694071114063263, + "learning_rate": 0.00018702170078456772, + "loss": 2.5805, + "step": 24219 + }, + { + "epoch": 0.7182041929840168, + "grad_norm": 0.10426467657089233, + "learning_rate": 0.00018698500910708688, + "loss": 2.5901, + "step": 24220 + }, + { + "epoch": 0.7182338463363284, + "grad_norm": 0.11882145702838898, + "learning_rate": 0.0001869483202014075, + "loss": 2.5878, + "step": 24221 + }, + { + "epoch": 0.7182634996886398, + "grad_norm": 0.0879133865237236, + "learning_rate": 0.0001869116340678545, + "loss": 2.5745, + "step": 24222 + }, + { + "epoch": 0.7182931530409513, + "grad_norm": 0.100529745221138, + "learning_rate": 0.00018687495070675287, + "loss": 2.5624, + "step": 24223 + }, + { + "epoch": 0.7183228063932627, + "grad_norm": 0.09295038878917694, + "learning_rate": 0.00018683827011842713, + "loss": 2.5269, + "step": 24224 + }, + { + "epoch": 0.7183524597455743, + "grad_norm": 0.107003353536129, + "learning_rate": 0.00018680159230320226, + "loss": 2.5612, + "step": 24225 + }, + { + "epoch": 0.7183821130978857, + "grad_norm": 0.10296996682882309, + "learning_rate": 0.00018676491726140305, + "loss": 2.5554, + "step": 24226 + }, + { + "epoch": 0.7184117664501972, + "grad_norm": 0.09363125264644623, + "learning_rate": 0.0001867282449933541, + "loss": 2.5944, + "step": 24227 + }, + { + "epoch": 0.7184414198025086, + "grad_norm": 0.10302037745714188, + "learning_rate": 0.0001866915754993805, + "loss": 2.6076, + "step": 24228 + }, + { + "epoch": 0.7184710731548202, + "grad_norm": 0.09169220179319382, + "learning_rate": 0.00018665490877980674, + "loss": 2.6114, + "step": 24229 + }, + { + "epoch": 0.7185007265071316, + "grad_norm": 0.09406156837940216, + "learning_rate": 0.00018661824483495753, + "loss": 2.56, + "step": 24230 + }, + { + "epoch": 0.7185303798594431, + "grad_norm": 0.10937537997961044, + "learning_rate": 0.00018658158366515766, + "loss": 2.6003, + "step": 24231 + }, + { + "epoch": 0.7185600332117545, + "grad_norm": 0.08994447439908981, + "learning_rate": 0.00018654492527073158, + "loss": 2.5836, + "step": 24232 + }, + { + "epoch": 0.7185896865640661, + "grad_norm": 0.09852439910173416, + "learning_rate": 0.00018650826965200417, + "loss": 2.5643, + "step": 24233 + }, + { + "epoch": 0.7186193399163775, + "grad_norm": 0.10062140971422195, + "learning_rate": 0.00018647161680929964, + "loss": 2.5621, + "step": 24234 + }, + { + "epoch": 0.718648993268689, + "grad_norm": 0.09020077437162399, + "learning_rate": 0.0001864349667429428, + "loss": 2.5451, + "step": 24235 + }, + { + "epoch": 0.7186786466210004, + "grad_norm": 0.10281848162412643, + "learning_rate": 0.0001863983194532582, + "loss": 2.5638, + "step": 24236 + }, + { + "epoch": 0.718708299973312, + "grad_norm": 0.09201578795909882, + "learning_rate": 0.00018636167494057022, + "loss": 2.5726, + "step": 24237 + }, + { + "epoch": 0.7187379533256235, + "grad_norm": 0.0976083055138588, + "learning_rate": 0.0001863250332052035, + "loss": 2.6144, + "step": 24238 + }, + { + "epoch": 0.7187676066779349, + "grad_norm": 0.095350481569767, + "learning_rate": 0.00018628839424748245, + "loss": 2.5627, + "step": 24239 + }, + { + "epoch": 0.7187972600302465, + "grad_norm": 0.0985487625002861, + "learning_rate": 0.00018625175806773133, + "loss": 2.5785, + "step": 24240 + }, + { + "epoch": 0.7188269133825579, + "grad_norm": 0.10484208166599274, + "learning_rate": 0.00018621512466627488, + "loss": 2.5768, + "step": 24241 + }, + { + "epoch": 0.7188565667348694, + "grad_norm": 0.0997522845864296, + "learning_rate": 0.0001861784940434374, + "loss": 2.5631, + "step": 24242 + }, + { + "epoch": 0.7188862200871808, + "grad_norm": 0.09484266489744186, + "learning_rate": 0.00018614186619954326, + "loss": 2.5812, + "step": 24243 + }, + { + "epoch": 0.7189158734394924, + "grad_norm": 0.12356328964233398, + "learning_rate": 0.00018610524113491688, + "loss": 2.566, + "step": 24244 + }, + { + "epoch": 0.7189455267918038, + "grad_norm": 0.09321930259466171, + "learning_rate": 0.0001860686188498823, + "loss": 2.5733, + "step": 24245 + }, + { + "epoch": 0.7189751801441153, + "grad_norm": 0.10135997086763382, + "learning_rate": 0.000186031999344764, + "loss": 2.5688, + "step": 24246 + }, + { + "epoch": 0.7190048334964267, + "grad_norm": 0.10415129363536835, + "learning_rate": 0.00018599538261988624, + "loss": 2.5899, + "step": 24247 + }, + { + "epoch": 0.7190344868487383, + "grad_norm": 0.10172244161367416, + "learning_rate": 0.00018595876867557332, + "loss": 2.5798, + "step": 24248 + }, + { + "epoch": 0.7190641402010497, + "grad_norm": 0.0979413390159607, + "learning_rate": 0.0001859221575121493, + "loss": 2.5502, + "step": 24249 + }, + { + "epoch": 0.7190937935533612, + "grad_norm": 0.09390201419591904, + "learning_rate": 0.00018588554912993854, + "loss": 2.5907, + "step": 24250 + }, + { + "epoch": 0.7191234469056726, + "grad_norm": 0.09894738346338272, + "learning_rate": 0.00018584894352926518, + "loss": 2.5947, + "step": 24251 + }, + { + "epoch": 0.7191531002579842, + "grad_norm": 0.1009925976395607, + "learning_rate": 0.00018581234071045333, + "loss": 2.5772, + "step": 24252 + }, + { + "epoch": 0.7191827536102956, + "grad_norm": 0.0878354012966156, + "learning_rate": 0.00018577574067382696, + "loss": 2.5544, + "step": 24253 + }, + { + "epoch": 0.7192124069626071, + "grad_norm": 0.0928303599357605, + "learning_rate": 0.0001857391434197107, + "loss": 2.5753, + "step": 24254 + }, + { + "epoch": 0.7192420603149186, + "grad_norm": 0.09696637094020844, + "learning_rate": 0.00018570254894842807, + "loss": 2.5656, + "step": 24255 + }, + { + "epoch": 0.7192717136672301, + "grad_norm": 0.08688871562480927, + "learning_rate": 0.00018566595726030334, + "loss": 2.5527, + "step": 24256 + }, + { + "epoch": 0.7193013670195415, + "grad_norm": 0.09188397973775864, + "learning_rate": 0.00018562936835566052, + "loss": 2.5827, + "step": 24257 + }, + { + "epoch": 0.719331020371853, + "grad_norm": 0.10138531774282455, + "learning_rate": 0.00018559278223482357, + "loss": 2.5471, + "step": 24258 + }, + { + "epoch": 0.7193606737241646, + "grad_norm": 0.09276106208562851, + "learning_rate": 0.00018555619889811653, + "loss": 2.5593, + "step": 24259 + }, + { + "epoch": 0.719390327076476, + "grad_norm": 0.11453940719366074, + "learning_rate": 0.0001855196183458633, + "loss": 2.593, + "step": 24260 + }, + { + "epoch": 0.7194199804287875, + "grad_norm": 0.10156845301389694, + "learning_rate": 0.00018548304057838783, + "loss": 2.5729, + "step": 24261 + }, + { + "epoch": 0.7194496337810989, + "grad_norm": 0.09658528864383698, + "learning_rate": 0.00018544646559601403, + "loss": 2.5601, + "step": 24262 + }, + { + "epoch": 0.7194792871334105, + "grad_norm": 0.1008337140083313, + "learning_rate": 0.00018540989339906579, + "loss": 2.5469, + "step": 24263 + }, + { + "epoch": 0.7195089404857219, + "grad_norm": 0.10229950398206711, + "learning_rate": 0.00018537332398786688, + "loss": 2.5517, + "step": 24264 + }, + { + "epoch": 0.7195385938380334, + "grad_norm": 0.09297489374876022, + "learning_rate": 0.00018533675736274124, + "loss": 2.5811, + "step": 24265 + }, + { + "epoch": 0.7195682471903448, + "grad_norm": 0.10034853219985962, + "learning_rate": 0.00018530019352401263, + "loss": 2.5883, + "step": 24266 + }, + { + "epoch": 0.7195979005426564, + "grad_norm": 0.08863996714353561, + "learning_rate": 0.00018526363247200483, + "loss": 2.5674, + "step": 24267 + }, + { + "epoch": 0.7196275538949678, + "grad_norm": 0.1010880395770073, + "learning_rate": 0.00018522707420704155, + "loss": 2.5878, + "step": 24268 + }, + { + "epoch": 0.7196572072472793, + "grad_norm": 0.10056035220623016, + "learning_rate": 0.00018519051872944658, + "loss": 2.5624, + "step": 24269 + }, + { + "epoch": 0.7196868605995907, + "grad_norm": 0.09089533984661102, + "learning_rate": 0.0001851539660395436, + "loss": 2.5154, + "step": 24270 + }, + { + "epoch": 0.7197165139519023, + "grad_norm": 0.0972265973687172, + "learning_rate": 0.0001851174161376563, + "loss": 2.5774, + "step": 24271 + }, + { + "epoch": 0.7197461673042137, + "grad_norm": 0.09179060161113739, + "learning_rate": 0.0001850808690241083, + "loss": 2.5683, + "step": 24272 + }, + { + "epoch": 0.7197758206565252, + "grad_norm": 0.09106186777353287, + "learning_rate": 0.00018504432469922333, + "loss": 2.5311, + "step": 24273 + }, + { + "epoch": 0.7198054740088367, + "grad_norm": 0.09684545546770096, + "learning_rate": 0.0001850077831633249, + "loss": 2.5604, + "step": 24274 + }, + { + "epoch": 0.7198351273611482, + "grad_norm": 0.08971159160137177, + "learning_rate": 0.00018497124441673658, + "loss": 2.5534, + "step": 24275 + }, + { + "epoch": 0.7198647807134596, + "grad_norm": 0.09369959682226181, + "learning_rate": 0.00018493470845978193, + "loss": 2.5181, + "step": 24276 + }, + { + "epoch": 0.7198944340657711, + "grad_norm": 0.09538354724645615, + "learning_rate": 0.00018489817529278462, + "loss": 2.5898, + "step": 24277 + }, + { + "epoch": 0.7199240874180826, + "grad_norm": 0.09184041619300842, + "learning_rate": 0.00018486164491606817, + "loss": 2.5667, + "step": 24278 + }, + { + "epoch": 0.7199537407703941, + "grad_norm": 0.09798431396484375, + "learning_rate": 0.0001848251173299556, + "loss": 2.5893, + "step": 24279 + }, + { + "epoch": 0.7199833941227056, + "grad_norm": 0.08736277371644974, + "learning_rate": 0.00018478859253477092, + "loss": 2.5612, + "step": 24280 + }, + { + "epoch": 0.720013047475017, + "grad_norm": 0.09303391724824905, + "learning_rate": 0.00018475207053083732, + "loss": 2.5651, + "step": 24281 + }, + { + "epoch": 0.7200427008273286, + "grad_norm": 0.09764011949300766, + "learning_rate": 0.0001847155513184783, + "loss": 2.5627, + "step": 24282 + }, + { + "epoch": 0.72007235417964, + "grad_norm": 0.0883607342839241, + "learning_rate": 0.00018467903489801713, + "loss": 2.6029, + "step": 24283 + }, + { + "epoch": 0.7201020075319515, + "grad_norm": 0.09508037567138672, + "learning_rate": 0.0001846425212697772, + "loss": 2.5991, + "step": 24284 + }, + { + "epoch": 0.720131660884263, + "grad_norm": 0.09071222692728043, + "learning_rate": 0.00018460601043408198, + "loss": 2.5618, + "step": 24285 + }, + { + "epoch": 0.7201613142365745, + "grad_norm": 0.08532823622226715, + "learning_rate": 0.0001845695023912546, + "loss": 2.5505, + "step": 24286 + }, + { + "epoch": 0.7201909675888859, + "grad_norm": 0.09442143887281418, + "learning_rate": 0.0001845329971416184, + "loss": 2.5746, + "step": 24287 + }, + { + "epoch": 0.7202206209411974, + "grad_norm": 0.09077398478984833, + "learning_rate": 0.0001844964946854969, + "loss": 2.6057, + "step": 24288 + }, + { + "epoch": 0.7202502742935089, + "grad_norm": 0.09922532737255096, + "learning_rate": 0.00018445999502321293, + "loss": 2.6196, + "step": 24289 + }, + { + "epoch": 0.7202799276458204, + "grad_norm": 0.08562544733285904, + "learning_rate": 0.0001844234981550898, + "loss": 2.5846, + "step": 24290 + }, + { + "epoch": 0.7203095809981318, + "grad_norm": 0.09192413091659546, + "learning_rate": 0.0001843870040814508, + "loss": 2.5772, + "step": 24291 + }, + { + "epoch": 0.7203392343504433, + "grad_norm": 0.09265727549791336, + "learning_rate": 0.0001843505128026189, + "loss": 2.5445, + "step": 24292 + }, + { + "epoch": 0.7203688877027548, + "grad_norm": 0.09178806096315384, + "learning_rate": 0.00018431402431891752, + "loss": 2.5795, + "step": 24293 + }, + { + "epoch": 0.7203985410550663, + "grad_norm": 0.10188091546297073, + "learning_rate": 0.00018427753863066966, + "loss": 2.6004, + "step": 24294 + }, + { + "epoch": 0.7204281944073777, + "grad_norm": 0.10228810459375381, + "learning_rate": 0.00018424105573819837, + "loss": 2.6002, + "step": 24295 + }, + { + "epoch": 0.7204578477596892, + "grad_norm": 0.08963926136493683, + "learning_rate": 0.00018420457564182675, + "loss": 2.5726, + "step": 24296 + }, + { + "epoch": 0.7204875011120007, + "grad_norm": 0.09821773320436478, + "learning_rate": 0.00018416809834187782, + "loss": 2.5803, + "step": 24297 + }, + { + "epoch": 0.7205171544643122, + "grad_norm": 0.09694049507379532, + "learning_rate": 0.0001841316238386746, + "loss": 2.5986, + "step": 24298 + }, + { + "epoch": 0.7205468078166237, + "grad_norm": 0.0891648530960083, + "learning_rate": 0.00018409515213254019, + "loss": 2.5694, + "step": 24299 + }, + { + "epoch": 0.7205764611689351, + "grad_norm": 0.1062096580862999, + "learning_rate": 0.00018405868322379733, + "loss": 2.5828, + "step": 24300 + }, + { + "epoch": 0.7206061145212467, + "grad_norm": 0.09137453883886337, + "learning_rate": 0.000184022217112769, + "loss": 2.6011, + "step": 24301 + }, + { + "epoch": 0.7206357678735581, + "grad_norm": 0.10597343742847443, + "learning_rate": 0.00018398575379977822, + "loss": 2.6098, + "step": 24302 + }, + { + "epoch": 0.7206654212258696, + "grad_norm": 0.09720353782176971, + "learning_rate": 0.0001839492932851478, + "loss": 2.5746, + "step": 24303 + }, + { + "epoch": 0.720695074578181, + "grad_norm": 0.0966491848230362, + "learning_rate": 0.0001839128355692007, + "loss": 2.5447, + "step": 24304 + }, + { + "epoch": 0.7207247279304926, + "grad_norm": 0.08998814970254898, + "learning_rate": 0.00018387638065225941, + "loss": 2.5591, + "step": 24305 + }, + { + "epoch": 0.720754381282804, + "grad_norm": 0.10523995757102966, + "learning_rate": 0.00018383992853464732, + "loss": 2.5671, + "step": 24306 + }, + { + "epoch": 0.7207840346351155, + "grad_norm": 0.0992591455578804, + "learning_rate": 0.00018380347921668688, + "loss": 2.6109, + "step": 24307 + }, + { + "epoch": 0.720813687987427, + "grad_norm": 0.08874604851007462, + "learning_rate": 0.0001837670326987009, + "loss": 2.5785, + "step": 24308 + }, + { + "epoch": 0.7208433413397385, + "grad_norm": 0.09733795374631882, + "learning_rate": 0.0001837305889810123, + "loss": 2.581, + "step": 24309 + }, + { + "epoch": 0.7208729946920499, + "grad_norm": 0.0896851047873497, + "learning_rate": 0.00018369414806394346, + "loss": 2.5504, + "step": 24310 + }, + { + "epoch": 0.7209026480443614, + "grad_norm": 0.1038779690861702, + "learning_rate": 0.00018365770994781722, + "loss": 2.5895, + "step": 24311 + }, + { + "epoch": 0.7209323013966729, + "grad_norm": 0.0903674066066742, + "learning_rate": 0.00018362127463295624, + "loss": 2.5392, + "step": 24312 + }, + { + "epoch": 0.7209619547489844, + "grad_norm": 0.10109342634677887, + "learning_rate": 0.00018358484211968324, + "loss": 2.6046, + "step": 24313 + }, + { + "epoch": 0.7209916081012958, + "grad_norm": 0.09337317198514938, + "learning_rate": 0.00018354841240832072, + "loss": 2.5622, + "step": 24314 + }, + { + "epoch": 0.7210212614536073, + "grad_norm": 0.0991205945611, + "learning_rate": 0.00018351198549919134, + "loss": 2.5788, + "step": 24315 + }, + { + "epoch": 0.7210509148059188, + "grad_norm": 0.08975955098867416, + "learning_rate": 0.00018347556139261767, + "loss": 2.5744, + "step": 24316 + }, + { + "epoch": 0.7210805681582303, + "grad_norm": 0.1010240837931633, + "learning_rate": 0.0001834391400889222, + "loss": 2.5796, + "step": 24317 + }, + { + "epoch": 0.7211102215105417, + "grad_norm": 0.08703029900789261, + "learning_rate": 0.00018340272158842735, + "loss": 2.5559, + "step": 24318 + }, + { + "epoch": 0.7211398748628532, + "grad_norm": 0.1009942889213562, + "learning_rate": 0.00018336630589145593, + "loss": 2.5616, + "step": 24319 + }, + { + "epoch": 0.7211695282151648, + "grad_norm": 0.10153903812170029, + "learning_rate": 0.00018332989299833037, + "loss": 2.5986, + "step": 24320 + }, + { + "epoch": 0.7211991815674762, + "grad_norm": 0.1466921865940094, + "learning_rate": 0.00018329348290937276, + "loss": 2.5593, + "step": 24321 + }, + { + "epoch": 0.7212288349197877, + "grad_norm": 0.10755665600299835, + "learning_rate": 0.00018325707562490574, + "loss": 2.5683, + "step": 24322 + }, + { + "epoch": 0.7212584882720992, + "grad_norm": 0.09180381894111633, + "learning_rate": 0.00018322067114525165, + "loss": 2.5652, + "step": 24323 + }, + { + "epoch": 0.7212881416244107, + "grad_norm": 0.09761485457420349, + "learning_rate": 0.00018318426947073297, + "loss": 2.5781, + "step": 24324 + }, + { + "epoch": 0.7213177949767221, + "grad_norm": 0.09423840790987015, + "learning_rate": 0.00018314787060167193, + "loss": 2.5884, + "step": 24325 + }, + { + "epoch": 0.7213474483290336, + "grad_norm": 0.08959083259105682, + "learning_rate": 0.00018311147453839083, + "loss": 2.5575, + "step": 24326 + }, + { + "epoch": 0.7213771016813451, + "grad_norm": 0.09358945488929749, + "learning_rate": 0.00018307508128121209, + "loss": 2.6324, + "step": 24327 + }, + { + "epoch": 0.7214067550336566, + "grad_norm": 0.09322862327098846, + "learning_rate": 0.00018303869083045787, + "loss": 2.592, + "step": 24328 + }, + { + "epoch": 0.721436408385968, + "grad_norm": 0.09772379696369171, + "learning_rate": 0.00018300230318645045, + "loss": 2.573, + "step": 24329 + }, + { + "epoch": 0.7214660617382795, + "grad_norm": 0.09717334806919098, + "learning_rate": 0.00018296591834951204, + "loss": 2.589, + "step": 24330 + }, + { + "epoch": 0.721495715090591, + "grad_norm": 0.09296609461307526, + "learning_rate": 0.00018292953631996484, + "loss": 2.5679, + "step": 24331 + }, + { + "epoch": 0.7215253684429025, + "grad_norm": 0.08894210308790207, + "learning_rate": 0.00018289315709813103, + "loss": 2.5828, + "step": 24332 + }, + { + "epoch": 0.7215550217952139, + "grad_norm": 0.08993779122829437, + "learning_rate": 0.00018285678068433276, + "loss": 2.5863, + "step": 24333 + }, + { + "epoch": 0.7215846751475254, + "grad_norm": 0.09747573733329773, + "learning_rate": 0.00018282040707889215, + "loss": 2.579, + "step": 24334 + }, + { + "epoch": 0.7216143284998369, + "grad_norm": 0.08905665576457977, + "learning_rate": 0.00018278403628213126, + "loss": 2.5604, + "step": 24335 + }, + { + "epoch": 0.7216439818521484, + "grad_norm": 0.11084390431642532, + "learning_rate": 0.00018274766829437218, + "loss": 2.5678, + "step": 24336 + }, + { + "epoch": 0.7216736352044598, + "grad_norm": 0.08775997161865234, + "learning_rate": 0.00018271130311593692, + "loss": 2.5839, + "step": 24337 + }, + { + "epoch": 0.7217032885567713, + "grad_norm": 0.10507688671350479, + "learning_rate": 0.00018267494074714752, + "loss": 2.5885, + "step": 24338 + }, + { + "epoch": 0.7217329419090828, + "grad_norm": 0.12792782485485077, + "learning_rate": 0.00018263858118832606, + "loss": 2.5743, + "step": 24339 + }, + { + "epoch": 0.7217625952613943, + "grad_norm": 0.10000531375408173, + "learning_rate": 0.0001826022244397944, + "loss": 2.5571, + "step": 24340 + }, + { + "epoch": 0.7217922486137058, + "grad_norm": 0.116970494389534, + "learning_rate": 0.0001825658705018745, + "loss": 2.5585, + "step": 24341 + }, + { + "epoch": 0.7218219019660173, + "grad_norm": 0.1176702156662941, + "learning_rate": 0.00018252951937488833, + "loss": 2.5743, + "step": 24342 + }, + { + "epoch": 0.7218515553183288, + "grad_norm": 0.10017912834882736, + "learning_rate": 0.00018249317105915774, + "loss": 2.566, + "step": 24343 + }, + { + "epoch": 0.7218812086706402, + "grad_norm": 0.11122431606054306, + "learning_rate": 0.00018245682555500465, + "loss": 2.5369, + "step": 24344 + }, + { + "epoch": 0.7219108620229517, + "grad_norm": 0.09608883410692215, + "learning_rate": 0.00018242048286275087, + "loss": 2.5687, + "step": 24345 + }, + { + "epoch": 0.7219405153752632, + "grad_norm": 0.10734357684850693, + "learning_rate": 0.0001823841429827182, + "loss": 2.5703, + "step": 24346 + }, + { + "epoch": 0.7219701687275747, + "grad_norm": 0.11521279066801071, + "learning_rate": 0.00018234780591522848, + "loss": 2.5706, + "step": 24347 + }, + { + "epoch": 0.7219998220798861, + "grad_norm": 0.10770974308252335, + "learning_rate": 0.00018231147166060347, + "loss": 2.6115, + "step": 24348 + }, + { + "epoch": 0.7220294754321976, + "grad_norm": 0.10571078211069107, + "learning_rate": 0.00018227514021916492, + "loss": 2.5863, + "step": 24349 + }, + { + "epoch": 0.7220591287845091, + "grad_norm": 0.10008440911769867, + "learning_rate": 0.0001822388115912345, + "loss": 2.5691, + "step": 24350 + }, + { + "epoch": 0.7220887821368206, + "grad_norm": 0.1122562363743782, + "learning_rate": 0.00018220248577713395, + "loss": 2.5688, + "step": 24351 + }, + { + "epoch": 0.722118435489132, + "grad_norm": 0.1034122109413147, + "learning_rate": 0.00018216616277718495, + "loss": 2.5704, + "step": 24352 + }, + { + "epoch": 0.7221480888414435, + "grad_norm": 0.09165801852941513, + "learning_rate": 0.00018212984259170916, + "loss": 2.5596, + "step": 24353 + }, + { + "epoch": 0.722177742193755, + "grad_norm": 0.10252638906240463, + "learning_rate": 0.00018209352522102835, + "loss": 2.5543, + "step": 24354 + }, + { + "epoch": 0.7222073955460665, + "grad_norm": 0.09491319209337234, + "learning_rate": 0.0001820572106654637, + "loss": 2.5876, + "step": 24355 + }, + { + "epoch": 0.7222370488983779, + "grad_norm": 0.10465428233146667, + "learning_rate": 0.00018202089892533708, + "loss": 2.5933, + "step": 24356 + }, + { + "epoch": 0.7222667022506895, + "grad_norm": 0.10110486298799515, + "learning_rate": 0.0001819845900009698, + "loss": 2.5717, + "step": 24357 + }, + { + "epoch": 0.7222963556030009, + "grad_norm": 0.09167299419641495, + "learning_rate": 0.00018194828389268375, + "loss": 2.6054, + "step": 24358 + }, + { + "epoch": 0.7223260089553124, + "grad_norm": 0.09414970874786377, + "learning_rate": 0.00018191198060080023, + "loss": 2.5342, + "step": 24359 + }, + { + "epoch": 0.7223556623076238, + "grad_norm": 0.08917652070522308, + "learning_rate": 0.00018187568012564072, + "loss": 2.5643, + "step": 24360 + }, + { + "epoch": 0.7223853156599354, + "grad_norm": 0.09078095853328705, + "learning_rate": 0.00018183938246752664, + "loss": 2.5931, + "step": 24361 + }, + { + "epoch": 0.7224149690122469, + "grad_norm": 0.08544937521219254, + "learning_rate": 0.00018180308762677944, + "loss": 2.581, + "step": 24362 + }, + { + "epoch": 0.7224446223645583, + "grad_norm": 0.09476158767938614, + "learning_rate": 0.00018176679560372055, + "loss": 2.5736, + "step": 24363 + }, + { + "epoch": 0.7224742757168698, + "grad_norm": 0.08617313951253891, + "learning_rate": 0.00018173050639867146, + "loss": 2.5638, + "step": 24364 + }, + { + "epoch": 0.7225039290691813, + "grad_norm": 0.09412593394517899, + "learning_rate": 0.0001816942200119532, + "loss": 2.5351, + "step": 24365 + }, + { + "epoch": 0.7225335824214928, + "grad_norm": 0.08401152491569519, + "learning_rate": 0.00018165793644388728, + "loss": 2.5634, + "step": 24366 + }, + { + "epoch": 0.7225632357738042, + "grad_norm": 0.08736812323331833, + "learning_rate": 0.00018162165569479493, + "loss": 2.5924, + "step": 24367 + }, + { + "epoch": 0.7225928891261157, + "grad_norm": 0.09432218223810196, + "learning_rate": 0.0001815853777649975, + "loss": 2.5688, + "step": 24368 + }, + { + "epoch": 0.7226225424784272, + "grad_norm": 0.10033419728279114, + "learning_rate": 0.0001815491026548162, + "loss": 2.5813, + "step": 24369 + }, + { + "epoch": 0.7226521958307387, + "grad_norm": 0.10222146660089493, + "learning_rate": 0.00018151283036457213, + "loss": 2.5778, + "step": 24370 + }, + { + "epoch": 0.7226818491830501, + "grad_norm": 0.08669904619455338, + "learning_rate": 0.00018147656089458669, + "loss": 2.5624, + "step": 24371 + }, + { + "epoch": 0.7227115025353616, + "grad_norm": 0.0990624725818634, + "learning_rate": 0.00018144029424518106, + "loss": 2.5943, + "step": 24372 + }, + { + "epoch": 0.7227411558876731, + "grad_norm": 0.08606772124767303, + "learning_rate": 0.00018140403041667626, + "loss": 2.542, + "step": 24373 + }, + { + "epoch": 0.7227708092399846, + "grad_norm": 0.10405131429433823, + "learning_rate": 0.00018136776940939347, + "loss": 2.565, + "step": 24374 + }, + { + "epoch": 0.722800462592296, + "grad_norm": 0.09611360728740692, + "learning_rate": 0.00018133151122365392, + "loss": 2.5634, + "step": 24375 + }, + { + "epoch": 0.7228301159446076, + "grad_norm": 0.10275009274482727, + "learning_rate": 0.0001812952558597784, + "loss": 2.5995, + "step": 24376 + }, + { + "epoch": 0.722859769296919, + "grad_norm": 0.09244877845048904, + "learning_rate": 0.0001812590033180881, + "loss": 2.6127, + "step": 24377 + }, + { + "epoch": 0.7228894226492305, + "grad_norm": 0.10167467594146729, + "learning_rate": 0.000181222753598904, + "loss": 2.5226, + "step": 24378 + }, + { + "epoch": 0.7229190760015419, + "grad_norm": 0.09913390874862671, + "learning_rate": 0.00018118650670254717, + "loss": 2.5716, + "step": 24379 + }, + { + "epoch": 0.7229487293538535, + "grad_norm": 0.09849604219198227, + "learning_rate": 0.00018115026262933854, + "loss": 2.5439, + "step": 24380 + }, + { + "epoch": 0.7229783827061649, + "grad_norm": 0.10131588578224182, + "learning_rate": 0.00018111402137959903, + "loss": 2.6019, + "step": 24381 + }, + { + "epoch": 0.7230080360584764, + "grad_norm": 0.09559682011604309, + "learning_rate": 0.00018107778295364961, + "loss": 2.6074, + "step": 24382 + }, + { + "epoch": 0.7230376894107879, + "grad_norm": 0.09926934540271759, + "learning_rate": 0.00018104154735181104, + "loss": 2.5679, + "step": 24383 + }, + { + "epoch": 0.7230673427630994, + "grad_norm": 0.0924333930015564, + "learning_rate": 0.00018100531457440445, + "loss": 2.5385, + "step": 24384 + }, + { + "epoch": 0.7230969961154109, + "grad_norm": 0.10996689647436142, + "learning_rate": 0.00018096908462175072, + "loss": 2.5939, + "step": 24385 + }, + { + "epoch": 0.7231266494677223, + "grad_norm": 0.09090391546487808, + "learning_rate": 0.00018093285749417036, + "loss": 2.606, + "step": 24386 + }, + { + "epoch": 0.7231563028200338, + "grad_norm": 0.1024734228849411, + "learning_rate": 0.0001808966331919843, + "loss": 2.5828, + "step": 24387 + }, + { + "epoch": 0.7231859561723453, + "grad_norm": 0.09865554422140121, + "learning_rate": 0.00018086041171551333, + "loss": 2.548, + "step": 24388 + }, + { + "epoch": 0.7232156095246568, + "grad_norm": 0.08840057253837585, + "learning_rate": 0.0001808241930650782, + "loss": 2.5397, + "step": 24389 + }, + { + "epoch": 0.7232452628769682, + "grad_norm": 0.09699207544326782, + "learning_rate": 0.0001807879772409996, + "loss": 2.5804, + "step": 24390 + }, + { + "epoch": 0.7232749162292798, + "grad_norm": 0.09184335917234421, + "learning_rate": 0.00018075176424359825, + "loss": 2.5793, + "step": 24391 + }, + { + "epoch": 0.7233045695815912, + "grad_norm": 0.09577745199203491, + "learning_rate": 0.00018071555407319484, + "loss": 2.5877, + "step": 24392 + }, + { + "epoch": 0.7233342229339027, + "grad_norm": 0.09639756381511688, + "learning_rate": 0.00018067934673011006, + "loss": 2.5908, + "step": 24393 + }, + { + "epoch": 0.7233638762862141, + "grad_norm": 0.09821784496307373, + "learning_rate": 0.0001806431422146644, + "loss": 2.5471, + "step": 24394 + }, + { + "epoch": 0.7233935296385257, + "grad_norm": 0.09624285995960236, + "learning_rate": 0.00018060694052717858, + "loss": 2.5292, + "step": 24395 + }, + { + "epoch": 0.7234231829908371, + "grad_norm": 0.10128328204154968, + "learning_rate": 0.00018057074166797304, + "loss": 2.566, + "step": 24396 + }, + { + "epoch": 0.7234528363431486, + "grad_norm": 0.09573085606098175, + "learning_rate": 0.00018053454563736847, + "loss": 2.5477, + "step": 24397 + }, + { + "epoch": 0.72348248969546, + "grad_norm": 0.11461953073740005, + "learning_rate": 0.00018049835243568536, + "loss": 2.5961, + "step": 24398 + }, + { + "epoch": 0.7235121430477716, + "grad_norm": 0.10785475373268127, + "learning_rate": 0.00018046216206324417, + "loss": 2.5373, + "step": 24399 + }, + { + "epoch": 0.723541796400083, + "grad_norm": 0.09729284793138504, + "learning_rate": 0.00018042597452036535, + "loss": 2.5956, + "step": 24400 + }, + { + "epoch": 0.7235714497523945, + "grad_norm": 0.10627344995737076, + "learning_rate": 0.00018038978980736942, + "loss": 2.533, + "step": 24401 + }, + { + "epoch": 0.7236011031047059, + "grad_norm": 0.1009926050901413, + "learning_rate": 0.0001803536079245767, + "loss": 2.5587, + "step": 24402 + }, + { + "epoch": 0.7236307564570175, + "grad_norm": 0.09409985691308975, + "learning_rate": 0.00018031742887230772, + "loss": 2.5686, + "step": 24403 + }, + { + "epoch": 0.723660409809329, + "grad_norm": 0.09577731788158417, + "learning_rate": 0.00018028125265088274, + "loss": 2.5595, + "step": 24404 + }, + { + "epoch": 0.7236900631616404, + "grad_norm": 0.08438064157962799, + "learning_rate": 0.00018024507926062217, + "loss": 2.5307, + "step": 24405 + }, + { + "epoch": 0.723719716513952, + "grad_norm": 0.10494817793369293, + "learning_rate": 0.0001802089087018463, + "loss": 2.5822, + "step": 24406 + }, + { + "epoch": 0.7237493698662634, + "grad_norm": 0.0940534695982933, + "learning_rate": 0.00018017274097487546, + "loss": 2.5602, + "step": 24407 + }, + { + "epoch": 0.7237790232185749, + "grad_norm": 0.09133113920688629, + "learning_rate": 0.00018013657608002985, + "loss": 2.5956, + "step": 24408 + }, + { + "epoch": 0.7238086765708863, + "grad_norm": 0.09655573964118958, + "learning_rate": 0.00018010041401762976, + "loss": 2.5601, + "step": 24409 + }, + { + "epoch": 0.7238383299231979, + "grad_norm": 0.10128404200077057, + "learning_rate": 0.00018006425478799543, + "loss": 2.5503, + "step": 24410 + }, + { + "epoch": 0.7238679832755093, + "grad_norm": 0.10505004972219467, + "learning_rate": 0.00018002809839144708, + "loss": 2.609, + "step": 24411 + }, + { + "epoch": 0.7238976366278208, + "grad_norm": 0.10476556420326233, + "learning_rate": 0.00017999194482830476, + "loss": 2.6009, + "step": 24412 + }, + { + "epoch": 0.7239272899801322, + "grad_norm": 0.09808140993118286, + "learning_rate": 0.00017995579409888879, + "loss": 2.596, + "step": 24413 + }, + { + "epoch": 0.7239569433324438, + "grad_norm": 0.09177403897047043, + "learning_rate": 0.00017991964620351914, + "loss": 2.574, + "step": 24414 + }, + { + "epoch": 0.7239865966847552, + "grad_norm": 0.09309031814336777, + "learning_rate": 0.00017988350114251595, + "loss": 2.5576, + "step": 24415 + }, + { + "epoch": 0.7240162500370667, + "grad_norm": 0.0960768312215805, + "learning_rate": 0.00017984735891619935, + "loss": 2.6108, + "step": 24416 + }, + { + "epoch": 0.7240459033893781, + "grad_norm": 0.086490698158741, + "learning_rate": 0.00017981121952488933, + "loss": 2.5722, + "step": 24417 + }, + { + "epoch": 0.7240755567416897, + "grad_norm": 0.0950045958161354, + "learning_rate": 0.00017977508296890588, + "loss": 2.5545, + "step": 24418 + }, + { + "epoch": 0.7241052100940011, + "grad_norm": 0.09834517538547516, + "learning_rate": 0.0001797389492485691, + "loss": 2.5489, + "step": 24419 + }, + { + "epoch": 0.7241348634463126, + "grad_norm": 0.09650188684463501, + "learning_rate": 0.000179702818364199, + "loss": 2.5893, + "step": 24420 + }, + { + "epoch": 0.724164516798624, + "grad_norm": 0.0955955758690834, + "learning_rate": 0.0001796666903161151, + "loss": 2.5489, + "step": 24421 + }, + { + "epoch": 0.7241941701509356, + "grad_norm": 0.09754876792430878, + "learning_rate": 0.00017963056510463782, + "loss": 2.5595, + "step": 24422 + }, + { + "epoch": 0.724223823503247, + "grad_norm": 0.09686246514320374, + "learning_rate": 0.0001795944427300869, + "loss": 2.5941, + "step": 24423 + }, + { + "epoch": 0.7242534768555585, + "grad_norm": 0.10220953077077866, + "learning_rate": 0.0001795583231927822, + "loss": 2.5528, + "step": 24424 + }, + { + "epoch": 0.72428313020787, + "grad_norm": 0.10136080533266068, + "learning_rate": 0.00017952220649304352, + "loss": 2.5579, + "step": 24425 + }, + { + "epoch": 0.7243127835601815, + "grad_norm": 0.09458354115486145, + "learning_rate": 0.0001794860926311907, + "loss": 2.5976, + "step": 24426 + }, + { + "epoch": 0.724342436912493, + "grad_norm": 0.09994368255138397, + "learning_rate": 0.00017944998160754355, + "loss": 2.5727, + "step": 24427 + }, + { + "epoch": 0.7243720902648044, + "grad_norm": 0.0918688029050827, + "learning_rate": 0.00017941387342242183, + "loss": 2.6077, + "step": 24428 + }, + { + "epoch": 0.724401743617116, + "grad_norm": 0.09313163161277771, + "learning_rate": 0.00017937776807614532, + "loss": 2.5934, + "step": 24429 + }, + { + "epoch": 0.7244313969694274, + "grad_norm": 0.09840631484985352, + "learning_rate": 0.00017934166556903385, + "loss": 2.5582, + "step": 24430 + }, + { + "epoch": 0.7244610503217389, + "grad_norm": 0.08890368789434433, + "learning_rate": 0.00017930556590140678, + "loss": 2.5741, + "step": 24431 + }, + { + "epoch": 0.7244907036740503, + "grad_norm": 0.10058718919754028, + "learning_rate": 0.00017926946907358403, + "loss": 2.6381, + "step": 24432 + }, + { + "epoch": 0.7245203570263619, + "grad_norm": 0.0999298244714737, + "learning_rate": 0.00017923337508588517, + "loss": 2.6009, + "step": 24433 + }, + { + "epoch": 0.7245500103786733, + "grad_norm": 0.09325554221868515, + "learning_rate": 0.0001791972839386296, + "loss": 2.5466, + "step": 24434 + }, + { + "epoch": 0.7245796637309848, + "grad_norm": 0.10018952935934067, + "learning_rate": 0.0001791611956321374, + "loss": 2.5768, + "step": 24435 + }, + { + "epoch": 0.7246093170832962, + "grad_norm": 0.09639937430620193, + "learning_rate": 0.00017912511016672782, + "loss": 2.5674, + "step": 24436 + }, + { + "epoch": 0.7246389704356078, + "grad_norm": 0.08763343095779419, + "learning_rate": 0.0001790890275427205, + "loss": 2.5651, + "step": 24437 + }, + { + "epoch": 0.7246686237879192, + "grad_norm": 0.10853439569473267, + "learning_rate": 0.0001790529477604349, + "loss": 2.5499, + "step": 24438 + }, + { + "epoch": 0.7246982771402307, + "grad_norm": 0.10556577891111374, + "learning_rate": 0.00017901687082019058, + "loss": 2.59, + "step": 24439 + }, + { + "epoch": 0.7247279304925421, + "grad_norm": 0.09358900785446167, + "learning_rate": 0.00017898079672230694, + "loss": 2.573, + "step": 24440 + }, + { + "epoch": 0.7247575838448537, + "grad_norm": 0.1069972813129425, + "learning_rate": 0.0001789447254671036, + "loss": 2.5864, + "step": 24441 + }, + { + "epoch": 0.7247872371971651, + "grad_norm": 0.09256917983293533, + "learning_rate": 0.00017890865705489967, + "loss": 2.5826, + "step": 24442 + }, + { + "epoch": 0.7248168905494766, + "grad_norm": 0.08990142494440079, + "learning_rate": 0.0001788725914860147, + "loss": 2.5758, + "step": 24443 + }, + { + "epoch": 0.724846543901788, + "grad_norm": 0.09299186617136002, + "learning_rate": 0.00017883652876076806, + "loss": 2.5615, + "step": 24444 + }, + { + "epoch": 0.7248761972540996, + "grad_norm": 0.09859929978847504, + "learning_rate": 0.0001788004688794791, + "loss": 2.5911, + "step": 24445 + }, + { + "epoch": 0.7249058506064111, + "grad_norm": 0.09272847324609756, + "learning_rate": 0.00017876441184246707, + "loss": 2.5992, + "step": 24446 + }, + { + "epoch": 0.7249355039587225, + "grad_norm": 0.0906764566898346, + "learning_rate": 0.0001787283576500512, + "loss": 2.588, + "step": 24447 + }, + { + "epoch": 0.7249651573110341, + "grad_norm": 0.0966259315609932, + "learning_rate": 0.00017869230630255102, + "loss": 2.5798, + "step": 24448 + }, + { + "epoch": 0.7249948106633455, + "grad_norm": 0.09198171645402908, + "learning_rate": 0.00017865625780028561, + "loss": 2.5747, + "step": 24449 + }, + { + "epoch": 0.725024464015657, + "grad_norm": 0.08917607367038727, + "learning_rate": 0.00017862021214357416, + "loss": 2.5537, + "step": 24450 + }, + { + "epoch": 0.7250541173679684, + "grad_norm": 0.09550958126783371, + "learning_rate": 0.0001785841693327361, + "loss": 2.5842, + "step": 24451 + }, + { + "epoch": 0.72508377072028, + "grad_norm": 0.08898281306028366, + "learning_rate": 0.00017854812936809024, + "loss": 2.5714, + "step": 24452 + }, + { + "epoch": 0.7251134240725914, + "grad_norm": 0.09601332992315292, + "learning_rate": 0.00017851209224995586, + "loss": 2.6107, + "step": 24453 + }, + { + "epoch": 0.7251430774249029, + "grad_norm": 0.09389817714691162, + "learning_rate": 0.00017847605797865207, + "loss": 2.5466, + "step": 24454 + }, + { + "epoch": 0.7251727307772143, + "grad_norm": 0.08829618245363235, + "learning_rate": 0.00017844002655449797, + "loss": 2.5567, + "step": 24455 + }, + { + "epoch": 0.7252023841295259, + "grad_norm": 0.09509865194559097, + "learning_rate": 0.00017840399797781266, + "loss": 2.5862, + "step": 24456 + }, + { + "epoch": 0.7252320374818373, + "grad_norm": 0.08810383826494217, + "learning_rate": 0.0001783679722489151, + "loss": 2.5679, + "step": 24457 + }, + { + "epoch": 0.7252616908341488, + "grad_norm": 0.10000546276569366, + "learning_rate": 0.00017833194936812437, + "loss": 2.5925, + "step": 24458 + }, + { + "epoch": 0.7252913441864602, + "grad_norm": 0.0934114083647728, + "learning_rate": 0.00017829592933575944, + "loss": 2.5717, + "step": 24459 + }, + { + "epoch": 0.7253209975387718, + "grad_norm": 0.0864827111363411, + "learning_rate": 0.00017825991215213917, + "loss": 2.5214, + "step": 24460 + }, + { + "epoch": 0.7253506508910832, + "grad_norm": 0.09436112642288208, + "learning_rate": 0.00017822389781758287, + "loss": 2.5735, + "step": 24461 + }, + { + "epoch": 0.7253803042433947, + "grad_norm": 0.09817546606063843, + "learning_rate": 0.00017818788633240906, + "loss": 2.5614, + "step": 24462 + }, + { + "epoch": 0.7254099575957061, + "grad_norm": 0.08370589464902878, + "learning_rate": 0.00017815187769693676, + "loss": 2.6026, + "step": 24463 + }, + { + "epoch": 0.7254396109480177, + "grad_norm": 0.10759888589382172, + "learning_rate": 0.00017811587191148476, + "loss": 2.5555, + "step": 24464 + }, + { + "epoch": 0.7254692643003291, + "grad_norm": 0.09783735126256943, + "learning_rate": 0.00017807986897637202, + "loss": 2.6053, + "step": 24465 + }, + { + "epoch": 0.7254989176526406, + "grad_norm": 0.08930220454931259, + "learning_rate": 0.00017804386889191725, + "loss": 2.5554, + "step": 24466 + }, + { + "epoch": 0.7255285710049522, + "grad_norm": 0.10076712816953659, + "learning_rate": 0.00017800787165843935, + "loss": 2.5905, + "step": 24467 + }, + { + "epoch": 0.7255582243572636, + "grad_norm": 0.09271082282066345, + "learning_rate": 0.00017797187727625698, + "loss": 2.5941, + "step": 24468 + }, + { + "epoch": 0.7255878777095751, + "grad_norm": 0.10517735034227371, + "learning_rate": 0.00017793588574568892, + "loss": 2.5863, + "step": 24469 + }, + { + "epoch": 0.7256175310618865, + "grad_norm": 0.08985581248998642, + "learning_rate": 0.00017789989706705389, + "loss": 2.5812, + "step": 24470 + }, + { + "epoch": 0.7256471844141981, + "grad_norm": 0.10208804905414581, + "learning_rate": 0.00017786391124067054, + "loss": 2.5597, + "step": 24471 + }, + { + "epoch": 0.7256768377665095, + "grad_norm": 0.10128721594810486, + "learning_rate": 0.00017782792826685756, + "loss": 2.5337, + "step": 24472 + }, + { + "epoch": 0.725706491118821, + "grad_norm": 0.0915597453713417, + "learning_rate": 0.00017779194814593352, + "loss": 2.5719, + "step": 24473 + }, + { + "epoch": 0.7257361444711324, + "grad_norm": 0.10306135565042496, + "learning_rate": 0.00017775597087821716, + "loss": 2.5988, + "step": 24474 + }, + { + "epoch": 0.725765797823444, + "grad_norm": 0.09244661778211594, + "learning_rate": 0.00017771999646402697, + "loss": 2.6075, + "step": 24475 + }, + { + "epoch": 0.7257954511757554, + "grad_norm": 0.10258881747722626, + "learning_rate": 0.00017768402490368152, + "loss": 2.6017, + "step": 24476 + }, + { + "epoch": 0.7258251045280669, + "grad_norm": 0.10684552043676376, + "learning_rate": 0.00017764805619749935, + "loss": 2.5492, + "step": 24477 + }, + { + "epoch": 0.7258547578803783, + "grad_norm": 0.09644953906536102, + "learning_rate": 0.00017761209034579902, + "loss": 2.572, + "step": 24478 + }, + { + "epoch": 0.7258844112326899, + "grad_norm": 0.10871312767267227, + "learning_rate": 0.00017757612734889888, + "loss": 2.57, + "step": 24479 + }, + { + "epoch": 0.7259140645850013, + "grad_norm": 0.10301565378904343, + "learning_rate": 0.00017754016720711752, + "loss": 2.5929, + "step": 24480 + }, + { + "epoch": 0.7259437179373128, + "grad_norm": 0.0974961668252945, + "learning_rate": 0.00017750420992077332, + "loss": 2.5793, + "step": 24481 + }, + { + "epoch": 0.7259733712896242, + "grad_norm": 0.08628825843334198, + "learning_rate": 0.00017746825549018474, + "loss": 2.6184, + "step": 24482 + }, + { + "epoch": 0.7260030246419358, + "grad_norm": 0.09829387068748474, + "learning_rate": 0.00017743230391567005, + "loss": 2.53, + "step": 24483 + }, + { + "epoch": 0.7260326779942472, + "grad_norm": 0.09428495168685913, + "learning_rate": 0.00017739635519754777, + "loss": 2.5794, + "step": 24484 + }, + { + "epoch": 0.7260623313465587, + "grad_norm": 0.09462111443281174, + "learning_rate": 0.00017736040933613622, + "loss": 2.5605, + "step": 24485 + }, + { + "epoch": 0.7260919846988702, + "grad_norm": 0.09177643805742264, + "learning_rate": 0.00017732446633175332, + "loss": 2.5901, + "step": 24486 + }, + { + "epoch": 0.7261216380511817, + "grad_norm": 0.09737976640462875, + "learning_rate": 0.00017728852618471787, + "loss": 2.5926, + "step": 24487 + }, + { + "epoch": 0.7261512914034932, + "grad_norm": 0.09340983629226685, + "learning_rate": 0.00017725258889534785, + "loss": 2.5956, + "step": 24488 + }, + { + "epoch": 0.7261809447558046, + "grad_norm": 0.0933784618973732, + "learning_rate": 0.00017721665446396157, + "loss": 2.5666, + "step": 24489 + }, + { + "epoch": 0.7262105981081162, + "grad_norm": 0.09433053433895111, + "learning_rate": 0.0001771807228908772, + "loss": 2.5794, + "step": 24490 + }, + { + "epoch": 0.7262402514604276, + "grad_norm": 0.09523706138134003, + "learning_rate": 0.00017714479417641298, + "loss": 2.5733, + "step": 24491 + }, + { + "epoch": 0.7262699048127391, + "grad_norm": 0.09371936321258545, + "learning_rate": 0.00017710886832088702, + "loss": 2.558, + "step": 24492 + }, + { + "epoch": 0.7262995581650505, + "grad_norm": 0.10220544785261154, + "learning_rate": 0.00017707294532461743, + "loss": 2.5896, + "step": 24493 + }, + { + "epoch": 0.7263292115173621, + "grad_norm": 0.10374341160058975, + "learning_rate": 0.00017703702518792236, + "loss": 2.558, + "step": 24494 + }, + { + "epoch": 0.7263588648696735, + "grad_norm": 0.10109341144561768, + "learning_rate": 0.00017700110791111985, + "loss": 2.6035, + "step": 24495 + }, + { + "epoch": 0.726388518221985, + "grad_norm": 0.10289797931909561, + "learning_rate": 0.00017696519349452816, + "loss": 2.5351, + "step": 24496 + }, + { + "epoch": 0.7264181715742964, + "grad_norm": 0.09627220034599304, + "learning_rate": 0.0001769292819384649, + "loss": 2.5494, + "step": 24497 + }, + { + "epoch": 0.726447824926608, + "grad_norm": 0.09690912812948227, + "learning_rate": 0.0001768933732432484, + "loss": 2.5705, + "step": 24498 + }, + { + "epoch": 0.7264774782789194, + "grad_norm": 0.09635964781045914, + "learning_rate": 0.00017685746740919633, + "loss": 2.5705, + "step": 24499 + }, + { + "epoch": 0.7265071316312309, + "grad_norm": 0.10042890906333923, + "learning_rate": 0.00017682156443662702, + "loss": 2.5342, + "step": 24500 + }, + { + "epoch": 0.7265367849835423, + "grad_norm": 0.09088363498449326, + "learning_rate": 0.00017678566432585818, + "loss": 2.5926, + "step": 24501 + }, + { + "epoch": 0.7265664383358539, + "grad_norm": 0.10369963198900223, + "learning_rate": 0.00017674976707720785, + "loss": 2.558, + "step": 24502 + }, + { + "epoch": 0.7265960916881653, + "grad_norm": 0.09032462537288666, + "learning_rate": 0.00017671387269099377, + "loss": 2.615, + "step": 24503 + }, + { + "epoch": 0.7266257450404768, + "grad_norm": 0.09048431366682053, + "learning_rate": 0.00017667798116753386, + "loss": 2.5202, + "step": 24504 + }, + { + "epoch": 0.7266553983927883, + "grad_norm": 0.08964448422193527, + "learning_rate": 0.00017664209250714593, + "loss": 2.5959, + "step": 24505 + }, + { + "epoch": 0.7266850517450998, + "grad_norm": 0.08571581542491913, + "learning_rate": 0.00017660620671014788, + "loss": 2.5779, + "step": 24506 + }, + { + "epoch": 0.7267147050974113, + "grad_norm": 0.09178799390792847, + "learning_rate": 0.00017657032377685727, + "loss": 2.5739, + "step": 24507 + }, + { + "epoch": 0.7267443584497227, + "grad_norm": 0.08432910591363907, + "learning_rate": 0.0001765344437075919, + "loss": 2.5588, + "step": 24508 + }, + { + "epoch": 0.7267740118020343, + "grad_norm": 0.10316547006368637, + "learning_rate": 0.0001764985665026696, + "loss": 2.562, + "step": 24509 + }, + { + "epoch": 0.7268036651543457, + "grad_norm": 0.09300816804170609, + "learning_rate": 0.00017646269216240802, + "loss": 2.5881, + "step": 24510 + }, + { + "epoch": 0.7268333185066572, + "grad_norm": 0.1080256924033165, + "learning_rate": 0.00017642682068712485, + "loss": 2.5437, + "step": 24511 + }, + { + "epoch": 0.7268629718589686, + "grad_norm": 0.09235815703868866, + "learning_rate": 0.00017639095207713752, + "loss": 2.5578, + "step": 24512 + }, + { + "epoch": 0.7268926252112802, + "grad_norm": 0.10073887556791306, + "learning_rate": 0.00017635508633276405, + "loss": 2.6007, + "step": 24513 + }, + { + "epoch": 0.7269222785635916, + "grad_norm": 0.09764822572469711, + "learning_rate": 0.00017631922345432184, + "loss": 2.5419, + "step": 24514 + }, + { + "epoch": 0.7269519319159031, + "grad_norm": 0.10500986129045486, + "learning_rate": 0.00017628336344212848, + "loss": 2.5798, + "step": 24515 + }, + { + "epoch": 0.7269815852682145, + "grad_norm": 0.10428416728973389, + "learning_rate": 0.0001762475062965015, + "loss": 2.5663, + "step": 24516 + }, + { + "epoch": 0.7270112386205261, + "grad_norm": 0.09681106358766556, + "learning_rate": 0.0001762116520177585, + "loss": 2.565, + "step": 24517 + }, + { + "epoch": 0.7270408919728375, + "grad_norm": 0.1028619110584259, + "learning_rate": 0.00017617580060621686, + "loss": 2.5993, + "step": 24518 + }, + { + "epoch": 0.727070545325149, + "grad_norm": 0.11277034878730774, + "learning_rate": 0.00017613995206219402, + "loss": 2.5547, + "step": 24519 + }, + { + "epoch": 0.7271001986774605, + "grad_norm": 0.09805940091609955, + "learning_rate": 0.00017610410638600749, + "loss": 2.6111, + "step": 24520 + }, + { + "epoch": 0.727129852029772, + "grad_norm": 0.09443474560976028, + "learning_rate": 0.00017606826357797472, + "loss": 2.5432, + "step": 24521 + }, + { + "epoch": 0.7271595053820834, + "grad_norm": 0.1057438924908638, + "learning_rate": 0.0001760324236384131, + "loss": 2.5684, + "step": 24522 + }, + { + "epoch": 0.7271891587343949, + "grad_norm": 0.09320404380559921, + "learning_rate": 0.00017599658656763996, + "loss": 2.5373, + "step": 24523 + }, + { + "epoch": 0.7272188120867064, + "grad_norm": 0.09861130267381668, + "learning_rate": 0.0001759607523659726, + "loss": 2.5702, + "step": 24524 + }, + { + "epoch": 0.7272484654390179, + "grad_norm": 0.10162463039159775, + "learning_rate": 0.0001759249210337283, + "loss": 2.5489, + "step": 24525 + }, + { + "epoch": 0.7272781187913293, + "grad_norm": 0.09510039538145065, + "learning_rate": 0.0001758890925712246, + "loss": 2.6112, + "step": 24526 + }, + { + "epoch": 0.7273077721436408, + "grad_norm": 0.0974045917391777, + "learning_rate": 0.0001758532669787788, + "loss": 2.5766, + "step": 24527 + }, + { + "epoch": 0.7273374254959524, + "grad_norm": 0.08987756073474884, + "learning_rate": 0.00017581744425670777, + "loss": 2.56, + "step": 24528 + }, + { + "epoch": 0.7273670788482638, + "grad_norm": 0.10053342580795288, + "learning_rate": 0.00017578162440532892, + "loss": 2.5887, + "step": 24529 + }, + { + "epoch": 0.7273967322005753, + "grad_norm": 0.09316544234752655, + "learning_rate": 0.00017574580742495943, + "loss": 2.5803, + "step": 24530 + }, + { + "epoch": 0.7274263855528867, + "grad_norm": 0.09362924098968506, + "learning_rate": 0.00017570999331591647, + "loss": 2.5909, + "step": 24531 + }, + { + "epoch": 0.7274560389051983, + "grad_norm": 0.08971063792705536, + "learning_rate": 0.0001756741820785172, + "loss": 2.6068, + "step": 24532 + }, + { + "epoch": 0.7274856922575097, + "grad_norm": 0.10448398441076279, + "learning_rate": 0.00017563837371307873, + "loss": 2.565, + "step": 24533 + }, + { + "epoch": 0.7275153456098212, + "grad_norm": 0.09779147058725357, + "learning_rate": 0.0001756025682199181, + "loss": 2.5667, + "step": 24534 + }, + { + "epoch": 0.7275449989621326, + "grad_norm": 0.09591459482908249, + "learning_rate": 0.0001755667655993524, + "loss": 2.6109, + "step": 24535 + }, + { + "epoch": 0.7275746523144442, + "grad_norm": 0.1059061661362648, + "learning_rate": 0.00017553096585169874, + "loss": 2.575, + "step": 24536 + }, + { + "epoch": 0.7276043056667556, + "grad_norm": 0.09063009172677994, + "learning_rate": 0.00017549516897727403, + "loss": 2.5966, + "step": 24537 + }, + { + "epoch": 0.7276339590190671, + "grad_norm": 0.09463508427143097, + "learning_rate": 0.0001754593749763953, + "loss": 2.5834, + "step": 24538 + }, + { + "epoch": 0.7276636123713786, + "grad_norm": 0.10478776693344116, + "learning_rate": 0.0001754235838493795, + "loss": 2.5514, + "step": 24539 + }, + { + "epoch": 0.7276932657236901, + "grad_norm": 0.09454843401908875, + "learning_rate": 0.0001753877955965436, + "loss": 2.5615, + "step": 24540 + }, + { + "epoch": 0.7277229190760015, + "grad_norm": 0.10171014070510864, + "learning_rate": 0.00017535201021820452, + "loss": 2.5757, + "step": 24541 + }, + { + "epoch": 0.727752572428313, + "grad_norm": 0.08943581581115723, + "learning_rate": 0.00017531622771467908, + "loss": 2.6051, + "step": 24542 + }, + { + "epoch": 0.7277822257806245, + "grad_norm": 0.0984322801232338, + "learning_rate": 0.00017528044808628418, + "loss": 2.5452, + "step": 24543 + }, + { + "epoch": 0.727811879132936, + "grad_norm": 0.10145758837461472, + "learning_rate": 0.00017524467133333665, + "loss": 2.5794, + "step": 24544 + }, + { + "epoch": 0.7278415324852474, + "grad_norm": 0.08699916303157806, + "learning_rate": 0.0001752088974561533, + "loss": 2.5671, + "step": 24545 + }, + { + "epoch": 0.7278711858375589, + "grad_norm": 0.11739496886730194, + "learning_rate": 0.0001751731264550509, + "loss": 2.5758, + "step": 24546 + }, + { + "epoch": 0.7279008391898704, + "grad_norm": 0.08820030093193054, + "learning_rate": 0.00017513735833034623, + "loss": 2.538, + "step": 24547 + }, + { + "epoch": 0.7279304925421819, + "grad_norm": 0.10482251644134521, + "learning_rate": 0.000175101593082356, + "loss": 2.5622, + "step": 24548 + }, + { + "epoch": 0.7279601458944934, + "grad_norm": 0.09894511103630066, + "learning_rate": 0.000175065830711397, + "loss": 2.5638, + "step": 24549 + }, + { + "epoch": 0.7279897992468048, + "grad_norm": 0.10039312392473221, + "learning_rate": 0.00017503007121778575, + "loss": 2.5904, + "step": 24550 + }, + { + "epoch": 0.7280194525991164, + "grad_norm": 0.11208341270685196, + "learning_rate": 0.000174994314601839, + "loss": 2.5592, + "step": 24551 + }, + { + "epoch": 0.7280491059514278, + "grad_norm": 0.09452793747186661, + "learning_rate": 0.00017495856086387345, + "loss": 2.5463, + "step": 24552 + }, + { + "epoch": 0.7280787593037393, + "grad_norm": 0.10916910320520401, + "learning_rate": 0.00017492281000420563, + "loss": 2.5722, + "step": 24553 + }, + { + "epoch": 0.7281084126560508, + "grad_norm": 0.09928523749113083, + "learning_rate": 0.00017488706202315208, + "loss": 2.5835, + "step": 24554 + }, + { + "epoch": 0.7281380660083623, + "grad_norm": 0.10145337879657745, + "learning_rate": 0.00017485131692102941, + "loss": 2.5757, + "step": 24555 + }, + { + "epoch": 0.7281677193606737, + "grad_norm": 0.10300998389720917, + "learning_rate": 0.00017481557469815412, + "loss": 2.5451, + "step": 24556 + }, + { + "epoch": 0.7281973727129852, + "grad_norm": 0.09476769715547562, + "learning_rate": 0.00017477983535484282, + "loss": 2.5972, + "step": 24557 + }, + { + "epoch": 0.7282270260652967, + "grad_norm": 0.09006931632757187, + "learning_rate": 0.0001747440988914118, + "loss": 2.542, + "step": 24558 + }, + { + "epoch": 0.7282566794176082, + "grad_norm": 0.0824347659945488, + "learning_rate": 0.00017470836530817768, + "loss": 2.5889, + "step": 24559 + }, + { + "epoch": 0.7282863327699196, + "grad_norm": 0.09173749387264252, + "learning_rate": 0.0001746726346054568, + "loss": 2.5937, + "step": 24560 + }, + { + "epoch": 0.7283159861222311, + "grad_norm": 0.08951692283153534, + "learning_rate": 0.00017463690678356576, + "loss": 2.5996, + "step": 24561 + }, + { + "epoch": 0.7283456394745426, + "grad_norm": 0.09460795670747757, + "learning_rate": 0.0001746011818428206, + "loss": 2.5928, + "step": 24562 + }, + { + "epoch": 0.7283752928268541, + "grad_norm": 0.08606746047735214, + "learning_rate": 0.00017456545978353777, + "loss": 2.5926, + "step": 24563 + }, + { + "epoch": 0.7284049461791655, + "grad_norm": 0.09157159924507141, + "learning_rate": 0.00017452974060603354, + "loss": 2.556, + "step": 24564 + }, + { + "epoch": 0.728434599531477, + "grad_norm": 0.09016478061676025, + "learning_rate": 0.00017449402431062445, + "loss": 2.5822, + "step": 24565 + }, + { + "epoch": 0.7284642528837885, + "grad_norm": 0.08959230035543442, + "learning_rate": 0.0001744583108976267, + "loss": 2.5558, + "step": 24566 + }, + { + "epoch": 0.7284939062361, + "grad_norm": 0.0979032889008522, + "learning_rate": 0.00017442260036735647, + "loss": 2.5898, + "step": 24567 + }, + { + "epoch": 0.7285235595884114, + "grad_norm": 0.09797414392232895, + "learning_rate": 0.00017438689272012998, + "loss": 2.5613, + "step": 24568 + }, + { + "epoch": 0.728553212940723, + "grad_norm": 0.09290548413991928, + "learning_rate": 0.00017435118795626343, + "loss": 2.5811, + "step": 24569 + }, + { + "epoch": 0.7285828662930345, + "grad_norm": 0.09990852326154709, + "learning_rate": 0.00017431548607607306, + "loss": 2.6098, + "step": 24570 + }, + { + "epoch": 0.7286125196453459, + "grad_norm": 0.09027566760778427, + "learning_rate": 0.0001742797870798749, + "loss": 2.5971, + "step": 24571 + }, + { + "epoch": 0.7286421729976574, + "grad_norm": 0.09548061341047287, + "learning_rate": 0.00017424409096798534, + "loss": 2.5852, + "step": 24572 + }, + { + "epoch": 0.7286718263499689, + "grad_norm": 0.09033896028995514, + "learning_rate": 0.0001742083977407201, + "loss": 2.581, + "step": 24573 + }, + { + "epoch": 0.7287014797022804, + "grad_norm": 0.10268162935972214, + "learning_rate": 0.00017417270739839542, + "loss": 2.5342, + "step": 24574 + }, + { + "epoch": 0.7287311330545918, + "grad_norm": 0.10019934177398682, + "learning_rate": 0.00017413701994132736, + "loss": 2.5772, + "step": 24575 + }, + { + "epoch": 0.7287607864069033, + "grad_norm": 0.08795538544654846, + "learning_rate": 0.00017410133536983191, + "loss": 2.5751, + "step": 24576 + }, + { + "epoch": 0.7287904397592148, + "grad_norm": 0.10031815618276596, + "learning_rate": 0.00017406565368422488, + "loss": 2.5661, + "step": 24577 + }, + { + "epoch": 0.7288200931115263, + "grad_norm": 0.10079458355903625, + "learning_rate": 0.00017402997488482263, + "loss": 2.5776, + "step": 24578 + }, + { + "epoch": 0.7288497464638377, + "grad_norm": 0.09681402891874313, + "learning_rate": 0.00017399429897194091, + "loss": 2.5637, + "step": 24579 + }, + { + "epoch": 0.7288793998161492, + "grad_norm": 0.09415807574987411, + "learning_rate": 0.00017395862594589556, + "loss": 2.5426, + "step": 24580 + }, + { + "epoch": 0.7289090531684607, + "grad_norm": 0.09349611401557922, + "learning_rate": 0.00017392295580700263, + "loss": 2.5747, + "step": 24581 + }, + { + "epoch": 0.7289387065207722, + "grad_norm": 0.0930216982960701, + "learning_rate": 0.00017388728855557802, + "loss": 2.5761, + "step": 24582 + }, + { + "epoch": 0.7289683598730836, + "grad_norm": 0.09473974257707596, + "learning_rate": 0.00017385162419193727, + "loss": 2.5651, + "step": 24583 + }, + { + "epoch": 0.7289980132253951, + "grad_norm": 0.09253726154565811, + "learning_rate": 0.00017381596271639645, + "loss": 2.5736, + "step": 24584 + }, + { + "epoch": 0.7290276665777066, + "grad_norm": 0.11023834347724915, + "learning_rate": 0.0001737803041292712, + "loss": 2.5501, + "step": 24585 + }, + { + "epoch": 0.7290573199300181, + "grad_norm": 0.0878814086318016, + "learning_rate": 0.00017374464843087733, + "loss": 2.5819, + "step": 24586 + }, + { + "epoch": 0.7290869732823295, + "grad_norm": 0.10384301841259003, + "learning_rate": 0.00017370899562153065, + "loss": 2.5892, + "step": 24587 + }, + { + "epoch": 0.729116626634641, + "grad_norm": 0.08963975310325623, + "learning_rate": 0.0001736733457015468, + "loss": 2.5807, + "step": 24588 + }, + { + "epoch": 0.7291462799869525, + "grad_norm": 0.09497340023517609, + "learning_rate": 0.00017363769867124147, + "loss": 2.5982, + "step": 24589 + }, + { + "epoch": 0.729175933339264, + "grad_norm": 0.10628913342952728, + "learning_rate": 0.0001736020545309302, + "loss": 2.5518, + "step": 24590 + }, + { + "epoch": 0.7292055866915755, + "grad_norm": 0.09168495237827301, + "learning_rate": 0.00017356641328092893, + "loss": 2.5444, + "step": 24591 + }, + { + "epoch": 0.729235240043887, + "grad_norm": 0.09895313531160355, + "learning_rate": 0.00017353077492155306, + "loss": 2.5539, + "step": 24592 + }, + { + "epoch": 0.7292648933961985, + "grad_norm": 0.09400757402181625, + "learning_rate": 0.00017349513945311845, + "loss": 2.5885, + "step": 24593 + }, + { + "epoch": 0.7292945467485099, + "grad_norm": 0.08941123634576797, + "learning_rate": 0.0001734595068759402, + "loss": 2.5805, + "step": 24594 + }, + { + "epoch": 0.7293242001008214, + "grad_norm": 0.08802780508995056, + "learning_rate": 0.0001734238771903341, + "loss": 2.618, + "step": 24595 + }, + { + "epoch": 0.7293538534531329, + "grad_norm": 0.08460243791341782, + "learning_rate": 0.0001733882503966156, + "loss": 2.5466, + "step": 24596 + }, + { + "epoch": 0.7293835068054444, + "grad_norm": 0.0947071835398674, + "learning_rate": 0.0001733526264951002, + "loss": 2.5829, + "step": 24597 + }, + { + "epoch": 0.7294131601577558, + "grad_norm": 0.08192633837461472, + "learning_rate": 0.00017331700548610341, + "loss": 2.5665, + "step": 24598 + }, + { + "epoch": 0.7294428135100673, + "grad_norm": 0.08196747303009033, + "learning_rate": 0.00017328138736994058, + "loss": 2.5733, + "step": 24599 + }, + { + "epoch": 0.7294724668623788, + "grad_norm": 0.08645687252283096, + "learning_rate": 0.0001732457721469271, + "loss": 2.569, + "step": 24600 + }, + { + "epoch": 0.7295021202146903, + "grad_norm": 0.09705953299999237, + "learning_rate": 0.00017321015981737846, + "loss": 2.5458, + "step": 24601 + }, + { + "epoch": 0.7295317735670017, + "grad_norm": 0.08981680124998093, + "learning_rate": 0.0001731745503816098, + "loss": 2.637, + "step": 24602 + }, + { + "epoch": 0.7295614269193132, + "grad_norm": 0.09962907433509827, + "learning_rate": 0.00017313894383993682, + "loss": 2.521, + "step": 24603 + }, + { + "epoch": 0.7295910802716247, + "grad_norm": 0.09016749262809753, + "learning_rate": 0.00017310334019267454, + "loss": 2.5591, + "step": 24604 + }, + { + "epoch": 0.7296207336239362, + "grad_norm": 0.09642597287893295, + "learning_rate": 0.00017306773944013827, + "loss": 2.6056, + "step": 24605 + }, + { + "epoch": 0.7296503869762476, + "grad_norm": 0.0896136686205864, + "learning_rate": 0.00017303214158264325, + "loss": 2.5919, + "step": 24606 + }, + { + "epoch": 0.7296800403285592, + "grad_norm": 0.09082039445638657, + "learning_rate": 0.00017299654662050474, + "loss": 2.5501, + "step": 24607 + }, + { + "epoch": 0.7297096936808706, + "grad_norm": 0.0930744931101799, + "learning_rate": 0.00017296095455403794, + "loss": 2.5865, + "step": 24608 + }, + { + "epoch": 0.7297393470331821, + "grad_norm": 0.08264564722776413, + "learning_rate": 0.0001729253653835581, + "loss": 2.5907, + "step": 24609 + }, + { + "epoch": 0.7297690003854935, + "grad_norm": 0.09169461578130722, + "learning_rate": 0.0001728897791093802, + "loss": 2.5769, + "step": 24610 + }, + { + "epoch": 0.7297986537378051, + "grad_norm": 0.08206798881292343, + "learning_rate": 0.0001728541957318195, + "loss": 2.6135, + "step": 24611 + }, + { + "epoch": 0.7298283070901166, + "grad_norm": 0.09454493969678879, + "learning_rate": 0.00017281861525119096, + "loss": 2.5619, + "step": 24612 + }, + { + "epoch": 0.729857960442428, + "grad_norm": 0.09978517144918442, + "learning_rate": 0.00017278303766780985, + "loss": 2.5763, + "step": 24613 + }, + { + "epoch": 0.7298876137947395, + "grad_norm": 0.0935186892747879, + "learning_rate": 0.00017274746298199107, + "loss": 2.5619, + "step": 24614 + }, + { + "epoch": 0.729917267147051, + "grad_norm": 0.09494666755199432, + "learning_rate": 0.00017271189119404966, + "loss": 2.5705, + "step": 24615 + }, + { + "epoch": 0.7299469204993625, + "grad_norm": 0.10518702119588852, + "learning_rate": 0.0001726763223043007, + "loss": 2.5729, + "step": 24616 + }, + { + "epoch": 0.7299765738516739, + "grad_norm": 0.10021805763244629, + "learning_rate": 0.00017264075631305903, + "loss": 2.5731, + "step": 24617 + }, + { + "epoch": 0.7300062272039854, + "grad_norm": 0.09330686926841736, + "learning_rate": 0.00017260519322063968, + "loss": 2.5929, + "step": 24618 + }, + { + "epoch": 0.7300358805562969, + "grad_norm": 0.11681533604860306, + "learning_rate": 0.0001725696330273575, + "loss": 2.5541, + "step": 24619 + }, + { + "epoch": 0.7300655339086084, + "grad_norm": 0.08485321700572968, + "learning_rate": 0.00017253407573352743, + "loss": 2.5663, + "step": 24620 + }, + { + "epoch": 0.7300951872609198, + "grad_norm": 0.10765177011489868, + "learning_rate": 0.00017249852133946437, + "loss": 2.5915, + "step": 24621 + }, + { + "epoch": 0.7301248406132314, + "grad_norm": 0.10281240195035934, + "learning_rate": 0.0001724629698454831, + "loss": 2.57, + "step": 24622 + }, + { + "epoch": 0.7301544939655428, + "grad_norm": 0.1008811965584755, + "learning_rate": 0.00017242742125189842, + "loss": 2.5425, + "step": 24623 + }, + { + "epoch": 0.7301841473178543, + "grad_norm": 0.10246524214744568, + "learning_rate": 0.0001723918755590252, + "loss": 2.5673, + "step": 24624 + }, + { + "epoch": 0.7302138006701657, + "grad_norm": 0.09307770431041718, + "learning_rate": 0.0001723563327671781, + "loss": 2.5547, + "step": 24625 + }, + { + "epoch": 0.7302434540224773, + "grad_norm": 0.10290734469890594, + "learning_rate": 0.00017232079287667195, + "loss": 2.5886, + "step": 24626 + }, + { + "epoch": 0.7302731073747887, + "grad_norm": 0.09019174426794052, + "learning_rate": 0.00017228525588782161, + "loss": 2.5698, + "step": 24627 + }, + { + "epoch": 0.7303027607271002, + "grad_norm": 0.10157759487628937, + "learning_rate": 0.00017224972180094123, + "loss": 2.5844, + "step": 24628 + }, + { + "epoch": 0.7303324140794116, + "grad_norm": 0.09477215260267258, + "learning_rate": 0.00017221419061634596, + "loss": 2.5942, + "step": 24629 + }, + { + "epoch": 0.7303620674317232, + "grad_norm": 0.09932681173086166, + "learning_rate": 0.0001721786623343503, + "loss": 2.5618, + "step": 24630 + }, + { + "epoch": 0.7303917207840346, + "grad_norm": 0.0944458618760109, + "learning_rate": 0.00017214313695526888, + "loss": 2.5731, + "step": 24631 + }, + { + "epoch": 0.7304213741363461, + "grad_norm": 0.09337391704320908, + "learning_rate": 0.00017210761447941625, + "loss": 2.5893, + "step": 24632 + }, + { + "epoch": 0.7304510274886576, + "grad_norm": 0.09041985124349594, + "learning_rate": 0.00017207209490710702, + "loss": 2.5489, + "step": 24633 + }, + { + "epoch": 0.7304806808409691, + "grad_norm": 0.09441231936216354, + "learning_rate": 0.00017203657823865558, + "loss": 2.5653, + "step": 24634 + }, + { + "epoch": 0.7305103341932806, + "grad_norm": 0.09360511600971222, + "learning_rate": 0.00017200106447437662, + "loss": 2.558, + "step": 24635 + }, + { + "epoch": 0.730539987545592, + "grad_norm": 0.09669002890586853, + "learning_rate": 0.00017196555361458448, + "loss": 2.5847, + "step": 24636 + }, + { + "epoch": 0.7305696408979035, + "grad_norm": 0.0960163027048111, + "learning_rate": 0.0001719300456595939, + "loss": 2.5982, + "step": 24637 + }, + { + "epoch": 0.730599294250215, + "grad_norm": 0.08931968361139297, + "learning_rate": 0.00017189454060971887, + "loss": 2.5495, + "step": 24638 + }, + { + "epoch": 0.7306289476025265, + "grad_norm": 0.0925423800945282, + "learning_rate": 0.000171859038465274, + "loss": 2.573, + "step": 24639 + }, + { + "epoch": 0.7306586009548379, + "grad_norm": 0.08879998326301575, + "learning_rate": 0.00017182353922657367, + "loss": 2.5611, + "step": 24640 + }, + { + "epoch": 0.7306882543071495, + "grad_norm": 0.09588305652141571, + "learning_rate": 0.00017178804289393206, + "loss": 2.5988, + "step": 24641 + }, + { + "epoch": 0.7307179076594609, + "grad_norm": 0.100335992872715, + "learning_rate": 0.00017175254946766384, + "loss": 2.5707, + "step": 24642 + }, + { + "epoch": 0.7307475610117724, + "grad_norm": 0.08977913856506348, + "learning_rate": 0.00017171705894808315, + "loss": 2.5777, + "step": 24643 + }, + { + "epoch": 0.7307772143640838, + "grad_norm": 0.09917417168617249, + "learning_rate": 0.00017168157133550421, + "loss": 2.602, + "step": 24644 + }, + { + "epoch": 0.7308068677163954, + "grad_norm": 0.10147546976804733, + "learning_rate": 0.00017164608663024133, + "loss": 2.5728, + "step": 24645 + }, + { + "epoch": 0.7308365210687068, + "grad_norm": 0.10011301934719086, + "learning_rate": 0.00017161060483260877, + "loss": 2.5726, + "step": 24646 + }, + { + "epoch": 0.7308661744210183, + "grad_norm": 0.09387979656457901, + "learning_rate": 0.00017157512594292063, + "loss": 2.5755, + "step": 24647 + }, + { + "epoch": 0.7308958277733297, + "grad_norm": 0.10128596425056458, + "learning_rate": 0.00017153964996149124, + "loss": 2.5552, + "step": 24648 + }, + { + "epoch": 0.7309254811256413, + "grad_norm": 0.09411542117595673, + "learning_rate": 0.00017150417688863452, + "loss": 2.6091, + "step": 24649 + }, + { + "epoch": 0.7309551344779527, + "grad_norm": 0.09545474499464035, + "learning_rate": 0.0001714687067246647, + "loss": 2.5815, + "step": 24650 + }, + { + "epoch": 0.7309847878302642, + "grad_norm": 0.08413807302713394, + "learning_rate": 0.00017143323946989587, + "loss": 2.5918, + "step": 24651 + }, + { + "epoch": 0.7310144411825756, + "grad_norm": 0.09718109667301178, + "learning_rate": 0.00017139777512464204, + "loss": 2.5593, + "step": 24652 + }, + { + "epoch": 0.7310440945348872, + "grad_norm": 0.09517770260572433, + "learning_rate": 0.0001713623136892174, + "loss": 2.5782, + "step": 24653 + }, + { + "epoch": 0.7310737478871987, + "grad_norm": 0.09292353689670563, + "learning_rate": 0.0001713268551639357, + "loss": 2.5758, + "step": 24654 + }, + { + "epoch": 0.7311034012395101, + "grad_norm": 0.08697438985109329, + "learning_rate": 0.00017129139954911128, + "loss": 2.5842, + "step": 24655 + }, + { + "epoch": 0.7311330545918217, + "grad_norm": 0.09213054180145264, + "learning_rate": 0.0001712559468450579, + "loss": 2.6057, + "step": 24656 + }, + { + "epoch": 0.7311627079441331, + "grad_norm": 0.09821134060621262, + "learning_rate": 0.00017122049705208954, + "loss": 2.5921, + "step": 24657 + }, + { + "epoch": 0.7311923612964446, + "grad_norm": 0.09652266651391983, + "learning_rate": 0.00017118505017052032, + "loss": 2.5797, + "step": 24658 + }, + { + "epoch": 0.731222014648756, + "grad_norm": 0.0866498276591301, + "learning_rate": 0.00017114960620066372, + "loss": 2.5664, + "step": 24659 + }, + { + "epoch": 0.7312516680010676, + "grad_norm": 0.09580888599157333, + "learning_rate": 0.00017111416514283383, + "loss": 2.5875, + "step": 24660 + }, + { + "epoch": 0.731281321353379, + "grad_norm": 0.08862292021512985, + "learning_rate": 0.00017107872699734445, + "loss": 2.5776, + "step": 24661 + }, + { + "epoch": 0.7313109747056905, + "grad_norm": 0.0935295969247818, + "learning_rate": 0.0001710432917645094, + "loss": 2.6145, + "step": 24662 + }, + { + "epoch": 0.7313406280580019, + "grad_norm": 0.09502524882555008, + "learning_rate": 0.00017100785944464247, + "loss": 2.5814, + "step": 24663 + }, + { + "epoch": 0.7313702814103135, + "grad_norm": 0.09683801978826523, + "learning_rate": 0.0001709724300380574, + "loss": 2.6027, + "step": 24664 + }, + { + "epoch": 0.7313999347626249, + "grad_norm": 0.08808355778455734, + "learning_rate": 0.00017093700354506796, + "loss": 2.5744, + "step": 24665 + }, + { + "epoch": 0.7314295881149364, + "grad_norm": 0.09064767509698868, + "learning_rate": 0.00017090157996598783, + "loss": 2.5811, + "step": 24666 + }, + { + "epoch": 0.7314592414672478, + "grad_norm": 0.08573977649211884, + "learning_rate": 0.00017086615930113054, + "loss": 2.5712, + "step": 24667 + }, + { + "epoch": 0.7314888948195594, + "grad_norm": 0.08657412230968475, + "learning_rate": 0.00017083074155081003, + "loss": 2.5696, + "step": 24668 + }, + { + "epoch": 0.7315185481718708, + "grad_norm": 0.08690953999757767, + "learning_rate": 0.00017079532671534004, + "loss": 2.562, + "step": 24669 + }, + { + "epoch": 0.7315482015241823, + "grad_norm": 0.08559669554233551, + "learning_rate": 0.00017075991479503373, + "loss": 2.569, + "step": 24670 + }, + { + "epoch": 0.7315778548764937, + "grad_norm": 0.08880592882633209, + "learning_rate": 0.00017072450579020489, + "loss": 2.6177, + "step": 24671 + }, + { + "epoch": 0.7316075082288053, + "grad_norm": 0.08723742514848709, + "learning_rate": 0.00017068909970116702, + "loss": 2.5739, + "step": 24672 + }, + { + "epoch": 0.7316371615811167, + "grad_norm": 0.09692255407571793, + "learning_rate": 0.00017065369652823377, + "loss": 2.5651, + "step": 24673 + }, + { + "epoch": 0.7316668149334282, + "grad_norm": 0.08340545743703842, + "learning_rate": 0.00017061829627171854, + "loss": 2.5501, + "step": 24674 + }, + { + "epoch": 0.7316964682857398, + "grad_norm": 0.09558690339326859, + "learning_rate": 0.00017058289893193484, + "loss": 2.5677, + "step": 24675 + }, + { + "epoch": 0.7317261216380512, + "grad_norm": 0.0877445712685585, + "learning_rate": 0.00017054750450919614, + "loss": 2.6165, + "step": 24676 + }, + { + "epoch": 0.7317557749903627, + "grad_norm": 0.09808623790740967, + "learning_rate": 0.0001705121130038158, + "loss": 2.6016, + "step": 24677 + }, + { + "epoch": 0.7317854283426741, + "grad_norm": 0.09618285298347473, + "learning_rate": 0.00017047672441610729, + "loss": 2.6299, + "step": 24678 + }, + { + "epoch": 0.7318150816949857, + "grad_norm": 0.09703842550516129, + "learning_rate": 0.0001704413387463839, + "loss": 2.5698, + "step": 24679 + }, + { + "epoch": 0.7318447350472971, + "grad_norm": 0.09870964288711548, + "learning_rate": 0.00017040595599495905, + "loss": 2.5787, + "step": 24680 + }, + { + "epoch": 0.7318743883996086, + "grad_norm": 0.09189296513795853, + "learning_rate": 0.00017037057616214608, + "loss": 2.5999, + "step": 24681 + }, + { + "epoch": 0.73190404175192, + "grad_norm": 0.10152653604745865, + "learning_rate": 0.0001703351992482582, + "loss": 2.558, + "step": 24682 + }, + { + "epoch": 0.7319336951042316, + "grad_norm": 0.09335426241159439, + "learning_rate": 0.00017029982525360864, + "loss": 2.541, + "step": 24683 + }, + { + "epoch": 0.731963348456543, + "grad_norm": 0.1027706190943718, + "learning_rate": 0.00017026445417851082, + "loss": 2.6022, + "step": 24684 + }, + { + "epoch": 0.7319930018088545, + "grad_norm": 0.10237842053174973, + "learning_rate": 0.0001702290860232778, + "loss": 2.5964, + "step": 24685 + }, + { + "epoch": 0.7320226551611659, + "grad_norm": 0.09336123615503311, + "learning_rate": 0.00017019372078822288, + "loss": 2.5926, + "step": 24686 + }, + { + "epoch": 0.7320523085134775, + "grad_norm": 0.11386232823133469, + "learning_rate": 0.00017015835847365913, + "loss": 2.5915, + "step": 24687 + }, + { + "epoch": 0.7320819618657889, + "grad_norm": 0.09624017030000687, + "learning_rate": 0.00017012299907989977, + "loss": 2.5931, + "step": 24688 + }, + { + "epoch": 0.7321116152181004, + "grad_norm": 0.12707266211509705, + "learning_rate": 0.00017008764260725785, + "loss": 2.56, + "step": 24689 + }, + { + "epoch": 0.7321412685704118, + "grad_norm": 0.09592996537685394, + "learning_rate": 0.00017005228905604648, + "loss": 2.5927, + "step": 24690 + }, + { + "epoch": 0.7321709219227234, + "grad_norm": 0.10676422715187073, + "learning_rate": 0.00017001693842657873, + "loss": 2.5681, + "step": 24691 + }, + { + "epoch": 0.7322005752750348, + "grad_norm": 0.10024562478065491, + "learning_rate": 0.0001699815907191678, + "loss": 2.5633, + "step": 24692 + }, + { + "epoch": 0.7322302286273463, + "grad_norm": 0.11020427942276001, + "learning_rate": 0.00016994624593412623, + "loss": 2.5773, + "step": 24693 + }, + { + "epoch": 0.7322598819796577, + "grad_norm": 0.08549479395151138, + "learning_rate": 0.0001699109040717674, + "loss": 2.5775, + "step": 24694 + }, + { + "epoch": 0.7322895353319693, + "grad_norm": 0.11865522712469101, + "learning_rate": 0.00016987556513240422, + "loss": 2.6008, + "step": 24695 + }, + { + "epoch": 0.7323191886842808, + "grad_norm": 0.08187784999608994, + "learning_rate": 0.00016984022911634955, + "loss": 2.5639, + "step": 24696 + }, + { + "epoch": 0.7323488420365922, + "grad_norm": 0.10190749168395996, + "learning_rate": 0.00016980489602391635, + "loss": 2.5928, + "step": 24697 + }, + { + "epoch": 0.7323784953889038, + "grad_norm": 0.09339004755020142, + "learning_rate": 0.00016976956585541746, + "loss": 2.5916, + "step": 24698 + }, + { + "epoch": 0.7324081487412152, + "grad_norm": 0.09735428541898727, + "learning_rate": 0.00016973423861116576, + "loss": 2.5589, + "step": 24699 + }, + { + "epoch": 0.7324378020935267, + "grad_norm": 0.09382592886686325, + "learning_rate": 0.00016969891429147405, + "loss": 2.5509, + "step": 24700 + }, + { + "epoch": 0.7324674554458381, + "grad_norm": 0.09334315359592438, + "learning_rate": 0.00016966359289665518, + "loss": 2.5561, + "step": 24701 + }, + { + "epoch": 0.7324971087981497, + "grad_norm": 0.0884193480014801, + "learning_rate": 0.0001696282744270219, + "loss": 2.5456, + "step": 24702 + }, + { + "epoch": 0.7325267621504611, + "grad_norm": 0.10862288624048233, + "learning_rate": 0.00016959295888288706, + "loss": 2.5855, + "step": 24703 + }, + { + "epoch": 0.7325564155027726, + "grad_norm": 0.09073721617460251, + "learning_rate": 0.00016955764626456316, + "loss": 2.6074, + "step": 24704 + }, + { + "epoch": 0.732586068855084, + "grad_norm": 0.10853813588619232, + "learning_rate": 0.00016952233657236305, + "loss": 2.5875, + "step": 24705 + }, + { + "epoch": 0.7326157222073956, + "grad_norm": 0.10101667046546936, + "learning_rate": 0.00016948702980659912, + "loss": 2.5896, + "step": 24706 + }, + { + "epoch": 0.732645375559707, + "grad_norm": 0.10802321881055832, + "learning_rate": 0.0001694517259675845, + "loss": 2.5795, + "step": 24707 + }, + { + "epoch": 0.7326750289120185, + "grad_norm": 0.09060391783714294, + "learning_rate": 0.0001694164250556316, + "loss": 2.5746, + "step": 24708 + }, + { + "epoch": 0.7327046822643299, + "grad_norm": 0.10066544264554977, + "learning_rate": 0.00016938112707105298, + "loss": 2.5779, + "step": 24709 + }, + { + "epoch": 0.7327343356166415, + "grad_norm": 0.09160707145929337, + "learning_rate": 0.00016934583201416126, + "loss": 2.5607, + "step": 24710 + }, + { + "epoch": 0.7327639889689529, + "grad_norm": 0.1010475680232048, + "learning_rate": 0.0001693105398852689, + "loss": 2.594, + "step": 24711 + }, + { + "epoch": 0.7327936423212644, + "grad_norm": 0.09160172939300537, + "learning_rate": 0.00016927525068468846, + "loss": 2.5638, + "step": 24712 + }, + { + "epoch": 0.7328232956735758, + "grad_norm": 0.10486746579408646, + "learning_rate": 0.00016923996441273265, + "loss": 2.6075, + "step": 24713 + }, + { + "epoch": 0.7328529490258874, + "grad_norm": 0.09519980847835541, + "learning_rate": 0.0001692046810697136, + "loss": 2.5735, + "step": 24714 + }, + { + "epoch": 0.7328826023781989, + "grad_norm": 0.10322505980730057, + "learning_rate": 0.00016916940065594383, + "loss": 2.5604, + "step": 24715 + }, + { + "epoch": 0.7329122557305103, + "grad_norm": 0.10186421126127243, + "learning_rate": 0.00016913412317173582, + "loss": 2.5675, + "step": 24716 + }, + { + "epoch": 0.7329419090828219, + "grad_norm": 0.10012496262788773, + "learning_rate": 0.00016909884861740192, + "loss": 2.5242, + "step": 24717 + }, + { + "epoch": 0.7329715624351333, + "grad_norm": 0.10174525529146194, + "learning_rate": 0.00016906357699325453, + "loss": 2.607, + "step": 24718 + }, + { + "epoch": 0.7330012157874448, + "grad_norm": 0.09258424490690231, + "learning_rate": 0.0001690283082996058, + "loss": 2.5701, + "step": 24719 + }, + { + "epoch": 0.7330308691397562, + "grad_norm": 0.10503055155277252, + "learning_rate": 0.00016899304253676834, + "loss": 2.5678, + "step": 24720 + }, + { + "epoch": 0.7330605224920678, + "grad_norm": 0.0953482910990715, + "learning_rate": 0.0001689577797050544, + "loss": 2.538, + "step": 24721 + }, + { + "epoch": 0.7330901758443792, + "grad_norm": 0.10168388485908508, + "learning_rate": 0.00016892251980477602, + "loss": 2.557, + "step": 24722 + }, + { + "epoch": 0.7331198291966907, + "grad_norm": 0.09827740490436554, + "learning_rate": 0.00016888726283624566, + "loss": 2.5793, + "step": 24723 + }, + { + "epoch": 0.7331494825490021, + "grad_norm": 0.09416986256837845, + "learning_rate": 0.0001688520087997755, + "loss": 2.5605, + "step": 24724 + }, + { + "epoch": 0.7331791359013137, + "grad_norm": 0.09269137680530548, + "learning_rate": 0.00016881675769567757, + "loss": 2.534, + "step": 24725 + }, + { + "epoch": 0.7332087892536251, + "grad_norm": 0.09267488121986389, + "learning_rate": 0.00016878150952426402, + "loss": 2.5998, + "step": 24726 + }, + { + "epoch": 0.7332384426059366, + "grad_norm": 0.0865868628025055, + "learning_rate": 0.0001687462642858471, + "loss": 2.5898, + "step": 24727 + }, + { + "epoch": 0.733268095958248, + "grad_norm": 0.09174535423517227, + "learning_rate": 0.00016871102198073883, + "loss": 2.5827, + "step": 24728 + }, + { + "epoch": 0.7332977493105596, + "grad_norm": 0.09617854654788971, + "learning_rate": 0.00016867578260925138, + "loss": 2.5548, + "step": 24729 + }, + { + "epoch": 0.733327402662871, + "grad_norm": 0.08753971755504608, + "learning_rate": 0.00016864054617169672, + "loss": 2.5836, + "step": 24730 + }, + { + "epoch": 0.7333570560151825, + "grad_norm": 0.09198182076215744, + "learning_rate": 0.00016860531266838693, + "loss": 2.6013, + "step": 24731 + }, + { + "epoch": 0.733386709367494, + "grad_norm": 0.0958978459239006, + "learning_rate": 0.00016857008209963375, + "loss": 2.5608, + "step": 24732 + }, + { + "epoch": 0.7334163627198055, + "grad_norm": 0.10017776489257812, + "learning_rate": 0.00016853485446574962, + "loss": 2.5448, + "step": 24733 + }, + { + "epoch": 0.7334460160721169, + "grad_norm": 0.08883854746818542, + "learning_rate": 0.00016849962976704636, + "loss": 2.6, + "step": 24734 + }, + { + "epoch": 0.7334756694244284, + "grad_norm": 0.09749910235404968, + "learning_rate": 0.0001684644080038356, + "loss": 2.5376, + "step": 24735 + }, + { + "epoch": 0.73350532277674, + "grad_norm": 0.10722161084413528, + "learning_rate": 0.00016842918917642945, + "loss": 2.5681, + "step": 24736 + }, + { + "epoch": 0.7335349761290514, + "grad_norm": 0.08538971096277237, + "learning_rate": 0.0001683939732851398, + "loss": 2.5796, + "step": 24737 + }, + { + "epoch": 0.7335646294813629, + "grad_norm": 0.10097865760326385, + "learning_rate": 0.00016835876033027836, + "loss": 2.5639, + "step": 24738 + }, + { + "epoch": 0.7335942828336743, + "grad_norm": 0.08749880641698837, + "learning_rate": 0.00016832355031215702, + "loss": 2.5622, + "step": 24739 + }, + { + "epoch": 0.7336239361859859, + "grad_norm": 0.10317711532115936, + "learning_rate": 0.0001682883432310876, + "loss": 2.5599, + "step": 24740 + }, + { + "epoch": 0.7336535895382973, + "grad_norm": 0.09201346337795258, + "learning_rate": 0.00016825313908738182, + "loss": 2.6057, + "step": 24741 + }, + { + "epoch": 0.7336832428906088, + "grad_norm": 0.10808859765529633, + "learning_rate": 0.00016821793788135143, + "loss": 2.5781, + "step": 24742 + }, + { + "epoch": 0.7337128962429202, + "grad_norm": 0.1116173267364502, + "learning_rate": 0.00016818273961330816, + "loss": 2.5486, + "step": 24743 + }, + { + "epoch": 0.7337425495952318, + "grad_norm": 0.09907671064138412, + "learning_rate": 0.00016814754428356372, + "loss": 2.548, + "step": 24744 + }, + { + "epoch": 0.7337722029475432, + "grad_norm": 0.10590211302042007, + "learning_rate": 0.00016811235189242968, + "loss": 2.5557, + "step": 24745 + }, + { + "epoch": 0.7338018562998547, + "grad_norm": 0.09997522830963135, + "learning_rate": 0.00016807716244021775, + "loss": 2.559, + "step": 24746 + }, + { + "epoch": 0.7338315096521661, + "grad_norm": 0.09629660844802856, + "learning_rate": 0.0001680419759272396, + "loss": 2.5622, + "step": 24747 + }, + { + "epoch": 0.7338611630044777, + "grad_norm": 0.11703693866729736, + "learning_rate": 0.00016800679235380662, + "loss": 2.5648, + "step": 24748 + }, + { + "epoch": 0.7338908163567891, + "grad_norm": 0.08808862417936325, + "learning_rate": 0.0001679716117202305, + "loss": 2.5594, + "step": 24749 + }, + { + "epoch": 0.7339204697091006, + "grad_norm": 0.10354103147983551, + "learning_rate": 0.0001679364340268228, + "loss": 2.564, + "step": 24750 + }, + { + "epoch": 0.733950123061412, + "grad_norm": 0.08717931807041168, + "learning_rate": 0.00016790125927389495, + "loss": 2.5696, + "step": 24751 + }, + { + "epoch": 0.7339797764137236, + "grad_norm": 0.1075376346707344, + "learning_rate": 0.00016786608746175845, + "loss": 2.5415, + "step": 24752 + }, + { + "epoch": 0.734009429766035, + "grad_norm": 0.09125621616840363, + "learning_rate": 0.00016783091859072475, + "loss": 2.5785, + "step": 24753 + }, + { + "epoch": 0.7340390831183465, + "grad_norm": 0.10584009438753128, + "learning_rate": 0.0001677957526611053, + "loss": 2.5541, + "step": 24754 + }, + { + "epoch": 0.734068736470658, + "grad_norm": 0.09454667568206787, + "learning_rate": 0.00016776058967321144, + "loss": 2.5706, + "step": 24755 + }, + { + "epoch": 0.7340983898229695, + "grad_norm": 0.10432330518960953, + "learning_rate": 0.00016772542962735466, + "loss": 2.5342, + "step": 24756 + }, + { + "epoch": 0.734128043175281, + "grad_norm": 0.09566438943147659, + "learning_rate": 0.00016769027252384617, + "loss": 2.6024, + "step": 24757 + }, + { + "epoch": 0.7341576965275924, + "grad_norm": 0.10023903846740723, + "learning_rate": 0.00016765511836299742, + "loss": 2.5721, + "step": 24758 + }, + { + "epoch": 0.734187349879904, + "grad_norm": 0.08393390476703644, + "learning_rate": 0.00016761996714511956, + "loss": 2.5585, + "step": 24759 + }, + { + "epoch": 0.7342170032322154, + "grad_norm": 0.09530974924564362, + "learning_rate": 0.00016758481887052401, + "loss": 2.6033, + "step": 24760 + }, + { + "epoch": 0.7342466565845269, + "grad_norm": 0.08503635227680206, + "learning_rate": 0.0001675496735395219, + "loss": 2.5656, + "step": 24761 + }, + { + "epoch": 0.7342763099368383, + "grad_norm": 0.10298274457454681, + "learning_rate": 0.00016751453115242454, + "loss": 2.6152, + "step": 24762 + }, + { + "epoch": 0.7343059632891499, + "grad_norm": 0.08589009195566177, + "learning_rate": 0.000167479391709543, + "loss": 2.6004, + "step": 24763 + }, + { + "epoch": 0.7343356166414613, + "grad_norm": 0.09084353595972061, + "learning_rate": 0.0001674442552111886, + "loss": 2.5801, + "step": 24764 + }, + { + "epoch": 0.7343652699937728, + "grad_norm": 0.09711235761642456, + "learning_rate": 0.00016740912165767242, + "loss": 2.5957, + "step": 24765 + }, + { + "epoch": 0.7343949233460842, + "grad_norm": 0.08379948139190674, + "learning_rate": 0.0001673739910493055, + "loss": 2.5937, + "step": 24766 + }, + { + "epoch": 0.7344245766983958, + "grad_norm": 0.08910349756479263, + "learning_rate": 0.00016733886338639897, + "loss": 2.6015, + "step": 24767 + }, + { + "epoch": 0.7344542300507072, + "grad_norm": 0.09106245636940002, + "learning_rate": 0.00016730373866926412, + "loss": 2.5475, + "step": 24768 + }, + { + "epoch": 0.7344838834030187, + "grad_norm": 0.09994182735681534, + "learning_rate": 0.0001672686168982116, + "loss": 2.5858, + "step": 24769 + }, + { + "epoch": 0.7345135367553302, + "grad_norm": 0.08935445547103882, + "learning_rate": 0.0001672334980735526, + "loss": 2.5568, + "step": 24770 + }, + { + "epoch": 0.7345431901076417, + "grad_norm": 0.08769361674785614, + "learning_rate": 0.00016719838219559786, + "loss": 2.5431, + "step": 24771 + }, + { + "epoch": 0.7345728434599531, + "grad_norm": 0.0905238687992096, + "learning_rate": 0.0001671632692646588, + "loss": 2.5917, + "step": 24772 + }, + { + "epoch": 0.7346024968122646, + "grad_norm": 0.08869127184152603, + "learning_rate": 0.00016712815928104607, + "loss": 2.6202, + "step": 24773 + }, + { + "epoch": 0.7346321501645761, + "grad_norm": 0.0892433300614357, + "learning_rate": 0.0001670930522450707, + "loss": 2.5545, + "step": 24774 + }, + { + "epoch": 0.7346618035168876, + "grad_norm": 0.08761309087276459, + "learning_rate": 0.00016705794815704346, + "loss": 2.5844, + "step": 24775 + }, + { + "epoch": 0.734691456869199, + "grad_norm": 0.08975216746330261, + "learning_rate": 0.0001670228470172752, + "loss": 2.5697, + "step": 24776 + }, + { + "epoch": 0.7347211102215105, + "grad_norm": 0.09422388672828674, + "learning_rate": 0.0001669877488260768, + "loss": 2.5858, + "step": 24777 + }, + { + "epoch": 0.7347507635738221, + "grad_norm": 0.0873052254319191, + "learning_rate": 0.0001669526535837591, + "loss": 2.5864, + "step": 24778 + }, + { + "epoch": 0.7347804169261335, + "grad_norm": 0.08606941252946854, + "learning_rate": 0.00016691756129063296, + "loss": 2.5938, + "step": 24779 + }, + { + "epoch": 0.734810070278445, + "grad_norm": 0.08116025477647781, + "learning_rate": 0.0001668824719470088, + "loss": 2.5929, + "step": 24780 + }, + { + "epoch": 0.7348397236307564, + "grad_norm": 0.08970856666564941, + "learning_rate": 0.00016684738555319752, + "loss": 2.5573, + "step": 24781 + }, + { + "epoch": 0.734869376983068, + "grad_norm": 0.09828050434589386, + "learning_rate": 0.00016681230210950981, + "loss": 2.5724, + "step": 24782 + }, + { + "epoch": 0.7348990303353794, + "grad_norm": 0.09366854280233383, + "learning_rate": 0.00016677722161625625, + "loss": 2.5762, + "step": 24783 + }, + { + "epoch": 0.7349286836876909, + "grad_norm": 0.09243689477443695, + "learning_rate": 0.00016674214407374772, + "loss": 2.5848, + "step": 24784 + }, + { + "epoch": 0.7349583370400024, + "grad_norm": 0.10251379758119583, + "learning_rate": 0.00016670706948229474, + "loss": 2.5752, + "step": 24785 + }, + { + "epoch": 0.7349879903923139, + "grad_norm": 0.09128424525260925, + "learning_rate": 0.00016667199784220783, + "loss": 2.5976, + "step": 24786 + }, + { + "epoch": 0.7350176437446253, + "grad_norm": 0.0926082655787468, + "learning_rate": 0.00016663692915379764, + "loss": 2.5745, + "step": 24787 + }, + { + "epoch": 0.7350472970969368, + "grad_norm": 0.09671445935964584, + "learning_rate": 0.0001666018634173746, + "loss": 2.5839, + "step": 24788 + }, + { + "epoch": 0.7350769504492483, + "grad_norm": 0.09968671947717667, + "learning_rate": 0.00016656680063324947, + "loss": 2.5711, + "step": 24789 + }, + { + "epoch": 0.7351066038015598, + "grad_norm": 0.08573520928621292, + "learning_rate": 0.00016653174080173238, + "loss": 2.5837, + "step": 24790 + }, + { + "epoch": 0.7351362571538712, + "grad_norm": 0.09463147073984146, + "learning_rate": 0.000166496683923134, + "loss": 2.5607, + "step": 24791 + }, + { + "epoch": 0.7351659105061827, + "grad_norm": 0.09418828785419464, + "learning_rate": 0.00016646162999776478, + "loss": 2.5804, + "step": 24792 + }, + { + "epoch": 0.7351955638584942, + "grad_norm": 0.09070559591054916, + "learning_rate": 0.00016642657902593496, + "loss": 2.5589, + "step": 24793 + }, + { + "epoch": 0.7352252172108057, + "grad_norm": 0.08937786519527435, + "learning_rate": 0.0001663915310079551, + "loss": 2.5836, + "step": 24794 + }, + { + "epoch": 0.7352548705631171, + "grad_norm": 0.09626930207014084, + "learning_rate": 0.00016635648594413548, + "loss": 2.5698, + "step": 24795 + }, + { + "epoch": 0.7352845239154286, + "grad_norm": 0.08845847845077515, + "learning_rate": 0.00016632144383478632, + "loss": 2.5842, + "step": 24796 + }, + { + "epoch": 0.7353141772677401, + "grad_norm": 0.08860532939434052, + "learning_rate": 0.00016628640468021816, + "loss": 2.5701, + "step": 24797 + }, + { + "epoch": 0.7353438306200516, + "grad_norm": 0.08669018745422363, + "learning_rate": 0.0001662513684807412, + "loss": 2.584, + "step": 24798 + }, + { + "epoch": 0.7353734839723631, + "grad_norm": 0.08302785456180573, + "learning_rate": 0.00016621633523666564, + "loss": 2.5769, + "step": 24799 + }, + { + "epoch": 0.7354031373246745, + "grad_norm": 0.10100145637989044, + "learning_rate": 0.0001661813049483019, + "loss": 2.5863, + "step": 24800 + }, + { + "epoch": 0.7354327906769861, + "grad_norm": 0.09405197203159332, + "learning_rate": 0.00016614627761595986, + "loss": 2.5928, + "step": 24801 + }, + { + "epoch": 0.7354624440292975, + "grad_norm": 0.09099353849887848, + "learning_rate": 0.0001661112532399498, + "loss": 2.5921, + "step": 24802 + }, + { + "epoch": 0.735492097381609, + "grad_norm": 0.09166771173477173, + "learning_rate": 0.0001660762318205819, + "loss": 2.573, + "step": 24803 + }, + { + "epoch": 0.7355217507339205, + "grad_norm": 0.09666094183921814, + "learning_rate": 0.00016604121335816636, + "loss": 2.5529, + "step": 24804 + }, + { + "epoch": 0.735551404086232, + "grad_norm": 0.09134252369403839, + "learning_rate": 0.00016600619785301312, + "loss": 2.5594, + "step": 24805 + }, + { + "epoch": 0.7355810574385434, + "grad_norm": 0.11079829186201096, + "learning_rate": 0.00016597118530543238, + "loss": 2.6004, + "step": 24806 + }, + { + "epoch": 0.7356107107908549, + "grad_norm": 0.09531692415475845, + "learning_rate": 0.00016593617571573406, + "loss": 2.5551, + "step": 24807 + }, + { + "epoch": 0.7356403641431664, + "grad_norm": 0.10608600080013275, + "learning_rate": 0.00016590116908422832, + "loss": 2.5858, + "step": 24808 + }, + { + "epoch": 0.7356700174954779, + "grad_norm": 0.08575908839702606, + "learning_rate": 0.00016586616541122484, + "loss": 2.5612, + "step": 24809 + }, + { + "epoch": 0.7356996708477893, + "grad_norm": 0.09951663017272949, + "learning_rate": 0.0001658311646970342, + "loss": 2.5605, + "step": 24810 + }, + { + "epoch": 0.7357293242001008, + "grad_norm": 0.09417211264371872, + "learning_rate": 0.00016579616694196574, + "loss": 2.5599, + "step": 24811 + }, + { + "epoch": 0.7357589775524123, + "grad_norm": 0.10958236455917358, + "learning_rate": 0.00016576117214632964, + "loss": 2.6128, + "step": 24812 + }, + { + "epoch": 0.7357886309047238, + "grad_norm": 0.09143257141113281, + "learning_rate": 0.00016572618031043574, + "loss": 2.5672, + "step": 24813 + }, + { + "epoch": 0.7358182842570352, + "grad_norm": 0.11211270093917847, + "learning_rate": 0.00016569119143459388, + "loss": 2.5502, + "step": 24814 + }, + { + "epoch": 0.7358479376093467, + "grad_norm": 0.09437967836856842, + "learning_rate": 0.00016565620551911385, + "loss": 2.5811, + "step": 24815 + }, + { + "epoch": 0.7358775909616582, + "grad_norm": 0.10110466927289963, + "learning_rate": 0.00016562122256430557, + "loss": 2.5751, + "step": 24816 + }, + { + "epoch": 0.7359072443139697, + "grad_norm": 0.09949912130832672, + "learning_rate": 0.00016558624257047871, + "loss": 2.5226, + "step": 24817 + }, + { + "epoch": 0.7359368976662811, + "grad_norm": 0.10009856522083282, + "learning_rate": 0.00016555126553794314, + "loss": 2.5554, + "step": 24818 + }, + { + "epoch": 0.7359665510185927, + "grad_norm": 0.10557720810174942, + "learning_rate": 0.00016551629146700848, + "loss": 2.5917, + "step": 24819 + }, + { + "epoch": 0.7359962043709042, + "grad_norm": 0.1005324274301529, + "learning_rate": 0.00016548132035798445, + "loss": 2.56, + "step": 24820 + }, + { + "epoch": 0.7360258577232156, + "grad_norm": 0.10167765617370605, + "learning_rate": 0.00016544635221118077, + "loss": 2.5758, + "step": 24821 + }, + { + "epoch": 0.7360555110755271, + "grad_norm": 0.09969808161258698, + "learning_rate": 0.00016541138702690707, + "loss": 2.5648, + "step": 24822 + }, + { + "epoch": 0.7360851644278386, + "grad_norm": 0.09661008417606354, + "learning_rate": 0.00016537642480547298, + "loss": 2.5781, + "step": 24823 + }, + { + "epoch": 0.7361148177801501, + "grad_norm": 0.09610693901777267, + "learning_rate": 0.0001653414655471881, + "loss": 2.5749, + "step": 24824 + }, + { + "epoch": 0.7361444711324615, + "grad_norm": 0.10060760378837585, + "learning_rate": 0.00016530650925236195, + "loss": 2.5699, + "step": 24825 + }, + { + "epoch": 0.736174124484773, + "grad_norm": 0.09278273582458496, + "learning_rate": 0.00016527155592130412, + "loss": 2.5545, + "step": 24826 + }, + { + "epoch": 0.7362037778370845, + "grad_norm": 0.11386117339134216, + "learning_rate": 0.00016523660555432413, + "loss": 2.5617, + "step": 24827 + }, + { + "epoch": 0.736233431189396, + "grad_norm": 0.09215433895587921, + "learning_rate": 0.00016520165815173143, + "loss": 2.5737, + "step": 24828 + }, + { + "epoch": 0.7362630845417074, + "grad_norm": 0.10615059733390808, + "learning_rate": 0.00016516671371383552, + "loss": 2.5712, + "step": 24829 + }, + { + "epoch": 0.736292737894019, + "grad_norm": 0.08993511646986008, + "learning_rate": 0.00016513177224094583, + "loss": 2.581, + "step": 24830 + }, + { + "epoch": 0.7363223912463304, + "grad_norm": 0.10058861970901489, + "learning_rate": 0.0001650968337333718, + "loss": 2.5556, + "step": 24831 + }, + { + "epoch": 0.7363520445986419, + "grad_norm": 0.11178920418024063, + "learning_rate": 0.0001650618981914228, + "loss": 2.569, + "step": 24832 + }, + { + "epoch": 0.7363816979509533, + "grad_norm": 0.0942600667476654, + "learning_rate": 0.00016502696561540814, + "loss": 2.5807, + "step": 24833 + }, + { + "epoch": 0.7364113513032648, + "grad_norm": 0.12269163876771927, + "learning_rate": 0.0001649920360056374, + "loss": 2.5602, + "step": 24834 + }, + { + "epoch": 0.7364410046555763, + "grad_norm": 0.10510348528623581, + "learning_rate": 0.00016495710936241938, + "loss": 2.5665, + "step": 24835 + }, + { + "epoch": 0.7364706580078878, + "grad_norm": 0.10369518399238586, + "learning_rate": 0.00016492218568606377, + "loss": 2.589, + "step": 24836 + }, + { + "epoch": 0.7365003113601992, + "grad_norm": 0.09952307492494583, + "learning_rate": 0.00016488726497687972, + "loss": 2.5828, + "step": 24837 + }, + { + "epoch": 0.7365299647125108, + "grad_norm": 0.0987284854054451, + "learning_rate": 0.0001648523472351765, + "loss": 2.5812, + "step": 24838 + }, + { + "epoch": 0.7365596180648222, + "grad_norm": 0.10590137541294098, + "learning_rate": 0.00016481743246126324, + "loss": 2.5904, + "step": 24839 + }, + { + "epoch": 0.7365892714171337, + "grad_norm": 0.11460810899734497, + "learning_rate": 0.00016478252065544918, + "loss": 2.5493, + "step": 24840 + }, + { + "epoch": 0.7366189247694452, + "grad_norm": 0.08811608701944351, + "learning_rate": 0.00016474761181804344, + "loss": 2.5856, + "step": 24841 + }, + { + "epoch": 0.7366485781217567, + "grad_norm": 0.11542785167694092, + "learning_rate": 0.00016471270594935512, + "loss": 2.5764, + "step": 24842 + }, + { + "epoch": 0.7366782314740682, + "grad_norm": 0.08735831081867218, + "learning_rate": 0.00016467780304969338, + "loss": 2.5884, + "step": 24843 + }, + { + "epoch": 0.7367078848263796, + "grad_norm": 0.10591204464435577, + "learning_rate": 0.0001646429031193672, + "loss": 2.5868, + "step": 24844 + }, + { + "epoch": 0.7367375381786911, + "grad_norm": 0.09887993335723877, + "learning_rate": 0.00016460800615868587, + "loss": 2.5929, + "step": 24845 + }, + { + "epoch": 0.7367671915310026, + "grad_norm": 0.09267313778400421, + "learning_rate": 0.000164573112167958, + "loss": 2.5602, + "step": 24846 + }, + { + "epoch": 0.7367968448833141, + "grad_norm": 0.0953531265258789, + "learning_rate": 0.00016453822114749283, + "loss": 2.5523, + "step": 24847 + }, + { + "epoch": 0.7368264982356255, + "grad_norm": 0.08678004890680313, + "learning_rate": 0.00016450333309759918, + "loss": 2.5844, + "step": 24848 + }, + { + "epoch": 0.736856151587937, + "grad_norm": 0.10166020691394806, + "learning_rate": 0.00016446844801858623, + "loss": 2.613, + "step": 24849 + }, + { + "epoch": 0.7368858049402485, + "grad_norm": 0.08877718448638916, + "learning_rate": 0.00016443356591076274, + "loss": 2.5825, + "step": 24850 + }, + { + "epoch": 0.73691545829256, + "grad_norm": 0.09852387011051178, + "learning_rate": 0.00016439868677443765, + "loss": 2.5511, + "step": 24851 + }, + { + "epoch": 0.7369451116448714, + "grad_norm": 0.08025772124528885, + "learning_rate": 0.0001643638106099198, + "loss": 2.5568, + "step": 24852 + }, + { + "epoch": 0.736974764997183, + "grad_norm": 0.09383831918239594, + "learning_rate": 0.00016432893741751798, + "loss": 2.5459, + "step": 24853 + }, + { + "epoch": 0.7370044183494944, + "grad_norm": 0.08263319730758667, + "learning_rate": 0.0001642940671975411, + "loss": 2.5706, + "step": 24854 + }, + { + "epoch": 0.7370340717018059, + "grad_norm": 0.10134933143854141, + "learning_rate": 0.00016425919995029798, + "loss": 2.5473, + "step": 24855 + }, + { + "epoch": 0.7370637250541173, + "grad_norm": 0.1014585793018341, + "learning_rate": 0.0001642243356760971, + "loss": 2.575, + "step": 24856 + }, + { + "epoch": 0.7370933784064289, + "grad_norm": 0.09263614565134048, + "learning_rate": 0.00016418947437524739, + "loss": 2.6008, + "step": 24857 + }, + { + "epoch": 0.7371230317587403, + "grad_norm": 0.09919878095388412, + "learning_rate": 0.00016415461604805752, + "loss": 2.5823, + "step": 24858 + }, + { + "epoch": 0.7371526851110518, + "grad_norm": 0.10007704794406891, + "learning_rate": 0.0001641197606948362, + "loss": 2.6225, + "step": 24859 + }, + { + "epoch": 0.7371823384633632, + "grad_norm": 0.10116594284772873, + "learning_rate": 0.00016408490831589206, + "loss": 2.6143, + "step": 24860 + }, + { + "epoch": 0.7372119918156748, + "grad_norm": 0.1080680713057518, + "learning_rate": 0.0001640500589115335, + "loss": 2.5816, + "step": 24861 + }, + { + "epoch": 0.7372416451679863, + "grad_norm": 0.09228731691837311, + "learning_rate": 0.00016401521248206953, + "loss": 2.574, + "step": 24862 + }, + { + "epoch": 0.7372712985202977, + "grad_norm": 0.1030651107430458, + "learning_rate": 0.00016398036902780854, + "loss": 2.5806, + "step": 24863 + }, + { + "epoch": 0.7373009518726092, + "grad_norm": 0.09326156228780746, + "learning_rate": 0.00016394552854905904, + "loss": 2.5657, + "step": 24864 + }, + { + "epoch": 0.7373306052249207, + "grad_norm": 0.0867149606347084, + "learning_rate": 0.0001639106910461297, + "loss": 2.5257, + "step": 24865 + }, + { + "epoch": 0.7373602585772322, + "grad_norm": 0.09269218146800995, + "learning_rate": 0.00016387585651932878, + "loss": 2.6038, + "step": 24866 + }, + { + "epoch": 0.7373899119295436, + "grad_norm": 0.09007222205400467, + "learning_rate": 0.0001638410249689648, + "loss": 2.55, + "step": 24867 + }, + { + "epoch": 0.7374195652818551, + "grad_norm": 0.09244261682033539, + "learning_rate": 0.00016380619639534629, + "loss": 2.5934, + "step": 24868 + }, + { + "epoch": 0.7374492186341666, + "grad_norm": 0.09059617668390274, + "learning_rate": 0.00016377137079878157, + "loss": 2.5665, + "step": 24869 + }, + { + "epoch": 0.7374788719864781, + "grad_norm": 0.10083039849996567, + "learning_rate": 0.00016373654817957906, + "loss": 2.578, + "step": 24870 + }, + { + "epoch": 0.7375085253387895, + "grad_norm": 0.08995784819126129, + "learning_rate": 0.00016370172853804715, + "loss": 2.5895, + "step": 24871 + }, + { + "epoch": 0.737538178691101, + "grad_norm": 0.10137518495321274, + "learning_rate": 0.00016366691187449418, + "loss": 2.5815, + "step": 24872 + }, + { + "epoch": 0.7375678320434125, + "grad_norm": 0.09482985734939575, + "learning_rate": 0.00016363209818922843, + "loss": 2.5634, + "step": 24873 + }, + { + "epoch": 0.737597485395724, + "grad_norm": 0.09404876828193665, + "learning_rate": 0.00016359728748255802, + "loss": 2.5911, + "step": 24874 + }, + { + "epoch": 0.7376271387480354, + "grad_norm": 0.09012073278427124, + "learning_rate": 0.00016356247975479155, + "loss": 2.59, + "step": 24875 + }, + { + "epoch": 0.737656792100347, + "grad_norm": 0.10298417508602142, + "learning_rate": 0.00016352767500623722, + "loss": 2.554, + "step": 24876 + }, + { + "epoch": 0.7376864454526584, + "grad_norm": 0.09294494241476059, + "learning_rate": 0.0001634928732372029, + "loss": 2.5706, + "step": 24877 + }, + { + "epoch": 0.7377160988049699, + "grad_norm": 0.09760834276676178, + "learning_rate": 0.00016345807444799698, + "loss": 2.5958, + "step": 24878 + }, + { + "epoch": 0.7377457521572813, + "grad_norm": 0.08608300983905792, + "learning_rate": 0.00016342327863892757, + "loss": 2.5739, + "step": 24879 + }, + { + "epoch": 0.7377754055095929, + "grad_norm": 0.09733811765909195, + "learning_rate": 0.0001633884858103028, + "loss": 2.5891, + "step": 24880 + }, + { + "epoch": 0.7378050588619043, + "grad_norm": 0.09215877950191498, + "learning_rate": 0.00016335369596243076, + "loss": 2.5481, + "step": 24881 + }, + { + "epoch": 0.7378347122142158, + "grad_norm": 0.10523447394371033, + "learning_rate": 0.00016331890909561953, + "loss": 2.6097, + "step": 24882 + }, + { + "epoch": 0.7378643655665273, + "grad_norm": 0.09422142058610916, + "learning_rate": 0.00016328412521017716, + "loss": 2.5655, + "step": 24883 + }, + { + "epoch": 0.7378940189188388, + "grad_norm": 0.09808259457349777, + "learning_rate": 0.00016324934430641164, + "loss": 2.5504, + "step": 24884 + }, + { + "epoch": 0.7379236722711503, + "grad_norm": 0.09041299670934677, + "learning_rate": 0.000163214566384631, + "loss": 2.5385, + "step": 24885 + }, + { + "epoch": 0.7379533256234617, + "grad_norm": 0.08826347440481186, + "learning_rate": 0.00016317979144514318, + "loss": 2.5646, + "step": 24886 + }, + { + "epoch": 0.7379829789757733, + "grad_norm": 0.09287165105342865, + "learning_rate": 0.0001631450194882561, + "loss": 2.5584, + "step": 24887 + }, + { + "epoch": 0.7380126323280847, + "grad_norm": 0.09213591367006302, + "learning_rate": 0.0001631102505142777, + "loss": 2.578, + "step": 24888 + }, + { + "epoch": 0.7380422856803962, + "grad_norm": 0.09102215617895126, + "learning_rate": 0.00016307548452351584, + "loss": 2.5959, + "step": 24889 + }, + { + "epoch": 0.7380719390327076, + "grad_norm": 0.09185915440320969, + "learning_rate": 0.0001630407215162784, + "loss": 2.6061, + "step": 24890 + }, + { + "epoch": 0.7381015923850192, + "grad_norm": 0.09971626102924347, + "learning_rate": 0.00016300596149287328, + "loss": 2.5687, + "step": 24891 + }, + { + "epoch": 0.7381312457373306, + "grad_norm": 0.09144304692745209, + "learning_rate": 0.00016297120445360813, + "loss": 2.5491, + "step": 24892 + }, + { + "epoch": 0.7381608990896421, + "grad_norm": 0.0917033851146698, + "learning_rate": 0.00016293645039879085, + "loss": 2.5487, + "step": 24893 + }, + { + "epoch": 0.7381905524419535, + "grad_norm": 0.09217037260532379, + "learning_rate": 0.00016290169932872917, + "loss": 2.5755, + "step": 24894 + }, + { + "epoch": 0.7382202057942651, + "grad_norm": 0.08649494498968124, + "learning_rate": 0.0001628669512437308, + "loss": 2.604, + "step": 24895 + }, + { + "epoch": 0.7382498591465765, + "grad_norm": 0.09310468286275864, + "learning_rate": 0.00016283220614410343, + "loss": 2.5901, + "step": 24896 + }, + { + "epoch": 0.738279512498888, + "grad_norm": 0.0843835398554802, + "learning_rate": 0.00016279746403015478, + "loss": 2.6072, + "step": 24897 + }, + { + "epoch": 0.7383091658511994, + "grad_norm": 0.08969996869564056, + "learning_rate": 0.00016276272490219242, + "loss": 2.5744, + "step": 24898 + }, + { + "epoch": 0.738338819203511, + "grad_norm": 0.08175529539585114, + "learning_rate": 0.00016272798876052402, + "loss": 2.5934, + "step": 24899 + }, + { + "epoch": 0.7383684725558224, + "grad_norm": 0.08994422107934952, + "learning_rate": 0.00016269325560545718, + "loss": 2.5625, + "step": 24900 + }, + { + "epoch": 0.7383981259081339, + "grad_norm": 0.08741804957389832, + "learning_rate": 0.00016265852543729948, + "loss": 2.5518, + "step": 24901 + }, + { + "epoch": 0.7384277792604453, + "grad_norm": 0.08627067506313324, + "learning_rate": 0.00016262379825635843, + "loss": 2.5937, + "step": 24902 + }, + { + "epoch": 0.7384574326127569, + "grad_norm": 0.17850597202777863, + "learning_rate": 0.00016258907406294156, + "loss": 2.5927, + "step": 24903 + }, + { + "epoch": 0.7384870859650684, + "grad_norm": 0.08915096521377563, + "learning_rate": 0.00016255435285735637, + "loss": 2.546, + "step": 24904 + }, + { + "epoch": 0.7385167393173798, + "grad_norm": 0.08529753983020782, + "learning_rate": 0.00016251963463991025, + "loss": 2.5464, + "step": 24905 + }, + { + "epoch": 0.7385463926696914, + "grad_norm": 0.09411364048719406, + "learning_rate": 0.00016248491941091075, + "loss": 2.5528, + "step": 24906 + }, + { + "epoch": 0.7385760460220028, + "grad_norm": 0.09613151848316193, + "learning_rate": 0.00016245020717066522, + "loss": 2.5621, + "step": 24907 + }, + { + "epoch": 0.7386056993743143, + "grad_norm": 0.09936888515949249, + "learning_rate": 0.00016241549791948102, + "loss": 2.6, + "step": 24908 + }, + { + "epoch": 0.7386353527266257, + "grad_norm": 0.0950029194355011, + "learning_rate": 0.0001623807916576655, + "loss": 2.5809, + "step": 24909 + }, + { + "epoch": 0.7386650060789373, + "grad_norm": 0.09846807271242142, + "learning_rate": 0.00016234608838552628, + "loss": 2.5956, + "step": 24910 + }, + { + "epoch": 0.7386946594312487, + "grad_norm": 0.09114626049995422, + "learning_rate": 0.00016231138810337016, + "loss": 2.5881, + "step": 24911 + }, + { + "epoch": 0.7387243127835602, + "grad_norm": 0.08742465823888779, + "learning_rate": 0.0001622766908115047, + "loss": 2.562, + "step": 24912 + }, + { + "epoch": 0.7387539661358716, + "grad_norm": 0.09038158506155014, + "learning_rate": 0.0001622419965102369, + "loss": 2.5635, + "step": 24913 + }, + { + "epoch": 0.7387836194881832, + "grad_norm": 0.08973494172096252, + "learning_rate": 0.00016220730519987442, + "loss": 2.6091, + "step": 24914 + }, + { + "epoch": 0.7388132728404946, + "grad_norm": 0.10307091474533081, + "learning_rate": 0.00016217261688072421, + "loss": 2.5953, + "step": 24915 + }, + { + "epoch": 0.7388429261928061, + "grad_norm": 0.08907864987850189, + "learning_rate": 0.00016213793155309348, + "loss": 2.5606, + "step": 24916 + }, + { + "epoch": 0.7388725795451175, + "grad_norm": 0.08402951806783676, + "learning_rate": 0.00016210324921728936, + "loss": 2.5568, + "step": 24917 + }, + { + "epoch": 0.7389022328974291, + "grad_norm": 0.08798940479755402, + "learning_rate": 0.00016206856987361896, + "loss": 2.5917, + "step": 24918 + }, + { + "epoch": 0.7389318862497405, + "grad_norm": 0.08947069942951202, + "learning_rate": 0.00016203389352238934, + "loss": 2.5558, + "step": 24919 + }, + { + "epoch": 0.738961539602052, + "grad_norm": 0.09998274594545364, + "learning_rate": 0.0001619992201639076, + "loss": 2.6129, + "step": 24920 + }, + { + "epoch": 0.7389911929543634, + "grad_norm": 0.09496821463108063, + "learning_rate": 0.00016196454979848103, + "loss": 2.5551, + "step": 24921 + }, + { + "epoch": 0.739020846306675, + "grad_norm": 0.09197413176298141, + "learning_rate": 0.00016192988242641615, + "loss": 2.5788, + "step": 24922 + }, + { + "epoch": 0.7390504996589865, + "grad_norm": 0.09479240328073502, + "learning_rate": 0.00016189521804802027, + "loss": 2.5625, + "step": 24923 + }, + { + "epoch": 0.7390801530112979, + "grad_norm": 0.09485135227441788, + "learning_rate": 0.00016186055666360022, + "loss": 2.5731, + "step": 24924 + }, + { + "epoch": 0.7391098063636095, + "grad_norm": 0.09279855340719223, + "learning_rate": 0.00016182589827346295, + "loss": 2.5713, + "step": 24925 + }, + { + "epoch": 0.7391394597159209, + "grad_norm": 0.09018396586179733, + "learning_rate": 0.0001617912428779153, + "loss": 2.569, + "step": 24926 + }, + { + "epoch": 0.7391691130682324, + "grad_norm": 0.09027333557605743, + "learning_rate": 0.00016175659047726437, + "loss": 2.5845, + "step": 24927 + }, + { + "epoch": 0.7391987664205438, + "grad_norm": 0.09467794001102448, + "learning_rate": 0.00016172194107181686, + "loss": 2.5732, + "step": 24928 + }, + { + "epoch": 0.7392284197728554, + "grad_norm": 0.08991916477680206, + "learning_rate": 0.0001616872946618797, + "loss": 2.5539, + "step": 24929 + }, + { + "epoch": 0.7392580731251668, + "grad_norm": 0.08378464728593826, + "learning_rate": 0.00016165265124775958, + "loss": 2.5608, + "step": 24930 + }, + { + "epoch": 0.7392877264774783, + "grad_norm": 0.08750876039266586, + "learning_rate": 0.00016161801082976347, + "loss": 2.5516, + "step": 24931 + }, + { + "epoch": 0.7393173798297897, + "grad_norm": 0.08255935460329056, + "learning_rate": 0.00016158337340819778, + "loss": 2.5773, + "step": 24932 + }, + { + "epoch": 0.7393470331821013, + "grad_norm": 0.09016817808151245, + "learning_rate": 0.00016154873898336942, + "loss": 2.5678, + "step": 24933 + }, + { + "epoch": 0.7393766865344127, + "grad_norm": 0.0933835580945015, + "learning_rate": 0.0001615141075555851, + "loss": 2.558, + "step": 24934 + }, + { + "epoch": 0.7394063398867242, + "grad_norm": 0.10294836014509201, + "learning_rate": 0.00016147947912515144, + "loss": 2.5713, + "step": 24935 + }, + { + "epoch": 0.7394359932390356, + "grad_norm": 0.09059703350067139, + "learning_rate": 0.00016144485369237505, + "loss": 2.5427, + "step": 24936 + }, + { + "epoch": 0.7394656465913472, + "grad_norm": 0.09706032276153564, + "learning_rate": 0.00016141023125756265, + "loss": 2.5627, + "step": 24937 + }, + { + "epoch": 0.7394952999436586, + "grad_norm": 0.0999925285577774, + "learning_rate": 0.00016137561182102077, + "loss": 2.5937, + "step": 24938 + }, + { + "epoch": 0.7395249532959701, + "grad_norm": 0.09445463865995407, + "learning_rate": 0.0001613409953830558, + "loss": 2.5991, + "step": 24939 + }, + { + "epoch": 0.7395546066482815, + "grad_norm": 0.09390103071928024, + "learning_rate": 0.00016130638194397458, + "loss": 2.5446, + "step": 24940 + }, + { + "epoch": 0.7395842600005931, + "grad_norm": 0.09477244317531586, + "learning_rate": 0.00016127177150408368, + "loss": 2.5718, + "step": 24941 + }, + { + "epoch": 0.7396139133529045, + "grad_norm": 0.09147021174430847, + "learning_rate": 0.0001612371640636892, + "loss": 2.5528, + "step": 24942 + }, + { + "epoch": 0.739643566705216, + "grad_norm": 0.09461136162281036, + "learning_rate": 0.00016120255962309783, + "loss": 2.5772, + "step": 24943 + }, + { + "epoch": 0.7396732200575276, + "grad_norm": 0.09491890668869019, + "learning_rate": 0.00016116795818261586, + "loss": 2.5615, + "step": 24944 + }, + { + "epoch": 0.739702873409839, + "grad_norm": 0.09963645786046982, + "learning_rate": 0.00016113335974254984, + "loss": 2.5593, + "step": 24945 + }, + { + "epoch": 0.7397325267621505, + "grad_norm": 0.10024194419384003, + "learning_rate": 0.00016109876430320607, + "loss": 2.5664, + "step": 24946 + }, + { + "epoch": 0.7397621801144619, + "grad_norm": 0.09267354756593704, + "learning_rate": 0.0001610641718648909, + "loss": 2.6062, + "step": 24947 + }, + { + "epoch": 0.7397918334667735, + "grad_norm": 0.09224142879247665, + "learning_rate": 0.0001610295824279107, + "loss": 2.5851, + "step": 24948 + }, + { + "epoch": 0.7398214868190849, + "grad_norm": 0.08967829495668411, + "learning_rate": 0.00016099499599257173, + "loss": 2.5548, + "step": 24949 + }, + { + "epoch": 0.7398511401713964, + "grad_norm": 0.09045720845460892, + "learning_rate": 0.00016096041255918026, + "loss": 2.5788, + "step": 24950 + }, + { + "epoch": 0.7398807935237078, + "grad_norm": 0.08718360215425491, + "learning_rate": 0.00016092583212804252, + "loss": 2.5583, + "step": 24951 + }, + { + "epoch": 0.7399104468760194, + "grad_norm": 0.08439405262470245, + "learning_rate": 0.00016089125469946475, + "loss": 2.5247, + "step": 24952 + }, + { + "epoch": 0.7399401002283308, + "grad_norm": 0.08929471671581268, + "learning_rate": 0.00016085668027375312, + "loss": 2.5586, + "step": 24953 + }, + { + "epoch": 0.7399697535806423, + "grad_norm": 0.09273568540811539, + "learning_rate": 0.00016082210885121374, + "loss": 2.5827, + "step": 24954 + }, + { + "epoch": 0.7399994069329537, + "grad_norm": 0.08818985521793365, + "learning_rate": 0.00016078754043215288, + "loss": 2.5772, + "step": 24955 + }, + { + "epoch": 0.7400290602852653, + "grad_norm": 0.08751045167446136, + "learning_rate": 0.00016075297501687652, + "loss": 2.545, + "step": 24956 + }, + { + "epoch": 0.7400587136375767, + "grad_norm": 0.0909309834241867, + "learning_rate": 0.00016071841260569087, + "loss": 2.6212, + "step": 24957 + }, + { + "epoch": 0.7400883669898882, + "grad_norm": 0.09948183596134186, + "learning_rate": 0.00016068385319890183, + "loss": 2.5458, + "step": 24958 + }, + { + "epoch": 0.7401180203421996, + "grad_norm": 0.08653932809829712, + "learning_rate": 0.00016064929679681557, + "loss": 2.5569, + "step": 24959 + }, + { + "epoch": 0.7401476736945112, + "grad_norm": 0.0845438539981842, + "learning_rate": 0.00016061474339973804, + "loss": 2.5794, + "step": 24960 + }, + { + "epoch": 0.7401773270468226, + "grad_norm": 0.09820644557476044, + "learning_rate": 0.00016058019300797517, + "loss": 2.5569, + "step": 24961 + }, + { + "epoch": 0.7402069803991341, + "grad_norm": 0.08794185519218445, + "learning_rate": 0.000160545645621833, + "loss": 2.5533, + "step": 24962 + }, + { + "epoch": 0.7402366337514455, + "grad_norm": 0.09908231347799301, + "learning_rate": 0.0001605111012416174, + "loss": 2.5638, + "step": 24963 + }, + { + "epoch": 0.7402662871037571, + "grad_norm": 0.09312506765127182, + "learning_rate": 0.00016047655986763421, + "loss": 2.5519, + "step": 24964 + }, + { + "epoch": 0.7402959404560686, + "grad_norm": 0.10284704715013504, + "learning_rate": 0.00016044202150018938, + "loss": 2.5678, + "step": 24965 + }, + { + "epoch": 0.74032559380838, + "grad_norm": 0.09096011519432068, + "learning_rate": 0.0001604074861395888, + "loss": 2.5882, + "step": 24966 + }, + { + "epoch": 0.7403552471606916, + "grad_norm": 0.09750779718160629, + "learning_rate": 0.00016037295378613814, + "loss": 2.5828, + "step": 24967 + }, + { + "epoch": 0.740384900513003, + "grad_norm": 0.09007971733808517, + "learning_rate": 0.00016033842444014335, + "loss": 2.5912, + "step": 24968 + }, + { + "epoch": 0.7404145538653145, + "grad_norm": 0.09318007528781891, + "learning_rate": 0.00016030389810191005, + "loss": 2.5758, + "step": 24969 + }, + { + "epoch": 0.7404442072176259, + "grad_norm": 0.092923603951931, + "learning_rate": 0.000160269374771744, + "loss": 2.5808, + "step": 24970 + }, + { + "epoch": 0.7404738605699375, + "grad_norm": 0.09562777727842331, + "learning_rate": 0.00016023485444995107, + "loss": 2.5818, + "step": 24971 + }, + { + "epoch": 0.7405035139222489, + "grad_norm": 0.08864714205265045, + "learning_rate": 0.00016020033713683675, + "loss": 2.5585, + "step": 24972 + }, + { + "epoch": 0.7405331672745604, + "grad_norm": 0.08957725018262863, + "learning_rate": 0.00016016582283270682, + "loss": 2.5366, + "step": 24973 + }, + { + "epoch": 0.7405628206268718, + "grad_norm": 0.09218752384185791, + "learning_rate": 0.0001601313115378668, + "loss": 2.5808, + "step": 24974 + }, + { + "epoch": 0.7405924739791834, + "grad_norm": 0.1027810201048851, + "learning_rate": 0.00016009680325262238, + "loss": 2.5915, + "step": 24975 + }, + { + "epoch": 0.7406221273314948, + "grad_norm": 0.08291416615247726, + "learning_rate": 0.00016006229797727928, + "loss": 2.5403, + "step": 24976 + }, + { + "epoch": 0.7406517806838063, + "grad_norm": 0.09869394451379776, + "learning_rate": 0.00016002779571214256, + "loss": 2.5838, + "step": 24977 + }, + { + "epoch": 0.7406814340361177, + "grad_norm": 0.08083615452051163, + "learning_rate": 0.0001599932964575182, + "loss": 2.5815, + "step": 24978 + }, + { + "epoch": 0.7407110873884293, + "grad_norm": 0.10384989529848099, + "learning_rate": 0.00015995880021371157, + "loss": 2.5821, + "step": 24979 + }, + { + "epoch": 0.7407407407407407, + "grad_norm": 0.09421253949403763, + "learning_rate": 0.00015992430698102812, + "loss": 2.5556, + "step": 24980 + }, + { + "epoch": 0.7407703940930522, + "grad_norm": 0.10029540956020355, + "learning_rate": 0.0001598898167597733, + "loss": 2.5969, + "step": 24981 + }, + { + "epoch": 0.7408000474453637, + "grad_norm": 0.1006820797920227, + "learning_rate": 0.00015985532955025263, + "loss": 2.5726, + "step": 24982 + }, + { + "epoch": 0.7408297007976752, + "grad_norm": 0.0972985029220581, + "learning_rate": 0.00015982084535277132, + "loss": 2.5295, + "step": 24983 + }, + { + "epoch": 0.7408593541499866, + "grad_norm": 0.09263800084590912, + "learning_rate": 0.0001597863641676348, + "loss": 2.6102, + "step": 24984 + }, + { + "epoch": 0.7408890075022981, + "grad_norm": 0.09688430279493332, + "learning_rate": 0.00015975188599514845, + "loss": 2.606, + "step": 24985 + }, + { + "epoch": 0.7409186608546097, + "grad_norm": 0.09539183229207993, + "learning_rate": 0.0001597174108356177, + "loss": 2.5648, + "step": 24986 + }, + { + "epoch": 0.7409483142069211, + "grad_norm": 0.09220855683088303, + "learning_rate": 0.00015968293868934752, + "loss": 2.5989, + "step": 24987 + }, + { + "epoch": 0.7409779675592326, + "grad_norm": 0.0980222076177597, + "learning_rate": 0.00015964846955664335, + "loss": 2.5476, + "step": 24988 + }, + { + "epoch": 0.741007620911544, + "grad_norm": 0.08931516110897064, + "learning_rate": 0.0001596140034378104, + "loss": 2.5713, + "step": 24989 + }, + { + "epoch": 0.7410372742638556, + "grad_norm": 0.09321557730436325, + "learning_rate": 0.0001595795403331537, + "loss": 2.5712, + "step": 24990 + }, + { + "epoch": 0.741066927616167, + "grad_norm": 0.08307991921901703, + "learning_rate": 0.00015954508024297877, + "loss": 2.597, + "step": 24991 + }, + { + "epoch": 0.7410965809684785, + "grad_norm": 0.0967833623290062, + "learning_rate": 0.00015951062316759063, + "loss": 2.5753, + "step": 24992 + }, + { + "epoch": 0.74112623432079, + "grad_norm": 0.09486737847328186, + "learning_rate": 0.00015947616910729433, + "loss": 2.5897, + "step": 24993 + }, + { + "epoch": 0.7411558876731015, + "grad_norm": 0.09045965224504471, + "learning_rate": 0.000159441718062395, + "loss": 2.5706, + "step": 24994 + }, + { + "epoch": 0.7411855410254129, + "grad_norm": 0.0952165424823761, + "learning_rate": 0.0001594072700331977, + "loss": 2.5673, + "step": 24995 + }, + { + "epoch": 0.7412151943777244, + "grad_norm": 0.08423153311014175, + "learning_rate": 0.0001593728250200075, + "loss": 2.5556, + "step": 24996 + }, + { + "epoch": 0.7412448477300358, + "grad_norm": 0.09937245398759842, + "learning_rate": 0.0001593383830231296, + "loss": 2.5886, + "step": 24997 + }, + { + "epoch": 0.7412745010823474, + "grad_norm": 0.09776092320680618, + "learning_rate": 0.00015930394404286858, + "loss": 2.5735, + "step": 24998 + }, + { + "epoch": 0.7413041544346588, + "grad_norm": 0.09060558676719666, + "learning_rate": 0.00015926950807952967, + "loss": 2.5543, + "step": 24999 + }, + { + "epoch": 0.7413338077869703, + "grad_norm": 0.10395728796720505, + "learning_rate": 0.00015923507513341766, + "loss": 2.562, + "step": 25000 + }, + { + "epoch": 0.7413634611392818, + "grad_norm": 0.0944821834564209, + "learning_rate": 0.00015920064520483763, + "loss": 2.5654, + "step": 25001 + }, + { + "epoch": 0.7413931144915933, + "grad_norm": 0.09859602153301239, + "learning_rate": 0.0001591662182940944, + "loss": 2.5837, + "step": 25002 + }, + { + "epoch": 0.7414227678439047, + "grad_norm": 0.09187237918376923, + "learning_rate": 0.0001591317944014926, + "loss": 2.6068, + "step": 25003 + }, + { + "epoch": 0.7414524211962162, + "grad_norm": 0.09819495677947998, + "learning_rate": 0.00015909737352733745, + "loss": 2.5652, + "step": 25004 + }, + { + "epoch": 0.7414820745485277, + "grad_norm": 0.08946523815393448, + "learning_rate": 0.0001590629556719336, + "loss": 2.5464, + "step": 25005 + }, + { + "epoch": 0.7415117279008392, + "grad_norm": 0.09439093619585037, + "learning_rate": 0.0001590285408355857, + "loss": 2.5643, + "step": 25006 + }, + { + "epoch": 0.7415413812531507, + "grad_norm": 0.10562554746866226, + "learning_rate": 0.00015899412901859884, + "loss": 2.5491, + "step": 25007 + }, + { + "epoch": 0.7415710346054621, + "grad_norm": 0.10238240659236908, + "learning_rate": 0.0001589597202212773, + "loss": 2.5496, + "step": 25008 + }, + { + "epoch": 0.7416006879577737, + "grad_norm": 0.10450764745473862, + "learning_rate": 0.000158925314443926, + "loss": 2.5887, + "step": 25009 + }, + { + "epoch": 0.7416303413100851, + "grad_norm": 0.09844780713319778, + "learning_rate": 0.0001588909116868496, + "loss": 2.5845, + "step": 25010 + }, + { + "epoch": 0.7416599946623966, + "grad_norm": 0.09315329045057297, + "learning_rate": 0.00015885651195035271, + "loss": 2.5937, + "step": 25011 + }, + { + "epoch": 0.741689648014708, + "grad_norm": 0.08941412717103958, + "learning_rate": 0.00015882211523474, + "loss": 2.5758, + "step": 25012 + }, + { + "epoch": 0.7417193013670196, + "grad_norm": 0.09649555385112762, + "learning_rate": 0.00015878772154031596, + "loss": 2.5931, + "step": 25013 + }, + { + "epoch": 0.741748954719331, + "grad_norm": 0.09708001464605331, + "learning_rate": 0.00015875333086738525, + "loss": 2.5603, + "step": 25014 + }, + { + "epoch": 0.7417786080716425, + "grad_norm": 0.09299816191196442, + "learning_rate": 0.00015871894321625236, + "loss": 2.5805, + "step": 25015 + }, + { + "epoch": 0.741808261423954, + "grad_norm": 0.10868250578641891, + "learning_rate": 0.00015868455858722164, + "loss": 2.5946, + "step": 25016 + }, + { + "epoch": 0.7418379147762655, + "grad_norm": 0.08591669797897339, + "learning_rate": 0.0001586501769805979, + "loss": 2.5541, + "step": 25017 + }, + { + "epoch": 0.7418675681285769, + "grad_norm": 0.11326808482408524, + "learning_rate": 0.0001586157983966856, + "loss": 2.5527, + "step": 25018 + }, + { + "epoch": 0.7418972214808884, + "grad_norm": 0.09365080296993256, + "learning_rate": 0.00015858142283578885, + "loss": 2.5571, + "step": 25019 + }, + { + "epoch": 0.7419268748331999, + "grad_norm": 0.1018889769911766, + "learning_rate": 0.0001585470502982122, + "loss": 2.6107, + "step": 25020 + }, + { + "epoch": 0.7419565281855114, + "grad_norm": 0.10117486119270325, + "learning_rate": 0.00015851268078426002, + "loss": 2.5821, + "step": 25021 + }, + { + "epoch": 0.7419861815378228, + "grad_norm": 0.09492931514978409, + "learning_rate": 0.0001584783142942367, + "loss": 2.6073, + "step": 25022 + }, + { + "epoch": 0.7420158348901343, + "grad_norm": 0.08956070989370346, + "learning_rate": 0.00015844395082844647, + "loss": 2.6019, + "step": 25023 + }, + { + "epoch": 0.7420454882424458, + "grad_norm": 0.09478660672903061, + "learning_rate": 0.00015840959038719376, + "loss": 2.5808, + "step": 25024 + }, + { + "epoch": 0.7420751415947573, + "grad_norm": 0.09380228817462921, + "learning_rate": 0.0001583752329707827, + "loss": 2.5676, + "step": 25025 + }, + { + "epoch": 0.7421047949470687, + "grad_norm": 0.09423765540122986, + "learning_rate": 0.00015834087857951763, + "loss": 2.5663, + "step": 25026 + }, + { + "epoch": 0.7421344482993802, + "grad_norm": 0.10163948684930801, + "learning_rate": 0.0001583065272137027, + "loss": 2.5824, + "step": 25027 + }, + { + "epoch": 0.7421641016516918, + "grad_norm": 0.09013311564922333, + "learning_rate": 0.0001582721788736421, + "loss": 2.579, + "step": 25028 + }, + { + "epoch": 0.7421937550040032, + "grad_norm": 0.09886351972818375, + "learning_rate": 0.00015823783355964005, + "loss": 2.5361, + "step": 25029 + }, + { + "epoch": 0.7422234083563147, + "grad_norm": 0.08667219430208206, + "learning_rate": 0.0001582034912720006, + "loss": 2.5892, + "step": 25030 + }, + { + "epoch": 0.7422530617086261, + "grad_norm": 0.08918917924165726, + "learning_rate": 0.000158169152011028, + "loss": 2.5893, + "step": 25031 + }, + { + "epoch": 0.7422827150609377, + "grad_norm": 0.08732754737138748, + "learning_rate": 0.00015813481577702616, + "loss": 2.553, + "step": 25032 + }, + { + "epoch": 0.7423123684132491, + "grad_norm": 0.08872071653604507, + "learning_rate": 0.0001581004825702992, + "loss": 2.5843, + "step": 25033 + }, + { + "epoch": 0.7423420217655606, + "grad_norm": 0.0914207249879837, + "learning_rate": 0.00015806615239115119, + "loss": 2.5537, + "step": 25034 + }, + { + "epoch": 0.742371675117872, + "grad_norm": 0.09178359806537628, + "learning_rate": 0.00015803182523988608, + "loss": 2.5649, + "step": 25035 + }, + { + "epoch": 0.7424013284701836, + "grad_norm": 0.0867404043674469, + "learning_rate": 0.00015799750111680782, + "loss": 2.5849, + "step": 25036 + }, + { + "epoch": 0.742430981822495, + "grad_norm": 0.08959529548883438, + "learning_rate": 0.00015796318002222044, + "loss": 2.5853, + "step": 25037 + }, + { + "epoch": 0.7424606351748065, + "grad_norm": 0.09064183384180069, + "learning_rate": 0.0001579288619564278, + "loss": 2.5545, + "step": 25038 + }, + { + "epoch": 0.742490288527118, + "grad_norm": 0.08390074968338013, + "learning_rate": 0.00015789454691973382, + "loss": 2.5737, + "step": 25039 + }, + { + "epoch": 0.7425199418794295, + "grad_norm": 0.08836262673139572, + "learning_rate": 0.0001578602349124423, + "loss": 2.5804, + "step": 25040 + }, + { + "epoch": 0.7425495952317409, + "grad_norm": 0.08875438570976257, + "learning_rate": 0.00015782592593485735, + "loss": 2.5746, + "step": 25041 + }, + { + "epoch": 0.7425792485840524, + "grad_norm": 0.09040074050426483, + "learning_rate": 0.0001577916199872822, + "loss": 2.5748, + "step": 25042 + }, + { + "epoch": 0.7426089019363639, + "grad_norm": 0.10037349164485931, + "learning_rate": 0.0001577573170700211, + "loss": 2.5691, + "step": 25043 + }, + { + "epoch": 0.7426385552886754, + "grad_norm": 0.09150364249944687, + "learning_rate": 0.0001577230171833778, + "loss": 2.5597, + "step": 25044 + }, + { + "epoch": 0.7426682086409868, + "grad_norm": 0.08815857768058777, + "learning_rate": 0.00015768872032765585, + "loss": 2.5747, + "step": 25045 + }, + { + "epoch": 0.7426978619932983, + "grad_norm": 0.08479725569486618, + "learning_rate": 0.0001576544265031591, + "loss": 2.5965, + "step": 25046 + }, + { + "epoch": 0.7427275153456098, + "grad_norm": 0.0862225890159607, + "learning_rate": 0.00015762013571019118, + "loss": 2.5949, + "step": 25047 + }, + { + "epoch": 0.7427571686979213, + "grad_norm": 0.08365841954946518, + "learning_rate": 0.00015758584794905566, + "loss": 2.5798, + "step": 25048 + }, + { + "epoch": 0.7427868220502328, + "grad_norm": 0.08708936721086502, + "learning_rate": 0.00015755156322005626, + "loss": 2.5632, + "step": 25049 + }, + { + "epoch": 0.7428164754025443, + "grad_norm": 0.08710894733667374, + "learning_rate": 0.0001575172815234966, + "loss": 2.607, + "step": 25050 + }, + { + "epoch": 0.7428461287548558, + "grad_norm": 0.09536939859390259, + "learning_rate": 0.00015748300285968014, + "loss": 2.6072, + "step": 25051 + }, + { + "epoch": 0.7428757821071672, + "grad_norm": 0.08948720246553421, + "learning_rate": 0.00015744872722891064, + "loss": 2.5725, + "step": 25052 + }, + { + "epoch": 0.7429054354594787, + "grad_norm": 0.09288237988948822, + "learning_rate": 0.0001574144546314913, + "loss": 2.5183, + "step": 25053 + }, + { + "epoch": 0.7429350888117902, + "grad_norm": 0.08872045576572418, + "learning_rate": 0.00015738018506772578, + "loss": 2.6209, + "step": 25054 + }, + { + "epoch": 0.7429647421641017, + "grad_norm": 0.09796442836523056, + "learning_rate": 0.00015734591853791742, + "loss": 2.6022, + "step": 25055 + }, + { + "epoch": 0.7429943955164131, + "grad_norm": 0.08810804039239883, + "learning_rate": 0.00015731165504236983, + "loss": 2.5579, + "step": 25056 + }, + { + "epoch": 0.7430240488687246, + "grad_norm": 0.09444756805896759, + "learning_rate": 0.0001572773945813864, + "loss": 2.573, + "step": 25057 + }, + { + "epoch": 0.7430537022210361, + "grad_norm": 0.08891808241605759, + "learning_rate": 0.0001572431371552705, + "loss": 2.5638, + "step": 25058 + }, + { + "epoch": 0.7430833555733476, + "grad_norm": 0.09118819236755371, + "learning_rate": 0.00015720888276432544, + "loss": 2.5527, + "step": 25059 + }, + { + "epoch": 0.743113008925659, + "grad_norm": 0.08300653845071793, + "learning_rate": 0.0001571746314088545, + "loss": 2.5656, + "step": 25060 + }, + { + "epoch": 0.7431426622779705, + "grad_norm": 0.08630294352769852, + "learning_rate": 0.00015714038308916112, + "loss": 2.5804, + "step": 25061 + }, + { + "epoch": 0.743172315630282, + "grad_norm": 0.09360714256763458, + "learning_rate": 0.00015710613780554867, + "loss": 2.5656, + "step": 25062 + }, + { + "epoch": 0.7432019689825935, + "grad_norm": 0.0921703577041626, + "learning_rate": 0.00015707189555832002, + "loss": 2.5614, + "step": 25063 + }, + { + "epoch": 0.7432316223349049, + "grad_norm": 0.08953355252742767, + "learning_rate": 0.00015703765634777862, + "loss": 2.563, + "step": 25064 + }, + { + "epoch": 0.7432612756872164, + "grad_norm": 0.09043686091899872, + "learning_rate": 0.0001570034201742276, + "loss": 2.5628, + "step": 25065 + }, + { + "epoch": 0.7432909290395279, + "grad_norm": 0.08593855798244476, + "learning_rate": 0.0001569691870379702, + "loss": 2.5586, + "step": 25066 + }, + { + "epoch": 0.7433205823918394, + "grad_norm": 0.08863288164138794, + "learning_rate": 0.00015693495693930954, + "loss": 2.6042, + "step": 25067 + }, + { + "epoch": 0.7433502357441508, + "grad_norm": 0.08788643032312393, + "learning_rate": 0.00015690072987854854, + "loss": 2.5763, + "step": 25068 + }, + { + "epoch": 0.7433798890964624, + "grad_norm": 0.08218631893396378, + "learning_rate": 0.00015686650585599067, + "loss": 2.5453, + "step": 25069 + }, + { + "epoch": 0.7434095424487739, + "grad_norm": 0.08764003217220306, + "learning_rate": 0.00015683228487193873, + "loss": 2.5755, + "step": 25070 + }, + { + "epoch": 0.7434391958010853, + "grad_norm": 0.09293445199728012, + "learning_rate": 0.00015679806692669586, + "loss": 2.5665, + "step": 25071 + }, + { + "epoch": 0.7434688491533968, + "grad_norm": 0.09817536920309067, + "learning_rate": 0.00015676385202056497, + "loss": 2.5975, + "step": 25072 + }, + { + "epoch": 0.7434985025057083, + "grad_norm": 0.08282992243766785, + "learning_rate": 0.00015672964015384926, + "loss": 2.5646, + "step": 25073 + }, + { + "epoch": 0.7435281558580198, + "grad_norm": 0.09278018027544022, + "learning_rate": 0.00015669543132685137, + "loss": 2.5524, + "step": 25074 + }, + { + "epoch": 0.7435578092103312, + "grad_norm": 0.09547078609466553, + "learning_rate": 0.0001566612255398744, + "loss": 2.5983, + "step": 25075 + }, + { + "epoch": 0.7435874625626427, + "grad_norm": 0.08511430025100708, + "learning_rate": 0.00015662702279322112, + "loss": 2.5543, + "step": 25076 + }, + { + "epoch": 0.7436171159149542, + "grad_norm": 0.09054911881685257, + "learning_rate": 0.0001565928230871946, + "loss": 2.5623, + "step": 25077 + }, + { + "epoch": 0.7436467692672657, + "grad_norm": 0.0855051577091217, + "learning_rate": 0.00015655862642209746, + "loss": 2.5622, + "step": 25078 + }, + { + "epoch": 0.7436764226195771, + "grad_norm": 0.08800968527793884, + "learning_rate": 0.0001565244327982327, + "loss": 2.5693, + "step": 25079 + }, + { + "epoch": 0.7437060759718886, + "grad_norm": 0.09280723333358765, + "learning_rate": 0.00015649024221590303, + "loss": 2.5736, + "step": 25080 + }, + { + "epoch": 0.7437357293242001, + "grad_norm": 0.09127414226531982, + "learning_rate": 0.00015645605467541108, + "loss": 2.5464, + "step": 25081 + }, + { + "epoch": 0.7437653826765116, + "grad_norm": 0.0833897814154625, + "learning_rate": 0.0001564218701770599, + "loss": 2.5735, + "step": 25082 + }, + { + "epoch": 0.743795036028823, + "grad_norm": 0.09067206084728241, + "learning_rate": 0.00015638768872115218, + "loss": 2.5872, + "step": 25083 + }, + { + "epoch": 0.7438246893811346, + "grad_norm": 0.09279035776853561, + "learning_rate": 0.00015635351030799028, + "loss": 2.5866, + "step": 25084 + }, + { + "epoch": 0.743854342733446, + "grad_norm": 0.08757786452770233, + "learning_rate": 0.00015631933493787703, + "loss": 2.5494, + "step": 25085 + }, + { + "epoch": 0.7438839960857575, + "grad_norm": 0.0936763659119606, + "learning_rate": 0.0001562851626111151, + "loss": 2.5854, + "step": 25086 + }, + { + "epoch": 0.7439136494380689, + "grad_norm": 0.08831340074539185, + "learning_rate": 0.000156250993328007, + "loss": 2.5613, + "step": 25087 + }, + { + "epoch": 0.7439433027903805, + "grad_norm": 0.09647643566131592, + "learning_rate": 0.00015621682708885542, + "loss": 2.5752, + "step": 25088 + }, + { + "epoch": 0.7439729561426919, + "grad_norm": 0.08843548595905304, + "learning_rate": 0.0001561826638939628, + "loss": 2.5527, + "step": 25089 + }, + { + "epoch": 0.7440026094950034, + "grad_norm": 0.09737826883792877, + "learning_rate": 0.0001561485037436317, + "loss": 2.589, + "step": 25090 + }, + { + "epoch": 0.7440322628473149, + "grad_norm": 0.0959489569067955, + "learning_rate": 0.00015611434663816465, + "loss": 2.58, + "step": 25091 + }, + { + "epoch": 0.7440619161996264, + "grad_norm": 0.09061334282159805, + "learning_rate": 0.0001560801925778641, + "loss": 2.5774, + "step": 25092 + }, + { + "epoch": 0.7440915695519379, + "grad_norm": 0.10732052475214005, + "learning_rate": 0.0001560460415630325, + "loss": 2.5717, + "step": 25093 + }, + { + "epoch": 0.7441212229042493, + "grad_norm": 0.0865950882434845, + "learning_rate": 0.00015601189359397215, + "loss": 2.551, + "step": 25094 + }, + { + "epoch": 0.7441508762565608, + "grad_norm": 0.10074262320995331, + "learning_rate": 0.00015597774867098557, + "loss": 2.6017, + "step": 25095 + }, + { + "epoch": 0.7441805296088723, + "grad_norm": 0.09148409217596054, + "learning_rate": 0.00015594360679437508, + "loss": 2.5825, + "step": 25096 + }, + { + "epoch": 0.7442101829611838, + "grad_norm": 0.09913896024227142, + "learning_rate": 0.00015590946796444305, + "loss": 2.5726, + "step": 25097 + }, + { + "epoch": 0.7442398363134952, + "grad_norm": 0.08554334193468094, + "learning_rate": 0.00015587533218149169, + "loss": 2.5547, + "step": 25098 + }, + { + "epoch": 0.7442694896658067, + "grad_norm": 0.100807785987854, + "learning_rate": 0.00015584119944582337, + "loss": 2.5767, + "step": 25099 + }, + { + "epoch": 0.7442991430181182, + "grad_norm": 0.09330208599567413, + "learning_rate": 0.0001558070697577403, + "loss": 2.5845, + "step": 25100 + }, + { + "epoch": 0.7443287963704297, + "grad_norm": 0.10357160866260529, + "learning_rate": 0.00015577294311754463, + "loss": 2.5936, + "step": 25101 + }, + { + "epoch": 0.7443584497227411, + "grad_norm": 0.09739737957715988, + "learning_rate": 0.0001557388195255387, + "loss": 2.5611, + "step": 25102 + }, + { + "epoch": 0.7443881030750527, + "grad_norm": 0.09965075552463531, + "learning_rate": 0.0001557046989820246, + "loss": 2.5973, + "step": 25103 + }, + { + "epoch": 0.7444177564273641, + "grad_norm": 0.0966934785246849, + "learning_rate": 0.00015567058148730452, + "loss": 2.548, + "step": 25104 + }, + { + "epoch": 0.7444474097796756, + "grad_norm": 0.09919574111700058, + "learning_rate": 0.0001556364670416805, + "loss": 2.5944, + "step": 25105 + }, + { + "epoch": 0.744477063131987, + "grad_norm": 0.10062228888273239, + "learning_rate": 0.00015560235564545473, + "loss": 2.5723, + "step": 25106 + }, + { + "epoch": 0.7445067164842986, + "grad_norm": 0.10338286310434341, + "learning_rate": 0.00015556824729892916, + "loss": 2.5668, + "step": 25107 + }, + { + "epoch": 0.74453636983661, + "grad_norm": 0.10597493499517441, + "learning_rate": 0.0001555341420024059, + "loss": 2.6048, + "step": 25108 + }, + { + "epoch": 0.7445660231889215, + "grad_norm": 0.09353963285684586, + "learning_rate": 0.00015550003975618693, + "loss": 2.5646, + "step": 25109 + }, + { + "epoch": 0.7445956765412329, + "grad_norm": 0.10083208978176117, + "learning_rate": 0.00015546594056057423, + "loss": 2.577, + "step": 25110 + }, + { + "epoch": 0.7446253298935445, + "grad_norm": 0.10193759948015213, + "learning_rate": 0.00015543184441586978, + "loss": 2.5909, + "step": 25111 + }, + { + "epoch": 0.744654983245856, + "grad_norm": 0.11018706858158112, + "learning_rate": 0.00015539775132237548, + "loss": 2.5606, + "step": 25112 + }, + { + "epoch": 0.7446846365981674, + "grad_norm": 0.0903049111366272, + "learning_rate": 0.00015536366128039331, + "loss": 2.5709, + "step": 25113 + }, + { + "epoch": 0.744714289950479, + "grad_norm": 0.10358871519565582, + "learning_rate": 0.000155329574290225, + "loss": 2.5619, + "step": 25114 + }, + { + "epoch": 0.7447439433027904, + "grad_norm": 0.09792207926511765, + "learning_rate": 0.0001552954903521725, + "loss": 2.5824, + "step": 25115 + }, + { + "epoch": 0.7447735966551019, + "grad_norm": 0.09915774315595627, + "learning_rate": 0.0001552614094665376, + "loss": 2.5562, + "step": 25116 + }, + { + "epoch": 0.7448032500074133, + "grad_norm": 0.09424766898155212, + "learning_rate": 0.00015522733163362223, + "loss": 2.5878, + "step": 25117 + }, + { + "epoch": 0.7448329033597249, + "grad_norm": 0.09808513522148132, + "learning_rate": 0.00015519325685372788, + "loss": 2.5306, + "step": 25118 + }, + { + "epoch": 0.7448625567120363, + "grad_norm": 0.09238338470458984, + "learning_rate": 0.00015515918512715643, + "loss": 2.55, + "step": 25119 + }, + { + "epoch": 0.7448922100643478, + "grad_norm": 0.09605137258768082, + "learning_rate": 0.0001551251164542094, + "loss": 2.5982, + "step": 25120 + }, + { + "epoch": 0.7449218634166592, + "grad_norm": 0.08991704136133194, + "learning_rate": 0.00015509105083518886, + "loss": 2.5533, + "step": 25121 + }, + { + "epoch": 0.7449515167689708, + "grad_norm": 0.09663949906826019, + "learning_rate": 0.00015505698827039628, + "loss": 2.579, + "step": 25122 + }, + { + "epoch": 0.7449811701212822, + "grad_norm": 0.09048686921596527, + "learning_rate": 0.0001550229287601332, + "loss": 2.5211, + "step": 25123 + }, + { + "epoch": 0.7450108234735937, + "grad_norm": 0.09756293147802353, + "learning_rate": 0.00015498887230470137, + "loss": 2.56, + "step": 25124 + }, + { + "epoch": 0.7450404768259051, + "grad_norm": 0.09592749178409576, + "learning_rate": 0.00015495481890440228, + "loss": 2.5657, + "step": 25125 + }, + { + "epoch": 0.7450701301782167, + "grad_norm": 0.0963432788848877, + "learning_rate": 0.00015492076855953745, + "loss": 2.5903, + "step": 25126 + }, + { + "epoch": 0.7450997835305281, + "grad_norm": 0.10398980230093002, + "learning_rate": 0.0001548867212704085, + "loss": 2.5923, + "step": 25127 + }, + { + "epoch": 0.7451294368828396, + "grad_norm": 0.10526735335588455, + "learning_rate": 0.00015485267703731703, + "loss": 2.5403, + "step": 25128 + }, + { + "epoch": 0.745159090235151, + "grad_norm": 0.09272037446498871, + "learning_rate": 0.00015481863586056416, + "loss": 2.5379, + "step": 25129 + }, + { + "epoch": 0.7451887435874626, + "grad_norm": 0.1303655058145523, + "learning_rate": 0.0001547845977404515, + "loss": 2.5854, + "step": 25130 + }, + { + "epoch": 0.745218396939774, + "grad_norm": 0.10380282998085022, + "learning_rate": 0.00015475056267728055, + "loss": 2.5948, + "step": 25131 + }, + { + "epoch": 0.7452480502920855, + "grad_norm": 0.10607368499040604, + "learning_rate": 0.00015471653067135262, + "loss": 2.5565, + "step": 25132 + }, + { + "epoch": 0.745277703644397, + "grad_norm": 0.09646763652563095, + "learning_rate": 0.00015468250172296887, + "loss": 2.5976, + "step": 25133 + }, + { + "epoch": 0.7453073569967085, + "grad_norm": 0.09460461884737015, + "learning_rate": 0.00015464847583243103, + "loss": 2.573, + "step": 25134 + }, + { + "epoch": 0.74533701034902, + "grad_norm": 0.09814580529928207, + "learning_rate": 0.00015461445300004017, + "loss": 2.576, + "step": 25135 + }, + { + "epoch": 0.7453666637013314, + "grad_norm": 0.08897152543067932, + "learning_rate": 0.00015458043322609766, + "loss": 2.5981, + "step": 25136 + }, + { + "epoch": 0.745396317053643, + "grad_norm": 0.09117718040943146, + "learning_rate": 0.00015454641651090469, + "loss": 2.5906, + "step": 25137 + }, + { + "epoch": 0.7454259704059544, + "grad_norm": 0.09765353798866272, + "learning_rate": 0.00015451240285476264, + "loss": 2.5878, + "step": 25138 + }, + { + "epoch": 0.7454556237582659, + "grad_norm": 0.0905885398387909, + "learning_rate": 0.00015447839225797244, + "loss": 2.5314, + "step": 25139 + }, + { + "epoch": 0.7454852771105773, + "grad_norm": 0.09294018894433975, + "learning_rate": 0.00015444438472083538, + "loss": 2.5717, + "step": 25140 + }, + { + "epoch": 0.7455149304628889, + "grad_norm": 0.09316756576299667, + "learning_rate": 0.00015441038024365263, + "loss": 2.5783, + "step": 25141 + }, + { + "epoch": 0.7455445838152003, + "grad_norm": 0.10303393006324768, + "learning_rate": 0.00015437637882672524, + "loss": 2.5778, + "step": 25142 + }, + { + "epoch": 0.7455742371675118, + "grad_norm": 0.09064820408821106, + "learning_rate": 0.00015434238047035438, + "loss": 2.6159, + "step": 25143 + }, + { + "epoch": 0.7456038905198232, + "grad_norm": 0.09924057126045227, + "learning_rate": 0.00015430838517484102, + "loss": 2.5783, + "step": 25144 + }, + { + "epoch": 0.7456335438721348, + "grad_norm": 0.09939868003129959, + "learning_rate": 0.00015427439294048628, + "loss": 2.5411, + "step": 25145 + }, + { + "epoch": 0.7456631972244462, + "grad_norm": 0.10653850436210632, + "learning_rate": 0.00015424040376759096, + "loss": 2.566, + "step": 25146 + }, + { + "epoch": 0.7456928505767577, + "grad_norm": 0.0920669436454773, + "learning_rate": 0.00015420641765645637, + "loss": 2.5496, + "step": 25147 + }, + { + "epoch": 0.7457225039290691, + "grad_norm": 0.09870568662881851, + "learning_rate": 0.00015417243460738334, + "loss": 2.5782, + "step": 25148 + }, + { + "epoch": 0.7457521572813807, + "grad_norm": 0.09621523320674896, + "learning_rate": 0.00015413845462067287, + "loss": 2.5614, + "step": 25149 + }, + { + "epoch": 0.7457818106336921, + "grad_norm": 0.08567361533641815, + "learning_rate": 0.00015410447769662562, + "loss": 2.5636, + "step": 25150 + }, + { + "epoch": 0.7458114639860036, + "grad_norm": 0.10243336111307144, + "learning_rate": 0.00015407050383554262, + "loss": 2.5553, + "step": 25151 + }, + { + "epoch": 0.7458411173383152, + "grad_norm": 0.08890386670827866, + "learning_rate": 0.00015403653303772464, + "loss": 2.5774, + "step": 25152 + }, + { + "epoch": 0.7458707706906266, + "grad_norm": 0.10073116421699524, + "learning_rate": 0.00015400256530347255, + "loss": 2.5749, + "step": 25153 + }, + { + "epoch": 0.7459004240429381, + "grad_norm": 0.09177733957767487, + "learning_rate": 0.00015396860063308714, + "loss": 2.5447, + "step": 25154 + }, + { + "epoch": 0.7459300773952495, + "grad_norm": 0.0885041356086731, + "learning_rate": 0.00015393463902686917, + "loss": 2.5428, + "step": 25155 + }, + { + "epoch": 0.7459597307475611, + "grad_norm": 0.09735162556171417, + "learning_rate": 0.00015390068048511934, + "loss": 2.5697, + "step": 25156 + }, + { + "epoch": 0.7459893840998725, + "grad_norm": 0.08896680921316147, + "learning_rate": 0.00015386672500813847, + "loss": 2.5407, + "step": 25157 + }, + { + "epoch": 0.746019037452184, + "grad_norm": 0.10664574801921844, + "learning_rate": 0.00015383277259622697, + "loss": 2.5623, + "step": 25158 + }, + { + "epoch": 0.7460486908044954, + "grad_norm": 0.09787242114543915, + "learning_rate": 0.00015379882324968597, + "loss": 2.5758, + "step": 25159 + }, + { + "epoch": 0.746078344156807, + "grad_norm": 0.10420882701873779, + "learning_rate": 0.00015376487696881568, + "loss": 2.5907, + "step": 25160 + }, + { + "epoch": 0.7461079975091184, + "grad_norm": 0.09965839236974716, + "learning_rate": 0.00015373093375391683, + "loss": 2.5689, + "step": 25161 + }, + { + "epoch": 0.7461376508614299, + "grad_norm": 0.08931740373373032, + "learning_rate": 0.00015369699360529, + "loss": 2.5985, + "step": 25162 + }, + { + "epoch": 0.7461673042137413, + "grad_norm": 0.09790758043527603, + "learning_rate": 0.00015366305652323575, + "loss": 2.552, + "step": 25163 + }, + { + "epoch": 0.7461969575660529, + "grad_norm": 0.1051851436495781, + "learning_rate": 0.00015362912250805455, + "loss": 2.5877, + "step": 25164 + }, + { + "epoch": 0.7462266109183643, + "grad_norm": 0.09421271830797195, + "learning_rate": 0.00015359519156004698, + "loss": 2.5677, + "step": 25165 + }, + { + "epoch": 0.7462562642706758, + "grad_norm": 0.10397399961948395, + "learning_rate": 0.0001535612636795134, + "loss": 2.5368, + "step": 25166 + }, + { + "epoch": 0.7462859176229872, + "grad_norm": 0.1010514348745346, + "learning_rate": 0.0001535273388667543, + "loss": 2.5812, + "step": 25167 + }, + { + "epoch": 0.7463155709752988, + "grad_norm": 0.0958387702703476, + "learning_rate": 0.0001534934171220701, + "loss": 2.5274, + "step": 25168 + }, + { + "epoch": 0.7463452243276102, + "grad_norm": 0.08634031563997269, + "learning_rate": 0.00015345949844576117, + "loss": 2.5715, + "step": 25169 + }, + { + "epoch": 0.7463748776799217, + "grad_norm": 0.1057339459657669, + "learning_rate": 0.00015342558283812786, + "loss": 2.5473, + "step": 25170 + }, + { + "epoch": 0.7464045310322331, + "grad_norm": 0.0856524258852005, + "learning_rate": 0.00015339167029947048, + "loss": 2.5247, + "step": 25171 + }, + { + "epoch": 0.7464341843845447, + "grad_norm": 0.09756694734096527, + "learning_rate": 0.0001533577608300894, + "loss": 2.5577, + "step": 25172 + }, + { + "epoch": 0.7464638377368562, + "grad_norm": 0.08444198966026306, + "learning_rate": 0.0001533238544302848, + "loss": 2.5407, + "step": 25173 + }, + { + "epoch": 0.7464934910891676, + "grad_norm": 0.09104163199663162, + "learning_rate": 0.00015328995110035698, + "loss": 2.537, + "step": 25174 + }, + { + "epoch": 0.7465231444414792, + "grad_norm": 0.08913594484329224, + "learning_rate": 0.00015325605084060616, + "loss": 2.5789, + "step": 25175 + }, + { + "epoch": 0.7465527977937906, + "grad_norm": 0.09293409436941147, + "learning_rate": 0.00015322215365133257, + "loss": 2.5729, + "step": 25176 + }, + { + "epoch": 0.7465824511461021, + "grad_norm": 0.0928325280547142, + "learning_rate": 0.00015318825953283626, + "loss": 2.5571, + "step": 25177 + }, + { + "epoch": 0.7466121044984135, + "grad_norm": 0.08378943055868149, + "learning_rate": 0.00015315436848541752, + "loss": 2.566, + "step": 25178 + }, + { + "epoch": 0.7466417578507251, + "grad_norm": 0.08355612307786942, + "learning_rate": 0.00015312048050937632, + "loss": 2.5479, + "step": 25179 + }, + { + "epoch": 0.7466714112030365, + "grad_norm": 0.08469705283641815, + "learning_rate": 0.00015308659560501288, + "loss": 2.5775, + "step": 25180 + }, + { + "epoch": 0.746701064555348, + "grad_norm": 0.09147009253501892, + "learning_rate": 0.0001530527137726271, + "loss": 2.5827, + "step": 25181 + }, + { + "epoch": 0.7467307179076594, + "grad_norm": 0.08256295323371887, + "learning_rate": 0.0001530188350125191, + "loss": 2.5714, + "step": 25182 + }, + { + "epoch": 0.746760371259971, + "grad_norm": 0.09187406301498413, + "learning_rate": 0.00015298495932498907, + "loss": 2.5668, + "step": 25183 + }, + { + "epoch": 0.7467900246122824, + "grad_norm": 0.08742745965719223, + "learning_rate": 0.00015295108671033647, + "loss": 2.5779, + "step": 25184 + }, + { + "epoch": 0.7468196779645939, + "grad_norm": 0.09447824954986572, + "learning_rate": 0.00015291721716886175, + "loss": 2.5493, + "step": 25185 + }, + { + "epoch": 0.7468493313169053, + "grad_norm": 0.0813845619559288, + "learning_rate": 0.00015288335070086462, + "loss": 2.5552, + "step": 25186 + }, + { + "epoch": 0.7468789846692169, + "grad_norm": 0.08941180258989334, + "learning_rate": 0.000152849487306645, + "loss": 2.5065, + "step": 25187 + }, + { + "epoch": 0.7469086380215283, + "grad_norm": 0.08850917220115662, + "learning_rate": 0.00015281562698650275, + "loss": 2.5881, + "step": 25188 + }, + { + "epoch": 0.7469382913738398, + "grad_norm": 0.09279845654964447, + "learning_rate": 0.0001527817697407377, + "loss": 2.5757, + "step": 25189 + }, + { + "epoch": 0.7469679447261512, + "grad_norm": 0.0944613665342331, + "learning_rate": 0.00015274791556964967, + "loss": 2.5696, + "step": 25190 + }, + { + "epoch": 0.7469975980784628, + "grad_norm": 0.09126916527748108, + "learning_rate": 0.00015271406447353852, + "loss": 2.5659, + "step": 25191 + }, + { + "epoch": 0.7470272514307742, + "grad_norm": 0.098137266933918, + "learning_rate": 0.00015268021645270391, + "loss": 2.5557, + "step": 25192 + }, + { + "epoch": 0.7470569047830857, + "grad_norm": 0.0908157005906105, + "learning_rate": 0.00015264637150744575, + "loss": 2.5957, + "step": 25193 + }, + { + "epoch": 0.7470865581353973, + "grad_norm": 0.08908168971538544, + "learning_rate": 0.00015261252963806344, + "loss": 2.5468, + "step": 25194 + }, + { + "epoch": 0.7471162114877087, + "grad_norm": 0.09047228842973709, + "learning_rate": 0.00015257869084485677, + "loss": 2.567, + "step": 25195 + }, + { + "epoch": 0.7471458648400202, + "grad_norm": 0.08603572100400925, + "learning_rate": 0.0001525448551281255, + "loss": 2.5673, + "step": 25196 + }, + { + "epoch": 0.7471755181923316, + "grad_norm": 0.09015582501888275, + "learning_rate": 0.00015251102248816894, + "loss": 2.5298, + "step": 25197 + }, + { + "epoch": 0.7472051715446432, + "grad_norm": 0.0857517272233963, + "learning_rate": 0.0001524771929252871, + "loss": 2.5721, + "step": 25198 + }, + { + "epoch": 0.7472348248969546, + "grad_norm": 0.09565363079309464, + "learning_rate": 0.00015244336643977942, + "loss": 2.5502, + "step": 25199 + }, + { + "epoch": 0.7472644782492661, + "grad_norm": 0.08997635543346405, + "learning_rate": 0.00015240954303194536, + "loss": 2.5759, + "step": 25200 + }, + { + "epoch": 0.7472941316015775, + "grad_norm": 0.09374440461397171, + "learning_rate": 0.00015237572270208443, + "loss": 2.5436, + "step": 25201 + }, + { + "epoch": 0.7473237849538891, + "grad_norm": 0.09367942810058594, + "learning_rate": 0.0001523419054504962, + "loss": 2.5657, + "step": 25202 + }, + { + "epoch": 0.7473534383062005, + "grad_norm": 0.08978674560785294, + "learning_rate": 0.00015230809127748002, + "loss": 2.5668, + "step": 25203 + }, + { + "epoch": 0.747383091658512, + "grad_norm": 0.09455209225416183, + "learning_rate": 0.00015227428018333556, + "loss": 2.5598, + "step": 25204 + }, + { + "epoch": 0.7474127450108234, + "grad_norm": 0.09246640652418137, + "learning_rate": 0.00015224047216836186, + "loss": 2.5663, + "step": 25205 + }, + { + "epoch": 0.747442398363135, + "grad_norm": 0.09126963466405869, + "learning_rate": 0.00015220666723285848, + "loss": 2.5657, + "step": 25206 + }, + { + "epoch": 0.7474720517154464, + "grad_norm": 0.09952037036418915, + "learning_rate": 0.00015217286537712478, + "loss": 2.5601, + "step": 25207 + }, + { + "epoch": 0.7475017050677579, + "grad_norm": 0.09005614370107651, + "learning_rate": 0.00015213906660146005, + "loss": 2.5672, + "step": 25208 + }, + { + "epoch": 0.7475313584200693, + "grad_norm": 0.09050817042589188, + "learning_rate": 0.00015210527090616356, + "loss": 2.5647, + "step": 25209 + }, + { + "epoch": 0.7475610117723809, + "grad_norm": 0.09550534933805466, + "learning_rate": 0.00015207147829153446, + "loss": 2.5856, + "step": 25210 + }, + { + "epoch": 0.7475906651246923, + "grad_norm": 0.09154480695724487, + "learning_rate": 0.00015203768875787232, + "loss": 2.5596, + "step": 25211 + }, + { + "epoch": 0.7476203184770038, + "grad_norm": 0.09435640275478363, + "learning_rate": 0.00015200390230547616, + "loss": 2.5416, + "step": 25212 + }, + { + "epoch": 0.7476499718293153, + "grad_norm": 0.0929797813296318, + "learning_rate": 0.0001519701189346452, + "loss": 2.5478, + "step": 25213 + }, + { + "epoch": 0.7476796251816268, + "grad_norm": 0.09483825415372849, + "learning_rate": 0.00015193633864567875, + "loss": 2.541, + "step": 25214 + }, + { + "epoch": 0.7477092785339383, + "grad_norm": 0.08924626559019089, + "learning_rate": 0.00015190256143887554, + "loss": 2.5411, + "step": 25215 + }, + { + "epoch": 0.7477389318862497, + "grad_norm": 0.09631895273923874, + "learning_rate": 0.00015186878731453495, + "loss": 2.5796, + "step": 25216 + }, + { + "epoch": 0.7477685852385613, + "grad_norm": 0.08599863946437836, + "learning_rate": 0.00015183501627295598, + "loss": 2.5729, + "step": 25217 + }, + { + "epoch": 0.7477982385908727, + "grad_norm": 0.08986403048038483, + "learning_rate": 0.00015180124831443774, + "loss": 2.5832, + "step": 25218 + }, + { + "epoch": 0.7478278919431842, + "grad_norm": 0.08737631887197495, + "learning_rate": 0.00015176748343927922, + "loss": 2.5667, + "step": 25219 + }, + { + "epoch": 0.7478575452954956, + "grad_norm": 0.09083583950996399, + "learning_rate": 0.00015173372164777933, + "loss": 2.533, + "step": 25220 + }, + { + "epoch": 0.7478871986478072, + "grad_norm": 0.08241330832242966, + "learning_rate": 0.0001516999629402372, + "loss": 2.5511, + "step": 25221 + }, + { + "epoch": 0.7479168520001186, + "grad_norm": 0.08816716820001602, + "learning_rate": 0.00015166620731695165, + "loss": 2.583, + "step": 25222 + }, + { + "epoch": 0.7479465053524301, + "grad_norm": 0.0942271277308464, + "learning_rate": 0.00015163245477822142, + "loss": 2.5561, + "step": 25223 + }, + { + "epoch": 0.7479761587047415, + "grad_norm": 0.0862448662519455, + "learning_rate": 0.00015159870532434577, + "loss": 2.5452, + "step": 25224 + }, + { + "epoch": 0.7480058120570531, + "grad_norm": 0.08606749773025513, + "learning_rate": 0.00015156495895562362, + "loss": 2.5591, + "step": 25225 + }, + { + "epoch": 0.7480354654093645, + "grad_norm": 0.08615954220294952, + "learning_rate": 0.00015153121567235335, + "loss": 2.5498, + "step": 25226 + }, + { + "epoch": 0.748065118761676, + "grad_norm": 0.09214746206998825, + "learning_rate": 0.00015149747547483401, + "loss": 2.5785, + "step": 25227 + }, + { + "epoch": 0.7480947721139874, + "grad_norm": 0.09294681251049042, + "learning_rate": 0.0001514637383633643, + "loss": 2.5757, + "step": 25228 + }, + { + "epoch": 0.748124425466299, + "grad_norm": 0.08698210120201111, + "learning_rate": 0.00015143000433824307, + "loss": 2.5769, + "step": 25229 + }, + { + "epoch": 0.7481540788186104, + "grad_norm": 0.09032482653856277, + "learning_rate": 0.0001513962733997689, + "loss": 2.5724, + "step": 25230 + }, + { + "epoch": 0.7481837321709219, + "grad_norm": 0.09095783531665802, + "learning_rate": 0.00015136254554824063, + "loss": 2.5825, + "step": 25231 + }, + { + "epoch": 0.7482133855232334, + "grad_norm": 0.08680888265371323, + "learning_rate": 0.00015132882078395683, + "loss": 2.5884, + "step": 25232 + }, + { + "epoch": 0.7482430388755449, + "grad_norm": 0.0901801809668541, + "learning_rate": 0.00015129509910721616, + "loss": 2.5758, + "step": 25233 + }, + { + "epoch": 0.7482726922278563, + "grad_norm": 0.0936204195022583, + "learning_rate": 0.00015126138051831727, + "loss": 2.5827, + "step": 25234 + }, + { + "epoch": 0.7483023455801678, + "grad_norm": 0.0880909115076065, + "learning_rate": 0.00015122766501755874, + "loss": 2.5992, + "step": 25235 + }, + { + "epoch": 0.7483319989324794, + "grad_norm": 0.09319623559713364, + "learning_rate": 0.00015119395260523912, + "loss": 2.5712, + "step": 25236 + }, + { + "epoch": 0.7483616522847908, + "grad_norm": 0.09318771213293076, + "learning_rate": 0.00015116024328165685, + "loss": 2.5335, + "step": 25237 + }, + { + "epoch": 0.7483913056371023, + "grad_norm": 0.09847023338079453, + "learning_rate": 0.00015112653704711055, + "loss": 2.5822, + "step": 25238 + }, + { + "epoch": 0.7484209589894137, + "grad_norm": 0.0901772752404213, + "learning_rate": 0.0001510928339018986, + "loss": 2.5698, + "step": 25239 + }, + { + "epoch": 0.7484506123417253, + "grad_norm": 0.09016859531402588, + "learning_rate": 0.00015105913384631952, + "loss": 2.5779, + "step": 25240 + }, + { + "epoch": 0.7484802656940367, + "grad_norm": 0.08768399059772491, + "learning_rate": 0.00015102543688067172, + "loss": 2.5668, + "step": 25241 + }, + { + "epoch": 0.7485099190463482, + "grad_norm": 0.08781547844409943, + "learning_rate": 0.0001509917430052536, + "loss": 2.5615, + "step": 25242 + }, + { + "epoch": 0.7485395723986596, + "grad_norm": 0.09979520738124847, + "learning_rate": 0.00015095805222036345, + "loss": 2.603, + "step": 25243 + }, + { + "epoch": 0.7485692257509712, + "grad_norm": 0.08438620716333389, + "learning_rate": 0.00015092436452629964, + "loss": 2.5746, + "step": 25244 + }, + { + "epoch": 0.7485988791032826, + "grad_norm": 0.09199617803096771, + "learning_rate": 0.00015089067992336057, + "loss": 2.5672, + "step": 25245 + }, + { + "epoch": 0.7486285324555941, + "grad_norm": 0.09052908420562744, + "learning_rate": 0.0001508569984118444, + "loss": 2.5589, + "step": 25246 + }, + { + "epoch": 0.7486581858079056, + "grad_norm": 0.08744636178016663, + "learning_rate": 0.00015082331999204945, + "loss": 2.5867, + "step": 25247 + }, + { + "epoch": 0.7486878391602171, + "grad_norm": 0.08401916921138763, + "learning_rate": 0.00015078964466427397, + "loss": 2.5575, + "step": 25248 + }, + { + "epoch": 0.7487174925125285, + "grad_norm": 0.08768844604492188, + "learning_rate": 0.0001507559724288161, + "loss": 2.5648, + "step": 25249 + }, + { + "epoch": 0.74874714586484, + "grad_norm": 0.08251766115427017, + "learning_rate": 0.00015072230328597408, + "loss": 2.5802, + "step": 25250 + }, + { + "epoch": 0.7487767992171515, + "grad_norm": 0.08380818367004395, + "learning_rate": 0.00015068863723604597, + "loss": 2.5609, + "step": 25251 + }, + { + "epoch": 0.748806452569463, + "grad_norm": 0.0967903584241867, + "learning_rate": 0.00015065497427932994, + "loss": 2.5753, + "step": 25252 + }, + { + "epoch": 0.7488361059217744, + "grad_norm": 0.09127770364284515, + "learning_rate": 0.00015062131441612408, + "loss": 2.5895, + "step": 25253 + }, + { + "epoch": 0.7488657592740859, + "grad_norm": 0.09612437337636948, + "learning_rate": 0.00015058765764672645, + "loss": 2.5596, + "step": 25254 + }, + { + "epoch": 0.7488954126263974, + "grad_norm": 0.09260635077953339, + "learning_rate": 0.0001505540039714351, + "loss": 2.5355, + "step": 25255 + }, + { + "epoch": 0.7489250659787089, + "grad_norm": 0.101584292948246, + "learning_rate": 0.00015052035339054804, + "loss": 2.59, + "step": 25256 + }, + { + "epoch": 0.7489547193310204, + "grad_norm": 0.09092529863119125, + "learning_rate": 0.0001504867059043632, + "loss": 2.5786, + "step": 25257 + }, + { + "epoch": 0.7489843726833318, + "grad_norm": 0.09011238068342209, + "learning_rate": 0.0001504530615131786, + "loss": 2.562, + "step": 25258 + }, + { + "epoch": 0.7490140260356434, + "grad_norm": 0.09708767384290695, + "learning_rate": 0.00015041942021729228, + "loss": 2.5651, + "step": 25259 + }, + { + "epoch": 0.7490436793879548, + "grad_norm": 0.08906316757202148, + "learning_rate": 0.00015038578201700186, + "loss": 2.5826, + "step": 25260 + }, + { + "epoch": 0.7490733327402663, + "grad_norm": 0.09684649854898453, + "learning_rate": 0.00015035214691260534, + "loss": 2.5292, + "step": 25261 + }, + { + "epoch": 0.7491029860925777, + "grad_norm": 0.08920856565237045, + "learning_rate": 0.0001503185149044004, + "loss": 2.5701, + "step": 25262 + }, + { + "epoch": 0.7491326394448893, + "grad_norm": 0.09118612110614777, + "learning_rate": 0.00015028488599268524, + "loss": 2.5862, + "step": 25263 + }, + { + "epoch": 0.7491622927972007, + "grad_norm": 0.089706189930439, + "learning_rate": 0.0001502512601777574, + "loss": 2.5355, + "step": 25264 + }, + { + "epoch": 0.7491919461495122, + "grad_norm": 0.09193895012140274, + "learning_rate": 0.00015021763745991468, + "loss": 2.5899, + "step": 25265 + }, + { + "epoch": 0.7492215995018237, + "grad_norm": 0.09898421168327332, + "learning_rate": 0.00015018401783945483, + "loss": 2.5346, + "step": 25266 + }, + { + "epoch": 0.7492512528541352, + "grad_norm": 0.08214376866817474, + "learning_rate": 0.00015015040131667557, + "loss": 2.5372, + "step": 25267 + }, + { + "epoch": 0.7492809062064466, + "grad_norm": 0.08772140741348267, + "learning_rate": 0.00015011678789187454, + "loss": 2.5448, + "step": 25268 + }, + { + "epoch": 0.7493105595587581, + "grad_norm": 0.08980577439069748, + "learning_rate": 0.00015008317756534957, + "loss": 2.5871, + "step": 25269 + }, + { + "epoch": 0.7493402129110696, + "grad_norm": 0.0889432430267334, + "learning_rate": 0.00015004957033739797, + "loss": 2.565, + "step": 25270 + }, + { + "epoch": 0.7493698662633811, + "grad_norm": 0.08998427540063858, + "learning_rate": 0.0001500159662083175, + "loss": 2.6043, + "step": 25271 + }, + { + "epoch": 0.7493995196156925, + "grad_norm": 0.09745314717292786, + "learning_rate": 0.0001499823651784057, + "loss": 2.5575, + "step": 25272 + }, + { + "epoch": 0.749429172968004, + "grad_norm": 0.08520591259002686, + "learning_rate": 0.00014994876724796013, + "loss": 2.5246, + "step": 25273 + }, + { + "epoch": 0.7494588263203155, + "grad_norm": 0.09310107678174973, + "learning_rate": 0.00014991517241727832, + "loss": 2.5528, + "step": 25274 + }, + { + "epoch": 0.749488479672627, + "grad_norm": 0.09944286197423935, + "learning_rate": 0.00014988158068665757, + "loss": 2.57, + "step": 25275 + }, + { + "epoch": 0.7495181330249384, + "grad_norm": 0.09860619902610779, + "learning_rate": 0.00014984799205639576, + "loss": 2.5696, + "step": 25276 + }, + { + "epoch": 0.74954778637725, + "grad_norm": 0.09095018357038498, + "learning_rate": 0.00014981440652679003, + "loss": 2.6218, + "step": 25277 + }, + { + "epoch": 0.7495774397295615, + "grad_norm": 0.0962536484003067, + "learning_rate": 0.0001497808240981378, + "loss": 2.5653, + "step": 25278 + }, + { + "epoch": 0.7496070930818729, + "grad_norm": 0.09464981406927109, + "learning_rate": 0.00014974724477073654, + "loss": 2.5896, + "step": 25279 + }, + { + "epoch": 0.7496367464341844, + "grad_norm": 0.08686719089746475, + "learning_rate": 0.00014971366854488373, + "loss": 2.5588, + "step": 25280 + }, + { + "epoch": 0.7496663997864959, + "grad_norm": 0.0852646753191948, + "learning_rate": 0.0001496800954208763, + "loss": 2.5726, + "step": 25281 + }, + { + "epoch": 0.7496960531388074, + "grad_norm": 0.08772248774766922, + "learning_rate": 0.00014964652539901176, + "loss": 2.585, + "step": 25282 + }, + { + "epoch": 0.7497257064911188, + "grad_norm": 0.0870722085237503, + "learning_rate": 0.0001496129584795874, + "loss": 2.577, + "step": 25283 + }, + { + "epoch": 0.7497553598434303, + "grad_norm": 0.08710190653800964, + "learning_rate": 0.00014957939466290045, + "loss": 2.5456, + "step": 25284 + }, + { + "epoch": 0.7497850131957418, + "grad_norm": 0.08849480748176575, + "learning_rate": 0.00014954583394924804, + "loss": 2.5794, + "step": 25285 + }, + { + "epoch": 0.7498146665480533, + "grad_norm": 0.09439188987016678, + "learning_rate": 0.00014951227633892744, + "loss": 2.5743, + "step": 25286 + }, + { + "epoch": 0.7498443199003647, + "grad_norm": 0.09382644295692444, + "learning_rate": 0.00014947872183223586, + "loss": 2.5566, + "step": 25287 + }, + { + "epoch": 0.7498739732526762, + "grad_norm": 0.08661787956953049, + "learning_rate": 0.00014944517042947015, + "loss": 2.5652, + "step": 25288 + }, + { + "epoch": 0.7499036266049877, + "grad_norm": 0.08597606420516968, + "learning_rate": 0.00014941162213092778, + "loss": 2.586, + "step": 25289 + }, + { + "epoch": 0.7499332799572992, + "grad_norm": 0.08547376841306686, + "learning_rate": 0.00014937807693690585, + "loss": 2.5459, + "step": 25290 + }, + { + "epoch": 0.7499629333096106, + "grad_norm": 0.0962742418050766, + "learning_rate": 0.0001493445348477011, + "loss": 2.5629, + "step": 25291 + }, + { + "epoch": 0.7499925866619221, + "grad_norm": 0.08715225011110306, + "learning_rate": 0.0001493109958636107, + "loss": 2.562, + "step": 25292 + }, + { + "epoch": 0.7500222400142336, + "grad_norm": 0.09329838305711746, + "learning_rate": 0.0001492774599849316, + "loss": 2.5626, + "step": 25293 + }, + { + "epoch": 0.7500518933665451, + "grad_norm": 0.09572100639343262, + "learning_rate": 0.00014924392721196071, + "loss": 2.54, + "step": 25294 + }, + { + "epoch": 0.7500815467188565, + "grad_norm": 0.10814044624567032, + "learning_rate": 0.00014921039754499515, + "loss": 2.6394, + "step": 25295 + }, + { + "epoch": 0.750111200071168, + "grad_norm": 0.09001032263040543, + "learning_rate": 0.0001491768709843317, + "loss": 2.5574, + "step": 25296 + }, + { + "epoch": 0.7501408534234795, + "grad_norm": 0.09607087075710297, + "learning_rate": 0.00014914334753026725, + "loss": 2.5945, + "step": 25297 + }, + { + "epoch": 0.750170506775791, + "grad_norm": 0.08272363245487213, + "learning_rate": 0.00014910982718309874, + "loss": 2.5562, + "step": 25298 + }, + { + "epoch": 0.7502001601281025, + "grad_norm": 0.09165684878826141, + "learning_rate": 0.00014907630994312295, + "loss": 2.5297, + "step": 25299 + }, + { + "epoch": 0.750229813480414, + "grad_norm": 0.09103981405496597, + "learning_rate": 0.00014904279581063663, + "loss": 2.5451, + "step": 25300 + }, + { + "epoch": 0.7502594668327255, + "grad_norm": 0.09229321777820587, + "learning_rate": 0.00014900928478593656, + "loss": 2.5907, + "step": 25301 + }, + { + "epoch": 0.7502891201850369, + "grad_norm": 0.09049136936664581, + "learning_rate": 0.00014897577686931956, + "loss": 2.5694, + "step": 25302 + }, + { + "epoch": 0.7503187735373484, + "grad_norm": 0.09455950558185577, + "learning_rate": 0.0001489422720610823, + "loss": 2.5635, + "step": 25303 + }, + { + "epoch": 0.7503484268896599, + "grad_norm": 0.08668268471956253, + "learning_rate": 0.0001489087703615215, + "loss": 2.5371, + "step": 25304 + }, + { + "epoch": 0.7503780802419714, + "grad_norm": 0.10043493658304214, + "learning_rate": 0.0001488752717709338, + "loss": 2.545, + "step": 25305 + }, + { + "epoch": 0.7504077335942828, + "grad_norm": 0.09021655470132828, + "learning_rate": 0.00014884177628961582, + "loss": 2.541, + "step": 25306 + }, + { + "epoch": 0.7504373869465943, + "grad_norm": 0.10194132477045059, + "learning_rate": 0.00014880828391786416, + "loss": 2.5643, + "step": 25307 + }, + { + "epoch": 0.7504670402989058, + "grad_norm": 0.09845372289419174, + "learning_rate": 0.00014877479465597544, + "loss": 2.5534, + "step": 25308 + }, + { + "epoch": 0.7504966936512173, + "grad_norm": 0.08990102261304855, + "learning_rate": 0.00014874130850424622, + "loss": 2.5293, + "step": 25309 + }, + { + "epoch": 0.7505263470035287, + "grad_norm": 0.09406294673681259, + "learning_rate": 0.00014870782546297302, + "loss": 2.5617, + "step": 25310 + }, + { + "epoch": 0.7505560003558402, + "grad_norm": 0.08861745893955231, + "learning_rate": 0.0001486743455324523, + "loss": 2.6284, + "step": 25311 + }, + { + "epoch": 0.7505856537081517, + "grad_norm": 0.10224927961826324, + "learning_rate": 0.00014864086871298055, + "loss": 2.6009, + "step": 25312 + }, + { + "epoch": 0.7506153070604632, + "grad_norm": 0.0779908075928688, + "learning_rate": 0.00014860739500485427, + "loss": 2.5875, + "step": 25313 + }, + { + "epoch": 0.7506449604127746, + "grad_norm": 0.10322535037994385, + "learning_rate": 0.00014857392440836975, + "loss": 2.6048, + "step": 25314 + }, + { + "epoch": 0.7506746137650862, + "grad_norm": 0.0983007624745369, + "learning_rate": 0.0001485404569238235, + "loss": 2.564, + "step": 25315 + }, + { + "epoch": 0.7507042671173976, + "grad_norm": 0.0955294817686081, + "learning_rate": 0.00014850699255151178, + "loss": 2.5726, + "step": 25316 + }, + { + "epoch": 0.7507339204697091, + "grad_norm": 0.09443164616823196, + "learning_rate": 0.00014847353129173103, + "loss": 2.5784, + "step": 25317 + }, + { + "epoch": 0.7507635738220205, + "grad_norm": 0.09728144854307175, + "learning_rate": 0.0001484400731447775, + "loss": 2.5654, + "step": 25318 + }, + { + "epoch": 0.7507932271743321, + "grad_norm": 0.09444284439086914, + "learning_rate": 0.00014840661811094742, + "loss": 2.5234, + "step": 25319 + }, + { + "epoch": 0.7508228805266436, + "grad_norm": 0.09843206405639648, + "learning_rate": 0.00014837316619053715, + "loss": 2.5595, + "step": 25320 + }, + { + "epoch": 0.750852533878955, + "grad_norm": 0.09360764175653458, + "learning_rate": 0.00014833971738384278, + "loss": 2.5909, + "step": 25321 + }, + { + "epoch": 0.7508821872312665, + "grad_norm": 0.09278211742639542, + "learning_rate": 0.00014830627169116063, + "loss": 2.5713, + "step": 25322 + }, + { + "epoch": 0.750911840583578, + "grad_norm": 0.08919660747051239, + "learning_rate": 0.00014827282911278677, + "loss": 2.5544, + "step": 25323 + }, + { + "epoch": 0.7509414939358895, + "grad_norm": 0.09845125675201416, + "learning_rate": 0.00014823938964901739, + "loss": 2.5666, + "step": 25324 + }, + { + "epoch": 0.7509711472882009, + "grad_norm": 0.10145247727632523, + "learning_rate": 0.00014820595330014875, + "loss": 2.5817, + "step": 25325 + }, + { + "epoch": 0.7510008006405124, + "grad_norm": 0.08408518135547638, + "learning_rate": 0.00014817252006647664, + "loss": 2.5525, + "step": 25326 + }, + { + "epoch": 0.7510304539928239, + "grad_norm": 0.10416785627603531, + "learning_rate": 0.00014813908994829711, + "loss": 2.5596, + "step": 25327 + }, + { + "epoch": 0.7510601073451354, + "grad_norm": 0.0837511494755745, + "learning_rate": 0.00014810566294590644, + "loss": 2.5881, + "step": 25328 + }, + { + "epoch": 0.7510897606974468, + "grad_norm": 0.10884897410869598, + "learning_rate": 0.00014807223905960055, + "loss": 2.5896, + "step": 25329 + }, + { + "epoch": 0.7511194140497583, + "grad_norm": 0.08870620280504227, + "learning_rate": 0.0001480388182896754, + "loss": 2.5947, + "step": 25330 + }, + { + "epoch": 0.7511490674020698, + "grad_norm": 0.08706735819578171, + "learning_rate": 0.0001480054006364269, + "loss": 2.542, + "step": 25331 + }, + { + "epoch": 0.7511787207543813, + "grad_norm": 0.0974942222237587, + "learning_rate": 0.00014797198610015105, + "loss": 2.5523, + "step": 25332 + }, + { + "epoch": 0.7512083741066927, + "grad_norm": 0.08871432393789291, + "learning_rate": 0.00014793857468114362, + "loss": 2.5431, + "step": 25333 + }, + { + "epoch": 0.7512380274590043, + "grad_norm": 0.08702301234006882, + "learning_rate": 0.00014790516637970064, + "loss": 2.5746, + "step": 25334 + }, + { + "epoch": 0.7512676808113157, + "grad_norm": 0.09486591815948486, + "learning_rate": 0.00014787176119611796, + "loss": 2.5484, + "step": 25335 + }, + { + "epoch": 0.7512973341636272, + "grad_norm": 0.09321312606334686, + "learning_rate": 0.00014783835913069105, + "loss": 2.5999, + "step": 25336 + }, + { + "epoch": 0.7513269875159386, + "grad_norm": 0.08764717727899551, + "learning_rate": 0.00014780496018371598, + "loss": 2.5733, + "step": 25337 + }, + { + "epoch": 0.7513566408682502, + "grad_norm": 0.09282597154378891, + "learning_rate": 0.00014777156435548844, + "loss": 2.5872, + "step": 25338 + }, + { + "epoch": 0.7513862942205616, + "grad_norm": 0.0800769031047821, + "learning_rate": 0.00014773817164630398, + "loss": 2.5731, + "step": 25339 + }, + { + "epoch": 0.7514159475728731, + "grad_norm": 0.10158071666955948, + "learning_rate": 0.00014770478205645865, + "loss": 2.5602, + "step": 25340 + }, + { + "epoch": 0.7514456009251846, + "grad_norm": 0.08173182606697083, + "learning_rate": 0.000147671395586248, + "loss": 2.5787, + "step": 25341 + }, + { + "epoch": 0.7514752542774961, + "grad_norm": 0.09349414706230164, + "learning_rate": 0.00014763801223596751, + "loss": 2.5767, + "step": 25342 + }, + { + "epoch": 0.7515049076298076, + "grad_norm": 0.08484809845685959, + "learning_rate": 0.00014760463200591295, + "loss": 2.6125, + "step": 25343 + }, + { + "epoch": 0.751534560982119, + "grad_norm": 0.09346452355384827, + "learning_rate": 0.0001475712548963799, + "loss": 2.6019, + "step": 25344 + }, + { + "epoch": 0.7515642143344305, + "grad_norm": 0.08973809331655502, + "learning_rate": 0.00014753788090766395, + "loss": 2.5742, + "step": 25345 + }, + { + "epoch": 0.751593867686742, + "grad_norm": 0.08862320333719254, + "learning_rate": 0.0001475045100400605, + "loss": 2.5359, + "step": 25346 + }, + { + "epoch": 0.7516235210390535, + "grad_norm": 0.09036560356616974, + "learning_rate": 0.00014747114229386504, + "loss": 2.5686, + "step": 25347 + }, + { + "epoch": 0.7516531743913649, + "grad_norm": 0.09806708991527557, + "learning_rate": 0.00014743777766937322, + "loss": 2.5789, + "step": 25348 + }, + { + "epoch": 0.7516828277436765, + "grad_norm": 0.08420413732528687, + "learning_rate": 0.00014740441616688032, + "loss": 2.5726, + "step": 25349 + }, + { + "epoch": 0.7517124810959879, + "grad_norm": 0.09208254516124725, + "learning_rate": 0.00014737105778668186, + "loss": 2.5683, + "step": 25350 + }, + { + "epoch": 0.7517421344482994, + "grad_norm": 0.07830687612295151, + "learning_rate": 0.00014733770252907318, + "loss": 2.593, + "step": 25351 + }, + { + "epoch": 0.7517717878006108, + "grad_norm": 0.09266764670610428, + "learning_rate": 0.0001473043503943496, + "loss": 2.6095, + "step": 25352 + }, + { + "epoch": 0.7518014411529224, + "grad_norm": 0.0863770917057991, + "learning_rate": 0.00014727100138280664, + "loss": 2.5578, + "step": 25353 + }, + { + "epoch": 0.7518310945052338, + "grad_norm": 0.08817211538553238, + "learning_rate": 0.0001472376554947395, + "loss": 2.5385, + "step": 25354 + }, + { + "epoch": 0.7518607478575453, + "grad_norm": 0.0909515768289566, + "learning_rate": 0.00014720431273044344, + "loss": 2.596, + "step": 25355 + }, + { + "epoch": 0.7518904012098567, + "grad_norm": 0.0843234583735466, + "learning_rate": 0.000147170973090214, + "loss": 2.5554, + "step": 25356 + }, + { + "epoch": 0.7519200545621683, + "grad_norm": 0.09680227190256119, + "learning_rate": 0.00014713763657434593, + "loss": 2.5663, + "step": 25357 + }, + { + "epoch": 0.7519497079144797, + "grad_norm": 0.07957462221384048, + "learning_rate": 0.00014710430318313466, + "loss": 2.5842, + "step": 25358 + }, + { + "epoch": 0.7519793612667912, + "grad_norm": 0.10047710686922073, + "learning_rate": 0.00014707097291687539, + "loss": 2.5797, + "step": 25359 + }, + { + "epoch": 0.7520090146191027, + "grad_norm": 0.08420411497354507, + "learning_rate": 0.0001470376457758632, + "loss": 2.5761, + "step": 25360 + }, + { + "epoch": 0.7520386679714142, + "grad_norm": 0.08570285141468048, + "learning_rate": 0.00014700432176039324, + "loss": 2.6004, + "step": 25361 + }, + { + "epoch": 0.7520683213237257, + "grad_norm": 0.09257245808839798, + "learning_rate": 0.00014697100087076066, + "loss": 2.5688, + "step": 25362 + }, + { + "epoch": 0.7520979746760371, + "grad_norm": 0.08878915756940842, + "learning_rate": 0.0001469376831072604, + "loss": 2.5785, + "step": 25363 + }, + { + "epoch": 0.7521276280283486, + "grad_norm": 0.09103982895612717, + "learning_rate": 0.00014690436847018758, + "loss": 2.5973, + "step": 25364 + }, + { + "epoch": 0.7521572813806601, + "grad_norm": 0.09231465309858322, + "learning_rate": 0.000146871056959837, + "loss": 2.6047, + "step": 25365 + }, + { + "epoch": 0.7521869347329716, + "grad_norm": 0.08679304271936417, + "learning_rate": 0.00014683774857650417, + "loss": 2.5961, + "step": 25366 + }, + { + "epoch": 0.752216588085283, + "grad_norm": 0.0936623364686966, + "learning_rate": 0.00014680444332048354, + "loss": 2.5932, + "step": 25367 + }, + { + "epoch": 0.7522462414375946, + "grad_norm": 0.09451843798160553, + "learning_rate": 0.00014677114119207018, + "loss": 2.55, + "step": 25368 + }, + { + "epoch": 0.752275894789906, + "grad_norm": 0.08794569224119186, + "learning_rate": 0.000146737842191559, + "loss": 2.5835, + "step": 25369 + }, + { + "epoch": 0.7523055481422175, + "grad_norm": 0.09632856398820877, + "learning_rate": 0.0001467045463192449, + "loss": 2.5811, + "step": 25370 + }, + { + "epoch": 0.7523352014945289, + "grad_norm": 0.08629002422094345, + "learning_rate": 0.00014667125357542267, + "loss": 2.5661, + "step": 25371 + }, + { + "epoch": 0.7523648548468405, + "grad_norm": 0.0827236995100975, + "learning_rate": 0.0001466379639603871, + "loss": 2.5796, + "step": 25372 + }, + { + "epoch": 0.7523945081991519, + "grad_norm": 0.0924588069319725, + "learning_rate": 0.00014660467747443302, + "loss": 2.6, + "step": 25373 + }, + { + "epoch": 0.7524241615514634, + "grad_norm": 0.08441494405269623, + "learning_rate": 0.0001465713941178552, + "loss": 2.5595, + "step": 25374 + }, + { + "epoch": 0.7524538149037748, + "grad_norm": 0.09377045929431915, + "learning_rate": 0.00014653811389094833, + "loss": 2.5602, + "step": 25375 + }, + { + "epoch": 0.7524834682560864, + "grad_norm": 0.08249148726463318, + "learning_rate": 0.00014650483679400713, + "loss": 2.5949, + "step": 25376 + }, + { + "epoch": 0.7525131216083978, + "grad_norm": 0.08586692810058594, + "learning_rate": 0.00014647156282732632, + "loss": 2.5791, + "step": 25377 + }, + { + "epoch": 0.7525427749607093, + "grad_norm": 0.08606737852096558, + "learning_rate": 0.00014643829199120046, + "loss": 2.578, + "step": 25378 + }, + { + "epoch": 0.7525724283130207, + "grad_norm": 0.08763427287340164, + "learning_rate": 0.00014640502428592422, + "loss": 2.5602, + "step": 25379 + }, + { + "epoch": 0.7526020816653323, + "grad_norm": 0.08786497265100479, + "learning_rate": 0.00014637175971179219, + "loss": 2.5705, + "step": 25380 + }, + { + "epoch": 0.7526317350176438, + "grad_norm": 0.09732969105243683, + "learning_rate": 0.00014633849826909895, + "loss": 2.6105, + "step": 25381 + }, + { + "epoch": 0.7526613883699552, + "grad_norm": 0.09599167853593826, + "learning_rate": 0.000146305239958139, + "loss": 2.5514, + "step": 25382 + }, + { + "epoch": 0.7526910417222668, + "grad_norm": 0.09450509399175644, + "learning_rate": 0.00014627198477920684, + "loss": 2.5667, + "step": 25383 + }, + { + "epoch": 0.7527206950745782, + "grad_norm": 0.09110178053379059, + "learning_rate": 0.00014623873273259696, + "loss": 2.5667, + "step": 25384 + }, + { + "epoch": 0.7527503484268897, + "grad_norm": 0.08973738551139832, + "learning_rate": 0.0001462054838186039, + "loss": 2.5794, + "step": 25385 + }, + { + "epoch": 0.7527800017792011, + "grad_norm": 0.09426719695329666, + "learning_rate": 0.0001461722380375219, + "loss": 2.5822, + "step": 25386 + }, + { + "epoch": 0.7528096551315127, + "grad_norm": 0.08237846195697784, + "learning_rate": 0.00014613899538964555, + "loss": 2.5914, + "step": 25387 + }, + { + "epoch": 0.7528393084838241, + "grad_norm": 0.0958094671368599, + "learning_rate": 0.00014610575587526909, + "loss": 2.5472, + "step": 25388 + }, + { + "epoch": 0.7528689618361356, + "grad_norm": 0.10221284627914429, + "learning_rate": 0.00014607251949468693, + "loss": 2.5733, + "step": 25389 + }, + { + "epoch": 0.752898615188447, + "grad_norm": 0.09923453629016876, + "learning_rate": 0.0001460392862481935, + "loss": 2.5638, + "step": 25390 + }, + { + "epoch": 0.7529282685407586, + "grad_norm": 0.089885413646698, + "learning_rate": 0.00014600605613608264, + "loss": 2.5702, + "step": 25391 + }, + { + "epoch": 0.75295792189307, + "grad_norm": 0.08966071903705597, + "learning_rate": 0.00014597282915864907, + "loss": 2.565, + "step": 25392 + }, + { + "epoch": 0.7529875752453815, + "grad_norm": 0.0891144871711731, + "learning_rate": 0.0001459396053161869, + "loss": 2.5455, + "step": 25393 + }, + { + "epoch": 0.7530172285976929, + "grad_norm": 0.08848172426223755, + "learning_rate": 0.00014590638460899025, + "loss": 2.5756, + "step": 25394 + }, + { + "epoch": 0.7530468819500045, + "grad_norm": 0.08773213624954224, + "learning_rate": 0.00014587316703735338, + "loss": 2.5612, + "step": 25395 + }, + { + "epoch": 0.7530765353023159, + "grad_norm": 0.08846253901720047, + "learning_rate": 0.0001458399526015704, + "loss": 2.5893, + "step": 25396 + }, + { + "epoch": 0.7531061886546274, + "grad_norm": 0.08678675442934036, + "learning_rate": 0.0001458067413019354, + "loss": 2.5691, + "step": 25397 + }, + { + "epoch": 0.7531358420069388, + "grad_norm": 0.09165330231189728, + "learning_rate": 0.00014577353313874252, + "loss": 2.6121, + "step": 25398 + }, + { + "epoch": 0.7531654953592504, + "grad_norm": 0.09678750485181808, + "learning_rate": 0.00014574032811228582, + "loss": 2.5552, + "step": 25399 + }, + { + "epoch": 0.7531951487115618, + "grad_norm": 0.0846298560500145, + "learning_rate": 0.0001457071262228593, + "loss": 2.5514, + "step": 25400 + }, + { + "epoch": 0.7532248020638733, + "grad_norm": 0.08662626892328262, + "learning_rate": 0.00014567392747075715, + "loss": 2.5987, + "step": 25401 + }, + { + "epoch": 0.7532544554161849, + "grad_norm": 0.09129731357097626, + "learning_rate": 0.00014564073185627303, + "loss": 2.5506, + "step": 25402 + }, + { + "epoch": 0.7532841087684963, + "grad_norm": 0.0911235585808754, + "learning_rate": 0.00014560753937970105, + "loss": 2.6025, + "step": 25403 + }, + { + "epoch": 0.7533137621208078, + "grad_norm": 0.08694247901439667, + "learning_rate": 0.00014557435004133501, + "loss": 2.5759, + "step": 25404 + }, + { + "epoch": 0.7533434154731192, + "grad_norm": 0.09601300209760666, + "learning_rate": 0.0001455411638414691, + "loss": 2.6119, + "step": 25405 + }, + { + "epoch": 0.7533730688254308, + "grad_norm": 0.1019536480307579, + "learning_rate": 0.00014550798078039702, + "loss": 2.5566, + "step": 25406 + }, + { + "epoch": 0.7534027221777422, + "grad_norm": 0.09058752655982971, + "learning_rate": 0.00014547480085841258, + "loss": 2.5394, + "step": 25407 + }, + { + "epoch": 0.7534323755300537, + "grad_norm": 0.08619669079780579, + "learning_rate": 0.0001454416240758097, + "loss": 2.5905, + "step": 25408 + }, + { + "epoch": 0.7534620288823651, + "grad_norm": 0.08585811406373978, + "learning_rate": 0.00014540845043288202, + "loss": 2.553, + "step": 25409 + }, + { + "epoch": 0.7534916822346767, + "grad_norm": 0.08720019459724426, + "learning_rate": 0.0001453752799299234, + "loss": 2.5615, + "step": 25410 + }, + { + "epoch": 0.7535213355869881, + "grad_norm": 0.08589904755353928, + "learning_rate": 0.00014534211256722775, + "loss": 2.5819, + "step": 25411 + }, + { + "epoch": 0.7535509889392996, + "grad_norm": 0.08344399929046631, + "learning_rate": 0.00014530894834508836, + "loss": 2.5563, + "step": 25412 + }, + { + "epoch": 0.753580642291611, + "grad_norm": 0.09640645235776901, + "learning_rate": 0.00014527578726379915, + "loss": 2.6118, + "step": 25413 + }, + { + "epoch": 0.7536102956439226, + "grad_norm": 0.08907470852136612, + "learning_rate": 0.00014524262932365367, + "loss": 2.5564, + "step": 25414 + }, + { + "epoch": 0.753639948996234, + "grad_norm": 0.08585163205862045, + "learning_rate": 0.00014520947452494566, + "loss": 2.5778, + "step": 25415 + }, + { + "epoch": 0.7536696023485455, + "grad_norm": 0.09102233499288559, + "learning_rate": 0.00014517632286796861, + "loss": 2.5717, + "step": 25416 + }, + { + "epoch": 0.7536992557008569, + "grad_norm": 0.08153894543647766, + "learning_rate": 0.000145143174353016, + "loss": 2.5648, + "step": 25417 + }, + { + "epoch": 0.7537289090531685, + "grad_norm": 0.09450431168079376, + "learning_rate": 0.00014511002898038168, + "loss": 2.5856, + "step": 25418 + }, + { + "epoch": 0.7537585624054799, + "grad_norm": 0.08820819854736328, + "learning_rate": 0.0001450768867503589, + "loss": 2.5732, + "step": 25419 + }, + { + "epoch": 0.7537882157577914, + "grad_norm": 0.09340522438287735, + "learning_rate": 0.00014504374766324124, + "loss": 2.5733, + "step": 25420 + }, + { + "epoch": 0.7538178691101028, + "grad_norm": 0.08340941369533539, + "learning_rate": 0.00014501061171932227, + "loss": 2.5734, + "step": 25421 + }, + { + "epoch": 0.7538475224624144, + "grad_norm": 0.09544214606285095, + "learning_rate": 0.00014497747891889512, + "loss": 2.57, + "step": 25422 + }, + { + "epoch": 0.7538771758147259, + "grad_norm": 0.08729323744773865, + "learning_rate": 0.00014494434926225336, + "loss": 2.5793, + "step": 25423 + }, + { + "epoch": 0.7539068291670373, + "grad_norm": 0.10483945906162262, + "learning_rate": 0.0001449112227496903, + "loss": 2.5791, + "step": 25424 + }, + { + "epoch": 0.7539364825193489, + "grad_norm": 0.09127698093652725, + "learning_rate": 0.00014487809938149932, + "loss": 2.554, + "step": 25425 + }, + { + "epoch": 0.7539661358716603, + "grad_norm": 0.09822620451450348, + "learning_rate": 0.00014484497915797373, + "loss": 2.5574, + "step": 25426 + }, + { + "epoch": 0.7539957892239718, + "grad_norm": 0.08824162930250168, + "learning_rate": 0.00014481186207940678, + "loss": 2.561, + "step": 25427 + }, + { + "epoch": 0.7540254425762832, + "grad_norm": 0.09525518119335175, + "learning_rate": 0.00014477874814609182, + "loss": 2.5553, + "step": 25428 + }, + { + "epoch": 0.7540550959285948, + "grad_norm": 0.08617842942476273, + "learning_rate": 0.00014474563735832196, + "loss": 2.5633, + "step": 25429 + }, + { + "epoch": 0.7540847492809062, + "grad_norm": 0.09617955982685089, + "learning_rate": 0.00014471252971639032, + "loss": 2.5845, + "step": 25430 + }, + { + "epoch": 0.7541144026332177, + "grad_norm": 0.08313864469528198, + "learning_rate": 0.00014467942522059036, + "loss": 2.5664, + "step": 25431 + }, + { + "epoch": 0.7541440559855291, + "grad_norm": 0.09226255118846893, + "learning_rate": 0.00014464632387121528, + "loss": 2.5514, + "step": 25432 + }, + { + "epoch": 0.7541737093378407, + "grad_norm": 0.09527426958084106, + "learning_rate": 0.00014461322566855777, + "loss": 2.5916, + "step": 25433 + }, + { + "epoch": 0.7542033626901521, + "grad_norm": 0.09739340096712112, + "learning_rate": 0.0001445801306129112, + "loss": 2.5939, + "step": 25434 + }, + { + "epoch": 0.7542330160424636, + "grad_norm": 0.08744863420724869, + "learning_rate": 0.0001445470387045686, + "loss": 2.5487, + "step": 25435 + }, + { + "epoch": 0.754262669394775, + "grad_norm": 0.0947585478425026, + "learning_rate": 0.00014451394994382293, + "loss": 2.5819, + "step": 25436 + }, + { + "epoch": 0.7542923227470866, + "grad_norm": 0.09092039614915848, + "learning_rate": 0.0001444808643309673, + "loss": 2.5631, + "step": 25437 + }, + { + "epoch": 0.754321976099398, + "grad_norm": 0.09173016995191574, + "learning_rate": 0.0001444477818662946, + "loss": 2.5557, + "step": 25438 + }, + { + "epoch": 0.7543516294517095, + "grad_norm": 0.09247619658708572, + "learning_rate": 0.00014441470255009787, + "loss": 2.5553, + "step": 25439 + }, + { + "epoch": 0.754381282804021, + "grad_norm": 0.0817669928073883, + "learning_rate": 0.00014438162638266995, + "loss": 2.5413, + "step": 25440 + }, + { + "epoch": 0.7544109361563325, + "grad_norm": 0.08565358072519302, + "learning_rate": 0.00014434855336430374, + "loss": 2.5515, + "step": 25441 + }, + { + "epoch": 0.7544405895086439, + "grad_norm": 0.09036694467067719, + "learning_rate": 0.00014431548349529217, + "loss": 2.5566, + "step": 25442 + }, + { + "epoch": 0.7544702428609554, + "grad_norm": 0.09425380825996399, + "learning_rate": 0.00014428241677592806, + "loss": 2.561, + "step": 25443 + }, + { + "epoch": 0.754499896213267, + "grad_norm": 0.08773627132177353, + "learning_rate": 0.00014424935320650419, + "loss": 2.5646, + "step": 25444 + }, + { + "epoch": 0.7545295495655784, + "grad_norm": 0.09287062287330627, + "learning_rate": 0.00014421629278731334, + "loss": 2.5809, + "step": 25445 + }, + { + "epoch": 0.7545592029178899, + "grad_norm": 0.08586204797029495, + "learning_rate": 0.00014418323551864832, + "loss": 2.5643, + "step": 25446 + }, + { + "epoch": 0.7545888562702013, + "grad_norm": 0.09930936992168427, + "learning_rate": 0.00014415018140080183, + "loss": 2.5363, + "step": 25447 + }, + { + "epoch": 0.7546185096225129, + "grad_norm": 0.08725957572460175, + "learning_rate": 0.00014411713043406655, + "loss": 2.5277, + "step": 25448 + }, + { + "epoch": 0.7546481629748243, + "grad_norm": 0.09614089876413345, + "learning_rate": 0.00014408408261873517, + "loss": 2.5712, + "step": 25449 + }, + { + "epoch": 0.7546778163271358, + "grad_norm": 0.0925142914056778, + "learning_rate": 0.00014405103795510032, + "loss": 2.5793, + "step": 25450 + }, + { + "epoch": 0.7547074696794472, + "grad_norm": 0.09480351209640503, + "learning_rate": 0.00014401799644345466, + "loss": 2.5569, + "step": 25451 + }, + { + "epoch": 0.7547371230317588, + "grad_norm": 0.08965631574392319, + "learning_rate": 0.00014398495808409068, + "loss": 2.5724, + "step": 25452 + }, + { + "epoch": 0.7547667763840702, + "grad_norm": 0.08787599205970764, + "learning_rate": 0.00014395192287730107, + "loss": 2.5741, + "step": 25453 + }, + { + "epoch": 0.7547964297363817, + "grad_norm": 0.09282270818948746, + "learning_rate": 0.00014391889082337827, + "loss": 2.5837, + "step": 25454 + }, + { + "epoch": 0.7548260830886931, + "grad_norm": 0.09366228431463242, + "learning_rate": 0.00014388586192261483, + "loss": 2.5747, + "step": 25455 + }, + { + "epoch": 0.7548557364410047, + "grad_norm": 0.08528000861406326, + "learning_rate": 0.0001438528361753032, + "loss": 2.5496, + "step": 25456 + }, + { + "epoch": 0.7548853897933161, + "grad_norm": 0.09123648703098297, + "learning_rate": 0.00014381981358173578, + "loss": 2.5675, + "step": 25457 + }, + { + "epoch": 0.7549150431456276, + "grad_norm": 0.08776766806840897, + "learning_rate": 0.00014378679414220514, + "loss": 2.5636, + "step": 25458 + }, + { + "epoch": 0.754944696497939, + "grad_norm": 0.08550653606653214, + "learning_rate": 0.00014375377785700354, + "loss": 2.5754, + "step": 25459 + }, + { + "epoch": 0.7549743498502506, + "grad_norm": 0.10034479200839996, + "learning_rate": 0.00014372076472642337, + "loss": 2.5931, + "step": 25460 + }, + { + "epoch": 0.755004003202562, + "grad_norm": 0.09070680290460587, + "learning_rate": 0.00014368775475075702, + "loss": 2.5502, + "step": 25461 + }, + { + "epoch": 0.7550336565548735, + "grad_norm": 0.08855140209197998, + "learning_rate": 0.0001436547479302967, + "loss": 2.5538, + "step": 25462 + }, + { + "epoch": 0.755063309907185, + "grad_norm": 0.08428311347961426, + "learning_rate": 0.0001436217442653348, + "loss": 2.5598, + "step": 25463 + }, + { + "epoch": 0.7550929632594965, + "grad_norm": 0.08424544334411621, + "learning_rate": 0.00014358874375616353, + "loss": 2.5507, + "step": 25464 + }, + { + "epoch": 0.755122616611808, + "grad_norm": 0.08202875405550003, + "learning_rate": 0.0001435557464030751, + "loss": 2.5559, + "step": 25465 + }, + { + "epoch": 0.7551522699641194, + "grad_norm": 0.08420991897583008, + "learning_rate": 0.0001435227522063619, + "loss": 2.541, + "step": 25466 + }, + { + "epoch": 0.755181923316431, + "grad_norm": 0.09036539494991302, + "learning_rate": 0.00014348976116631575, + "loss": 2.5534, + "step": 25467 + }, + { + "epoch": 0.7552115766687424, + "grad_norm": 0.09378470480442047, + "learning_rate": 0.00014345677328322893, + "loss": 2.5839, + "step": 25468 + }, + { + "epoch": 0.7552412300210539, + "grad_norm": 0.09287139028310776, + "learning_rate": 0.0001434237885573934, + "loss": 2.569, + "step": 25469 + }, + { + "epoch": 0.7552708833733653, + "grad_norm": 0.10024403035640717, + "learning_rate": 0.00014339080698910168, + "loss": 2.5526, + "step": 25470 + }, + { + "epoch": 0.7553005367256769, + "grad_norm": 0.08884060382843018, + "learning_rate": 0.0001433578285786455, + "loss": 2.5837, + "step": 25471 + }, + { + "epoch": 0.7553301900779883, + "grad_norm": 0.09549760818481445, + "learning_rate": 0.000143324853326317, + "loss": 2.5624, + "step": 25472 + }, + { + "epoch": 0.7553598434302998, + "grad_norm": 0.0994444340467453, + "learning_rate": 0.0001432918812324081, + "loss": 2.5558, + "step": 25473 + }, + { + "epoch": 0.7553894967826112, + "grad_norm": 0.10180135071277618, + "learning_rate": 0.0001432589122972109, + "loss": 2.568, + "step": 25474 + }, + { + "epoch": 0.7554191501349228, + "grad_norm": 0.08628183603286743, + "learning_rate": 0.00014322594652101716, + "loss": 2.5762, + "step": 25475 + }, + { + "epoch": 0.7554488034872342, + "grad_norm": 0.09959670156240463, + "learning_rate": 0.000143192983904119, + "loss": 2.5721, + "step": 25476 + }, + { + "epoch": 0.7554784568395457, + "grad_norm": 0.0935790166258812, + "learning_rate": 0.00014316002444680833, + "loss": 2.5866, + "step": 25477 + }, + { + "epoch": 0.7555081101918572, + "grad_norm": 0.10363283008337021, + "learning_rate": 0.00014312706814937677, + "loss": 2.559, + "step": 25478 + }, + { + "epoch": 0.7555377635441687, + "grad_norm": 0.08769497275352478, + "learning_rate": 0.00014309411501211623, + "loss": 2.5251, + "step": 25479 + }, + { + "epoch": 0.7555674168964801, + "grad_norm": 0.10422062873840332, + "learning_rate": 0.00014306116503531857, + "loss": 2.6115, + "step": 25480 + }, + { + "epoch": 0.7555970702487916, + "grad_norm": 0.09646188467741013, + "learning_rate": 0.0001430282182192756, + "loss": 2.5817, + "step": 25481 + }, + { + "epoch": 0.7556267236011031, + "grad_norm": 0.08777303248643875, + "learning_rate": 0.0001429952745642788, + "loss": 2.5647, + "step": 25482 + }, + { + "epoch": 0.7556563769534146, + "grad_norm": 0.09381183236837387, + "learning_rate": 0.00014296233407062032, + "loss": 2.5866, + "step": 25483 + }, + { + "epoch": 0.755686030305726, + "grad_norm": 0.09392789751291275, + "learning_rate": 0.00014292939673859169, + "loss": 2.5624, + "step": 25484 + }, + { + "epoch": 0.7557156836580375, + "grad_norm": 0.09600425511598587, + "learning_rate": 0.0001428964625684845, + "loss": 2.5776, + "step": 25485 + }, + { + "epoch": 0.7557453370103491, + "grad_norm": 0.09044045209884644, + "learning_rate": 0.00014286353156059046, + "loss": 2.5409, + "step": 25486 + }, + { + "epoch": 0.7557749903626605, + "grad_norm": 0.09455616772174835, + "learning_rate": 0.0001428306037152013, + "loss": 2.5646, + "step": 25487 + }, + { + "epoch": 0.755804643714972, + "grad_norm": 0.08907735347747803, + "learning_rate": 0.00014279767903260825, + "loss": 2.5658, + "step": 25488 + }, + { + "epoch": 0.7558342970672834, + "grad_norm": 0.08864708244800568, + "learning_rate": 0.00014276475751310314, + "loss": 2.5527, + "step": 25489 + }, + { + "epoch": 0.755863950419595, + "grad_norm": 0.09039973467588425, + "learning_rate": 0.00014273183915697736, + "loss": 2.5755, + "step": 25490 + }, + { + "epoch": 0.7558936037719064, + "grad_norm": 0.0885106772184372, + "learning_rate": 0.0001426989239645225, + "loss": 2.5536, + "step": 25491 + }, + { + "epoch": 0.7559232571242179, + "grad_norm": 0.08859504759311676, + "learning_rate": 0.00014266601193602997, + "loss": 2.5113, + "step": 25492 + }, + { + "epoch": 0.7559529104765293, + "grad_norm": 0.08623352646827698, + "learning_rate": 0.00014263310307179128, + "loss": 2.5643, + "step": 25493 + }, + { + "epoch": 0.7559825638288409, + "grad_norm": 0.10067540407180786, + "learning_rate": 0.00014260019737209777, + "loss": 2.5892, + "step": 25494 + }, + { + "epoch": 0.7560122171811523, + "grad_norm": 0.08779127895832062, + "learning_rate": 0.0001425672948372407, + "loss": 2.5944, + "step": 25495 + }, + { + "epoch": 0.7560418705334638, + "grad_norm": 0.09526750445365906, + "learning_rate": 0.00014253439546751178, + "loss": 2.5689, + "step": 25496 + }, + { + "epoch": 0.7560715238857753, + "grad_norm": 0.09027960896492004, + "learning_rate": 0.0001425014992632021, + "loss": 2.5606, + "step": 25497 + }, + { + "epoch": 0.7561011772380868, + "grad_norm": 0.10214763134717941, + "learning_rate": 0.00014246860622460318, + "loss": 2.577, + "step": 25498 + }, + { + "epoch": 0.7561308305903982, + "grad_norm": 0.08823956549167633, + "learning_rate": 0.00014243571635200598, + "loss": 2.5626, + "step": 25499 + }, + { + "epoch": 0.7561604839427097, + "grad_norm": 0.09733106195926666, + "learning_rate": 0.00014240282964570188, + "loss": 2.5757, + "step": 25500 + }, + { + "epoch": 0.7561901372950212, + "grad_norm": 0.10100214928388596, + "learning_rate": 0.0001423699461059821, + "loss": 2.5818, + "step": 25501 + }, + { + "epoch": 0.7562197906473327, + "grad_norm": 0.09592583775520325, + "learning_rate": 0.00014233706573313788, + "loss": 2.5883, + "step": 25502 + }, + { + "epoch": 0.7562494439996441, + "grad_norm": 0.1144130527973175, + "learning_rate": 0.00014230418852746024, + "loss": 2.5364, + "step": 25503 + }, + { + "epoch": 0.7562790973519556, + "grad_norm": 0.09425187110900879, + "learning_rate": 0.00014227131448924047, + "loss": 2.6059, + "step": 25504 + }, + { + "epoch": 0.7563087507042671, + "grad_norm": 0.10156761109828949, + "learning_rate": 0.00014223844361876964, + "loss": 2.5719, + "step": 25505 + }, + { + "epoch": 0.7563384040565786, + "grad_norm": 0.11169085651636124, + "learning_rate": 0.00014220557591633875, + "loss": 2.571, + "step": 25506 + }, + { + "epoch": 0.7563680574088901, + "grad_norm": 0.09363849461078644, + "learning_rate": 0.00014217271138223893, + "loss": 2.5788, + "step": 25507 + }, + { + "epoch": 0.7563977107612015, + "grad_norm": 0.1088932678103447, + "learning_rate": 0.0001421398500167611, + "loss": 2.5668, + "step": 25508 + }, + { + "epoch": 0.7564273641135131, + "grad_norm": 0.09016379714012146, + "learning_rate": 0.00014210699182019642, + "loss": 2.5521, + "step": 25509 + }, + { + "epoch": 0.7564570174658245, + "grad_norm": 0.10988801717758179, + "learning_rate": 0.0001420741367928357, + "loss": 2.571, + "step": 25510 + }, + { + "epoch": 0.756486670818136, + "grad_norm": 0.09140834212303162, + "learning_rate": 0.00014204128493497, + "loss": 2.5333, + "step": 25511 + }, + { + "epoch": 0.7565163241704475, + "grad_norm": 0.10098235309123993, + "learning_rate": 0.0001420084362468901, + "loss": 2.5579, + "step": 25512 + }, + { + "epoch": 0.756545977522759, + "grad_norm": 0.0931902676820755, + "learning_rate": 0.00014197559072888694, + "loss": 2.5489, + "step": 25513 + }, + { + "epoch": 0.7565756308750704, + "grad_norm": 0.09914635866880417, + "learning_rate": 0.00014194274838125144, + "loss": 2.5545, + "step": 25514 + }, + { + "epoch": 0.7566052842273819, + "grad_norm": 0.08535052090883255, + "learning_rate": 0.00014190990920427433, + "loss": 2.5626, + "step": 25515 + }, + { + "epoch": 0.7566349375796934, + "grad_norm": 0.09906047582626343, + "learning_rate": 0.0001418770731982464, + "loss": 2.5623, + "step": 25516 + }, + { + "epoch": 0.7566645909320049, + "grad_norm": 0.09749215841293335, + "learning_rate": 0.00014184424036345849, + "loss": 2.6176, + "step": 25517 + }, + { + "epoch": 0.7566942442843163, + "grad_norm": 0.08615130186080933, + "learning_rate": 0.00014181141070020132, + "loss": 2.5704, + "step": 25518 + }, + { + "epoch": 0.7567238976366278, + "grad_norm": 0.09884075075387955, + "learning_rate": 0.0001417785842087656, + "loss": 2.5586, + "step": 25519 + }, + { + "epoch": 0.7567535509889393, + "grad_norm": 0.09166882187128067, + "learning_rate": 0.00014174576088944195, + "loss": 2.5628, + "step": 25520 + }, + { + "epoch": 0.7567832043412508, + "grad_norm": 0.10130736231803894, + "learning_rate": 0.0001417129407425211, + "loss": 2.5539, + "step": 25521 + }, + { + "epoch": 0.7568128576935622, + "grad_norm": 0.09641489386558533, + "learning_rate": 0.0001416801237682937, + "loss": 2.6035, + "step": 25522 + }, + { + "epoch": 0.7568425110458737, + "grad_norm": 0.0975048616528511, + "learning_rate": 0.00014164730996705027, + "loss": 2.5292, + "step": 25523 + }, + { + "epoch": 0.7568721643981852, + "grad_norm": 0.08987338095903397, + "learning_rate": 0.00014161449933908143, + "loss": 2.5614, + "step": 25524 + }, + { + "epoch": 0.7569018177504967, + "grad_norm": 0.10325123369693756, + "learning_rate": 0.00014158169188467772, + "loss": 2.5856, + "step": 25525 + }, + { + "epoch": 0.7569314711028081, + "grad_norm": 0.088132344186306, + "learning_rate": 0.0001415488876041297, + "loss": 2.5791, + "step": 25526 + }, + { + "epoch": 0.7569611244551196, + "grad_norm": 0.10294613987207413, + "learning_rate": 0.00014151608649772774, + "loss": 2.587, + "step": 25527 + }, + { + "epoch": 0.7569907778074312, + "grad_norm": 0.0943792536854744, + "learning_rate": 0.00014148328856576238, + "loss": 2.5781, + "step": 25528 + }, + { + "epoch": 0.7570204311597426, + "grad_norm": 0.09689389914274216, + "learning_rate": 0.00014145049380852404, + "loss": 2.564, + "step": 25529 + }, + { + "epoch": 0.7570500845120541, + "grad_norm": 0.0956917256116867, + "learning_rate": 0.0001414177022263031, + "loss": 2.5602, + "step": 25530 + }, + { + "epoch": 0.7570797378643656, + "grad_norm": 0.09236034005880356, + "learning_rate": 0.00014138491381939, + "loss": 2.5271, + "step": 25531 + }, + { + "epoch": 0.7571093912166771, + "grad_norm": 0.09551854431629181, + "learning_rate": 0.00014135212858807516, + "loss": 2.5858, + "step": 25532 + }, + { + "epoch": 0.7571390445689885, + "grad_norm": 0.09561334550380707, + "learning_rate": 0.00014131934653264854, + "loss": 2.5684, + "step": 25533 + }, + { + "epoch": 0.7571686979213, + "grad_norm": 0.09169834107160568, + "learning_rate": 0.00014128656765340075, + "loss": 2.5701, + "step": 25534 + }, + { + "epoch": 0.7571983512736115, + "grad_norm": 0.09663017094135284, + "learning_rate": 0.00014125379195062204, + "loss": 2.5626, + "step": 25535 + }, + { + "epoch": 0.757228004625923, + "grad_norm": 0.08923555910587311, + "learning_rate": 0.00014122101942460252, + "loss": 2.5544, + "step": 25536 + }, + { + "epoch": 0.7572576579782344, + "grad_norm": 0.0916975662112236, + "learning_rate": 0.00014118825007563253, + "loss": 2.5728, + "step": 25537 + }, + { + "epoch": 0.7572873113305459, + "grad_norm": 0.08636520802974701, + "learning_rate": 0.00014115548390400206, + "loss": 2.5381, + "step": 25538 + }, + { + "epoch": 0.7573169646828574, + "grad_norm": 0.09765438735485077, + "learning_rate": 0.0001411227209100015, + "loss": 2.565, + "step": 25539 + }, + { + "epoch": 0.7573466180351689, + "grad_norm": 0.0924375131726265, + "learning_rate": 0.00014108996109392076, + "loss": 2.574, + "step": 25540 + }, + { + "epoch": 0.7573762713874803, + "grad_norm": 0.09427458792924881, + "learning_rate": 0.0001410572044560501, + "loss": 2.5581, + "step": 25541 + }, + { + "epoch": 0.7574059247397918, + "grad_norm": 0.08379855006933212, + "learning_rate": 0.00014102445099667955, + "loss": 2.5529, + "step": 25542 + }, + { + "epoch": 0.7574355780921033, + "grad_norm": 0.08383574336767197, + "learning_rate": 0.000140991700716099, + "loss": 2.5519, + "step": 25543 + }, + { + "epoch": 0.7574652314444148, + "grad_norm": 0.09807726740837097, + "learning_rate": 0.00014095895361459858, + "loss": 2.5819, + "step": 25544 + }, + { + "epoch": 0.7574948847967262, + "grad_norm": 0.0887477844953537, + "learning_rate": 0.0001409262096924683, + "loss": 2.5598, + "step": 25545 + }, + { + "epoch": 0.7575245381490378, + "grad_norm": 0.08999871462583542, + "learning_rate": 0.00014089346894999782, + "loss": 2.5935, + "step": 25546 + }, + { + "epoch": 0.7575541915013492, + "grad_norm": 0.09171886742115021, + "learning_rate": 0.00014086073138747752, + "loss": 2.5336, + "step": 25547 + }, + { + "epoch": 0.7575838448536607, + "grad_norm": 0.0966498851776123, + "learning_rate": 0.00014082799700519704, + "loss": 2.5729, + "step": 25548 + }, + { + "epoch": 0.7576134982059722, + "grad_norm": 0.0966595783829689, + "learning_rate": 0.00014079526580344637, + "loss": 2.5788, + "step": 25549 + }, + { + "epoch": 0.7576431515582837, + "grad_norm": 0.1001284122467041, + "learning_rate": 0.00014076253778251525, + "loss": 2.5313, + "step": 25550 + }, + { + "epoch": 0.7576728049105952, + "grad_norm": 0.08914686739444733, + "learning_rate": 0.0001407298129426935, + "loss": 2.5688, + "step": 25551 + }, + { + "epoch": 0.7577024582629066, + "grad_norm": 0.08517652004957199, + "learning_rate": 0.00014069709128427095, + "loss": 2.5518, + "step": 25552 + }, + { + "epoch": 0.7577321116152181, + "grad_norm": 0.09105037152767181, + "learning_rate": 0.00014066437280753748, + "loss": 2.5903, + "step": 25553 + }, + { + "epoch": 0.7577617649675296, + "grad_norm": 0.0961245745420456, + "learning_rate": 0.0001406316575127825, + "loss": 2.5707, + "step": 25554 + }, + { + "epoch": 0.7577914183198411, + "grad_norm": 0.08665893226861954, + "learning_rate": 0.0001405989454002959, + "loss": 2.5316, + "step": 25555 + }, + { + "epoch": 0.7578210716721525, + "grad_norm": 0.09557072818279266, + "learning_rate": 0.00014056623647036725, + "loss": 2.557, + "step": 25556 + }, + { + "epoch": 0.757850725024464, + "grad_norm": 0.09012343734502792, + "learning_rate": 0.00014053353072328634, + "loss": 2.515, + "step": 25557 + }, + { + "epoch": 0.7578803783767755, + "grad_norm": 0.08253118395805359, + "learning_rate": 0.00014050082815934272, + "loss": 2.5693, + "step": 25558 + }, + { + "epoch": 0.757910031729087, + "grad_norm": 0.07986029237508774, + "learning_rate": 0.0001404681287788258, + "loss": 2.603, + "step": 25559 + }, + { + "epoch": 0.7579396850813984, + "grad_norm": 0.08808750659227371, + "learning_rate": 0.00014043543258202552, + "loss": 2.5662, + "step": 25560 + }, + { + "epoch": 0.75796933843371, + "grad_norm": 0.08535830676555634, + "learning_rate": 0.00014040273956923116, + "loss": 2.5735, + "step": 25561 + }, + { + "epoch": 0.7579989917860214, + "grad_norm": 0.09771475195884705, + "learning_rate": 0.00014037004974073224, + "loss": 2.5584, + "step": 25562 + }, + { + "epoch": 0.7580286451383329, + "grad_norm": 0.08650662004947662, + "learning_rate": 0.00014033736309681844, + "loss": 2.5439, + "step": 25563 + }, + { + "epoch": 0.7580582984906443, + "grad_norm": 0.10132185369729996, + "learning_rate": 0.00014030467963777887, + "loss": 2.5833, + "step": 25564 + }, + { + "epoch": 0.7580879518429559, + "grad_norm": 0.08414842933416367, + "learning_rate": 0.0001402719993639031, + "loss": 2.616, + "step": 25565 + }, + { + "epoch": 0.7581176051952673, + "grad_norm": 0.08907831460237503, + "learning_rate": 0.00014023932227548054, + "loss": 2.5693, + "step": 25566 + }, + { + "epoch": 0.7581472585475788, + "grad_norm": 0.08893833309412003, + "learning_rate": 0.0001402066483728005, + "loss": 2.5704, + "step": 25567 + }, + { + "epoch": 0.7581769118998903, + "grad_norm": 0.09621698409318924, + "learning_rate": 0.00014017397765615235, + "loss": 2.5909, + "step": 25568 + }, + { + "epoch": 0.7582065652522018, + "grad_norm": 0.08882877230644226, + "learning_rate": 0.00014014131012582542, + "loss": 2.5837, + "step": 25569 + }, + { + "epoch": 0.7582362186045133, + "grad_norm": 0.09343786537647247, + "learning_rate": 0.00014010864578210897, + "loss": 2.6251, + "step": 25570 + }, + { + "epoch": 0.7582658719568247, + "grad_norm": 0.07829154282808304, + "learning_rate": 0.0001400759846252922, + "loss": 2.5341, + "step": 25571 + }, + { + "epoch": 0.7582955253091362, + "grad_norm": 0.0989801287651062, + "learning_rate": 0.00014004332665566423, + "loss": 2.5964, + "step": 25572 + }, + { + "epoch": 0.7583251786614477, + "grad_norm": 0.081424281001091, + "learning_rate": 0.00014001067187351452, + "loss": 2.5539, + "step": 25573 + }, + { + "epoch": 0.7583548320137592, + "grad_norm": 0.10001399368047714, + "learning_rate": 0.00013997802027913226, + "loss": 2.5684, + "step": 25574 + }, + { + "epoch": 0.7583844853660706, + "grad_norm": 0.08826921135187149, + "learning_rate": 0.00013994537187280633, + "loss": 2.5877, + "step": 25575 + }, + { + "epoch": 0.7584141387183821, + "grad_norm": 0.10725343972444534, + "learning_rate": 0.00013991272665482584, + "loss": 2.5552, + "step": 25576 + }, + { + "epoch": 0.7584437920706936, + "grad_norm": 0.09678076207637787, + "learning_rate": 0.00013988008462548, + "loss": 2.5849, + "step": 25577 + }, + { + "epoch": 0.7584734454230051, + "grad_norm": 0.09853863716125488, + "learning_rate": 0.00013984744578505787, + "loss": 2.5678, + "step": 25578 + }, + { + "epoch": 0.7585030987753165, + "grad_norm": 0.10003583133220673, + "learning_rate": 0.0001398148101338484, + "loss": 2.5208, + "step": 25579 + }, + { + "epoch": 0.758532752127628, + "grad_norm": 0.08871673047542572, + "learning_rate": 0.0001397821776721406, + "loss": 2.5439, + "step": 25580 + }, + { + "epoch": 0.7585624054799395, + "grad_norm": 0.09583177417516708, + "learning_rate": 0.00013974954840022342, + "loss": 2.5584, + "step": 25581 + }, + { + "epoch": 0.758592058832251, + "grad_norm": 0.09397635608911514, + "learning_rate": 0.00013971692231838585, + "loss": 2.5827, + "step": 25582 + }, + { + "epoch": 0.7586217121845624, + "grad_norm": 0.08952168375253677, + "learning_rate": 0.0001396842994269168, + "loss": 2.6086, + "step": 25583 + }, + { + "epoch": 0.758651365536874, + "grad_norm": 0.08639267832040787, + "learning_rate": 0.0001396516797261051, + "loss": 2.5603, + "step": 25584 + }, + { + "epoch": 0.7586810188891854, + "grad_norm": 0.09525668621063232, + "learning_rate": 0.00013961906321623962, + "loss": 2.5657, + "step": 25585 + }, + { + "epoch": 0.7587106722414969, + "grad_norm": 0.10562140494585037, + "learning_rate": 0.0001395864498976092, + "loss": 2.5836, + "step": 25586 + }, + { + "epoch": 0.7587403255938083, + "grad_norm": 0.09234318882226944, + "learning_rate": 0.00013955383977050267, + "loss": 2.5655, + "step": 25587 + }, + { + "epoch": 0.7587699789461199, + "grad_norm": 0.10820984840393066, + "learning_rate": 0.00013952123283520872, + "loss": 2.5367, + "step": 25588 + }, + { + "epoch": 0.7587996322984314, + "grad_norm": 0.08654128015041351, + "learning_rate": 0.00013948862909201614, + "loss": 2.5755, + "step": 25589 + }, + { + "epoch": 0.7588292856507428, + "grad_norm": 0.10945013165473938, + "learning_rate": 0.00013945602854121365, + "loss": 2.5532, + "step": 25590 + }, + { + "epoch": 0.7588589390030543, + "grad_norm": 0.09516768902540207, + "learning_rate": 0.00013942343118308987, + "loss": 2.5443, + "step": 25591 + }, + { + "epoch": 0.7588885923553658, + "grad_norm": 0.11325950175523758, + "learning_rate": 0.00013939083701793354, + "loss": 2.575, + "step": 25592 + }, + { + "epoch": 0.7589182457076773, + "grad_norm": 0.0931411162018776, + "learning_rate": 0.0001393582460460332, + "loss": 2.5703, + "step": 25593 + }, + { + "epoch": 0.7589478990599887, + "grad_norm": 0.11131563037633896, + "learning_rate": 0.00013932565826767752, + "loss": 2.556, + "step": 25594 + }, + { + "epoch": 0.7589775524123002, + "grad_norm": 0.10804551094770432, + "learning_rate": 0.0001392930736831551, + "loss": 2.5736, + "step": 25595 + }, + { + "epoch": 0.7590072057646117, + "grad_norm": 0.09101960062980652, + "learning_rate": 0.00013926049229275435, + "loss": 2.5765, + "step": 25596 + }, + { + "epoch": 0.7590368591169232, + "grad_norm": 0.10418585687875748, + "learning_rate": 0.000139227914096764, + "loss": 2.582, + "step": 25597 + }, + { + "epoch": 0.7590665124692346, + "grad_norm": 0.0940190851688385, + "learning_rate": 0.00013919533909547216, + "loss": 2.5403, + "step": 25598 + }, + { + "epoch": 0.7590961658215462, + "grad_norm": 0.09795084595680237, + "learning_rate": 0.00013916276728916766, + "loss": 2.519, + "step": 25599 + }, + { + "epoch": 0.7591258191738576, + "grad_norm": 0.09945471584796906, + "learning_rate": 0.00013913019867813875, + "loss": 2.564, + "step": 25600 + }, + { + "epoch": 0.7591554725261691, + "grad_norm": 0.09586169570684433, + "learning_rate": 0.00013909763326267388, + "loss": 2.6001, + "step": 25601 + }, + { + "epoch": 0.7591851258784805, + "grad_norm": 0.09949810057878494, + "learning_rate": 0.00013906507104306142, + "loss": 2.5452, + "step": 25602 + }, + { + "epoch": 0.7592147792307921, + "grad_norm": 0.09787974506616592, + "learning_rate": 0.00013903251201958976, + "loss": 2.5309, + "step": 25603 + }, + { + "epoch": 0.7592444325831035, + "grad_norm": 0.10853186994791031, + "learning_rate": 0.00013899995619254713, + "loss": 2.5473, + "step": 25604 + }, + { + "epoch": 0.759274085935415, + "grad_norm": 0.10183630883693695, + "learning_rate": 0.00013896740356222187, + "loss": 2.5625, + "step": 25605 + }, + { + "epoch": 0.7593037392877264, + "grad_norm": 0.08857406675815582, + "learning_rate": 0.00013893485412890216, + "loss": 2.5126, + "step": 25606 + }, + { + "epoch": 0.759333392640038, + "grad_norm": 0.10398297011852264, + "learning_rate": 0.0001389023078928764, + "loss": 2.5874, + "step": 25607 + }, + { + "epoch": 0.7593630459923494, + "grad_norm": 0.0806921124458313, + "learning_rate": 0.00013886976485443276, + "loss": 2.6055, + "step": 25608 + }, + { + "epoch": 0.7593926993446609, + "grad_norm": 0.10730860382318497, + "learning_rate": 0.00013883722501385922, + "loss": 2.6104, + "step": 25609 + }, + { + "epoch": 0.7594223526969724, + "grad_norm": 0.08785761147737503, + "learning_rate": 0.000138804688371444, + "loss": 2.5761, + "step": 25610 + }, + { + "epoch": 0.7594520060492839, + "grad_norm": 0.09739288687705994, + "learning_rate": 0.00013877215492747512, + "loss": 2.5791, + "step": 25611 + }, + { + "epoch": 0.7594816594015954, + "grad_norm": 0.1022578775882721, + "learning_rate": 0.000138739624682241, + "loss": 2.5792, + "step": 25612 + }, + { + "epoch": 0.7595113127539068, + "grad_norm": 0.0835883766412735, + "learning_rate": 0.0001387070976360295, + "loss": 2.5253, + "step": 25613 + }, + { + "epoch": 0.7595409661062184, + "grad_norm": 0.09684355556964874, + "learning_rate": 0.00013867457378912863, + "loss": 2.5632, + "step": 25614 + }, + { + "epoch": 0.7595706194585298, + "grad_norm": 0.08521497249603271, + "learning_rate": 0.0001386420531418265, + "loss": 2.6107, + "step": 25615 + }, + { + "epoch": 0.7596002728108413, + "grad_norm": 0.09250960499048233, + "learning_rate": 0.00013860953569441094, + "loss": 2.5532, + "step": 25616 + }, + { + "epoch": 0.7596299261631527, + "grad_norm": 0.08040913194417953, + "learning_rate": 0.00013857702144717005, + "loss": 2.5668, + "step": 25617 + }, + { + "epoch": 0.7596595795154643, + "grad_norm": 0.0920446589589119, + "learning_rate": 0.00013854451040039173, + "loss": 2.6062, + "step": 25618 + }, + { + "epoch": 0.7596892328677757, + "grad_norm": 0.0857977569103241, + "learning_rate": 0.00013851200255436373, + "loss": 2.5779, + "step": 25619 + }, + { + "epoch": 0.7597188862200872, + "grad_norm": 0.09246466308832169, + "learning_rate": 0.00013847949790937397, + "loss": 2.5616, + "step": 25620 + }, + { + "epoch": 0.7597485395723986, + "grad_norm": 0.08348837494850159, + "learning_rate": 0.00013844699646571034, + "loss": 2.5766, + "step": 25621 + }, + { + "epoch": 0.7597781929247102, + "grad_norm": 0.09160906821489334, + "learning_rate": 0.00013841449822366058, + "loss": 2.598, + "step": 25622 + }, + { + "epoch": 0.7598078462770216, + "grad_norm": 0.08204980939626694, + "learning_rate": 0.00013838200318351258, + "loss": 2.561, + "step": 25623 + }, + { + "epoch": 0.7598374996293331, + "grad_norm": 0.09468288719654083, + "learning_rate": 0.0001383495113455538, + "loss": 2.5542, + "step": 25624 + }, + { + "epoch": 0.7598671529816445, + "grad_norm": 0.07998447865247726, + "learning_rate": 0.0001383170227100723, + "loss": 2.5705, + "step": 25625 + }, + { + "epoch": 0.7598968063339561, + "grad_norm": 0.08924650400876999, + "learning_rate": 0.00013828453727735568, + "loss": 2.5727, + "step": 25626 + }, + { + "epoch": 0.7599264596862675, + "grad_norm": 0.08244927227497101, + "learning_rate": 0.00013825205504769156, + "loss": 2.5677, + "step": 25627 + }, + { + "epoch": 0.759956113038579, + "grad_norm": 0.08345863968133926, + "learning_rate": 0.0001382195760213676, + "loss": 2.5532, + "step": 25628 + }, + { + "epoch": 0.7599857663908904, + "grad_norm": 0.08954034000635147, + "learning_rate": 0.00013818710019867153, + "loss": 2.5663, + "step": 25629 + }, + { + "epoch": 0.760015419743202, + "grad_norm": 0.08555182814598083, + "learning_rate": 0.00013815462757989062, + "loss": 2.5608, + "step": 25630 + }, + { + "epoch": 0.7600450730955135, + "grad_norm": 0.08042138069868088, + "learning_rate": 0.00013812215816531265, + "loss": 2.5469, + "step": 25631 + }, + { + "epoch": 0.7600747264478249, + "grad_norm": 0.08712290972471237, + "learning_rate": 0.00013808969195522504, + "loss": 2.5448, + "step": 25632 + }, + { + "epoch": 0.7601043798001365, + "grad_norm": 0.08826493471860886, + "learning_rate": 0.00013805722894991534, + "loss": 2.5453, + "step": 25633 + }, + { + "epoch": 0.7601340331524479, + "grad_norm": 0.08757968991994858, + "learning_rate": 0.000138024769149671, + "loss": 2.5768, + "step": 25634 + }, + { + "epoch": 0.7601636865047594, + "grad_norm": 0.081837497651577, + "learning_rate": 0.00013799231255477945, + "loss": 2.5713, + "step": 25635 + }, + { + "epoch": 0.7601933398570708, + "grad_norm": 0.08674206584692001, + "learning_rate": 0.00013795985916552816, + "loss": 2.588, + "step": 25636 + }, + { + "epoch": 0.7602229932093824, + "grad_norm": 0.07891632616519928, + "learning_rate": 0.00013792740898220423, + "loss": 2.5877, + "step": 25637 + }, + { + "epoch": 0.7602526465616938, + "grad_norm": 0.0939774215221405, + "learning_rate": 0.0001378949620050955, + "loss": 2.587, + "step": 25638 + }, + { + "epoch": 0.7602822999140053, + "grad_norm": 0.0878245085477829, + "learning_rate": 0.00013786251823448908, + "loss": 2.5442, + "step": 25639 + }, + { + "epoch": 0.7603119532663167, + "grad_norm": 0.0843610167503357, + "learning_rate": 0.00013783007767067214, + "loss": 2.5602, + "step": 25640 + }, + { + "epoch": 0.7603416066186283, + "grad_norm": 0.09931682795286179, + "learning_rate": 0.000137797640313932, + "loss": 2.5944, + "step": 25641 + }, + { + "epoch": 0.7603712599709397, + "grad_norm": 0.07886800915002823, + "learning_rate": 0.00013776520616455595, + "loss": 2.573, + "step": 25642 + }, + { + "epoch": 0.7604009133232512, + "grad_norm": 0.09233488142490387, + "learning_rate": 0.0001377327752228311, + "loss": 2.5755, + "step": 25643 + }, + { + "epoch": 0.7604305666755626, + "grad_norm": 0.08917662501335144, + "learning_rate": 0.00013770034748904482, + "loss": 2.5817, + "step": 25644 + }, + { + "epoch": 0.7604602200278742, + "grad_norm": 0.08962207287549973, + "learning_rate": 0.00013766792296348408, + "loss": 2.608, + "step": 25645 + }, + { + "epoch": 0.7604898733801856, + "grad_norm": 0.10666897147893906, + "learning_rate": 0.00013763550164643613, + "loss": 2.562, + "step": 25646 + }, + { + "epoch": 0.7605195267324971, + "grad_norm": 0.08372034132480621, + "learning_rate": 0.00013760308353818795, + "loss": 2.5594, + "step": 25647 + }, + { + "epoch": 0.7605491800848085, + "grad_norm": 0.10126123577356339, + "learning_rate": 0.0001375706686390267, + "loss": 2.5884, + "step": 25648 + }, + { + "epoch": 0.7605788334371201, + "grad_norm": 0.08463625609874725, + "learning_rate": 0.00013753825694923938, + "loss": 2.5972, + "step": 25649 + }, + { + "epoch": 0.7606084867894315, + "grad_norm": 0.09881262481212616, + "learning_rate": 0.000137505848469113, + "loss": 2.5413, + "step": 25650 + }, + { + "epoch": 0.760638140141743, + "grad_norm": 0.08485953509807587, + "learning_rate": 0.00013747344319893457, + "loss": 2.5743, + "step": 25651 + }, + { + "epoch": 0.7606677934940546, + "grad_norm": 0.09267386794090271, + "learning_rate": 0.00013744104113899103, + "loss": 2.5269, + "step": 25652 + }, + { + "epoch": 0.760697446846366, + "grad_norm": 0.09529761224985123, + "learning_rate": 0.0001374086422895693, + "loss": 2.5867, + "step": 25653 + }, + { + "epoch": 0.7607271001986775, + "grad_norm": 0.09219734370708466, + "learning_rate": 0.00013737624665095626, + "loss": 2.5544, + "step": 25654 + }, + { + "epoch": 0.7607567535509889, + "grad_norm": 0.08953993767499924, + "learning_rate": 0.0001373438542234388, + "loss": 2.5772, + "step": 25655 + }, + { + "epoch": 0.7607864069033005, + "grad_norm": 0.0941392034292221, + "learning_rate": 0.00013731146500730378, + "loss": 2.5628, + "step": 25656 + }, + { + "epoch": 0.7608160602556119, + "grad_norm": 0.08944816142320633, + "learning_rate": 0.00013727907900283804, + "loss": 2.5466, + "step": 25657 + }, + { + "epoch": 0.7608457136079234, + "grad_norm": 0.09397432953119278, + "learning_rate": 0.00013724669621032826, + "loss": 2.5658, + "step": 25658 + }, + { + "epoch": 0.7608753669602348, + "grad_norm": 0.08749762177467346, + "learning_rate": 0.00013721431663006123, + "loss": 2.5896, + "step": 25659 + }, + { + "epoch": 0.7609050203125464, + "grad_norm": 0.09369570761919022, + "learning_rate": 0.00013718194026232373, + "loss": 2.5566, + "step": 25660 + }, + { + "epoch": 0.7609346736648578, + "grad_norm": 0.09347591549158096, + "learning_rate": 0.0001371495671074024, + "loss": 2.567, + "step": 25661 + }, + { + "epoch": 0.7609643270171693, + "grad_norm": 0.08612444996833801, + "learning_rate": 0.00013711719716558396, + "loss": 2.5585, + "step": 25662 + }, + { + "epoch": 0.7609939803694807, + "grad_norm": 0.0898943766951561, + "learning_rate": 0.00013708483043715504, + "loss": 2.5887, + "step": 25663 + }, + { + "epoch": 0.7610236337217923, + "grad_norm": 0.09002792090177536, + "learning_rate": 0.0001370524669224022, + "loss": 2.5692, + "step": 25664 + }, + { + "epoch": 0.7610532870741037, + "grad_norm": 0.0880824476480484, + "learning_rate": 0.00013702010662161213, + "loss": 2.561, + "step": 25665 + }, + { + "epoch": 0.7610829404264152, + "grad_norm": 0.09188634902238846, + "learning_rate": 0.00013698774953507125, + "loss": 2.5333, + "step": 25666 + }, + { + "epoch": 0.7611125937787266, + "grad_norm": 0.0972060039639473, + "learning_rate": 0.00013695539566306619, + "loss": 2.5469, + "step": 25667 + }, + { + "epoch": 0.7611422471310382, + "grad_norm": 0.08973333239555359, + "learning_rate": 0.00013692304500588344, + "loss": 2.5825, + "step": 25668 + }, + { + "epoch": 0.7611719004833496, + "grad_norm": 0.09779096394777298, + "learning_rate": 0.0001368906975638094, + "loss": 2.5777, + "step": 25669 + }, + { + "epoch": 0.7612015538356611, + "grad_norm": 0.08676570653915405, + "learning_rate": 0.0001368583533371306, + "loss": 2.5581, + "step": 25670 + }, + { + "epoch": 0.7612312071879725, + "grad_norm": 0.09888298809528351, + "learning_rate": 0.00013682601232613335, + "loss": 2.5614, + "step": 25671 + }, + { + "epoch": 0.7612608605402841, + "grad_norm": 0.08944930881261826, + "learning_rate": 0.00013679367453110414, + "loss": 2.5643, + "step": 25672 + }, + { + "epoch": 0.7612905138925956, + "grad_norm": 0.09748662263154984, + "learning_rate": 0.00013676133995232947, + "loss": 2.5707, + "step": 25673 + }, + { + "epoch": 0.761320167244907, + "grad_norm": 0.09221930801868439, + "learning_rate": 0.00013672900859009528, + "loss": 2.5966, + "step": 25674 + }, + { + "epoch": 0.7613498205972186, + "grad_norm": 0.09667699784040451, + "learning_rate": 0.00013669668044468807, + "loss": 2.5784, + "step": 25675 + }, + { + "epoch": 0.76137947394953, + "grad_norm": 0.09098245948553085, + "learning_rate": 0.000136664355516394, + "loss": 2.5758, + "step": 25676 + }, + { + "epoch": 0.7614091273018415, + "grad_norm": 0.08856663107872009, + "learning_rate": 0.0001366320338054996, + "loss": 2.5466, + "step": 25677 + }, + { + "epoch": 0.7614387806541529, + "grad_norm": 0.08897209167480469, + "learning_rate": 0.00013659971531229087, + "loss": 2.5848, + "step": 25678 + }, + { + "epoch": 0.7614684340064645, + "grad_norm": 0.09785108268260956, + "learning_rate": 0.00013656740003705403, + "loss": 2.5347, + "step": 25679 + }, + { + "epoch": 0.7614980873587759, + "grad_norm": 0.0900207981467247, + "learning_rate": 0.0001365350879800753, + "loss": 2.5575, + "step": 25680 + }, + { + "epoch": 0.7615277407110874, + "grad_norm": 0.0907519981265068, + "learning_rate": 0.00013650277914164073, + "loss": 2.5681, + "step": 25681 + }, + { + "epoch": 0.7615573940633988, + "grad_norm": 0.088749460875988, + "learning_rate": 0.0001364704735220364, + "loss": 2.5749, + "step": 25682 + }, + { + "epoch": 0.7615870474157104, + "grad_norm": 0.08250778168439865, + "learning_rate": 0.00013643817112154845, + "loss": 2.5378, + "step": 25683 + }, + { + "epoch": 0.7616167007680218, + "grad_norm": 0.09070972353219986, + "learning_rate": 0.00013640587194046306, + "loss": 2.6013, + "step": 25684 + }, + { + "epoch": 0.7616463541203333, + "grad_norm": 0.08978298306465149, + "learning_rate": 0.00013637357597906592, + "loss": 2.5635, + "step": 25685 + }, + { + "epoch": 0.7616760074726447, + "grad_norm": 0.08441660553216934, + "learning_rate": 0.00013634128323764322, + "loss": 2.5569, + "step": 25686 + }, + { + "epoch": 0.7617056608249563, + "grad_norm": 0.09731577336788177, + "learning_rate": 0.00013630899371648087, + "loss": 2.5748, + "step": 25687 + }, + { + "epoch": 0.7617353141772677, + "grad_norm": 0.09467022120952606, + "learning_rate": 0.00013627670741586472, + "loss": 2.5928, + "step": 25688 + }, + { + "epoch": 0.7617649675295792, + "grad_norm": 0.09381284564733505, + "learning_rate": 0.0001362444243360807, + "loss": 2.5842, + "step": 25689 + }, + { + "epoch": 0.7617946208818906, + "grad_norm": 0.08853580057621002, + "learning_rate": 0.00013621214447741487, + "loss": 2.5506, + "step": 25690 + }, + { + "epoch": 0.7618242742342022, + "grad_norm": 0.10401172935962677, + "learning_rate": 0.00013617986784015296, + "loss": 2.5303, + "step": 25691 + }, + { + "epoch": 0.7618539275865136, + "grad_norm": 0.07873507589101791, + "learning_rate": 0.00013614759442458075, + "loss": 2.5104, + "step": 25692 + }, + { + "epoch": 0.7618835809388251, + "grad_norm": 0.09707264602184296, + "learning_rate": 0.00013611532423098404, + "loss": 2.5806, + "step": 25693 + }, + { + "epoch": 0.7619132342911367, + "grad_norm": 0.08256098628044128, + "learning_rate": 0.00013608305725964877, + "loss": 2.5856, + "step": 25694 + }, + { + "epoch": 0.7619428876434481, + "grad_norm": 0.08679872006177902, + "learning_rate": 0.0001360507935108603, + "loss": 2.5666, + "step": 25695 + }, + { + "epoch": 0.7619725409957596, + "grad_norm": 0.0891696959733963, + "learning_rate": 0.0001360185329849045, + "loss": 2.5562, + "step": 25696 + }, + { + "epoch": 0.762002194348071, + "grad_norm": 0.09416995942592621, + "learning_rate": 0.00013598627568206718, + "loss": 2.5455, + "step": 25697 + }, + { + "epoch": 0.7620318477003826, + "grad_norm": 0.0935443639755249, + "learning_rate": 0.00013595402160263375, + "loss": 2.5521, + "step": 25698 + }, + { + "epoch": 0.762061501052694, + "grad_norm": 0.09248464554548264, + "learning_rate": 0.00013592177074689, + "loss": 2.5807, + "step": 25699 + }, + { + "epoch": 0.7620911544050055, + "grad_norm": 0.10283084213733673, + "learning_rate": 0.0001358895231151215, + "loss": 2.5837, + "step": 25700 + }, + { + "epoch": 0.7621208077573169, + "grad_norm": 0.09166460484266281, + "learning_rate": 0.00013585727870761354, + "loss": 2.5792, + "step": 25701 + }, + { + "epoch": 0.7621504611096285, + "grad_norm": 0.10236074030399323, + "learning_rate": 0.0001358250375246521, + "loss": 2.5863, + "step": 25702 + }, + { + "epoch": 0.7621801144619399, + "grad_norm": 0.09575864672660828, + "learning_rate": 0.00013579279956652245, + "loss": 2.5666, + "step": 25703 + }, + { + "epoch": 0.7622097678142514, + "grad_norm": 0.10476900637149811, + "learning_rate": 0.00013576056483351006, + "loss": 2.5755, + "step": 25704 + }, + { + "epoch": 0.7622394211665628, + "grad_norm": 0.10578981041908264, + "learning_rate": 0.00013572833332590057, + "loss": 2.5718, + "step": 25705 + }, + { + "epoch": 0.7622690745188744, + "grad_norm": 0.09430058300495148, + "learning_rate": 0.000135696105043979, + "loss": 2.5686, + "step": 25706 + }, + { + "epoch": 0.7622987278711858, + "grad_norm": 0.08861999958753586, + "learning_rate": 0.000135663879988031, + "loss": 2.5651, + "step": 25707 + }, + { + "epoch": 0.7623283812234973, + "grad_norm": 0.09463126212358475, + "learning_rate": 0.0001356316581583419, + "loss": 2.5511, + "step": 25708 + }, + { + "epoch": 0.7623580345758088, + "grad_norm": 0.08223758637905121, + "learning_rate": 0.00013559943955519693, + "loss": 2.5844, + "step": 25709 + }, + { + "epoch": 0.7623876879281203, + "grad_norm": 0.08831992000341415, + "learning_rate": 0.00013556722417888155, + "loss": 2.5618, + "step": 25710 + }, + { + "epoch": 0.7624173412804317, + "grad_norm": 0.08825334906578064, + "learning_rate": 0.00013553501202968093, + "loss": 2.5873, + "step": 25711 + }, + { + "epoch": 0.7624469946327432, + "grad_norm": 0.08897584676742554, + "learning_rate": 0.00013550280310788032, + "loss": 2.5421, + "step": 25712 + }, + { + "epoch": 0.7624766479850547, + "grad_norm": 0.08550361543893814, + "learning_rate": 0.000135470597413765, + "loss": 2.5345, + "step": 25713 + }, + { + "epoch": 0.7625063013373662, + "grad_norm": 0.08660412579774857, + "learning_rate": 0.0001354383949476199, + "loss": 2.6021, + "step": 25714 + }, + { + "epoch": 0.7625359546896777, + "grad_norm": 0.08296673744916916, + "learning_rate": 0.00013540619570973073, + "loss": 2.5614, + "step": 25715 + }, + { + "epoch": 0.7625656080419891, + "grad_norm": 0.09789912402629852, + "learning_rate": 0.00013537399970038212, + "loss": 2.555, + "step": 25716 + }, + { + "epoch": 0.7625952613943007, + "grad_norm": 0.09221269190311432, + "learning_rate": 0.0001353418069198593, + "loss": 2.5842, + "step": 25717 + }, + { + "epoch": 0.7626249147466121, + "grad_norm": 0.0970025584101677, + "learning_rate": 0.00013530961736844737, + "loss": 2.5866, + "step": 25718 + }, + { + "epoch": 0.7626545680989236, + "grad_norm": 0.09173190593719482, + "learning_rate": 0.00013527743104643143, + "loss": 2.5728, + "step": 25719 + }, + { + "epoch": 0.762684221451235, + "grad_norm": 0.09241978079080582, + "learning_rate": 0.00013524524795409642, + "loss": 2.5574, + "step": 25720 + }, + { + "epoch": 0.7627138748035466, + "grad_norm": 0.09583377838134766, + "learning_rate": 0.00013521306809172735, + "loss": 2.5363, + "step": 25721 + }, + { + "epoch": 0.762743528155858, + "grad_norm": 0.08936747163534164, + "learning_rate": 0.00013518089145960916, + "loss": 2.5759, + "step": 25722 + }, + { + "epoch": 0.7627731815081695, + "grad_norm": 0.09437945485115051, + "learning_rate": 0.0001351487180580268, + "loss": 2.564, + "step": 25723 + }, + { + "epoch": 0.762802834860481, + "grad_norm": 0.09800832718610764, + "learning_rate": 0.0001351165478872652, + "loss": 2.5674, + "step": 25724 + }, + { + "epoch": 0.7628324882127925, + "grad_norm": 0.08656535297632217, + "learning_rate": 0.00013508438094760917, + "loss": 2.555, + "step": 25725 + }, + { + "epoch": 0.7628621415651039, + "grad_norm": 0.0871584489941597, + "learning_rate": 0.00013505221723934357, + "loss": 2.5573, + "step": 25726 + }, + { + "epoch": 0.7628917949174154, + "grad_norm": 0.09175252914428711, + "learning_rate": 0.00013502005676275326, + "loss": 2.581, + "step": 25727 + }, + { + "epoch": 0.7629214482697269, + "grad_norm": 0.09364400058984756, + "learning_rate": 0.000134987899518123, + "loss": 2.5786, + "step": 25728 + }, + { + "epoch": 0.7629511016220384, + "grad_norm": 0.08625319600105286, + "learning_rate": 0.00013495574550573752, + "loss": 2.5759, + "step": 25729 + }, + { + "epoch": 0.7629807549743498, + "grad_norm": 0.09297484904527664, + "learning_rate": 0.00013492359472588156, + "loss": 2.5859, + "step": 25730 + }, + { + "epoch": 0.7630104083266613, + "grad_norm": 0.08846878260374069, + "learning_rate": 0.00013489144717883988, + "loss": 2.5452, + "step": 25731 + }, + { + "epoch": 0.7630400616789728, + "grad_norm": 0.08720511198043823, + "learning_rate": 0.00013485930286489707, + "loss": 2.5462, + "step": 25732 + }, + { + "epoch": 0.7630697150312843, + "grad_norm": 0.08993225544691086, + "learning_rate": 0.0001348271617843378, + "loss": 2.5812, + "step": 25733 + }, + { + "epoch": 0.7630993683835957, + "grad_norm": 0.07820509374141693, + "learning_rate": 0.00013479502393744675, + "loss": 2.5667, + "step": 25734 + }, + { + "epoch": 0.7631290217359072, + "grad_norm": 0.08612462878227234, + "learning_rate": 0.00013476288932450837, + "loss": 2.5839, + "step": 25735 + }, + { + "epoch": 0.7631586750882188, + "grad_norm": 0.092344731092453, + "learning_rate": 0.00013473075794580736, + "loss": 2.5492, + "step": 25736 + }, + { + "epoch": 0.7631883284405302, + "grad_norm": 0.08511779457330704, + "learning_rate": 0.00013469862980162816, + "loss": 2.5966, + "step": 25737 + }, + { + "epoch": 0.7632179817928417, + "grad_norm": 0.08928700536489487, + "learning_rate": 0.00013466650489225528, + "loss": 2.549, + "step": 25738 + }, + { + "epoch": 0.7632476351451531, + "grad_norm": 0.09043098241090775, + "learning_rate": 0.0001346343832179734, + "loss": 2.5453, + "step": 25739 + }, + { + "epoch": 0.7632772884974647, + "grad_norm": 0.0879589095711708, + "learning_rate": 0.00013460226477906645, + "loss": 2.5855, + "step": 25740 + }, + { + "epoch": 0.7633069418497761, + "grad_norm": 0.09152525663375854, + "learning_rate": 0.00013457014957581932, + "loss": 2.5722, + "step": 25741 + }, + { + "epoch": 0.7633365952020876, + "grad_norm": 0.08664365857839584, + "learning_rate": 0.00013453803760851623, + "loss": 2.5678, + "step": 25742 + }, + { + "epoch": 0.763366248554399, + "grad_norm": 0.10037044435739517, + "learning_rate": 0.00013450592887744156, + "loss": 2.5838, + "step": 25743 + }, + { + "epoch": 0.7633959019067106, + "grad_norm": 0.09035349637269974, + "learning_rate": 0.00013447382338287962, + "loss": 2.568, + "step": 25744 + }, + { + "epoch": 0.763425555259022, + "grad_norm": 0.08889228105545044, + "learning_rate": 0.0001344417211251147, + "loss": 2.5515, + "step": 25745 + }, + { + "epoch": 0.7634552086113335, + "grad_norm": 0.09348779916763306, + "learning_rate": 0.0001344096221044311, + "loss": 2.5681, + "step": 25746 + }, + { + "epoch": 0.763484861963645, + "grad_norm": 0.08712705969810486, + "learning_rate": 0.00013437752632111305, + "loss": 2.5791, + "step": 25747 + }, + { + "epoch": 0.7635145153159565, + "grad_norm": 0.09983021020889282, + "learning_rate": 0.00013434543377544472, + "loss": 2.5948, + "step": 25748 + }, + { + "epoch": 0.7635441686682679, + "grad_norm": 0.09111617505550385, + "learning_rate": 0.00013431334446771054, + "loss": 2.5667, + "step": 25749 + }, + { + "epoch": 0.7635738220205794, + "grad_norm": 0.09318774938583374, + "learning_rate": 0.00013428125839819432, + "loss": 2.5701, + "step": 25750 + }, + { + "epoch": 0.7636034753728909, + "grad_norm": 0.09616635739803314, + "learning_rate": 0.0001342491755671803, + "loss": 2.5745, + "step": 25751 + }, + { + "epoch": 0.7636331287252024, + "grad_norm": 0.08950235694646835, + "learning_rate": 0.00013421709597495263, + "loss": 2.5606, + "step": 25752 + }, + { + "epoch": 0.7636627820775138, + "grad_norm": 0.08922881633043289, + "learning_rate": 0.00013418501962179525, + "loss": 2.5487, + "step": 25753 + }, + { + "epoch": 0.7636924354298253, + "grad_norm": 0.09035056084394455, + "learning_rate": 0.00013415294650799236, + "loss": 2.5235, + "step": 25754 + }, + { + "epoch": 0.7637220887821368, + "grad_norm": 0.0978742465376854, + "learning_rate": 0.000134120876633828, + "loss": 2.5856, + "step": 25755 + }, + { + "epoch": 0.7637517421344483, + "grad_norm": 0.084617018699646, + "learning_rate": 0.00013408880999958605, + "loss": 2.5713, + "step": 25756 + }, + { + "epoch": 0.7637813954867598, + "grad_norm": 0.09327267855405807, + "learning_rate": 0.00013405674660555057, + "loss": 2.5821, + "step": 25757 + }, + { + "epoch": 0.7638110488390712, + "grad_norm": 0.08608704805374146, + "learning_rate": 0.00013402468645200532, + "loss": 2.5585, + "step": 25758 + }, + { + "epoch": 0.7638407021913828, + "grad_norm": 0.092009998857975, + "learning_rate": 0.0001339926295392343, + "loss": 2.5801, + "step": 25759 + }, + { + "epoch": 0.7638703555436942, + "grad_norm": 0.0922277569770813, + "learning_rate": 0.00013396057586752158, + "loss": 2.5473, + "step": 25760 + }, + { + "epoch": 0.7639000088960057, + "grad_norm": 0.08271857351064682, + "learning_rate": 0.0001339285254371506, + "loss": 2.5568, + "step": 25761 + }, + { + "epoch": 0.7639296622483172, + "grad_norm": 0.09318019449710846, + "learning_rate": 0.00013389647824840534, + "loss": 2.5708, + "step": 25762 + }, + { + "epoch": 0.7639593156006287, + "grad_norm": 0.08255548775196075, + "learning_rate": 0.00013386443430156965, + "loss": 2.5984, + "step": 25763 + }, + { + "epoch": 0.7639889689529401, + "grad_norm": 0.0907827615737915, + "learning_rate": 0.00013383239359692723, + "loss": 2.5688, + "step": 25764 + }, + { + "epoch": 0.7640186223052516, + "grad_norm": 0.08132081478834152, + "learning_rate": 0.00013380035613476182, + "loss": 2.5788, + "step": 25765 + }, + { + "epoch": 0.7640482756575631, + "grad_norm": 0.09707090258598328, + "learning_rate": 0.00013376832191535693, + "loss": 2.5596, + "step": 25766 + }, + { + "epoch": 0.7640779290098746, + "grad_norm": 0.08031412959098816, + "learning_rate": 0.00013373629093899658, + "loss": 2.5694, + "step": 25767 + }, + { + "epoch": 0.764107582362186, + "grad_norm": 0.10489122569561005, + "learning_rate": 0.00013370426320596425, + "loss": 2.5267, + "step": 25768 + }, + { + "epoch": 0.7641372357144975, + "grad_norm": 0.08171102404594421, + "learning_rate": 0.00013367223871654354, + "loss": 2.5444, + "step": 25769 + }, + { + "epoch": 0.764166889066809, + "grad_norm": 0.10368307679891586, + "learning_rate": 0.00013364021747101816, + "loss": 2.5739, + "step": 25770 + }, + { + "epoch": 0.7641965424191205, + "grad_norm": 0.08811573684215546, + "learning_rate": 0.00013360819946967145, + "loss": 2.5769, + "step": 25771 + }, + { + "epoch": 0.7642261957714319, + "grad_norm": 0.09823977947235107, + "learning_rate": 0.00013357618471278697, + "loss": 2.5652, + "step": 25772 + }, + { + "epoch": 0.7642558491237434, + "grad_norm": 0.10088176280260086, + "learning_rate": 0.00013354417320064833, + "loss": 2.578, + "step": 25773 + }, + { + "epoch": 0.7642855024760549, + "grad_norm": 0.09332428872585297, + "learning_rate": 0.00013351216493353885, + "loss": 2.5563, + "step": 25774 + }, + { + "epoch": 0.7643151558283664, + "grad_norm": 0.10637380182743073, + "learning_rate": 0.0001334801599117421, + "loss": 2.5895, + "step": 25775 + }, + { + "epoch": 0.7643448091806779, + "grad_norm": 0.09809445589780807, + "learning_rate": 0.00013344815813554146, + "loss": 2.5704, + "step": 25776 + }, + { + "epoch": 0.7643744625329894, + "grad_norm": 0.09504156559705734, + "learning_rate": 0.0001334161596052203, + "loss": 2.5738, + "step": 25777 + }, + { + "epoch": 0.7644041158853009, + "grad_norm": 0.10454919934272766, + "learning_rate": 0.00013338416432106192, + "loss": 2.5555, + "step": 25778 + }, + { + "epoch": 0.7644337692376123, + "grad_norm": 0.0956936702132225, + "learning_rate": 0.0001333521722833496, + "loss": 2.5971, + "step": 25779 + }, + { + "epoch": 0.7644634225899238, + "grad_norm": 0.09839663654565811, + "learning_rate": 0.00013332018349236684, + "loss": 2.5193, + "step": 25780 + }, + { + "epoch": 0.7644930759422353, + "grad_norm": 0.10500527918338776, + "learning_rate": 0.00013328819794839697, + "loss": 2.5686, + "step": 25781 + }, + { + "epoch": 0.7645227292945468, + "grad_norm": 0.08716293424367905, + "learning_rate": 0.0001332562156517229, + "loss": 2.5373, + "step": 25782 + }, + { + "epoch": 0.7645523826468582, + "grad_norm": 0.09640521556138992, + "learning_rate": 0.000133224236602628, + "loss": 2.555, + "step": 25783 + }, + { + "epoch": 0.7645820359991697, + "grad_norm": 0.08338852226734161, + "learning_rate": 0.00013319226080139545, + "loss": 2.5591, + "step": 25784 + }, + { + "epoch": 0.7646116893514812, + "grad_norm": 0.09072554856538773, + "learning_rate": 0.00013316028824830834, + "loss": 2.5703, + "step": 25785 + }, + { + "epoch": 0.7646413427037927, + "grad_norm": 0.08470797538757324, + "learning_rate": 0.0001331283189436499, + "loss": 2.5621, + "step": 25786 + }, + { + "epoch": 0.7646709960561041, + "grad_norm": 0.08952182531356812, + "learning_rate": 0.0001330963528877031, + "loss": 2.5439, + "step": 25787 + }, + { + "epoch": 0.7647006494084156, + "grad_norm": 0.09572076797485352, + "learning_rate": 0.00013306439008075116, + "loss": 2.572, + "step": 25788 + }, + { + "epoch": 0.7647303027607271, + "grad_norm": 0.1574750393629074, + "learning_rate": 0.00013303243052307696, + "loss": 2.5229, + "step": 25789 + }, + { + "epoch": 0.7647599561130386, + "grad_norm": 0.0993688777089119, + "learning_rate": 0.00013300047421496364, + "loss": 2.5558, + "step": 25790 + }, + { + "epoch": 0.76478960946535, + "grad_norm": 0.09458722919225693, + "learning_rate": 0.00013296852115669404, + "loss": 2.553, + "step": 25791 + }, + { + "epoch": 0.7648192628176615, + "grad_norm": 0.0925995260477066, + "learning_rate": 0.00013293657134855124, + "loss": 2.5397, + "step": 25792 + }, + { + "epoch": 0.764848916169973, + "grad_norm": 0.09836281090974808, + "learning_rate": 0.0001329046247908181, + "loss": 2.5874, + "step": 25793 + }, + { + "epoch": 0.7648785695222845, + "grad_norm": 0.09731890261173248, + "learning_rate": 0.0001328726814837775, + "loss": 2.505, + "step": 25794 + }, + { + "epoch": 0.7649082228745959, + "grad_norm": 0.08691161870956421, + "learning_rate": 0.00013284074142771237, + "loss": 2.538, + "step": 25795 + }, + { + "epoch": 0.7649378762269075, + "grad_norm": 0.09162147343158722, + "learning_rate": 0.00013280880462290546, + "loss": 2.5766, + "step": 25796 + }, + { + "epoch": 0.764967529579219, + "grad_norm": 0.09453180432319641, + "learning_rate": 0.0001327768710696396, + "loss": 2.548, + "step": 25797 + }, + { + "epoch": 0.7649971829315304, + "grad_norm": 0.08713264018297195, + "learning_rate": 0.0001327449407681976, + "loss": 2.5959, + "step": 25798 + }, + { + "epoch": 0.7650268362838419, + "grad_norm": 0.09212135523557663, + "learning_rate": 0.00013271301371886213, + "loss": 2.5392, + "step": 25799 + }, + { + "epoch": 0.7650564896361534, + "grad_norm": 0.08810192346572876, + "learning_rate": 0.00013268108992191602, + "loss": 2.525, + "step": 25800 + }, + { + "epoch": 0.7650861429884649, + "grad_norm": 0.0880158394575119, + "learning_rate": 0.00013264916937764194, + "loss": 2.5232, + "step": 25801 + }, + { + "epoch": 0.7651157963407763, + "grad_norm": 0.08580370247364044, + "learning_rate": 0.00013261725208632246, + "loss": 2.6223, + "step": 25802 + }, + { + "epoch": 0.7651454496930878, + "grad_norm": 0.0938405990600586, + "learning_rate": 0.0001325853380482403, + "loss": 2.6032, + "step": 25803 + }, + { + "epoch": 0.7651751030453993, + "grad_norm": 0.08587406575679779, + "learning_rate": 0.00013255342726367804, + "loss": 2.5571, + "step": 25804 + }, + { + "epoch": 0.7652047563977108, + "grad_norm": 0.08382052183151245, + "learning_rate": 0.00013252151973291827, + "loss": 2.5452, + "step": 25805 + }, + { + "epoch": 0.7652344097500222, + "grad_norm": 0.09056374430656433, + "learning_rate": 0.0001324896154562435, + "loss": 2.5478, + "step": 25806 + }, + { + "epoch": 0.7652640631023337, + "grad_norm": 0.08330711722373962, + "learning_rate": 0.00013245771443393621, + "loss": 2.5094, + "step": 25807 + }, + { + "epoch": 0.7652937164546452, + "grad_norm": 0.0871630311012268, + "learning_rate": 0.00013242581666627902, + "loss": 2.5369, + "step": 25808 + }, + { + "epoch": 0.7653233698069567, + "grad_norm": 0.08987810462713242, + "learning_rate": 0.00013239392215355427, + "loss": 2.5587, + "step": 25809 + }, + { + "epoch": 0.7653530231592681, + "grad_norm": 0.08324034512042999, + "learning_rate": 0.00013236203089604448, + "loss": 2.5158, + "step": 25810 + }, + { + "epoch": 0.7653826765115797, + "grad_norm": 0.08824312686920166, + "learning_rate": 0.000132330142894032, + "loss": 2.5822, + "step": 25811 + }, + { + "epoch": 0.7654123298638911, + "grad_norm": 0.08179230242967606, + "learning_rate": 0.0001322982581477992, + "loss": 2.5663, + "step": 25812 + }, + { + "epoch": 0.7654419832162026, + "grad_norm": 0.0895630270242691, + "learning_rate": 0.00013226637665762848, + "loss": 2.5545, + "step": 25813 + }, + { + "epoch": 0.765471636568514, + "grad_norm": 0.08160830289125443, + "learning_rate": 0.00013223449842380208, + "loss": 2.5643, + "step": 25814 + }, + { + "epoch": 0.7655012899208256, + "grad_norm": 0.08960875868797302, + "learning_rate": 0.0001322026234466025, + "loss": 2.5837, + "step": 25815 + }, + { + "epoch": 0.765530943273137, + "grad_norm": 0.08471318334341049, + "learning_rate": 0.00013217075172631165, + "loss": 2.5621, + "step": 25816 + }, + { + "epoch": 0.7655605966254485, + "grad_norm": 0.08490723371505737, + "learning_rate": 0.00013213888326321193, + "loss": 2.5815, + "step": 25817 + }, + { + "epoch": 0.76559024997776, + "grad_norm": 0.08182796835899353, + "learning_rate": 0.00013210701805758542, + "loss": 2.533, + "step": 25818 + }, + { + "epoch": 0.7656199033300715, + "grad_norm": 0.08281636238098145, + "learning_rate": 0.00013207515610971448, + "loss": 2.5515, + "step": 25819 + }, + { + "epoch": 0.765649556682383, + "grad_norm": 0.08639807254076004, + "learning_rate": 0.00013204329741988124, + "loss": 2.5452, + "step": 25820 + }, + { + "epoch": 0.7656792100346944, + "grad_norm": 0.08667638897895813, + "learning_rate": 0.00013201144198836777, + "loss": 2.6134, + "step": 25821 + }, + { + "epoch": 0.765708863387006, + "grad_norm": 0.08589884638786316, + "learning_rate": 0.00013197958981545616, + "loss": 2.5319, + "step": 25822 + }, + { + "epoch": 0.7657385167393174, + "grad_norm": 0.08617536723613739, + "learning_rate": 0.00013194774090142841, + "loss": 2.5796, + "step": 25823 + }, + { + "epoch": 0.7657681700916289, + "grad_norm": 0.08450431376695633, + "learning_rate": 0.0001319158952465666, + "loss": 2.5575, + "step": 25824 + }, + { + "epoch": 0.7657978234439403, + "grad_norm": 0.08925094455480576, + "learning_rate": 0.0001318840528511529, + "loss": 2.5498, + "step": 25825 + }, + { + "epoch": 0.7658274767962518, + "grad_norm": 0.08829951286315918, + "learning_rate": 0.00013185221371546892, + "loss": 2.5415, + "step": 25826 + }, + { + "epoch": 0.7658571301485633, + "grad_norm": 0.0851793885231018, + "learning_rate": 0.00013182037783979677, + "loss": 2.56, + "step": 25827 + }, + { + "epoch": 0.7658867835008748, + "grad_norm": 0.0874049961566925, + "learning_rate": 0.0001317885452244184, + "loss": 2.5621, + "step": 25828 + }, + { + "epoch": 0.7659164368531862, + "grad_norm": 0.0893602967262268, + "learning_rate": 0.00013175671586961564, + "loss": 2.5697, + "step": 25829 + }, + { + "epoch": 0.7659460902054978, + "grad_norm": 0.08624925464391708, + "learning_rate": 0.00013172488977567038, + "loss": 2.6004, + "step": 25830 + }, + { + "epoch": 0.7659757435578092, + "grad_norm": 0.09262283891439438, + "learning_rate": 0.00013169306694286426, + "loss": 2.558, + "step": 25831 + }, + { + "epoch": 0.7660053969101207, + "grad_norm": 0.08528773486614227, + "learning_rate": 0.00013166124737147943, + "loss": 2.576, + "step": 25832 + }, + { + "epoch": 0.7660350502624321, + "grad_norm": 0.08836082369089127, + "learning_rate": 0.00013162943106179747, + "loss": 2.5601, + "step": 25833 + }, + { + "epoch": 0.7660647036147437, + "grad_norm": 0.08758091926574707, + "learning_rate": 0.00013159761801410014, + "loss": 2.5434, + "step": 25834 + }, + { + "epoch": 0.7660943569670551, + "grad_norm": 0.08828160911798477, + "learning_rate": 0.00013156580822866915, + "loss": 2.5736, + "step": 25835 + }, + { + "epoch": 0.7661240103193666, + "grad_norm": 0.08688760548830032, + "learning_rate": 0.00013153400170578627, + "loss": 2.5449, + "step": 25836 + }, + { + "epoch": 0.766153663671678, + "grad_norm": 0.09896361082792282, + "learning_rate": 0.00013150219844573297, + "loss": 2.6073, + "step": 25837 + }, + { + "epoch": 0.7661833170239896, + "grad_norm": 0.09429093450307846, + "learning_rate": 0.00013147039844879087, + "loss": 2.5516, + "step": 25838 + }, + { + "epoch": 0.7662129703763011, + "grad_norm": 0.10301874577999115, + "learning_rate": 0.00013143860171524176, + "loss": 2.5416, + "step": 25839 + }, + { + "epoch": 0.7662426237286125, + "grad_norm": 0.08758480846881866, + "learning_rate": 0.000131406808245367, + "loss": 2.5773, + "step": 25840 + }, + { + "epoch": 0.766272277080924, + "grad_norm": 0.10785981267690659, + "learning_rate": 0.00013137501803944823, + "loss": 2.5226, + "step": 25841 + }, + { + "epoch": 0.7663019304332355, + "grad_norm": 0.08774110674858093, + "learning_rate": 0.00013134323109776697, + "loss": 2.5603, + "step": 25842 + }, + { + "epoch": 0.766331583785547, + "grad_norm": 0.10061710327863693, + "learning_rate": 0.0001313114474206047, + "loss": 2.5656, + "step": 25843 + }, + { + "epoch": 0.7663612371378584, + "grad_norm": 0.10062852501869202, + "learning_rate": 0.00013127966700824268, + "loss": 2.5688, + "step": 25844 + }, + { + "epoch": 0.76639089049017, + "grad_norm": 0.0904424786567688, + "learning_rate": 0.00013124788986096264, + "loss": 2.5839, + "step": 25845 + }, + { + "epoch": 0.7664205438424814, + "grad_norm": 0.09479475766420364, + "learning_rate": 0.00013121611597904597, + "loss": 2.5562, + "step": 25846 + }, + { + "epoch": 0.7664501971947929, + "grad_norm": 0.08891429752111435, + "learning_rate": 0.00013118434536277374, + "loss": 2.5853, + "step": 25847 + }, + { + "epoch": 0.7664798505471043, + "grad_norm": 0.09157662838697433, + "learning_rate": 0.00013115257801242747, + "loss": 2.5844, + "step": 25848 + }, + { + "epoch": 0.7665095038994159, + "grad_norm": 0.09135770052671432, + "learning_rate": 0.00013112081392828846, + "loss": 2.5691, + "step": 25849 + }, + { + "epoch": 0.7665391572517273, + "grad_norm": 0.09243737906217575, + "learning_rate": 0.0001310890531106379, + "loss": 2.574, + "step": 25850 + }, + { + "epoch": 0.7665688106040388, + "grad_norm": 0.09135863184928894, + "learning_rate": 0.0001310572955597571, + "loss": 2.5541, + "step": 25851 + }, + { + "epoch": 0.7665984639563502, + "grad_norm": 0.09771846234798431, + "learning_rate": 0.00013102554127592732, + "loss": 2.5625, + "step": 25852 + }, + { + "epoch": 0.7666281173086618, + "grad_norm": 0.0890154093503952, + "learning_rate": 0.00013099379025942966, + "loss": 2.5784, + "step": 25853 + }, + { + "epoch": 0.7666577706609732, + "grad_norm": 0.09913822263479233, + "learning_rate": 0.00013096204251054533, + "loss": 2.5257, + "step": 25854 + }, + { + "epoch": 0.7666874240132847, + "grad_norm": 0.08418366312980652, + "learning_rate": 0.00013093029802955548, + "loss": 2.5719, + "step": 25855 + }, + { + "epoch": 0.7667170773655961, + "grad_norm": 0.0901140347123146, + "learning_rate": 0.00013089855681674119, + "loss": 2.5561, + "step": 25856 + }, + { + "epoch": 0.7667467307179077, + "grad_norm": 0.09118185937404633, + "learning_rate": 0.00013086681887238355, + "loss": 2.5639, + "step": 25857 + }, + { + "epoch": 0.7667763840702191, + "grad_norm": 0.09673136472702026, + "learning_rate": 0.00013083508419676354, + "loss": 2.619, + "step": 25858 + }, + { + "epoch": 0.7668060374225306, + "grad_norm": 0.09504172950983047, + "learning_rate": 0.00013080335279016226, + "loss": 2.5631, + "step": 25859 + }, + { + "epoch": 0.7668356907748421, + "grad_norm": 0.10147085040807724, + "learning_rate": 0.00013077162465286064, + "loss": 2.5336, + "step": 25860 + }, + { + "epoch": 0.7668653441271536, + "grad_norm": 0.09959974139928818, + "learning_rate": 0.00013073989978513968, + "loss": 2.5391, + "step": 25861 + }, + { + "epoch": 0.7668949974794651, + "grad_norm": 0.09439021348953247, + "learning_rate": 0.0001307081781872803, + "loss": 2.6134, + "step": 25862 + }, + { + "epoch": 0.7669246508317765, + "grad_norm": 0.08663712441921234, + "learning_rate": 0.00013067645985956339, + "loss": 2.5831, + "step": 25863 + }, + { + "epoch": 0.766954304184088, + "grad_norm": 0.09818682074546814, + "learning_rate": 0.0001306447448022698, + "loss": 2.5253, + "step": 25864 + }, + { + "epoch": 0.7669839575363995, + "grad_norm": 0.10079988092184067, + "learning_rate": 0.0001306130330156804, + "loss": 2.5644, + "step": 25865 + }, + { + "epoch": 0.767013610888711, + "grad_norm": 0.09724127501249313, + "learning_rate": 0.00013058132450007598, + "loss": 2.5893, + "step": 25866 + }, + { + "epoch": 0.7670432642410224, + "grad_norm": 0.10018112510442734, + "learning_rate": 0.0001305496192557374, + "loss": 2.5522, + "step": 25867 + }, + { + "epoch": 0.767072917593334, + "grad_norm": 0.09521768242120743, + "learning_rate": 0.00013051791728294532, + "loss": 2.5532, + "step": 25868 + }, + { + "epoch": 0.7671025709456454, + "grad_norm": 0.10325180739164352, + "learning_rate": 0.0001304862185819805, + "loss": 2.5623, + "step": 25869 + }, + { + "epoch": 0.7671322242979569, + "grad_norm": 0.09861777722835541, + "learning_rate": 0.0001304545231531236, + "loss": 2.5578, + "step": 25870 + }, + { + "epoch": 0.7671618776502683, + "grad_norm": 0.10273673385381699, + "learning_rate": 0.0001304228309966554, + "loss": 2.545, + "step": 25871 + }, + { + "epoch": 0.7671915310025799, + "grad_norm": 0.09163213521242142, + "learning_rate": 0.00013039114211285647, + "loss": 2.6016, + "step": 25872 + }, + { + "epoch": 0.7672211843548913, + "grad_norm": 0.09757255017757416, + "learning_rate": 0.00013035945650200737, + "loss": 2.5513, + "step": 25873 + }, + { + "epoch": 0.7672508377072028, + "grad_norm": 0.08910365402698517, + "learning_rate": 0.00013032777416438872, + "loss": 2.5544, + "step": 25874 + }, + { + "epoch": 0.7672804910595142, + "grad_norm": 0.09780264645814896, + "learning_rate": 0.00013029609510028113, + "loss": 2.5381, + "step": 25875 + }, + { + "epoch": 0.7673101444118258, + "grad_norm": 0.09179077297449112, + "learning_rate": 0.00013026441930996508, + "loss": 2.5572, + "step": 25876 + }, + { + "epoch": 0.7673397977641372, + "grad_norm": 0.09940643608570099, + "learning_rate": 0.00013023274679372106, + "loss": 2.5789, + "step": 25877 + }, + { + "epoch": 0.7673694511164487, + "grad_norm": 0.08275251090526581, + "learning_rate": 0.00013020107755182947, + "loss": 2.5599, + "step": 25878 + }, + { + "epoch": 0.7673991044687601, + "grad_norm": 0.09398587793111801, + "learning_rate": 0.00013016941158457092, + "loss": 2.518, + "step": 25879 + }, + { + "epoch": 0.7674287578210717, + "grad_norm": 0.08930737525224686, + "learning_rate": 0.00013013774889222564, + "loss": 2.548, + "step": 25880 + }, + { + "epoch": 0.7674584111733832, + "grad_norm": 0.08099496364593506, + "learning_rate": 0.00013010608947507425, + "loss": 2.5713, + "step": 25881 + }, + { + "epoch": 0.7674880645256946, + "grad_norm": 0.09078529477119446, + "learning_rate": 0.00013007443333339674, + "loss": 2.529, + "step": 25882 + }, + { + "epoch": 0.7675177178780062, + "grad_norm": 0.08401687443256378, + "learning_rate": 0.00013004278046747347, + "loss": 2.5469, + "step": 25883 + }, + { + "epoch": 0.7675473712303176, + "grad_norm": 0.092036172747612, + "learning_rate": 0.00013001113087758508, + "loss": 2.5539, + "step": 25884 + }, + { + "epoch": 0.7675770245826291, + "grad_norm": 0.08637124300003052, + "learning_rate": 0.00012997948456401165, + "loss": 2.561, + "step": 25885 + }, + { + "epoch": 0.7676066779349405, + "grad_norm": 0.08720581978559494, + "learning_rate": 0.00012994784152703338, + "loss": 2.5508, + "step": 25886 + }, + { + "epoch": 0.7676363312872521, + "grad_norm": 0.0941794291138649, + "learning_rate": 0.00012991620176693041, + "loss": 2.5819, + "step": 25887 + }, + { + "epoch": 0.7676659846395635, + "grad_norm": 0.09550947695970535, + "learning_rate": 0.00012988456528398307, + "loss": 2.5694, + "step": 25888 + }, + { + "epoch": 0.767695637991875, + "grad_norm": 0.0869290754199028, + "learning_rate": 0.00012985293207847144, + "loss": 2.5605, + "step": 25889 + }, + { + "epoch": 0.7677252913441864, + "grad_norm": 0.09621694684028625, + "learning_rate": 0.00012982130215067562, + "loss": 2.5307, + "step": 25890 + }, + { + "epoch": 0.767754944696498, + "grad_norm": 0.08383625000715256, + "learning_rate": 0.00012978967550087588, + "loss": 2.5836, + "step": 25891 + }, + { + "epoch": 0.7677845980488094, + "grad_norm": 0.08481407165527344, + "learning_rate": 0.00012975805212935194, + "loss": 2.5611, + "step": 25892 + }, + { + "epoch": 0.7678142514011209, + "grad_norm": 0.09017327427864075, + "learning_rate": 0.000129726432036384, + "loss": 2.5811, + "step": 25893 + }, + { + "epoch": 0.7678439047534323, + "grad_norm": 0.08598881959915161, + "learning_rate": 0.000129694815222252, + "loss": 2.5715, + "step": 25894 + }, + { + "epoch": 0.7678735581057439, + "grad_norm": 0.08534421771764755, + "learning_rate": 0.00012966320168723594, + "loss": 2.5771, + "step": 25895 + }, + { + "epoch": 0.7679032114580553, + "grad_norm": 0.08444533497095108, + "learning_rate": 0.00012963159143161585, + "loss": 2.5608, + "step": 25896 + }, + { + "epoch": 0.7679328648103668, + "grad_norm": 0.08688984811306, + "learning_rate": 0.00012959998445567168, + "loss": 2.5518, + "step": 25897 + }, + { + "epoch": 0.7679625181626782, + "grad_norm": 0.09136547148227692, + "learning_rate": 0.00012956838075968313, + "loss": 2.5742, + "step": 25898 + }, + { + "epoch": 0.7679921715149898, + "grad_norm": 0.08803455531597137, + "learning_rate": 0.00012953678034393024, + "loss": 2.5816, + "step": 25899 + }, + { + "epoch": 0.7680218248673012, + "grad_norm": 0.0853561982512474, + "learning_rate": 0.0001295051832086927, + "loss": 2.5789, + "step": 25900 + }, + { + "epoch": 0.7680514782196127, + "grad_norm": 0.0826452448964119, + "learning_rate": 0.00012947358935425036, + "loss": 2.5744, + "step": 25901 + }, + { + "epoch": 0.7680811315719243, + "grad_norm": 0.08394315093755722, + "learning_rate": 0.00012944199878088314, + "loss": 2.546, + "step": 25902 + }, + { + "epoch": 0.7681107849242357, + "grad_norm": 0.0835176631808281, + "learning_rate": 0.00012941041148887045, + "loss": 2.5538, + "step": 25903 + }, + { + "epoch": 0.7681404382765472, + "grad_norm": 0.08526194840669632, + "learning_rate": 0.00012937882747849223, + "loss": 2.5714, + "step": 25904 + }, + { + "epoch": 0.7681700916288586, + "grad_norm": 0.09458112716674805, + "learning_rate": 0.00012934724675002808, + "loss": 2.5745, + "step": 25905 + }, + { + "epoch": 0.7681997449811702, + "grad_norm": 0.08700136840343475, + "learning_rate": 0.00012931566930375764, + "loss": 2.583, + "step": 25906 + }, + { + "epoch": 0.7682293983334816, + "grad_norm": 0.08967811614274979, + "learning_rate": 0.00012928409513996065, + "loss": 2.5762, + "step": 25907 + }, + { + "epoch": 0.7682590516857931, + "grad_norm": 0.08230839669704437, + "learning_rate": 0.00012925252425891642, + "loss": 2.5678, + "step": 25908 + }, + { + "epoch": 0.7682887050381045, + "grad_norm": 0.08806637674570084, + "learning_rate": 0.00012922095666090494, + "loss": 2.5856, + "step": 25909 + }, + { + "epoch": 0.7683183583904161, + "grad_norm": 0.0894610658288002, + "learning_rate": 0.00012918939234620548, + "loss": 2.6017, + "step": 25910 + }, + { + "epoch": 0.7683480117427275, + "grad_norm": 0.0869775041937828, + "learning_rate": 0.00012915783131509762, + "loss": 2.5851, + "step": 25911 + }, + { + "epoch": 0.768377665095039, + "grad_norm": 0.087290920317173, + "learning_rate": 0.000129126273567861, + "loss": 2.5872, + "step": 25912 + }, + { + "epoch": 0.7684073184473504, + "grad_norm": 0.08577793091535568, + "learning_rate": 0.00012909471910477465, + "loss": 2.5503, + "step": 25913 + }, + { + "epoch": 0.768436971799662, + "grad_norm": 0.08824630081653595, + "learning_rate": 0.00012906316792611828, + "loss": 2.5724, + "step": 25914 + }, + { + "epoch": 0.7684666251519734, + "grad_norm": 0.08160200715065002, + "learning_rate": 0.00012903162003217121, + "loss": 2.5683, + "step": 25915 + }, + { + "epoch": 0.7684962785042849, + "grad_norm": 0.0819411501288414, + "learning_rate": 0.00012900007542321291, + "loss": 2.5321, + "step": 25916 + }, + { + "epoch": 0.7685259318565963, + "grad_norm": 0.08666585385799408, + "learning_rate": 0.00012896853409952253, + "loss": 2.5939, + "step": 25917 + }, + { + "epoch": 0.7685555852089079, + "grad_norm": 0.0888749286532402, + "learning_rate": 0.0001289369960613795, + "loss": 2.519, + "step": 25918 + }, + { + "epoch": 0.7685852385612193, + "grad_norm": 0.09548868238925934, + "learning_rate": 0.00012890546130906307, + "loss": 2.5686, + "step": 25919 + }, + { + "epoch": 0.7686148919135308, + "grad_norm": 0.0905182883143425, + "learning_rate": 0.0001288739298428524, + "loss": 2.572, + "step": 25920 + }, + { + "epoch": 0.7686445452658422, + "grad_norm": 0.08777198940515518, + "learning_rate": 0.00012884240166302668, + "loss": 2.5651, + "step": 25921 + }, + { + "epoch": 0.7686741986181538, + "grad_norm": 0.08691054582595825, + "learning_rate": 0.0001288108767698655, + "loss": 2.553, + "step": 25922 + }, + { + "epoch": 0.7687038519704653, + "grad_norm": 0.09139062464237213, + "learning_rate": 0.00012877935516364763, + "loss": 2.5646, + "step": 25923 + }, + { + "epoch": 0.7687335053227767, + "grad_norm": 0.09025060385465622, + "learning_rate": 0.00012874783684465224, + "loss": 2.561, + "step": 25924 + }, + { + "epoch": 0.7687631586750883, + "grad_norm": 0.08903506398200989, + "learning_rate": 0.0001287163218131585, + "loss": 2.5895, + "step": 25925 + }, + { + "epoch": 0.7687928120273997, + "grad_norm": 0.08747293800115585, + "learning_rate": 0.00012868481006944545, + "loss": 2.5712, + "step": 25926 + }, + { + "epoch": 0.7688224653797112, + "grad_norm": 0.09454343467950821, + "learning_rate": 0.00012865330161379213, + "loss": 2.5247, + "step": 25927 + }, + { + "epoch": 0.7688521187320226, + "grad_norm": 0.08836585283279419, + "learning_rate": 0.00012862179644647752, + "loss": 2.5716, + "step": 25928 + }, + { + "epoch": 0.7688817720843342, + "grad_norm": 0.09722502529621124, + "learning_rate": 0.00012859029456778077, + "loss": 2.5677, + "step": 25929 + }, + { + "epoch": 0.7689114254366456, + "grad_norm": 0.08305805921554565, + "learning_rate": 0.0001285587959779806, + "loss": 2.5486, + "step": 25930 + }, + { + "epoch": 0.7689410787889571, + "grad_norm": 0.09867390990257263, + "learning_rate": 0.00012852730067735614, + "loss": 2.5459, + "step": 25931 + }, + { + "epoch": 0.7689707321412685, + "grad_norm": 0.08243594318628311, + "learning_rate": 0.0001284958086661861, + "loss": 2.5722, + "step": 25932 + }, + { + "epoch": 0.7690003854935801, + "grad_norm": 0.092593714594841, + "learning_rate": 0.0001284643199447495, + "loss": 2.5553, + "step": 25933 + }, + { + "epoch": 0.7690300388458915, + "grad_norm": 0.08299390226602554, + "learning_rate": 0.0001284328345133251, + "loss": 2.5253, + "step": 25934 + }, + { + "epoch": 0.769059692198203, + "grad_norm": 0.09601333737373352, + "learning_rate": 0.00012840135237219175, + "loss": 2.5507, + "step": 25935 + }, + { + "epoch": 0.7690893455505144, + "grad_norm": 0.08957450091838837, + "learning_rate": 0.00012836987352162822, + "loss": 2.5711, + "step": 25936 + }, + { + "epoch": 0.769118998902826, + "grad_norm": 0.09778095036745071, + "learning_rate": 0.0001283383979619132, + "loss": 2.5772, + "step": 25937 + }, + { + "epoch": 0.7691486522551374, + "grad_norm": 0.08531411737203598, + "learning_rate": 0.00012830692569332547, + "loss": 2.5612, + "step": 25938 + }, + { + "epoch": 0.7691783056074489, + "grad_norm": 0.09254399687051773, + "learning_rate": 0.00012827545671614376, + "loss": 2.5459, + "step": 25939 + }, + { + "epoch": 0.7692079589597604, + "grad_norm": 0.09014440327882767, + "learning_rate": 0.00012824399103064665, + "loss": 2.5794, + "step": 25940 + }, + { + "epoch": 0.7692376123120719, + "grad_norm": 0.08947951346635818, + "learning_rate": 0.00012821252863711282, + "loss": 2.5656, + "step": 25941 + }, + { + "epoch": 0.7692672656643833, + "grad_norm": 0.08609407395124435, + "learning_rate": 0.00012818106953582087, + "loss": 2.5664, + "step": 25942 + }, + { + "epoch": 0.7692969190166948, + "grad_norm": 0.08968185633420944, + "learning_rate": 0.00012814961372704936, + "loss": 2.5591, + "step": 25943 + }, + { + "epoch": 0.7693265723690064, + "grad_norm": 0.0832328051328659, + "learning_rate": 0.00012811816121107683, + "loss": 2.5273, + "step": 25944 + }, + { + "epoch": 0.7693562257213178, + "grad_norm": 0.08835688978433609, + "learning_rate": 0.00012808671198818184, + "loss": 2.5737, + "step": 25945 + }, + { + "epoch": 0.7693858790736293, + "grad_norm": 0.09076055139303207, + "learning_rate": 0.000128055266058643, + "loss": 2.5742, + "step": 25946 + }, + { + "epoch": 0.7694155324259407, + "grad_norm": 0.08235578238964081, + "learning_rate": 0.00012802382342273834, + "loss": 2.5876, + "step": 25947 + }, + { + "epoch": 0.7694451857782523, + "grad_norm": 0.08965402096509933, + "learning_rate": 0.0001279923840807467, + "loss": 2.5538, + "step": 25948 + }, + { + "epoch": 0.7694748391305637, + "grad_norm": 0.07995683699846268, + "learning_rate": 0.00012796094803294632, + "loss": 2.5443, + "step": 25949 + }, + { + "epoch": 0.7695044924828752, + "grad_norm": 0.0897856131196022, + "learning_rate": 0.00012792951527961565, + "loss": 2.5532, + "step": 25950 + }, + { + "epoch": 0.7695341458351866, + "grad_norm": 0.08360181003808975, + "learning_rate": 0.00012789808582103302, + "loss": 2.5683, + "step": 25951 + }, + { + "epoch": 0.7695637991874982, + "grad_norm": 0.7348471879959106, + "learning_rate": 0.00012786665965747662, + "loss": 2.6076, + "step": 25952 + }, + { + "epoch": 0.7695934525398096, + "grad_norm": 0.09775876253843307, + "learning_rate": 0.00012783523678922492, + "loss": 2.5607, + "step": 25953 + }, + { + "epoch": 0.7696231058921211, + "grad_norm": 0.10883191227912903, + "learning_rate": 0.00012780381721655605, + "loss": 2.5825, + "step": 25954 + }, + { + "epoch": 0.7696527592444325, + "grad_norm": 0.09227842837572098, + "learning_rate": 0.00012777240093974824, + "loss": 2.5874, + "step": 25955 + }, + { + "epoch": 0.7696824125967441, + "grad_norm": 0.09749369323253632, + "learning_rate": 0.0001277409879590797, + "loss": 2.5673, + "step": 25956 + }, + { + "epoch": 0.7697120659490555, + "grad_norm": 0.08831610530614853, + "learning_rate": 0.00012770957827482876, + "loss": 2.5607, + "step": 25957 + }, + { + "epoch": 0.769741719301367, + "grad_norm": 0.10121895372867584, + "learning_rate": 0.00012767817188727331, + "loss": 2.5736, + "step": 25958 + }, + { + "epoch": 0.7697713726536785, + "grad_norm": 0.09986047446727753, + "learning_rate": 0.00012764676879669151, + "loss": 2.55, + "step": 25959 + }, + { + "epoch": 0.76980102600599, + "grad_norm": 0.09750453382730484, + "learning_rate": 0.00012761536900336135, + "loss": 2.6237, + "step": 25960 + }, + { + "epoch": 0.7698306793583014, + "grad_norm": 0.09219242632389069, + "learning_rate": 0.00012758397250756115, + "loss": 2.5464, + "step": 25961 + }, + { + "epoch": 0.7698603327106129, + "grad_norm": 0.08675042539834976, + "learning_rate": 0.0001275525793095688, + "loss": 2.5615, + "step": 25962 + }, + { + "epoch": 0.7698899860629244, + "grad_norm": 0.08330877125263214, + "learning_rate": 0.00012752118940966234, + "loss": 2.5649, + "step": 25963 + }, + { + "epoch": 0.7699196394152359, + "grad_norm": 0.08883680403232574, + "learning_rate": 0.00012748980280811968, + "loss": 2.5468, + "step": 25964 + }, + { + "epoch": 0.7699492927675474, + "grad_norm": 0.08300849795341492, + "learning_rate": 0.0001274584195052187, + "loss": 2.5855, + "step": 25965 + }, + { + "epoch": 0.7699789461198588, + "grad_norm": 0.09705507755279541, + "learning_rate": 0.0001274270395012374, + "loss": 2.5741, + "step": 25966 + }, + { + "epoch": 0.7700085994721704, + "grad_norm": 0.09839485585689545, + "learning_rate": 0.00012739566279645372, + "loss": 2.5893, + "step": 25967 + }, + { + "epoch": 0.7700382528244818, + "grad_norm": 0.09169533848762512, + "learning_rate": 0.0001273642893911453, + "loss": 2.5924, + "step": 25968 + }, + { + "epoch": 0.7700679061767933, + "grad_norm": 0.08327962458133698, + "learning_rate": 0.00012733291928559005, + "loss": 2.6036, + "step": 25969 + }, + { + "epoch": 0.7700975595291047, + "grad_norm": 0.10273724794387817, + "learning_rate": 0.00012730155248006576, + "loss": 2.5602, + "step": 25970 + }, + { + "epoch": 0.7701272128814163, + "grad_norm": 0.08239983022212982, + "learning_rate": 0.00012727018897485022, + "loss": 2.5446, + "step": 25971 + }, + { + "epoch": 0.7701568662337277, + "grad_norm": 0.11019314080476761, + "learning_rate": 0.00012723882877022107, + "loss": 2.5685, + "step": 25972 + }, + { + "epoch": 0.7701865195860392, + "grad_norm": 0.07879601418972015, + "learning_rate": 0.000127207471866456, + "loss": 2.586, + "step": 25973 + }, + { + "epoch": 0.7702161729383507, + "grad_norm": 0.10982458293437958, + "learning_rate": 0.00012717611826383284, + "loss": 2.5818, + "step": 25974 + }, + { + "epoch": 0.7702458262906622, + "grad_norm": 0.08342532813549042, + "learning_rate": 0.00012714476796262919, + "loss": 2.5314, + "step": 25975 + }, + { + "epoch": 0.7702754796429736, + "grad_norm": 0.0944829061627388, + "learning_rate": 0.0001271134209631226, + "loss": 2.5621, + "step": 25976 + }, + { + "epoch": 0.7703051329952851, + "grad_norm": 0.08802958577871323, + "learning_rate": 0.00012708207726559067, + "loss": 2.5784, + "step": 25977 + }, + { + "epoch": 0.7703347863475966, + "grad_norm": 0.09998735785484314, + "learning_rate": 0.00012705073687031116, + "loss": 2.5826, + "step": 25978 + }, + { + "epoch": 0.7703644396999081, + "grad_norm": 0.09029266983270645, + "learning_rate": 0.00012701939977756115, + "loss": 2.5624, + "step": 25979 + }, + { + "epoch": 0.7703940930522195, + "grad_norm": 0.10119439661502838, + "learning_rate": 0.00012698806598761847, + "loss": 2.5754, + "step": 25980 + }, + { + "epoch": 0.770423746404531, + "grad_norm": 0.0926395058631897, + "learning_rate": 0.00012695673550076043, + "loss": 2.5512, + "step": 25981 + }, + { + "epoch": 0.7704533997568425, + "grad_norm": 0.0937577486038208, + "learning_rate": 0.00012692540831726456, + "loss": 2.5672, + "step": 25982 + }, + { + "epoch": 0.770483053109154, + "grad_norm": 0.09177558869123459, + "learning_rate": 0.0001268940844374082, + "loss": 2.575, + "step": 25983 + }, + { + "epoch": 0.7705127064614655, + "grad_norm": 0.0940660834312439, + "learning_rate": 0.00012686276386146878, + "loss": 2.5964, + "step": 25984 + }, + { + "epoch": 0.770542359813777, + "grad_norm": 0.08912267535924911, + "learning_rate": 0.00012683144658972366, + "loss": 2.5908, + "step": 25985 + }, + { + "epoch": 0.7705720131660885, + "grad_norm": 0.08076667040586472, + "learning_rate": 0.00012680013262245, + "loss": 2.5633, + "step": 25986 + }, + { + "epoch": 0.7706016665183999, + "grad_norm": 0.08917902410030365, + "learning_rate": 0.00012676882195992528, + "loss": 2.5796, + "step": 25987 + }, + { + "epoch": 0.7706313198707114, + "grad_norm": 0.08799756318330765, + "learning_rate": 0.00012673751460242694, + "loss": 2.5509, + "step": 25988 + }, + { + "epoch": 0.7706609732230228, + "grad_norm": 0.08892715722322464, + "learning_rate": 0.0001267062105502318, + "loss": 2.5317, + "step": 25989 + }, + { + "epoch": 0.7706906265753344, + "grad_norm": 0.09021290391683578, + "learning_rate": 0.00012667490980361722, + "loss": 2.5545, + "step": 25990 + }, + { + "epoch": 0.7707202799276458, + "grad_norm": 0.09237337857484818, + "learning_rate": 0.00012664361236286044, + "loss": 2.5633, + "step": 25991 + }, + { + "epoch": 0.7707499332799573, + "grad_norm": 0.08187185972929001, + "learning_rate": 0.00012661231822823853, + "loss": 2.5589, + "step": 25992 + }, + { + "epoch": 0.7707795866322688, + "grad_norm": 0.09320545196533203, + "learning_rate": 0.00012658102740002863, + "loss": 2.5625, + "step": 25993 + }, + { + "epoch": 0.7708092399845803, + "grad_norm": 0.08449369668960571, + "learning_rate": 0.00012654973987850788, + "loss": 2.5491, + "step": 25994 + }, + { + "epoch": 0.7708388933368917, + "grad_norm": 0.08609634637832642, + "learning_rate": 0.00012651845566395326, + "loss": 2.6036, + "step": 25995 + }, + { + "epoch": 0.7708685466892032, + "grad_norm": 0.08469746261835098, + "learning_rate": 0.0001264871747566418, + "loss": 2.5424, + "step": 25996 + }, + { + "epoch": 0.7708982000415147, + "grad_norm": 0.07724752277135849, + "learning_rate": 0.00012645589715685053, + "loss": 2.5858, + "step": 25997 + }, + { + "epoch": 0.7709278533938262, + "grad_norm": 0.08141043037176132, + "learning_rate": 0.0001264246228648564, + "loss": 2.5276, + "step": 25998 + }, + { + "epoch": 0.7709575067461376, + "grad_norm": 0.08511131256818771, + "learning_rate": 0.00012639335188093637, + "loss": 2.5885, + "step": 25999 + }, + { + "epoch": 0.7709871600984491, + "grad_norm": 0.08713936805725098, + "learning_rate": 0.00012636208420536732, + "loss": 2.5679, + "step": 26000 + }, + { + "epoch": 0.7710168134507606, + "grad_norm": 0.09228675067424774, + "learning_rate": 0.00012633081983842614, + "loss": 2.5641, + "step": 26001 + }, + { + "epoch": 0.7710464668030721, + "grad_norm": 0.08302930742502213, + "learning_rate": 0.00012629955878038973, + "loss": 2.5582, + "step": 26002 + }, + { + "epoch": 0.7710761201553835, + "grad_norm": 0.08797503262758255, + "learning_rate": 0.00012626830103153487, + "loss": 2.5641, + "step": 26003 + }, + { + "epoch": 0.771105773507695, + "grad_norm": 0.08863667398691177, + "learning_rate": 0.00012623704659213831, + "loss": 2.5812, + "step": 26004 + }, + { + "epoch": 0.7711354268600066, + "grad_norm": 0.10056056827306747, + "learning_rate": 0.0001262057954624769, + "loss": 2.5614, + "step": 26005 + }, + { + "epoch": 0.771165080212318, + "grad_norm": 0.08191220462322235, + "learning_rate": 0.00012617454764282733, + "loss": 2.5281, + "step": 26006 + }, + { + "epoch": 0.7711947335646295, + "grad_norm": 0.0963779017329216, + "learning_rate": 0.00012614330313346627, + "loss": 2.573, + "step": 26007 + }, + { + "epoch": 0.771224386916941, + "grad_norm": 0.09004376828670502, + "learning_rate": 0.00012611206193467045, + "loss": 2.5732, + "step": 26008 + }, + { + "epoch": 0.7712540402692525, + "grad_norm": 0.10724633932113647, + "learning_rate": 0.0001260808240467165, + "loss": 2.5409, + "step": 26009 + }, + { + "epoch": 0.7712836936215639, + "grad_norm": 0.0886676162481308, + "learning_rate": 0.00012604958946988104, + "loss": 2.5665, + "step": 26010 + }, + { + "epoch": 0.7713133469738754, + "grad_norm": 0.0931188091635704, + "learning_rate": 0.00012601835820444068, + "loss": 2.5407, + "step": 26011 + }, + { + "epoch": 0.7713430003261869, + "grad_norm": 0.08882389217615128, + "learning_rate": 0.00012598713025067194, + "loss": 2.5434, + "step": 26012 + }, + { + "epoch": 0.7713726536784984, + "grad_norm": 0.09321441501379013, + "learning_rate": 0.00012595590560885133, + "loss": 2.547, + "step": 26013 + }, + { + "epoch": 0.7714023070308098, + "grad_norm": 0.08891641348600388, + "learning_rate": 0.00012592468427925535, + "loss": 2.5747, + "step": 26014 + }, + { + "epoch": 0.7714319603831213, + "grad_norm": 0.08582451194524765, + "learning_rate": 0.00012589346626216058, + "loss": 2.5476, + "step": 26015 + }, + { + "epoch": 0.7714616137354328, + "grad_norm": 0.08550510555505753, + "learning_rate": 0.00012586225155784337, + "loss": 2.586, + "step": 26016 + }, + { + "epoch": 0.7714912670877443, + "grad_norm": 0.09393753856420517, + "learning_rate": 0.0001258310401665801, + "loss": 2.556, + "step": 26017 + }, + { + "epoch": 0.7715209204400557, + "grad_norm": 0.08590182662010193, + "learning_rate": 0.0001257998320886472, + "loss": 2.5421, + "step": 26018 + }, + { + "epoch": 0.7715505737923672, + "grad_norm": 0.09214287996292114, + "learning_rate": 0.00012576862732432105, + "loss": 2.5606, + "step": 26019 + }, + { + "epoch": 0.7715802271446787, + "grad_norm": 0.09377248585224152, + "learning_rate": 0.00012573742587387786, + "loss": 2.5899, + "step": 26020 + }, + { + "epoch": 0.7716098804969902, + "grad_norm": 0.08436097204685211, + "learning_rate": 0.00012570622773759412, + "loss": 2.5686, + "step": 26021 + }, + { + "epoch": 0.7716395338493016, + "grad_norm": 0.086911641061306, + "learning_rate": 0.00012567503291574606, + "loss": 2.5529, + "step": 26022 + }, + { + "epoch": 0.7716691872016131, + "grad_norm": 0.08469151705503464, + "learning_rate": 0.0001256438414086097, + "loss": 2.5292, + "step": 26023 + }, + { + "epoch": 0.7716988405539246, + "grad_norm": 0.08871246129274368, + "learning_rate": 0.00012561265321646138, + "loss": 2.5962, + "step": 26024 + }, + { + "epoch": 0.7717284939062361, + "grad_norm": 0.09049281477928162, + "learning_rate": 0.00012558146833957712, + "loss": 2.5764, + "step": 26025 + }, + { + "epoch": 0.7717581472585476, + "grad_norm": 0.09622488170862198, + "learning_rate": 0.0001255502867782334, + "loss": 2.5586, + "step": 26026 + }, + { + "epoch": 0.771787800610859, + "grad_norm": 0.09273168444633484, + "learning_rate": 0.00012551910853270614, + "loss": 2.5305, + "step": 26027 + }, + { + "epoch": 0.7718174539631706, + "grad_norm": 0.09133422374725342, + "learning_rate": 0.0001254879336032715, + "loss": 2.5592, + "step": 26028 + }, + { + "epoch": 0.771847107315482, + "grad_norm": 0.0945989191532135, + "learning_rate": 0.00012545676199020544, + "loss": 2.5525, + "step": 26029 + }, + { + "epoch": 0.7718767606677935, + "grad_norm": 0.092217355966568, + "learning_rate": 0.00012542559369378405, + "loss": 2.5707, + "step": 26030 + }, + { + "epoch": 0.771906414020105, + "grad_norm": 0.0885528177022934, + "learning_rate": 0.00012539442871428337, + "loss": 2.5567, + "step": 26031 + }, + { + "epoch": 0.7719360673724165, + "grad_norm": 0.08460863679647446, + "learning_rate": 0.00012536326705197927, + "loss": 2.523, + "step": 26032 + }, + { + "epoch": 0.7719657207247279, + "grad_norm": 0.10076603293418884, + "learning_rate": 0.00012533210870714788, + "loss": 2.5628, + "step": 26033 + }, + { + "epoch": 0.7719953740770394, + "grad_norm": 0.09154062718153, + "learning_rate": 0.0001253009536800649, + "loss": 2.553, + "step": 26034 + }, + { + "epoch": 0.7720250274293509, + "grad_norm": 0.09462737292051315, + "learning_rate": 0.00012526980197100624, + "loss": 2.5653, + "step": 26035 + }, + { + "epoch": 0.7720546807816624, + "grad_norm": 0.0882612094283104, + "learning_rate": 0.00012523865358024777, + "loss": 2.5936, + "step": 26036 + }, + { + "epoch": 0.7720843341339738, + "grad_norm": 0.08550867438316345, + "learning_rate": 0.00012520750850806544, + "loss": 2.5534, + "step": 26037 + }, + { + "epoch": 0.7721139874862853, + "grad_norm": 0.07671204954385757, + "learning_rate": 0.00012517636675473477, + "loss": 2.5506, + "step": 26038 + }, + { + "epoch": 0.7721436408385968, + "grad_norm": 0.08994647860527039, + "learning_rate": 0.00012514522832053181, + "loss": 2.5453, + "step": 26039 + }, + { + "epoch": 0.7721732941909083, + "grad_norm": 0.07827236503362656, + "learning_rate": 0.00012511409320573226, + "loss": 2.5653, + "step": 26040 + }, + { + "epoch": 0.7722029475432197, + "grad_norm": 0.07982301712036133, + "learning_rate": 0.0001250829614106117, + "loss": 2.5347, + "step": 26041 + }, + { + "epoch": 0.7722326008955313, + "grad_norm": 0.09433281421661377, + "learning_rate": 0.00012505183293544592, + "loss": 2.5481, + "step": 26042 + }, + { + "epoch": 0.7722622542478427, + "grad_norm": 0.0911400094628334, + "learning_rate": 0.00012502070778051068, + "loss": 2.5608, + "step": 26043 + }, + { + "epoch": 0.7722919076001542, + "grad_norm": 0.0889708399772644, + "learning_rate": 0.00012498958594608128, + "loss": 2.5739, + "step": 26044 + }, + { + "epoch": 0.7723215609524656, + "grad_norm": 0.09064130485057831, + "learning_rate": 0.00012495846743243344, + "loss": 2.5251, + "step": 26045 + }, + { + "epoch": 0.7723512143047772, + "grad_norm": 0.08940435945987701, + "learning_rate": 0.00012492735223984275, + "loss": 2.5597, + "step": 26046 + }, + { + "epoch": 0.7723808676570887, + "grad_norm": 0.09391053020954132, + "learning_rate": 0.00012489624036858476, + "loss": 2.5919, + "step": 26047 + }, + { + "epoch": 0.7724105210094001, + "grad_norm": 0.08690446615219116, + "learning_rate": 0.00012486513181893488, + "loss": 2.5558, + "step": 26048 + }, + { + "epoch": 0.7724401743617116, + "grad_norm": 0.08040236681699753, + "learning_rate": 0.0001248340265911687, + "loss": 2.5544, + "step": 26049 + }, + { + "epoch": 0.7724698277140231, + "grad_norm": 0.09112175554037094, + "learning_rate": 0.00012480292468556166, + "loss": 2.5659, + "step": 26050 + }, + { + "epoch": 0.7724994810663346, + "grad_norm": 0.08628752827644348, + "learning_rate": 0.00012477182610238892, + "loss": 2.5545, + "step": 26051 + }, + { + "epoch": 0.772529134418646, + "grad_norm": 0.09987781196832657, + "learning_rate": 0.00012474073084192623, + "loss": 2.5584, + "step": 26052 + }, + { + "epoch": 0.7725587877709575, + "grad_norm": 0.09343939274549484, + "learning_rate": 0.00012470963890444874, + "loss": 2.546, + "step": 26053 + }, + { + "epoch": 0.772588441123269, + "grad_norm": 0.08528292924165726, + "learning_rate": 0.000124678550290232, + "loss": 2.549, + "step": 26054 + }, + { + "epoch": 0.7726180944755805, + "grad_norm": 0.09237702190876007, + "learning_rate": 0.000124647464999551, + "loss": 2.5836, + "step": 26055 + }, + { + "epoch": 0.7726477478278919, + "grad_norm": 0.08882246166467667, + "learning_rate": 0.00012461638303268107, + "loss": 2.5364, + "step": 26056 + }, + { + "epoch": 0.7726774011802034, + "grad_norm": 0.08774299919605255, + "learning_rate": 0.0001245853043898975, + "loss": 2.5663, + "step": 26057 + }, + { + "epoch": 0.7727070545325149, + "grad_norm": 0.09271595627069473, + "learning_rate": 0.00012455422907147557, + "loss": 2.594, + "step": 26058 + }, + { + "epoch": 0.7727367078848264, + "grad_norm": 0.08849984407424927, + "learning_rate": 0.0001245231570776903, + "loss": 2.5396, + "step": 26059 + }, + { + "epoch": 0.7727663612371378, + "grad_norm": 0.08855020999908447, + "learning_rate": 0.000124492088408817, + "loss": 2.576, + "step": 26060 + }, + { + "epoch": 0.7727960145894494, + "grad_norm": 0.08605986088514328, + "learning_rate": 0.00012446102306513064, + "loss": 2.6017, + "step": 26061 + }, + { + "epoch": 0.7728256679417608, + "grad_norm": 0.08632569760084152, + "learning_rate": 0.0001244299610469064, + "loss": 2.5563, + "step": 26062 + }, + { + "epoch": 0.7728553212940723, + "grad_norm": 0.08364022523164749, + "learning_rate": 0.00012439890235441936, + "loss": 2.5573, + "step": 26063 + }, + { + "epoch": 0.7728849746463837, + "grad_norm": 0.08497951924800873, + "learning_rate": 0.0001243678469879445, + "loss": 2.5594, + "step": 26064 + }, + { + "epoch": 0.7729146279986953, + "grad_norm": 0.10110338777303696, + "learning_rate": 0.0001243367949477568, + "loss": 2.5529, + "step": 26065 + }, + { + "epoch": 0.7729442813510067, + "grad_norm": 0.09041465818881989, + "learning_rate": 0.00012430574623413133, + "loss": 2.5614, + "step": 26066 + }, + { + "epoch": 0.7729739347033182, + "grad_norm": 0.08394259959459305, + "learning_rate": 0.00012427470084734287, + "loss": 2.5882, + "step": 26067 + }, + { + "epoch": 0.7730035880556297, + "grad_norm": 0.08099101483821869, + "learning_rate": 0.00012424365878766646, + "loss": 2.5348, + "step": 26068 + }, + { + "epoch": 0.7730332414079412, + "grad_norm": 0.08412176370620728, + "learning_rate": 0.00012421262005537698, + "loss": 2.5514, + "step": 26069 + }, + { + "epoch": 0.7730628947602527, + "grad_norm": 0.08813028037548065, + "learning_rate": 0.00012418158465074924, + "loss": 2.5335, + "step": 26070 + }, + { + "epoch": 0.7730925481125641, + "grad_norm": 0.0869642123579979, + "learning_rate": 0.00012415055257405805, + "loss": 2.5312, + "step": 26071 + }, + { + "epoch": 0.7731222014648756, + "grad_norm": 0.08637218922376633, + "learning_rate": 0.00012411952382557827, + "loss": 2.5608, + "step": 26072 + }, + { + "epoch": 0.7731518548171871, + "grad_norm": 0.09073111414909363, + "learning_rate": 0.0001240884984055846, + "loss": 2.6005, + "step": 26073 + }, + { + "epoch": 0.7731815081694986, + "grad_norm": 0.08065105229616165, + "learning_rate": 0.00012405747631435182, + "loss": 2.5564, + "step": 26074 + }, + { + "epoch": 0.77321116152181, + "grad_norm": 0.0899730920791626, + "learning_rate": 0.0001240264575521546, + "loss": 2.5595, + "step": 26075 + }, + { + "epoch": 0.7732408148741216, + "grad_norm": 0.08398512005805969, + "learning_rate": 0.00012399544211926766, + "loss": 2.5491, + "step": 26076 + }, + { + "epoch": 0.773270468226433, + "grad_norm": 0.08571929484605789, + "learning_rate": 0.00012396443001596558, + "loss": 2.5796, + "step": 26077 + }, + { + "epoch": 0.7733001215787445, + "grad_norm": 0.09282969683408737, + "learning_rate": 0.00012393342124252305, + "loss": 2.6132, + "step": 26078 + }, + { + "epoch": 0.7733297749310559, + "grad_norm": 0.08495966345071793, + "learning_rate": 0.00012390241579921462, + "loss": 2.5735, + "step": 26079 + }, + { + "epoch": 0.7733594282833675, + "grad_norm": 0.09232618659734726, + "learning_rate": 0.00012387141368631484, + "loss": 2.5904, + "step": 26080 + }, + { + "epoch": 0.7733890816356789, + "grad_norm": 0.0786791667342186, + "learning_rate": 0.00012384041490409825, + "loss": 2.5287, + "step": 26081 + }, + { + "epoch": 0.7734187349879904, + "grad_norm": 0.08729352056980133, + "learning_rate": 0.00012380941945283936, + "loss": 2.5679, + "step": 26082 + }, + { + "epoch": 0.7734483883403018, + "grad_norm": 0.08433310687541962, + "learning_rate": 0.00012377842733281264, + "loss": 2.5553, + "step": 26083 + }, + { + "epoch": 0.7734780416926134, + "grad_norm": 0.08767900615930557, + "learning_rate": 0.00012374743854429255, + "loss": 2.5676, + "step": 26084 + }, + { + "epoch": 0.7735076950449248, + "grad_norm": 0.0813252255320549, + "learning_rate": 0.0001237164530875534, + "loss": 2.5589, + "step": 26085 + }, + { + "epoch": 0.7735373483972363, + "grad_norm": 0.08493740111589432, + "learning_rate": 0.0001236854709628697, + "loss": 2.5511, + "step": 26086 + }, + { + "epoch": 0.7735670017495477, + "grad_norm": 0.08353843539953232, + "learning_rate": 0.00012365449217051572, + "loss": 2.5566, + "step": 26087 + }, + { + "epoch": 0.7735966551018593, + "grad_norm": 0.08290254324674606, + "learning_rate": 0.000123623516710766, + "loss": 2.5401, + "step": 26088 + }, + { + "epoch": 0.7736263084541708, + "grad_norm": 0.0810200497508049, + "learning_rate": 0.00012359254458389434, + "loss": 2.5528, + "step": 26089 + }, + { + "epoch": 0.7736559618064822, + "grad_norm": 0.0863930955529213, + "learning_rate": 0.00012356157579017542, + "loss": 2.5782, + "step": 26090 + }, + { + "epoch": 0.7736856151587937, + "grad_norm": 0.09141339361667633, + "learning_rate": 0.0001235306103298834, + "loss": 2.5783, + "step": 26091 + }, + { + "epoch": 0.7737152685111052, + "grad_norm": 0.08070451021194458, + "learning_rate": 0.0001234996482032924, + "loss": 2.5877, + "step": 26092 + }, + { + "epoch": 0.7737449218634167, + "grad_norm": 0.08177272230386734, + "learning_rate": 0.0001234686894106767, + "loss": 2.5687, + "step": 26093 + }, + { + "epoch": 0.7737745752157281, + "grad_norm": 0.09004686772823334, + "learning_rate": 0.00012343773395231034, + "loss": 2.5756, + "step": 26094 + }, + { + "epoch": 0.7738042285680397, + "grad_norm": 0.08535947650671005, + "learning_rate": 0.00012340678182846748, + "loss": 2.59, + "step": 26095 + }, + { + "epoch": 0.7738338819203511, + "grad_norm": 0.08451342582702637, + "learning_rate": 0.00012337583303942222, + "loss": 2.5463, + "step": 26096 + }, + { + "epoch": 0.7738635352726626, + "grad_norm": 0.08355531841516495, + "learning_rate": 0.00012334488758544864, + "loss": 2.5542, + "step": 26097 + }, + { + "epoch": 0.773893188624974, + "grad_norm": 0.08970247954130173, + "learning_rate": 0.00012331394546682083, + "loss": 2.571, + "step": 26098 + }, + { + "epoch": 0.7739228419772856, + "grad_norm": 0.08927939087152481, + "learning_rate": 0.0001232830066838126, + "loss": 2.6158, + "step": 26099 + }, + { + "epoch": 0.773952495329597, + "grad_norm": 0.08923923969268799, + "learning_rate": 0.00012325207123669796, + "loss": 2.5887, + "step": 26100 + }, + { + "epoch": 0.7739821486819085, + "grad_norm": 0.0891711488366127, + "learning_rate": 0.0001232211391257509, + "loss": 2.5731, + "step": 26101 + }, + { + "epoch": 0.7740118020342199, + "grad_norm": 0.08763068914413452, + "learning_rate": 0.00012319021035124518, + "loss": 2.5778, + "step": 26102 + }, + { + "epoch": 0.7740414553865315, + "grad_norm": 0.08990918844938278, + "learning_rate": 0.00012315928491345496, + "loss": 2.5409, + "step": 26103 + }, + { + "epoch": 0.7740711087388429, + "grad_norm": 0.08956126123666763, + "learning_rate": 0.000123128362812654, + "loss": 2.5959, + "step": 26104 + }, + { + "epoch": 0.7741007620911544, + "grad_norm": 0.08621922135353088, + "learning_rate": 0.00012309744404911604, + "loss": 2.5664, + "step": 26105 + }, + { + "epoch": 0.7741304154434658, + "grad_norm": 0.09104348719120026, + "learning_rate": 0.00012306652862311492, + "loss": 2.5874, + "step": 26106 + }, + { + "epoch": 0.7741600687957774, + "grad_norm": 0.08997747302055359, + "learning_rate": 0.0001230356165349244, + "loss": 2.5759, + "step": 26107 + }, + { + "epoch": 0.7741897221480888, + "grad_norm": 0.0967646911740303, + "learning_rate": 0.0001230047077848182, + "loss": 2.5835, + "step": 26108 + }, + { + "epoch": 0.7742193755004003, + "grad_norm": 0.08853298425674438, + "learning_rate": 0.00012297380237307016, + "loss": 2.536, + "step": 26109 + }, + { + "epoch": 0.7742490288527119, + "grad_norm": 0.09393502026796341, + "learning_rate": 0.00012294290029995364, + "loss": 2.5563, + "step": 26110 + }, + { + "epoch": 0.7742786822050233, + "grad_norm": 0.0899033322930336, + "learning_rate": 0.0001229120015657425, + "loss": 2.5516, + "step": 26111 + }, + { + "epoch": 0.7743083355573348, + "grad_norm": 0.09156844764947891, + "learning_rate": 0.0001228811061707103, + "loss": 2.5666, + "step": 26112 + }, + { + "epoch": 0.7743379889096462, + "grad_norm": 0.0835186019539833, + "learning_rate": 0.00012285021411513063, + "loss": 2.548, + "step": 26113 + }, + { + "epoch": 0.7743676422619578, + "grad_norm": 0.0987747311592102, + "learning_rate": 0.00012281932539927703, + "loss": 2.5564, + "step": 26114 + }, + { + "epoch": 0.7743972956142692, + "grad_norm": 0.08039659261703491, + "learning_rate": 0.00012278844002342292, + "loss": 2.5786, + "step": 26115 + }, + { + "epoch": 0.7744269489665807, + "grad_norm": 0.08427484333515167, + "learning_rate": 0.0001227575579878421, + "loss": 2.5589, + "step": 26116 + }, + { + "epoch": 0.7744566023188921, + "grad_norm": 0.08915103226900101, + "learning_rate": 0.00012272667929280785, + "loss": 2.559, + "step": 26117 + }, + { + "epoch": 0.7744862556712037, + "grad_norm": 0.07847394049167633, + "learning_rate": 0.0001226958039385936, + "loss": 2.562, + "step": 26118 + }, + { + "epoch": 0.7745159090235151, + "grad_norm": 0.08695105463266373, + "learning_rate": 0.0001226649319254729, + "loss": 2.5668, + "step": 26119 + }, + { + "epoch": 0.7745455623758266, + "grad_norm": 0.08293971419334412, + "learning_rate": 0.00012263406325371885, + "loss": 2.5696, + "step": 26120 + }, + { + "epoch": 0.774575215728138, + "grad_norm": 0.10291845351457596, + "learning_rate": 0.00012260319792360496, + "loss": 2.5628, + "step": 26121 + }, + { + "epoch": 0.7746048690804496, + "grad_norm": 0.08174163848161697, + "learning_rate": 0.0001225723359354045, + "loss": 2.5493, + "step": 26122 + }, + { + "epoch": 0.774634522432761, + "grad_norm": 0.09015359729528427, + "learning_rate": 0.0001225414772893908, + "loss": 2.5856, + "step": 26123 + }, + { + "epoch": 0.7746641757850725, + "grad_norm": 0.08375348895788193, + "learning_rate": 0.00012251062198583712, + "loss": 2.5581, + "step": 26124 + }, + { + "epoch": 0.7746938291373839, + "grad_norm": 0.08786588162183762, + "learning_rate": 0.0001224797700250167, + "loss": 2.5557, + "step": 26125 + }, + { + "epoch": 0.7747234824896955, + "grad_norm": 0.08299622684717178, + "learning_rate": 0.0001224489214072027, + "loss": 2.5898, + "step": 26126 + }, + { + "epoch": 0.7747531358420069, + "grad_norm": 0.09181416779756546, + "learning_rate": 0.00012241807613266832, + "loss": 2.5761, + "step": 26127 + }, + { + "epoch": 0.7747827891943184, + "grad_norm": 0.0932580977678299, + "learning_rate": 0.0001223872342016865, + "loss": 2.5456, + "step": 26128 + }, + { + "epoch": 0.7748124425466298, + "grad_norm": 0.08957895636558533, + "learning_rate": 0.00012235639561453072, + "loss": 2.5706, + "step": 26129 + }, + { + "epoch": 0.7748420958989414, + "grad_norm": 0.0843127965927124, + "learning_rate": 0.00012232556037147403, + "loss": 2.5481, + "step": 26130 + }, + { + "epoch": 0.7748717492512529, + "grad_norm": 0.08795129507780075, + "learning_rate": 0.00012229472847278918, + "loss": 2.5444, + "step": 26131 + }, + { + "epoch": 0.7749014026035643, + "grad_norm": 0.08623360097408295, + "learning_rate": 0.0001222638999187493, + "loss": 2.5464, + "step": 26132 + }, + { + "epoch": 0.7749310559558759, + "grad_norm": 0.08389365673065186, + "learning_rate": 0.00012223307470962748, + "loss": 2.5744, + "step": 26133 + }, + { + "epoch": 0.7749607093081873, + "grad_norm": 0.08509200811386108, + "learning_rate": 0.00012220225284569658, + "loss": 2.5441, + "step": 26134 + }, + { + "epoch": 0.7749903626604988, + "grad_norm": 0.08909360319375992, + "learning_rate": 0.0001221714343272296, + "loss": 2.5441, + "step": 26135 + }, + { + "epoch": 0.7750200160128102, + "grad_norm": 0.08574876934289932, + "learning_rate": 0.00012214061915449942, + "loss": 2.5878, + "step": 26136 + }, + { + "epoch": 0.7750496693651218, + "grad_norm": 0.08682410418987274, + "learning_rate": 0.00012210980732777892, + "loss": 2.5537, + "step": 26137 + }, + { + "epoch": 0.7750793227174332, + "grad_norm": 0.08661599457263947, + "learning_rate": 0.0001220789988473409, + "loss": 2.5723, + "step": 26138 + }, + { + "epoch": 0.7751089760697447, + "grad_norm": 0.08942621946334839, + "learning_rate": 0.00012204819371345827, + "loss": 2.5877, + "step": 26139 + }, + { + "epoch": 0.7751386294220561, + "grad_norm": 0.08872438222169876, + "learning_rate": 0.00012201739192640376, + "loss": 2.58, + "step": 26140 + }, + { + "epoch": 0.7751682827743677, + "grad_norm": 0.0814598947763443, + "learning_rate": 0.00012198659348645008, + "loss": 2.5043, + "step": 26141 + }, + { + "epoch": 0.7751979361266791, + "grad_norm": 0.08467654883861542, + "learning_rate": 0.00012195579839387005, + "loss": 2.5638, + "step": 26142 + }, + { + "epoch": 0.7752275894789906, + "grad_norm": 0.08561563491821289, + "learning_rate": 0.0001219250066489363, + "loss": 2.5609, + "step": 26143 + }, + { + "epoch": 0.775257242831302, + "grad_norm": 0.08628559112548828, + "learning_rate": 0.00012189421825192148, + "loss": 2.5401, + "step": 26144 + }, + { + "epoch": 0.7752868961836136, + "grad_norm": 0.08751722425222397, + "learning_rate": 0.00012186343320309828, + "loss": 2.5682, + "step": 26145 + }, + { + "epoch": 0.775316549535925, + "grad_norm": 0.08513262122869492, + "learning_rate": 0.00012183265150273931, + "loss": 2.5637, + "step": 26146 + }, + { + "epoch": 0.7753462028882365, + "grad_norm": 0.0859871581196785, + "learning_rate": 0.00012180187315111706, + "loss": 2.5576, + "step": 26147 + }, + { + "epoch": 0.775375856240548, + "grad_norm": 0.08830196410417557, + "learning_rate": 0.00012177109814850417, + "loss": 2.6037, + "step": 26148 + }, + { + "epoch": 0.7754055095928595, + "grad_norm": 0.08772995322942734, + "learning_rate": 0.00012174032649517319, + "loss": 2.5642, + "step": 26149 + }, + { + "epoch": 0.7754351629451709, + "grad_norm": 0.08257041871547699, + "learning_rate": 0.00012170955819139645, + "loss": 2.5723, + "step": 26150 + }, + { + "epoch": 0.7754648162974824, + "grad_norm": 0.08506695926189423, + "learning_rate": 0.00012167879323744658, + "loss": 2.5311, + "step": 26151 + }, + { + "epoch": 0.775494469649794, + "grad_norm": 0.08480428159236908, + "learning_rate": 0.00012164803163359589, + "loss": 2.5586, + "step": 26152 + }, + { + "epoch": 0.7755241230021054, + "grad_norm": 0.08656249195337296, + "learning_rate": 0.00012161727338011696, + "loss": 2.545, + "step": 26153 + }, + { + "epoch": 0.7755537763544169, + "grad_norm": 0.08416203409433365, + "learning_rate": 0.00012158651847728175, + "loss": 2.5681, + "step": 26154 + }, + { + "epoch": 0.7755834297067283, + "grad_norm": 0.08893939852714539, + "learning_rate": 0.00012155576692536302, + "loss": 2.5595, + "step": 26155 + }, + { + "epoch": 0.7756130830590399, + "grad_norm": 0.08869830518960953, + "learning_rate": 0.00012152501872463295, + "loss": 2.5488, + "step": 26156 + }, + { + "epoch": 0.7756427364113513, + "grad_norm": 0.08933406323194504, + "learning_rate": 0.00012149427387536377, + "loss": 2.5513, + "step": 26157 + }, + { + "epoch": 0.7756723897636628, + "grad_norm": 0.08257041871547699, + "learning_rate": 0.00012146353237782782, + "loss": 2.5536, + "step": 26158 + }, + { + "epoch": 0.7757020431159742, + "grad_norm": 0.09548671543598175, + "learning_rate": 0.0001214327942322972, + "loss": 2.6026, + "step": 26159 + }, + { + "epoch": 0.7757316964682858, + "grad_norm": 0.08546826243400574, + "learning_rate": 0.0001214020594390442, + "loss": 2.5849, + "step": 26160 + }, + { + "epoch": 0.7757613498205972, + "grad_norm": 0.08789032697677612, + "learning_rate": 0.00012137132799834094, + "loss": 2.5399, + "step": 26161 + }, + { + "epoch": 0.7757910031729087, + "grad_norm": 0.0893956646323204, + "learning_rate": 0.00012134059991045959, + "loss": 2.5829, + "step": 26162 + }, + { + "epoch": 0.7758206565252201, + "grad_norm": 0.08767043799161911, + "learning_rate": 0.00012130987517567221, + "loss": 2.5508, + "step": 26163 + }, + { + "epoch": 0.7758503098775317, + "grad_norm": 0.08862456679344177, + "learning_rate": 0.00012127915379425098, + "loss": 2.563, + "step": 26164 + }, + { + "epoch": 0.7758799632298431, + "grad_norm": 0.09059568494558334, + "learning_rate": 0.00012124843576646777, + "loss": 2.5529, + "step": 26165 + }, + { + "epoch": 0.7759096165821546, + "grad_norm": 0.08926604688167572, + "learning_rate": 0.00012121772109259465, + "loss": 2.573, + "step": 26166 + }, + { + "epoch": 0.775939269934466, + "grad_norm": 0.09311400353908539, + "learning_rate": 0.00012118700977290348, + "loss": 2.5407, + "step": 26167 + }, + { + "epoch": 0.7759689232867776, + "grad_norm": 0.09107238054275513, + "learning_rate": 0.00012115630180766651, + "loss": 2.5391, + "step": 26168 + }, + { + "epoch": 0.775998576639089, + "grad_norm": 0.09724240005016327, + "learning_rate": 0.00012112559719715549, + "loss": 2.587, + "step": 26169 + }, + { + "epoch": 0.7760282299914005, + "grad_norm": 0.09239133447408676, + "learning_rate": 0.00012109489594164236, + "loss": 2.5595, + "step": 26170 + }, + { + "epoch": 0.776057883343712, + "grad_norm": 0.09640661627054214, + "learning_rate": 0.00012106419804139896, + "loss": 2.5558, + "step": 26171 + }, + { + "epoch": 0.7760875366960235, + "grad_norm": 0.08909745514392853, + "learning_rate": 0.00012103350349669717, + "loss": 2.5827, + "step": 26172 + }, + { + "epoch": 0.776117190048335, + "grad_norm": 0.09777885675430298, + "learning_rate": 0.00012100281230780868, + "loss": 2.5677, + "step": 26173 + }, + { + "epoch": 0.7761468434006464, + "grad_norm": 0.07975804805755615, + "learning_rate": 0.00012097212447500555, + "loss": 2.5323, + "step": 26174 + }, + { + "epoch": 0.776176496752958, + "grad_norm": 0.09464382380247116, + "learning_rate": 0.00012094143999855916, + "loss": 2.5769, + "step": 26175 + }, + { + "epoch": 0.7762061501052694, + "grad_norm": 0.09024257957935333, + "learning_rate": 0.00012091075887874136, + "loss": 2.5499, + "step": 26176 + }, + { + "epoch": 0.7762358034575809, + "grad_norm": 0.08835627883672714, + "learning_rate": 0.00012088008111582388, + "loss": 2.5774, + "step": 26177 + }, + { + "epoch": 0.7762654568098923, + "grad_norm": 0.08330260962247849, + "learning_rate": 0.00012084940671007833, + "loss": 2.5599, + "step": 26178 + }, + { + "epoch": 0.7762951101622039, + "grad_norm": 0.0884118601679802, + "learning_rate": 0.00012081873566177642, + "loss": 2.5892, + "step": 26179 + }, + { + "epoch": 0.7763247635145153, + "grad_norm": 0.08353570103645325, + "learning_rate": 0.00012078806797118946, + "loss": 2.5756, + "step": 26180 + }, + { + "epoch": 0.7763544168668268, + "grad_norm": 0.08433765918016434, + "learning_rate": 0.00012075740363858945, + "loss": 2.5446, + "step": 26181 + }, + { + "epoch": 0.7763840702191382, + "grad_norm": 0.0971968024969101, + "learning_rate": 0.00012072674266424776, + "loss": 2.5592, + "step": 26182 + }, + { + "epoch": 0.7764137235714498, + "grad_norm": 0.09138078987598419, + "learning_rate": 0.00012069608504843582, + "loss": 2.5764, + "step": 26183 + }, + { + "epoch": 0.7764433769237612, + "grad_norm": 0.0879025086760521, + "learning_rate": 0.00012066543079142517, + "loss": 2.5748, + "step": 26184 + }, + { + "epoch": 0.7764730302760727, + "grad_norm": 0.08979162573814392, + "learning_rate": 0.00012063477989348737, + "loss": 2.5255, + "step": 26185 + }, + { + "epoch": 0.7765026836283841, + "grad_norm": 0.09967998415231705, + "learning_rate": 0.00012060413235489354, + "loss": 2.5657, + "step": 26186 + }, + { + "epoch": 0.7765323369806957, + "grad_norm": 0.08683415502309799, + "learning_rate": 0.00012057348817591529, + "loss": 2.5581, + "step": 26187 + }, + { + "epoch": 0.7765619903330071, + "grad_norm": 0.09496407210826874, + "learning_rate": 0.00012054284735682392, + "loss": 2.5438, + "step": 26188 + }, + { + "epoch": 0.7765916436853186, + "grad_norm": 0.09569178521633148, + "learning_rate": 0.00012051220989789075, + "loss": 2.5517, + "step": 26189 + }, + { + "epoch": 0.77662129703763, + "grad_norm": 0.09184101969003677, + "learning_rate": 0.00012048157579938712, + "loss": 2.5718, + "step": 26190 + }, + { + "epoch": 0.7766509503899416, + "grad_norm": 0.09620426595211029, + "learning_rate": 0.00012045094506158427, + "loss": 2.5568, + "step": 26191 + }, + { + "epoch": 0.776680603742253, + "grad_norm": 0.10241095721721649, + "learning_rate": 0.00012042031768475348, + "loss": 2.5539, + "step": 26192 + }, + { + "epoch": 0.7767102570945645, + "grad_norm": 0.08459426462650299, + "learning_rate": 0.00012038969366916574, + "loss": 2.5572, + "step": 26193 + }, + { + "epoch": 0.7767399104468761, + "grad_norm": 0.09786888211965561, + "learning_rate": 0.00012035907301509258, + "loss": 2.5301, + "step": 26194 + }, + { + "epoch": 0.7767695637991875, + "grad_norm": 0.08927401155233383, + "learning_rate": 0.00012032845572280516, + "loss": 2.5954, + "step": 26195 + }, + { + "epoch": 0.776799217151499, + "grad_norm": 0.09302876889705658, + "learning_rate": 0.00012029784179257425, + "loss": 2.5769, + "step": 26196 + }, + { + "epoch": 0.7768288705038104, + "grad_norm": 0.09383808821439743, + "learning_rate": 0.00012026723122467114, + "loss": 2.5826, + "step": 26197 + }, + { + "epoch": 0.776858523856122, + "grad_norm": 0.09390458464622498, + "learning_rate": 0.00012023662401936692, + "loss": 2.5914, + "step": 26198 + }, + { + "epoch": 0.7768881772084334, + "grad_norm": 0.09611599147319794, + "learning_rate": 0.00012020602017693249, + "loss": 2.5395, + "step": 26199 + }, + { + "epoch": 0.7769178305607449, + "grad_norm": 0.1020222082734108, + "learning_rate": 0.00012017541969763901, + "loss": 2.6088, + "step": 26200 + }, + { + "epoch": 0.7769474839130563, + "grad_norm": 0.09244292974472046, + "learning_rate": 0.00012014482258175741, + "loss": 2.5263, + "step": 26201 + }, + { + "epoch": 0.7769771372653679, + "grad_norm": 0.10234159976243973, + "learning_rate": 0.00012011422882955853, + "loss": 2.5382, + "step": 26202 + }, + { + "epoch": 0.7770067906176793, + "grad_norm": 0.0957159623503685, + "learning_rate": 0.00012008363844131343, + "loss": 2.5646, + "step": 26203 + }, + { + "epoch": 0.7770364439699908, + "grad_norm": 0.08909919112920761, + "learning_rate": 0.00012005305141729289, + "loss": 2.5651, + "step": 26204 + }, + { + "epoch": 0.7770660973223023, + "grad_norm": 0.09404688328504562, + "learning_rate": 0.00012002246775776782, + "loss": 2.5756, + "step": 26205 + }, + { + "epoch": 0.7770957506746138, + "grad_norm": 0.09158775210380554, + "learning_rate": 0.00011999188746300899, + "loss": 2.5793, + "step": 26206 + }, + { + "epoch": 0.7771254040269252, + "grad_norm": 0.08845555782318115, + "learning_rate": 0.00011996131053328723, + "loss": 2.5631, + "step": 26207 + }, + { + "epoch": 0.7771550573792367, + "grad_norm": 0.08677404373884201, + "learning_rate": 0.0001199307369688733, + "loss": 2.5553, + "step": 26208 + }, + { + "epoch": 0.7771847107315482, + "grad_norm": 0.08699612319469452, + "learning_rate": 0.0001199001667700379, + "loss": 2.5724, + "step": 26209 + }, + { + "epoch": 0.7772143640838597, + "grad_norm": 0.08912856131792068, + "learning_rate": 0.00011986959993705182, + "loss": 2.5395, + "step": 26210 + }, + { + "epoch": 0.7772440174361711, + "grad_norm": 0.08392411470413208, + "learning_rate": 0.00011983903647018568, + "loss": 2.5902, + "step": 26211 + }, + { + "epoch": 0.7772736707884826, + "grad_norm": 0.09042637050151825, + "learning_rate": 0.00011980847636971009, + "loss": 2.565, + "step": 26212 + }, + { + "epoch": 0.7773033241407942, + "grad_norm": 0.08331993222236633, + "learning_rate": 0.00011977791963589574, + "loss": 2.5676, + "step": 26213 + }, + { + "epoch": 0.7773329774931056, + "grad_norm": 0.09508009999990463, + "learning_rate": 0.00011974736626901312, + "loss": 2.5127, + "step": 26214 + }, + { + "epoch": 0.7773626308454171, + "grad_norm": 0.0902620404958725, + "learning_rate": 0.00011971681626933289, + "loss": 2.5737, + "step": 26215 + }, + { + "epoch": 0.7773922841977285, + "grad_norm": 0.08390375971794128, + "learning_rate": 0.00011968626963712554, + "loss": 2.5407, + "step": 26216 + }, + { + "epoch": 0.7774219375500401, + "grad_norm": 0.10281173139810562, + "learning_rate": 0.00011965572637266147, + "loss": 2.5729, + "step": 26217 + }, + { + "epoch": 0.7774515909023515, + "grad_norm": 0.08036863803863525, + "learning_rate": 0.00011962518647621135, + "loss": 2.5476, + "step": 26218 + }, + { + "epoch": 0.777481244254663, + "grad_norm": 0.09439501911401749, + "learning_rate": 0.00011959464994804542, + "loss": 2.589, + "step": 26219 + }, + { + "epoch": 0.7775108976069744, + "grad_norm": 0.09056825190782547, + "learning_rate": 0.00011956411678843416, + "loss": 2.5897, + "step": 26220 + }, + { + "epoch": 0.777540550959286, + "grad_norm": 0.09331859648227692, + "learning_rate": 0.00011953358699764794, + "loss": 2.5364, + "step": 26221 + }, + { + "epoch": 0.7775702043115974, + "grad_norm": 0.08857758343219757, + "learning_rate": 0.00011950306057595712, + "loss": 2.5264, + "step": 26222 + }, + { + "epoch": 0.7775998576639089, + "grad_norm": 0.09077584743499756, + "learning_rate": 0.00011947253752363202, + "loss": 2.5749, + "step": 26223 + }, + { + "epoch": 0.7776295110162204, + "grad_norm": 0.08472853899002075, + "learning_rate": 0.00011944201784094294, + "loss": 2.5546, + "step": 26224 + }, + { + "epoch": 0.7776591643685319, + "grad_norm": 0.10750250518321991, + "learning_rate": 0.00011941150152816005, + "loss": 2.6268, + "step": 26225 + }, + { + "epoch": 0.7776888177208433, + "grad_norm": 0.08647683262825012, + "learning_rate": 0.00011938098858555368, + "loss": 2.5413, + "step": 26226 + }, + { + "epoch": 0.7777184710731548, + "grad_norm": 0.09199830889701843, + "learning_rate": 0.00011935047901339401, + "loss": 2.5701, + "step": 26227 + }, + { + "epoch": 0.7777481244254663, + "grad_norm": 0.08277225494384766, + "learning_rate": 0.00011931997281195117, + "loss": 2.586, + "step": 26228 + }, + { + "epoch": 0.7777777777777778, + "grad_norm": 0.08612063527107239, + "learning_rate": 0.00011928946998149543, + "loss": 2.569, + "step": 26229 + }, + { + "epoch": 0.7778074311300892, + "grad_norm": 0.088788241147995, + "learning_rate": 0.00011925897052229667, + "loss": 2.53, + "step": 26230 + }, + { + "epoch": 0.7778370844824007, + "grad_norm": 0.08106882125139236, + "learning_rate": 0.0001192284744346251, + "loss": 2.5517, + "step": 26231 + }, + { + "epoch": 0.7778667378347122, + "grad_norm": 0.09437479823827744, + "learning_rate": 0.00011919798171875057, + "loss": 2.5517, + "step": 26232 + }, + { + "epoch": 0.7778963911870237, + "grad_norm": 0.08423130214214325, + "learning_rate": 0.0001191674923749434, + "loss": 2.5385, + "step": 26233 + }, + { + "epoch": 0.7779260445393352, + "grad_norm": 0.0913095623254776, + "learning_rate": 0.00011913700640347346, + "loss": 2.5665, + "step": 26234 + }, + { + "epoch": 0.7779556978916466, + "grad_norm": 0.07940103113651276, + "learning_rate": 0.00011910652380461074, + "loss": 2.5786, + "step": 26235 + }, + { + "epoch": 0.7779853512439582, + "grad_norm": 0.10037245601415634, + "learning_rate": 0.00011907604457862509, + "loss": 2.5468, + "step": 26236 + }, + { + "epoch": 0.7780150045962696, + "grad_norm": 0.08001718670129776, + "learning_rate": 0.00011904556872578649, + "loss": 2.5708, + "step": 26237 + }, + { + "epoch": 0.7780446579485811, + "grad_norm": 0.09553194791078568, + "learning_rate": 0.00011901509624636476, + "loss": 2.5702, + "step": 26238 + }, + { + "epoch": 0.7780743113008926, + "grad_norm": 0.09733118116855621, + "learning_rate": 0.00011898462714062974, + "loss": 2.5521, + "step": 26239 + }, + { + "epoch": 0.7781039646532041, + "grad_norm": 0.09473805129528046, + "learning_rate": 0.00011895416140885135, + "loss": 2.539, + "step": 26240 + }, + { + "epoch": 0.7781336180055155, + "grad_norm": 0.10371638834476471, + "learning_rate": 0.00011892369905129919, + "loss": 2.5719, + "step": 26241 + }, + { + "epoch": 0.778163271357827, + "grad_norm": 0.09930257499217987, + "learning_rate": 0.00011889324006824304, + "loss": 2.5591, + "step": 26242 + }, + { + "epoch": 0.7781929247101385, + "grad_norm": 0.0993354320526123, + "learning_rate": 0.00011886278445995269, + "loss": 2.5451, + "step": 26243 + }, + { + "epoch": 0.77822257806245, + "grad_norm": 0.09344210475683212, + "learning_rate": 0.00011883233222669782, + "loss": 2.5573, + "step": 26244 + }, + { + "epoch": 0.7782522314147614, + "grad_norm": 0.09687460958957672, + "learning_rate": 0.00011880188336874786, + "loss": 2.5409, + "step": 26245 + }, + { + "epoch": 0.7782818847670729, + "grad_norm": 0.09778996556997299, + "learning_rate": 0.00011877143788637285, + "loss": 2.5642, + "step": 26246 + }, + { + "epoch": 0.7783115381193844, + "grad_norm": 0.10547676682472229, + "learning_rate": 0.00011874099577984221, + "loss": 2.5595, + "step": 26247 + }, + { + "epoch": 0.7783411914716959, + "grad_norm": 0.09669952094554901, + "learning_rate": 0.00011871055704942546, + "loss": 2.6011, + "step": 26248 + }, + { + "epoch": 0.7783708448240073, + "grad_norm": 0.09240047633647919, + "learning_rate": 0.00011868012169539222, + "loss": 2.5646, + "step": 26249 + }, + { + "epoch": 0.7784004981763188, + "grad_norm": 0.10310547053813934, + "learning_rate": 0.00011864968971801204, + "loss": 2.5704, + "step": 26250 + }, + { + "epoch": 0.7784301515286303, + "grad_norm": 0.08283640444278717, + "learning_rate": 0.00011861926111755423, + "loss": 2.5445, + "step": 26251 + }, + { + "epoch": 0.7784598048809418, + "grad_norm": 0.09525837749242783, + "learning_rate": 0.00011858883589428826, + "loss": 2.5429, + "step": 26252 + }, + { + "epoch": 0.7784894582332532, + "grad_norm": 0.09396833181381226, + "learning_rate": 0.00011855841404848366, + "loss": 2.5746, + "step": 26253 + }, + { + "epoch": 0.7785191115855647, + "grad_norm": 0.09125550091266632, + "learning_rate": 0.00011852799558040978, + "loss": 2.546, + "step": 26254 + }, + { + "epoch": 0.7785487649378763, + "grad_norm": 0.0899694412946701, + "learning_rate": 0.00011849758049033599, + "loss": 2.5637, + "step": 26255 + }, + { + "epoch": 0.7785784182901877, + "grad_norm": 0.09203339368104935, + "learning_rate": 0.00011846716877853159, + "loss": 2.5508, + "step": 26256 + }, + { + "epoch": 0.7786080716424992, + "grad_norm": 0.0986960306763649, + "learning_rate": 0.00011843676044526575, + "loss": 2.5742, + "step": 26257 + }, + { + "epoch": 0.7786377249948107, + "grad_norm": 0.09372316300868988, + "learning_rate": 0.00011840635549080803, + "loss": 2.5523, + "step": 26258 + }, + { + "epoch": 0.7786673783471222, + "grad_norm": 0.09215397387742996, + "learning_rate": 0.00011837595391542754, + "loss": 2.5751, + "step": 26259 + }, + { + "epoch": 0.7786970316994336, + "grad_norm": 0.09341444820165634, + "learning_rate": 0.00011834555571939348, + "loss": 2.5375, + "step": 26260 + }, + { + "epoch": 0.7787266850517451, + "grad_norm": 0.10561608523130417, + "learning_rate": 0.00011831516090297517, + "loss": 2.5612, + "step": 26261 + }, + { + "epoch": 0.7787563384040566, + "grad_norm": 0.09413941204547882, + "learning_rate": 0.00011828476946644151, + "loss": 2.5815, + "step": 26262 + }, + { + "epoch": 0.7787859917563681, + "grad_norm": 0.09199784696102142, + "learning_rate": 0.00011825438141006173, + "loss": 2.5438, + "step": 26263 + }, + { + "epoch": 0.7788156451086795, + "grad_norm": 0.09849037975072861, + "learning_rate": 0.00011822399673410489, + "loss": 2.5619, + "step": 26264 + }, + { + "epoch": 0.778845298460991, + "grad_norm": 0.08883219212293625, + "learning_rate": 0.0001181936154388401, + "loss": 2.5559, + "step": 26265 + }, + { + "epoch": 0.7788749518133025, + "grad_norm": 0.10711412876844406, + "learning_rate": 0.00011816323752453639, + "loss": 2.5879, + "step": 26266 + }, + { + "epoch": 0.778904605165614, + "grad_norm": 0.07988473773002625, + "learning_rate": 0.00011813286299146281, + "loss": 2.575, + "step": 26267 + }, + { + "epoch": 0.7789342585179254, + "grad_norm": 0.09758840501308441, + "learning_rate": 0.0001181024918398882, + "loss": 2.5573, + "step": 26268 + }, + { + "epoch": 0.778963911870237, + "grad_norm": 0.08289314806461334, + "learning_rate": 0.0001180721240700816, + "loss": 2.5653, + "step": 26269 + }, + { + "epoch": 0.7789935652225484, + "grad_norm": 0.0964532420039177, + "learning_rate": 0.00011804175968231178, + "loss": 2.5365, + "step": 26270 + }, + { + "epoch": 0.7790232185748599, + "grad_norm": 0.08473778516054153, + "learning_rate": 0.00011801139867684801, + "loss": 2.5722, + "step": 26271 + }, + { + "epoch": 0.7790528719271713, + "grad_norm": 0.09124522656202316, + "learning_rate": 0.00011798104105395869, + "loss": 2.558, + "step": 26272 + }, + { + "epoch": 0.7790825252794829, + "grad_norm": 0.08272287994623184, + "learning_rate": 0.00011795068681391285, + "loss": 2.5811, + "step": 26273 + }, + { + "epoch": 0.7791121786317943, + "grad_norm": 0.0907624140381813, + "learning_rate": 0.00011792033595697926, + "loss": 2.5519, + "step": 26274 + }, + { + "epoch": 0.7791418319841058, + "grad_norm": 0.08743470162153244, + "learning_rate": 0.00011788998848342669, + "loss": 2.6057, + "step": 26275 + }, + { + "epoch": 0.7791714853364173, + "grad_norm": 0.09321088343858719, + "learning_rate": 0.00011785964439352386, + "loss": 2.5546, + "step": 26276 + }, + { + "epoch": 0.7792011386887288, + "grad_norm": 0.08593180030584335, + "learning_rate": 0.00011782930368753947, + "loss": 2.5537, + "step": 26277 + }, + { + "epoch": 0.7792307920410403, + "grad_norm": 0.09492647647857666, + "learning_rate": 0.00011779896636574223, + "loss": 2.5604, + "step": 26278 + }, + { + "epoch": 0.7792604453933517, + "grad_norm": 0.09846854209899902, + "learning_rate": 0.00011776863242840069, + "loss": 2.5716, + "step": 26279 + }, + { + "epoch": 0.7792900987456632, + "grad_norm": 0.09679695963859558, + "learning_rate": 0.00011773830187578354, + "loss": 2.5361, + "step": 26280 + }, + { + "epoch": 0.7793197520979747, + "grad_norm": 0.09488916397094727, + "learning_rate": 0.00011770797470815931, + "loss": 2.5551, + "step": 26281 + }, + { + "epoch": 0.7793494054502862, + "grad_norm": 0.10359051078557968, + "learning_rate": 0.00011767765092579663, + "loss": 2.563, + "step": 26282 + }, + { + "epoch": 0.7793790588025976, + "grad_norm": 0.0996304377913475, + "learning_rate": 0.00011764733052896392, + "loss": 2.5435, + "step": 26283 + }, + { + "epoch": 0.7794087121549091, + "grad_norm": 0.0924847349524498, + "learning_rate": 0.0001176170135179298, + "loss": 2.5807, + "step": 26284 + }, + { + "epoch": 0.7794383655072206, + "grad_norm": 0.08743277937173843, + "learning_rate": 0.00011758669989296262, + "loss": 2.5741, + "step": 26285 + }, + { + "epoch": 0.7794680188595321, + "grad_norm": 0.08776206523180008, + "learning_rate": 0.00011755638965433085, + "loss": 2.5324, + "step": 26286 + }, + { + "epoch": 0.7794976722118435, + "grad_norm": 0.0866047814488411, + "learning_rate": 0.00011752608280230292, + "loss": 2.5385, + "step": 26287 + }, + { + "epoch": 0.779527325564155, + "grad_norm": 0.0910683274269104, + "learning_rate": 0.00011749577933714717, + "loss": 2.5759, + "step": 26288 + }, + { + "epoch": 0.7795569789164665, + "grad_norm": 0.08542834222316742, + "learning_rate": 0.00011746547925913192, + "loss": 2.5769, + "step": 26289 + }, + { + "epoch": 0.779586632268778, + "grad_norm": 0.09781809151172638, + "learning_rate": 0.00011743518256852554, + "loss": 2.5619, + "step": 26290 + }, + { + "epoch": 0.7796162856210894, + "grad_norm": 0.09268000721931458, + "learning_rate": 0.00011740488926559628, + "loss": 2.5949, + "step": 26291 + }, + { + "epoch": 0.779645938973401, + "grad_norm": 0.09087422490119934, + "learning_rate": 0.0001173745993506124, + "loss": 2.5259, + "step": 26292 + }, + { + "epoch": 0.7796755923257124, + "grad_norm": 0.08779485523700714, + "learning_rate": 0.00011734431282384211, + "loss": 2.5918, + "step": 26293 + }, + { + "epoch": 0.7797052456780239, + "grad_norm": 0.08949052542448044, + "learning_rate": 0.0001173140296855536, + "loss": 2.5727, + "step": 26294 + }, + { + "epoch": 0.7797348990303353, + "grad_norm": 0.08452853560447693, + "learning_rate": 0.0001172837499360152, + "loss": 2.548, + "step": 26295 + }, + { + "epoch": 0.7797645523826469, + "grad_norm": 0.08593863248825073, + "learning_rate": 0.0001172534735754946, + "loss": 2.537, + "step": 26296 + }, + { + "epoch": 0.7797942057349584, + "grad_norm": 0.09367925673723221, + "learning_rate": 0.00011722320060426033, + "loss": 2.5787, + "step": 26297 + }, + { + "epoch": 0.7798238590872698, + "grad_norm": 0.08305733650922775, + "learning_rate": 0.00011719293102258033, + "loss": 2.4932, + "step": 26298 + }, + { + "epoch": 0.7798535124395813, + "grad_norm": 0.09463603049516678, + "learning_rate": 0.00011716266483072263, + "loss": 2.5285, + "step": 26299 + }, + { + "epoch": 0.7798831657918928, + "grad_norm": 0.08923095464706421, + "learning_rate": 0.00011713240202895525, + "loss": 2.5316, + "step": 26300 + }, + { + "epoch": 0.7799128191442043, + "grad_norm": 0.09308422356843948, + "learning_rate": 0.00011710214261754621, + "loss": 2.5598, + "step": 26301 + }, + { + "epoch": 0.7799424724965157, + "grad_norm": 0.08883831650018692, + "learning_rate": 0.00011707188659676338, + "loss": 2.5417, + "step": 26302 + }, + { + "epoch": 0.7799721258488272, + "grad_norm": 0.08439048379659653, + "learning_rate": 0.00011704163396687468, + "loss": 2.597, + "step": 26303 + }, + { + "epoch": 0.7800017792011387, + "grad_norm": 0.08845274895429611, + "learning_rate": 0.0001170113847281481, + "loss": 2.564, + "step": 26304 + }, + { + "epoch": 0.7800314325534502, + "grad_norm": 0.08758802711963654, + "learning_rate": 0.00011698113888085144, + "loss": 2.565, + "step": 26305 + }, + { + "epoch": 0.7800610859057616, + "grad_norm": 0.08824703842401505, + "learning_rate": 0.0001169508964252527, + "loss": 2.5447, + "step": 26306 + }, + { + "epoch": 0.7800907392580732, + "grad_norm": 0.08364430069923401, + "learning_rate": 0.00011692065736161933, + "loss": 2.5987, + "step": 26307 + }, + { + "epoch": 0.7801203926103846, + "grad_norm": 0.08316943794488907, + "learning_rate": 0.0001168904216902193, + "loss": 2.595, + "step": 26308 + }, + { + "epoch": 0.7801500459626961, + "grad_norm": 0.08764394372701645, + "learning_rate": 0.00011686018941132026, + "loss": 2.5375, + "step": 26309 + }, + { + "epoch": 0.7801796993150075, + "grad_norm": 0.08220258355140686, + "learning_rate": 0.00011682996052519007, + "loss": 2.5929, + "step": 26310 + }, + { + "epoch": 0.7802093526673191, + "grad_norm": 0.09016250818967819, + "learning_rate": 0.0001167997350320964, + "loss": 2.5923, + "step": 26311 + }, + { + "epoch": 0.7802390060196305, + "grad_norm": 0.0837155357003212, + "learning_rate": 0.00011676951293230686, + "loss": 2.5771, + "step": 26312 + }, + { + "epoch": 0.780268659371942, + "grad_norm": 0.08506560325622559, + "learning_rate": 0.00011673929422608903, + "loss": 2.5772, + "step": 26313 + }, + { + "epoch": 0.7802983127242534, + "grad_norm": 0.08219853788614273, + "learning_rate": 0.00011670907891371058, + "loss": 2.5863, + "step": 26314 + }, + { + "epoch": 0.780327966076565, + "grad_norm": 0.09110531210899353, + "learning_rate": 0.00011667886699543901, + "loss": 2.5588, + "step": 26315 + }, + { + "epoch": 0.7803576194288764, + "grad_norm": 0.08646167814731598, + "learning_rate": 0.000116648658471542, + "loss": 2.5418, + "step": 26316 + }, + { + "epoch": 0.7803872727811879, + "grad_norm": 0.08800359815359116, + "learning_rate": 0.00011661845334228677, + "loss": 2.5673, + "step": 26317 + }, + { + "epoch": 0.7804169261334994, + "grad_norm": 0.0876619890332222, + "learning_rate": 0.00011658825160794096, + "loss": 2.5503, + "step": 26318 + }, + { + "epoch": 0.7804465794858109, + "grad_norm": 0.08868689835071564, + "learning_rate": 0.00011655805326877195, + "loss": 2.5651, + "step": 26319 + }, + { + "epoch": 0.7804762328381224, + "grad_norm": 0.08140242099761963, + "learning_rate": 0.00011652785832504726, + "loss": 2.5327, + "step": 26320 + }, + { + "epoch": 0.7805058861904338, + "grad_norm": 0.08816958218812943, + "learning_rate": 0.00011649766677703416, + "loss": 2.5831, + "step": 26321 + }, + { + "epoch": 0.7805355395427453, + "grad_norm": 0.08893788605928421, + "learning_rate": 0.0001164674786249999, + "loss": 2.5619, + "step": 26322 + }, + { + "epoch": 0.7805651928950568, + "grad_norm": 0.08832549303770065, + "learning_rate": 0.00011643729386921215, + "loss": 2.5853, + "step": 26323 + }, + { + "epoch": 0.7805948462473683, + "grad_norm": 0.09636911749839783, + "learning_rate": 0.00011640711250993796, + "loss": 2.592, + "step": 26324 + }, + { + "epoch": 0.7806244995996797, + "grad_norm": 0.08212500065565109, + "learning_rate": 0.00011637693454744464, + "loss": 2.592, + "step": 26325 + }, + { + "epoch": 0.7806541529519913, + "grad_norm": 0.09377378970384598, + "learning_rate": 0.0001163467599819995, + "loss": 2.5373, + "step": 26326 + }, + { + "epoch": 0.7806838063043027, + "grad_norm": 0.08768625557422638, + "learning_rate": 0.00011631658881386959, + "loss": 2.5949, + "step": 26327 + }, + { + "epoch": 0.7807134596566142, + "grad_norm": 0.08609430491924286, + "learning_rate": 0.0001162864210433221, + "loss": 2.5668, + "step": 26328 + }, + { + "epoch": 0.7807431130089256, + "grad_norm": 0.09779681265354156, + "learning_rate": 0.00011625625667062423, + "loss": 2.5485, + "step": 26329 + }, + { + "epoch": 0.7807727663612372, + "grad_norm": 0.0912029892206192, + "learning_rate": 0.00011622609569604308, + "loss": 2.548, + "step": 26330 + }, + { + "epoch": 0.7808024197135486, + "grad_norm": 0.09186487644910812, + "learning_rate": 0.00011619593811984574, + "loss": 2.5699, + "step": 26331 + }, + { + "epoch": 0.7808320730658601, + "grad_norm": 0.08965504169464111, + "learning_rate": 0.0001161657839422992, + "loss": 2.5562, + "step": 26332 + }, + { + "epoch": 0.7808617264181715, + "grad_norm": 0.09132493287324905, + "learning_rate": 0.0001161356331636706, + "loss": 2.5616, + "step": 26333 + }, + { + "epoch": 0.7808913797704831, + "grad_norm": 0.0903087854385376, + "learning_rate": 0.00011610548578422681, + "loss": 2.5495, + "step": 26334 + }, + { + "epoch": 0.7809210331227945, + "grad_norm": 0.08980656415224075, + "learning_rate": 0.00011607534180423469, + "loss": 2.5622, + "step": 26335 + }, + { + "epoch": 0.780950686475106, + "grad_norm": 0.0897594541311264, + "learning_rate": 0.0001160452012239615, + "loss": 2.5615, + "step": 26336 + }, + { + "epoch": 0.7809803398274174, + "grad_norm": 0.07923582196235657, + "learning_rate": 0.00011601506404367407, + "loss": 2.5388, + "step": 26337 + }, + { + "epoch": 0.781009993179729, + "grad_norm": 0.08822057396173477, + "learning_rate": 0.00011598493026363898, + "loss": 2.581, + "step": 26338 + }, + { + "epoch": 0.7810396465320405, + "grad_norm": 0.08215406537055969, + "learning_rate": 0.00011595479988412333, + "loss": 2.5438, + "step": 26339 + }, + { + "epoch": 0.7810692998843519, + "grad_norm": 0.08719569444656372, + "learning_rate": 0.00011592467290539377, + "loss": 2.5575, + "step": 26340 + }, + { + "epoch": 0.7810989532366635, + "grad_norm": 0.09088890254497528, + "learning_rate": 0.0001158945493277172, + "loss": 2.556, + "step": 26341 + }, + { + "epoch": 0.7811286065889749, + "grad_norm": 0.08462992310523987, + "learning_rate": 0.00011586442915136031, + "loss": 2.5597, + "step": 26342 + }, + { + "epoch": 0.7811582599412864, + "grad_norm": 0.08348067104816437, + "learning_rate": 0.00011583431237658986, + "loss": 2.5375, + "step": 26343 + }, + { + "epoch": 0.7811879132935978, + "grad_norm": 0.09259065240621567, + "learning_rate": 0.0001158041990036725, + "loss": 2.5789, + "step": 26344 + }, + { + "epoch": 0.7812175666459094, + "grad_norm": 0.08767011016607285, + "learning_rate": 0.0001157740890328749, + "loss": 2.5645, + "step": 26345 + }, + { + "epoch": 0.7812472199982208, + "grad_norm": 0.09639927744865417, + "learning_rate": 0.00011574398246446371, + "loss": 2.5703, + "step": 26346 + }, + { + "epoch": 0.7812768733505323, + "grad_norm": 0.08620790392160416, + "learning_rate": 0.00011571387929870547, + "loss": 2.5749, + "step": 26347 + }, + { + "epoch": 0.7813065267028437, + "grad_norm": 0.09248095750808716, + "learning_rate": 0.00011568377953586684, + "loss": 2.5611, + "step": 26348 + }, + { + "epoch": 0.7813361800551553, + "grad_norm": 0.09078086167573929, + "learning_rate": 0.00011565368317621428, + "loss": 2.5857, + "step": 26349 + }, + { + "epoch": 0.7813658334074667, + "grad_norm": 0.08936269581317902, + "learning_rate": 0.00011562359022001434, + "loss": 2.5389, + "step": 26350 + }, + { + "epoch": 0.7813954867597782, + "grad_norm": 0.09289894998073578, + "learning_rate": 0.00011559350066753349, + "loss": 2.6001, + "step": 26351 + }, + { + "epoch": 0.7814251401120896, + "grad_norm": 0.08031004667282104, + "learning_rate": 0.00011556341451903819, + "loss": 2.5755, + "step": 26352 + }, + { + "epoch": 0.7814547934644012, + "grad_norm": 0.0888429582118988, + "learning_rate": 0.0001155333317747948, + "loss": 2.5131, + "step": 26353 + }, + { + "epoch": 0.7814844468167126, + "grad_norm": 0.08184882998466492, + "learning_rate": 0.00011550325243506976, + "loss": 2.5485, + "step": 26354 + }, + { + "epoch": 0.7815141001690241, + "grad_norm": 0.08519510179758072, + "learning_rate": 0.00011547317650012945, + "loss": 2.5548, + "step": 26355 + }, + { + "epoch": 0.7815437535213355, + "grad_norm": 0.08998628705739975, + "learning_rate": 0.0001154431039702401, + "loss": 2.5255, + "step": 26356 + }, + { + "epoch": 0.7815734068736471, + "grad_norm": 0.08331034332513809, + "learning_rate": 0.00011541303484566811, + "loss": 2.5627, + "step": 26357 + }, + { + "epoch": 0.7816030602259585, + "grad_norm": 0.08563170582056046, + "learning_rate": 0.00011538296912667973, + "loss": 2.5746, + "step": 26358 + }, + { + "epoch": 0.78163271357827, + "grad_norm": 0.08802642673254013, + "learning_rate": 0.00011535290681354116, + "loss": 2.5363, + "step": 26359 + }, + { + "epoch": 0.7816623669305816, + "grad_norm": 0.08798521757125854, + "learning_rate": 0.00011532284790651864, + "loss": 2.5715, + "step": 26360 + }, + { + "epoch": 0.781692020282893, + "grad_norm": 0.08410614728927612, + "learning_rate": 0.0001152927924058783, + "loss": 2.5641, + "step": 26361 + }, + { + "epoch": 0.7817216736352045, + "grad_norm": 0.08582844585180283, + "learning_rate": 0.00011526274031188633, + "loss": 2.5917, + "step": 26362 + }, + { + "epoch": 0.7817513269875159, + "grad_norm": 0.08520620316267014, + "learning_rate": 0.00011523269162480887, + "loss": 2.5548, + "step": 26363 + }, + { + "epoch": 0.7817809803398275, + "grad_norm": 0.08531995117664337, + "learning_rate": 0.0001152026463449119, + "loss": 2.5515, + "step": 26364 + }, + { + "epoch": 0.7818106336921389, + "grad_norm": 0.08866671472787857, + "learning_rate": 0.00011517260447246159, + "loss": 2.5749, + "step": 26365 + }, + { + "epoch": 0.7818402870444504, + "grad_norm": 0.0945037379860878, + "learning_rate": 0.00011514256600772394, + "loss": 2.5532, + "step": 26366 + }, + { + "epoch": 0.7818699403967618, + "grad_norm": 0.08597569912672043, + "learning_rate": 0.00011511253095096491, + "loss": 2.5536, + "step": 26367 + }, + { + "epoch": 0.7818995937490734, + "grad_norm": 0.08985859155654907, + "learning_rate": 0.00011508249930245047, + "loss": 2.5455, + "step": 26368 + }, + { + "epoch": 0.7819292471013848, + "grad_norm": 0.08518259227275848, + "learning_rate": 0.00011505247106244655, + "loss": 2.5417, + "step": 26369 + }, + { + "epoch": 0.7819589004536963, + "grad_norm": 0.09081476926803589, + "learning_rate": 0.0001150224462312191, + "loss": 2.557, + "step": 26370 + }, + { + "epoch": 0.7819885538060077, + "grad_norm": 0.08321244269609451, + "learning_rate": 0.0001149924248090341, + "loss": 2.5731, + "step": 26371 + }, + { + "epoch": 0.7820182071583193, + "grad_norm": 0.10547157377004623, + "learning_rate": 0.00011496240679615716, + "loss": 2.592, + "step": 26372 + }, + { + "epoch": 0.7820478605106307, + "grad_norm": 0.08926062285900116, + "learning_rate": 0.00011493239219285418, + "loss": 2.5542, + "step": 26373 + }, + { + "epoch": 0.7820775138629422, + "grad_norm": 0.09134334325790405, + "learning_rate": 0.00011490238099939081, + "loss": 2.5321, + "step": 26374 + }, + { + "epoch": 0.7821071672152536, + "grad_norm": 0.09132391959428787, + "learning_rate": 0.0001148723732160331, + "loss": 2.5884, + "step": 26375 + }, + { + "epoch": 0.7821368205675652, + "grad_norm": 0.08790414035320282, + "learning_rate": 0.00011484236884304661, + "loss": 2.5372, + "step": 26376 + }, + { + "epoch": 0.7821664739198766, + "grad_norm": 0.08637288957834244, + "learning_rate": 0.0001148123678806971, + "loss": 2.5536, + "step": 26377 + }, + { + "epoch": 0.7821961272721881, + "grad_norm": 0.08800044655799866, + "learning_rate": 0.00011478237032925015, + "loss": 2.6019, + "step": 26378 + }, + { + "epoch": 0.7822257806244995, + "grad_norm": 0.08890330046415329, + "learning_rate": 0.00011475237618897144, + "loss": 2.5419, + "step": 26379 + }, + { + "epoch": 0.7822554339768111, + "grad_norm": 0.0829230323433876, + "learning_rate": 0.00011472238546012659, + "loss": 2.5774, + "step": 26380 + }, + { + "epoch": 0.7822850873291226, + "grad_norm": 0.0894579142332077, + "learning_rate": 0.00011469239814298115, + "loss": 2.5264, + "step": 26381 + }, + { + "epoch": 0.782314740681434, + "grad_norm": 0.07982859760522842, + "learning_rate": 0.00011466241423780077, + "loss": 2.5073, + "step": 26382 + }, + { + "epoch": 0.7823443940337456, + "grad_norm": 0.0822538211941719, + "learning_rate": 0.00011463243374485072, + "loss": 2.5454, + "step": 26383 + }, + { + "epoch": 0.782374047386057, + "grad_norm": 0.0783396065235138, + "learning_rate": 0.00011460245666439667, + "loss": 2.5901, + "step": 26384 + }, + { + "epoch": 0.7824037007383685, + "grad_norm": 0.08481647819280624, + "learning_rate": 0.00011457248299670398, + "loss": 2.5989, + "step": 26385 + }, + { + "epoch": 0.7824333540906799, + "grad_norm": 0.0812889039516449, + "learning_rate": 0.00011454251274203808, + "loss": 2.5644, + "step": 26386 + }, + { + "epoch": 0.7824630074429915, + "grad_norm": 0.08376596868038177, + "learning_rate": 0.00011451254590066429, + "loss": 2.5763, + "step": 26387 + }, + { + "epoch": 0.7824926607953029, + "grad_norm": 0.08700224757194519, + "learning_rate": 0.00011448258247284821, + "loss": 2.5821, + "step": 26388 + }, + { + "epoch": 0.7825223141476144, + "grad_norm": 0.07999301701784134, + "learning_rate": 0.00011445262245885502, + "loss": 2.5454, + "step": 26389 + }, + { + "epoch": 0.7825519674999258, + "grad_norm": 0.08489630371332169, + "learning_rate": 0.00011442266585895006, + "loss": 2.526, + "step": 26390 + }, + { + "epoch": 0.7825816208522374, + "grad_norm": 0.0875602439045906, + "learning_rate": 0.00011439271267339851, + "loss": 2.5852, + "step": 26391 + }, + { + "epoch": 0.7826112742045488, + "grad_norm": 0.08623231202363968, + "learning_rate": 0.00011436276290246589, + "loss": 2.5395, + "step": 26392 + }, + { + "epoch": 0.7826409275568603, + "grad_norm": 0.08438363671302795, + "learning_rate": 0.00011433281654641703, + "loss": 2.5603, + "step": 26393 + }, + { + "epoch": 0.7826705809091717, + "grad_norm": 0.09525344520807266, + "learning_rate": 0.00011430287360551733, + "loss": 2.5686, + "step": 26394 + }, + { + "epoch": 0.7827002342614833, + "grad_norm": 0.0926346406340599, + "learning_rate": 0.00011427293408003181, + "loss": 2.6001, + "step": 26395 + }, + { + "epoch": 0.7827298876137947, + "grad_norm": 0.09586765617132187, + "learning_rate": 0.0001142429979702257, + "loss": 2.5538, + "step": 26396 + }, + { + "epoch": 0.7827595409661062, + "grad_norm": 0.08429916203022003, + "learning_rate": 0.0001142130652763641, + "loss": 2.5275, + "step": 26397 + }, + { + "epoch": 0.7827891943184176, + "grad_norm": 0.09963391721248627, + "learning_rate": 0.00011418313599871194, + "loss": 2.5823, + "step": 26398 + }, + { + "epoch": 0.7828188476707292, + "grad_norm": 0.09345310926437378, + "learning_rate": 0.00011415321013753444, + "loss": 2.5852, + "step": 26399 + }, + { + "epoch": 0.7828485010230406, + "grad_norm": 0.10209519416093826, + "learning_rate": 0.00011412328769309627, + "loss": 2.5447, + "step": 26400 + }, + { + "epoch": 0.7828781543753521, + "grad_norm": 0.10308162868022919, + "learning_rate": 0.00011409336866566278, + "loss": 2.5529, + "step": 26401 + }, + { + "epoch": 0.7829078077276637, + "grad_norm": 0.09235449880361557, + "learning_rate": 0.00011406345305549893, + "loss": 2.5488, + "step": 26402 + }, + { + "epoch": 0.7829374610799751, + "grad_norm": 0.10069145262241364, + "learning_rate": 0.00011403354086286927, + "loss": 2.5546, + "step": 26403 + }, + { + "epoch": 0.7829671144322866, + "grad_norm": 0.10594379901885986, + "learning_rate": 0.00011400363208803882, + "loss": 2.5548, + "step": 26404 + }, + { + "epoch": 0.782996767784598, + "grad_norm": 0.10633165389299393, + "learning_rate": 0.00011397372673127255, + "loss": 2.57, + "step": 26405 + }, + { + "epoch": 0.7830264211369096, + "grad_norm": 0.09406142681837082, + "learning_rate": 0.00011394382479283511, + "loss": 2.5279, + "step": 26406 + }, + { + "epoch": 0.783056074489221, + "grad_norm": 0.10160072892904282, + "learning_rate": 0.0001139139262729914, + "loss": 2.5457, + "step": 26407 + }, + { + "epoch": 0.7830857278415325, + "grad_norm": 0.09095980226993561, + "learning_rate": 0.00011388403117200619, + "loss": 2.5935, + "step": 26408 + }, + { + "epoch": 0.7831153811938439, + "grad_norm": 0.08927012234926224, + "learning_rate": 0.00011385413949014411, + "loss": 2.5387, + "step": 26409 + }, + { + "epoch": 0.7831450345461555, + "grad_norm": 0.08994391560554504, + "learning_rate": 0.00011382425122766993, + "loss": 2.5467, + "step": 26410 + }, + { + "epoch": 0.7831746878984669, + "grad_norm": 0.08795544505119324, + "learning_rate": 0.00011379436638484824, + "loss": 2.6022, + "step": 26411 + }, + { + "epoch": 0.7832043412507784, + "grad_norm": 0.08749289065599442, + "learning_rate": 0.0001137644849619438, + "loss": 2.5718, + "step": 26412 + }, + { + "epoch": 0.7832339946030898, + "grad_norm": 0.09011899679899216, + "learning_rate": 0.00011373460695922105, + "loss": 2.5485, + "step": 26413 + }, + { + "epoch": 0.7832636479554014, + "grad_norm": 0.08395123481750488, + "learning_rate": 0.00011370473237694473, + "loss": 2.5367, + "step": 26414 + }, + { + "epoch": 0.7832933013077128, + "grad_norm": 0.08786709606647491, + "learning_rate": 0.0001136748612153793, + "loss": 2.5298, + "step": 26415 + }, + { + "epoch": 0.7833229546600243, + "grad_norm": 0.08948937058448792, + "learning_rate": 0.00011364499347478929, + "loss": 2.5542, + "step": 26416 + }, + { + "epoch": 0.7833526080123357, + "grad_norm": 0.09942037612199783, + "learning_rate": 0.00011361512915543914, + "loss": 2.5603, + "step": 26417 + }, + { + "epoch": 0.7833822613646473, + "grad_norm": 0.0889078751206398, + "learning_rate": 0.00011358526825759342, + "loss": 2.5434, + "step": 26418 + }, + { + "epoch": 0.7834119147169587, + "grad_norm": 0.09366843849420547, + "learning_rate": 0.00011355541078151643, + "loss": 2.5776, + "step": 26419 + }, + { + "epoch": 0.7834415680692702, + "grad_norm": 0.08749107271432877, + "learning_rate": 0.00011352555672747262, + "loss": 2.5771, + "step": 26420 + }, + { + "epoch": 0.7834712214215818, + "grad_norm": 0.09640005230903625, + "learning_rate": 0.00011349570609572629, + "loss": 2.5868, + "step": 26421 + }, + { + "epoch": 0.7835008747738932, + "grad_norm": 0.0892028734087944, + "learning_rate": 0.00011346585888654187, + "loss": 2.5572, + "step": 26422 + }, + { + "epoch": 0.7835305281262047, + "grad_norm": 0.09384749829769135, + "learning_rate": 0.00011343601510018364, + "loss": 2.5775, + "step": 26423 + }, + { + "epoch": 0.7835601814785161, + "grad_norm": 0.09030800312757492, + "learning_rate": 0.00011340617473691578, + "loss": 2.5721, + "step": 26424 + }, + { + "epoch": 0.7835898348308277, + "grad_norm": 0.09419909119606018, + "learning_rate": 0.00011337633779700268, + "loss": 2.5249, + "step": 26425 + }, + { + "epoch": 0.7836194881831391, + "grad_norm": 0.08030176162719727, + "learning_rate": 0.00011334650428070841, + "loss": 2.5509, + "step": 26426 + }, + { + "epoch": 0.7836491415354506, + "grad_norm": 0.10168691724538803, + "learning_rate": 0.00011331667418829728, + "loss": 2.5746, + "step": 26427 + }, + { + "epoch": 0.783678794887762, + "grad_norm": 0.08122743666172028, + "learning_rate": 0.00011328684752003332, + "loss": 2.58, + "step": 26428 + }, + { + "epoch": 0.7837084482400736, + "grad_norm": 0.09122925251722336, + "learning_rate": 0.00011325702427618068, + "loss": 2.5581, + "step": 26429 + }, + { + "epoch": 0.783738101592385, + "grad_norm": 0.0831770971417427, + "learning_rate": 0.00011322720445700352, + "loss": 2.5646, + "step": 26430 + }, + { + "epoch": 0.7837677549446965, + "grad_norm": 0.09536456316709518, + "learning_rate": 0.00011319738806276586, + "loss": 2.6133, + "step": 26431 + }, + { + "epoch": 0.783797408297008, + "grad_norm": 0.08305954933166504, + "learning_rate": 0.0001131675750937317, + "loss": 2.5647, + "step": 26432 + }, + { + "epoch": 0.7838270616493195, + "grad_norm": 0.08914753794670105, + "learning_rate": 0.00011313776555016509, + "loss": 2.5583, + "step": 26433 + }, + { + "epoch": 0.7838567150016309, + "grad_norm": 0.08604729175567627, + "learning_rate": 0.00011310795943232993, + "loss": 2.5469, + "step": 26434 + }, + { + "epoch": 0.7838863683539424, + "grad_norm": 0.09004413336515427, + "learning_rate": 0.0001130781567404902, + "loss": 2.5856, + "step": 26435 + }, + { + "epoch": 0.7839160217062539, + "grad_norm": 0.08305374532938004, + "learning_rate": 0.00011304835747490983, + "loss": 2.5748, + "step": 26436 + }, + { + "epoch": 0.7839456750585654, + "grad_norm": 0.09304092824459076, + "learning_rate": 0.00011301856163585278, + "loss": 2.5644, + "step": 26437 + }, + { + "epoch": 0.7839753284108768, + "grad_norm": 0.0756017193198204, + "learning_rate": 0.0001129887692235827, + "loss": 2.5487, + "step": 26438 + }, + { + "epoch": 0.7840049817631883, + "grad_norm": 0.09488886594772339, + "learning_rate": 0.00011295898023836332, + "loss": 2.5563, + "step": 26439 + }, + { + "epoch": 0.7840346351154998, + "grad_norm": 0.08420221507549286, + "learning_rate": 0.00011292919468045875, + "loss": 2.5824, + "step": 26440 + }, + { + "epoch": 0.7840642884678113, + "grad_norm": 0.09116534143686295, + "learning_rate": 0.00011289941255013264, + "loss": 2.542, + "step": 26441 + }, + { + "epoch": 0.7840939418201228, + "grad_norm": 0.08137194812297821, + "learning_rate": 0.00011286963384764865, + "loss": 2.5785, + "step": 26442 + }, + { + "epoch": 0.7841235951724342, + "grad_norm": 0.09649596363306046, + "learning_rate": 0.0001128398585732705, + "loss": 2.5698, + "step": 26443 + }, + { + "epoch": 0.7841532485247458, + "grad_norm": 0.08486807346343994, + "learning_rate": 0.00011281008672726185, + "loss": 2.5974, + "step": 26444 + }, + { + "epoch": 0.7841829018770572, + "grad_norm": 0.09188052266836166, + "learning_rate": 0.00011278031830988633, + "loss": 2.528, + "step": 26445 + }, + { + "epoch": 0.7842125552293687, + "grad_norm": 0.08955465257167816, + "learning_rate": 0.0001127505533214076, + "loss": 2.5362, + "step": 26446 + }, + { + "epoch": 0.7842422085816801, + "grad_norm": 0.09559620171785355, + "learning_rate": 0.00011272079176208927, + "loss": 2.5867, + "step": 26447 + }, + { + "epoch": 0.7842718619339917, + "grad_norm": 0.0898011177778244, + "learning_rate": 0.00011269103363219474, + "loss": 2.5625, + "step": 26448 + }, + { + "epoch": 0.7843015152863031, + "grad_norm": 0.09740689396858215, + "learning_rate": 0.00011266127893198752, + "loss": 2.5675, + "step": 26449 + }, + { + "epoch": 0.7843311686386146, + "grad_norm": 0.08608543872833252, + "learning_rate": 0.00011263152766173118, + "loss": 2.5707, + "step": 26450 + }, + { + "epoch": 0.784360821990926, + "grad_norm": 0.08916082978248596, + "learning_rate": 0.00011260177982168906, + "loss": 2.5498, + "step": 26451 + }, + { + "epoch": 0.7843904753432376, + "grad_norm": 0.08818331360816956, + "learning_rate": 0.00011257203541212479, + "loss": 2.574, + "step": 26452 + }, + { + "epoch": 0.784420128695549, + "grad_norm": 0.08208959549665451, + "learning_rate": 0.0001125422944333016, + "loss": 2.5616, + "step": 26453 + }, + { + "epoch": 0.7844497820478605, + "grad_norm": 0.08713537454605103, + "learning_rate": 0.00011251255688548295, + "loss": 2.5299, + "step": 26454 + }, + { + "epoch": 0.784479435400172, + "grad_norm": 0.08189304918050766, + "learning_rate": 0.00011248282276893213, + "loss": 2.527, + "step": 26455 + }, + { + "epoch": 0.7845090887524835, + "grad_norm": 0.09260990470647812, + "learning_rate": 0.00011245309208391241, + "loss": 2.5654, + "step": 26456 + }, + { + "epoch": 0.7845387421047949, + "grad_norm": 0.08635392040014267, + "learning_rate": 0.00011242336483068704, + "loss": 2.5857, + "step": 26457 + }, + { + "epoch": 0.7845683954571064, + "grad_norm": 0.09552416205406189, + "learning_rate": 0.00011239364100951949, + "loss": 2.5558, + "step": 26458 + }, + { + "epoch": 0.7845980488094179, + "grad_norm": 0.09234360605478287, + "learning_rate": 0.00011236392062067263, + "loss": 2.5679, + "step": 26459 + }, + { + "epoch": 0.7846277021617294, + "grad_norm": 0.08951307088136673, + "learning_rate": 0.00011233420366440977, + "loss": 2.5565, + "step": 26460 + }, + { + "epoch": 0.7846573555140408, + "grad_norm": 0.09003342688083649, + "learning_rate": 0.00011230449014099414, + "loss": 2.5327, + "step": 26461 + }, + { + "epoch": 0.7846870088663523, + "grad_norm": 0.08946520835161209, + "learning_rate": 0.00011227478005068875, + "loss": 2.5124, + "step": 26462 + }, + { + "epoch": 0.7847166622186639, + "grad_norm": 0.09007302671670914, + "learning_rate": 0.00011224507339375672, + "loss": 2.5618, + "step": 26463 + }, + { + "epoch": 0.7847463155709753, + "grad_norm": 0.0896611362695694, + "learning_rate": 0.00011221537017046101, + "loss": 2.5486, + "step": 26464 + }, + { + "epoch": 0.7847759689232868, + "grad_norm": 0.09105946868658066, + "learning_rate": 0.00011218567038106487, + "loss": 2.5999, + "step": 26465 + }, + { + "epoch": 0.7848056222755982, + "grad_norm": 0.07583606988191605, + "learning_rate": 0.00011215597402583122, + "loss": 2.5453, + "step": 26466 + }, + { + "epoch": 0.7848352756279098, + "grad_norm": 0.09020809829235077, + "learning_rate": 0.00011212628110502298, + "loss": 2.5351, + "step": 26467 + }, + { + "epoch": 0.7848649289802212, + "grad_norm": 0.08092722296714783, + "learning_rate": 0.00011209659161890323, + "loss": 2.5694, + "step": 26468 + }, + { + "epoch": 0.7848945823325327, + "grad_norm": 0.08550863713026047, + "learning_rate": 0.00011206690556773463, + "loss": 2.5312, + "step": 26469 + }, + { + "epoch": 0.7849242356848442, + "grad_norm": 0.09111843258142471, + "learning_rate": 0.00011203722295178015, + "loss": 2.5492, + "step": 26470 + }, + { + "epoch": 0.7849538890371557, + "grad_norm": 0.08554108440876007, + "learning_rate": 0.00011200754377130268, + "loss": 2.587, + "step": 26471 + }, + { + "epoch": 0.7849835423894671, + "grad_norm": 0.09298105537891388, + "learning_rate": 0.00011197786802656495, + "loss": 2.5561, + "step": 26472 + }, + { + "epoch": 0.7850131957417786, + "grad_norm": 0.09075990319252014, + "learning_rate": 0.00011194819571782982, + "loss": 2.5615, + "step": 26473 + }, + { + "epoch": 0.7850428490940901, + "grad_norm": 0.09543034434318542, + "learning_rate": 0.00011191852684536008, + "loss": 2.595, + "step": 26474 + }, + { + "epoch": 0.7850725024464016, + "grad_norm": 0.0847722738981247, + "learning_rate": 0.00011188886140941835, + "loss": 2.5805, + "step": 26475 + }, + { + "epoch": 0.785102155798713, + "grad_norm": 0.08700481057167053, + "learning_rate": 0.00011185919941026739, + "loss": 2.5314, + "step": 26476 + }, + { + "epoch": 0.7851318091510245, + "grad_norm": 0.08496703207492828, + "learning_rate": 0.00011182954084816965, + "loss": 2.5676, + "step": 26477 + }, + { + "epoch": 0.785161462503336, + "grad_norm": 0.0959080159664154, + "learning_rate": 0.00011179988572338818, + "loss": 2.5521, + "step": 26478 + }, + { + "epoch": 0.7851911158556475, + "grad_norm": 0.0867827907204628, + "learning_rate": 0.00011177023403618542, + "loss": 2.5726, + "step": 26479 + }, + { + "epoch": 0.7852207692079589, + "grad_norm": 0.09026861935853958, + "learning_rate": 0.00011174058578682378, + "loss": 2.5361, + "step": 26480 + }, + { + "epoch": 0.7852504225602704, + "grad_norm": 0.09672532975673676, + "learning_rate": 0.00011171094097556588, + "loss": 2.5648, + "step": 26481 + }, + { + "epoch": 0.7852800759125819, + "grad_norm": 0.07787754386663437, + "learning_rate": 0.00011168129960267426, + "loss": 2.5665, + "step": 26482 + }, + { + "epoch": 0.7853097292648934, + "grad_norm": 0.09636089205741882, + "learning_rate": 0.00011165166166841139, + "loss": 2.5625, + "step": 26483 + }, + { + "epoch": 0.7853393826172049, + "grad_norm": 0.08557651937007904, + "learning_rate": 0.00011162202717303971, + "loss": 2.5717, + "step": 26484 + }, + { + "epoch": 0.7853690359695163, + "grad_norm": 0.09134300798177719, + "learning_rate": 0.00011159239611682159, + "loss": 2.531, + "step": 26485 + }, + { + "epoch": 0.7853986893218279, + "grad_norm": 0.0831720158457756, + "learning_rate": 0.00011156276850001956, + "loss": 2.594, + "step": 26486 + }, + { + "epoch": 0.7854283426741393, + "grad_norm": 0.08185378462076187, + "learning_rate": 0.00011153314432289586, + "loss": 2.576, + "step": 26487 + }, + { + "epoch": 0.7854579960264508, + "grad_norm": 0.08781078457832336, + "learning_rate": 0.00011150352358571281, + "loss": 2.5767, + "step": 26488 + }, + { + "epoch": 0.7854876493787623, + "grad_norm": 0.08582714200019836, + "learning_rate": 0.00011147390628873278, + "loss": 2.5801, + "step": 26489 + }, + { + "epoch": 0.7855173027310738, + "grad_norm": 0.08170662820339203, + "learning_rate": 0.00011144429243221798, + "loss": 2.5596, + "step": 26490 + }, + { + "epoch": 0.7855469560833852, + "grad_norm": 0.08846273273229599, + "learning_rate": 0.00011141468201643068, + "loss": 2.5613, + "step": 26491 + }, + { + "epoch": 0.7855766094356967, + "grad_norm": 0.08170121908187866, + "learning_rate": 0.00011138507504163303, + "loss": 2.5678, + "step": 26492 + }, + { + "epoch": 0.7856062627880082, + "grad_norm": 0.08888266235589981, + "learning_rate": 0.00011135547150808727, + "loss": 2.5249, + "step": 26493 + }, + { + "epoch": 0.7856359161403197, + "grad_norm": 0.09196146577596664, + "learning_rate": 0.00011132587141605555, + "loss": 2.5494, + "step": 26494 + }, + { + "epoch": 0.7856655694926311, + "grad_norm": 0.09143669903278351, + "learning_rate": 0.00011129627476579996, + "loss": 2.5815, + "step": 26495 + }, + { + "epoch": 0.7856952228449426, + "grad_norm": 0.09573407471179962, + "learning_rate": 0.00011126668155758252, + "loss": 2.5759, + "step": 26496 + }, + { + "epoch": 0.7857248761972541, + "grad_norm": 0.0882403776049614, + "learning_rate": 0.00011123709179166535, + "loss": 2.5658, + "step": 26497 + }, + { + "epoch": 0.7857545295495656, + "grad_norm": 0.10729075223207474, + "learning_rate": 0.00011120750546831049, + "loss": 2.5727, + "step": 26498 + }, + { + "epoch": 0.785784182901877, + "grad_norm": 0.08158572018146515, + "learning_rate": 0.00011117792258777992, + "loss": 2.5825, + "step": 26499 + }, + { + "epoch": 0.7858138362541885, + "grad_norm": 0.089191734790802, + "learning_rate": 0.00011114834315033557, + "loss": 2.5366, + "step": 26500 + }, + { + "epoch": 0.7858434896065, + "grad_norm": 0.09007400274276733, + "learning_rate": 0.00011111876715623936, + "loss": 2.5586, + "step": 26501 + }, + { + "epoch": 0.7858731429588115, + "grad_norm": 0.0901680439710617, + "learning_rate": 0.0001110891946057534, + "loss": 2.5344, + "step": 26502 + }, + { + "epoch": 0.7859027963111229, + "grad_norm": 0.08077678084373474, + "learning_rate": 0.00011105962549913906, + "loss": 2.529, + "step": 26503 + }, + { + "epoch": 0.7859324496634345, + "grad_norm": 0.08147674798965454, + "learning_rate": 0.00011103005983665864, + "loss": 2.5635, + "step": 26504 + }, + { + "epoch": 0.785962103015746, + "grad_norm": 0.0857582688331604, + "learning_rate": 0.00011100049761857378, + "loss": 2.5477, + "step": 26505 + }, + { + "epoch": 0.7859917563680574, + "grad_norm": 0.08489983528852463, + "learning_rate": 0.00011097093884514636, + "loss": 2.5614, + "step": 26506 + }, + { + "epoch": 0.7860214097203689, + "grad_norm": 0.09045572578907013, + "learning_rate": 0.00011094138351663801, + "loss": 2.5532, + "step": 26507 + }, + { + "epoch": 0.7860510630726804, + "grad_norm": 0.08070476353168488, + "learning_rate": 0.00011091183163331048, + "loss": 2.5506, + "step": 26508 + }, + { + "epoch": 0.7860807164249919, + "grad_norm": 0.09051147848367691, + "learning_rate": 0.00011088228319542548, + "loss": 2.5529, + "step": 26509 + }, + { + "epoch": 0.7861103697773033, + "grad_norm": 0.0869855061173439, + "learning_rate": 0.00011085273820324466, + "loss": 2.5847, + "step": 26510 + }, + { + "epoch": 0.7861400231296148, + "grad_norm": 0.091656893491745, + "learning_rate": 0.00011082319665702962, + "loss": 2.5529, + "step": 26511 + }, + { + "epoch": 0.7861696764819263, + "grad_norm": 0.09308450669050217, + "learning_rate": 0.00011079365855704198, + "loss": 2.5468, + "step": 26512 + }, + { + "epoch": 0.7861993298342378, + "grad_norm": 0.09195193648338318, + "learning_rate": 0.00011076412390354346, + "loss": 2.591, + "step": 26513 + }, + { + "epoch": 0.7862289831865492, + "grad_norm": 0.0832533910870552, + "learning_rate": 0.00011073459269679532, + "loss": 2.5611, + "step": 26514 + }, + { + "epoch": 0.7862586365388607, + "grad_norm": 0.11156277358531952, + "learning_rate": 0.00011070506493705913, + "loss": 2.5509, + "step": 26515 + }, + { + "epoch": 0.7862882898911722, + "grad_norm": 0.08894721418619156, + "learning_rate": 0.00011067554062459629, + "loss": 2.5594, + "step": 26516 + }, + { + "epoch": 0.7863179432434837, + "grad_norm": 0.11073776334524155, + "learning_rate": 0.00011064601975966848, + "loss": 2.5763, + "step": 26517 + }, + { + "epoch": 0.7863475965957951, + "grad_norm": 0.082863450050354, + "learning_rate": 0.000110616502342537, + "loss": 2.5408, + "step": 26518 + }, + { + "epoch": 0.7863772499481066, + "grad_norm": 0.10012874007225037, + "learning_rate": 0.00011058698837346326, + "loss": 2.576, + "step": 26519 + }, + { + "epoch": 0.7864069033004181, + "grad_norm": 0.08780862390995026, + "learning_rate": 0.00011055747785270853, + "loss": 2.5653, + "step": 26520 + }, + { + "epoch": 0.7864365566527296, + "grad_norm": 0.10290390998125076, + "learning_rate": 0.00011052797078053423, + "loss": 2.5698, + "step": 26521 + }, + { + "epoch": 0.786466210005041, + "grad_norm": 0.08823831379413605, + "learning_rate": 0.00011049846715720158, + "loss": 2.5674, + "step": 26522 + }, + { + "epoch": 0.7864958633573526, + "grad_norm": 0.09851031005382538, + "learning_rate": 0.00011046896698297198, + "loss": 2.5915, + "step": 26523 + }, + { + "epoch": 0.786525516709664, + "grad_norm": 0.0903107300400734, + "learning_rate": 0.00011043947025810647, + "loss": 2.5827, + "step": 26524 + }, + { + "epoch": 0.7865551700619755, + "grad_norm": 0.0933857262134552, + "learning_rate": 0.00011040997698286625, + "loss": 2.5732, + "step": 26525 + }, + { + "epoch": 0.786584823414287, + "grad_norm": 0.08781720697879791, + "learning_rate": 0.00011038048715751258, + "loss": 2.5525, + "step": 26526 + }, + { + "epoch": 0.7866144767665985, + "grad_norm": 0.09956791996955872, + "learning_rate": 0.00011035100078230653, + "loss": 2.5824, + "step": 26527 + }, + { + "epoch": 0.78664413011891, + "grad_norm": 0.08544167876243591, + "learning_rate": 0.00011032151785750932, + "loss": 2.5696, + "step": 26528 + }, + { + "epoch": 0.7866737834712214, + "grad_norm": 0.10064157098531723, + "learning_rate": 0.00011029203838338181, + "loss": 2.5427, + "step": 26529 + }, + { + "epoch": 0.7867034368235329, + "grad_norm": 0.08498499542474747, + "learning_rate": 0.00011026256236018528, + "loss": 2.5625, + "step": 26530 + }, + { + "epoch": 0.7867330901758444, + "grad_norm": 0.08836200833320618, + "learning_rate": 0.0001102330897881807, + "loss": 2.5802, + "step": 26531 + }, + { + "epoch": 0.7867627435281559, + "grad_norm": 0.08245958387851715, + "learning_rate": 0.00011020362066762902, + "loss": 2.5469, + "step": 26532 + }, + { + "epoch": 0.7867923968804673, + "grad_norm": 0.08832573145627975, + "learning_rate": 0.00011017415499879114, + "loss": 2.5323, + "step": 26533 + }, + { + "epoch": 0.7868220502327788, + "grad_norm": 0.08252027630805969, + "learning_rate": 0.00011014469278192817, + "loss": 2.5458, + "step": 26534 + }, + { + "epoch": 0.7868517035850903, + "grad_norm": 0.0861366018652916, + "learning_rate": 0.00011011523401730078, + "loss": 2.543, + "step": 26535 + }, + { + "epoch": 0.7868813569374018, + "grad_norm": 0.08468380570411682, + "learning_rate": 0.00011008577870516989, + "loss": 2.5278, + "step": 26536 + }, + { + "epoch": 0.7869110102897132, + "grad_norm": 0.08151724189519882, + "learning_rate": 0.00011005632684579636, + "loss": 2.584, + "step": 26537 + }, + { + "epoch": 0.7869406636420248, + "grad_norm": 0.08217247575521469, + "learning_rate": 0.00011002687843944099, + "loss": 2.543, + "step": 26538 + }, + { + "epoch": 0.7869703169943362, + "grad_norm": 0.09008309245109558, + "learning_rate": 0.00010999743348636454, + "loss": 2.5482, + "step": 26539 + }, + { + "epoch": 0.7869999703466477, + "grad_norm": 0.08172745257616043, + "learning_rate": 0.00010996799198682772, + "loss": 2.5553, + "step": 26540 + }, + { + "epoch": 0.7870296236989591, + "grad_norm": 0.09012428671121597, + "learning_rate": 0.00010993855394109136, + "loss": 2.5616, + "step": 26541 + }, + { + "epoch": 0.7870592770512707, + "grad_norm": 0.09228992462158203, + "learning_rate": 0.00010990911934941588, + "loss": 2.5491, + "step": 26542 + }, + { + "epoch": 0.7870889304035821, + "grad_norm": 0.08225883543491364, + "learning_rate": 0.00010987968821206223, + "loss": 2.5579, + "step": 26543 + }, + { + "epoch": 0.7871185837558936, + "grad_norm": 0.08252720534801483, + "learning_rate": 0.00010985026052929104, + "loss": 2.5661, + "step": 26544 + }, + { + "epoch": 0.787148237108205, + "grad_norm": 0.09260360896587372, + "learning_rate": 0.00010982083630136264, + "loss": 2.5906, + "step": 26545 + }, + { + "epoch": 0.7871778904605166, + "grad_norm": 0.08050613850355148, + "learning_rate": 0.00010979141552853772, + "loss": 2.5508, + "step": 26546 + }, + { + "epoch": 0.7872075438128281, + "grad_norm": 0.07953575998544693, + "learning_rate": 0.0001097619982110768, + "loss": 2.5718, + "step": 26547 + }, + { + "epoch": 0.7872371971651395, + "grad_norm": 0.09037350863218307, + "learning_rate": 0.00010973258434924033, + "loss": 2.5325, + "step": 26548 + }, + { + "epoch": 0.787266850517451, + "grad_norm": 0.08279678970575333, + "learning_rate": 0.00010970317394328883, + "loss": 2.5632, + "step": 26549 + }, + { + "epoch": 0.7872965038697625, + "grad_norm": 0.0936778113245964, + "learning_rate": 0.00010967376699348274, + "loss": 2.5808, + "step": 26550 + }, + { + "epoch": 0.787326157222074, + "grad_norm": 0.08233750611543655, + "learning_rate": 0.00010964436350008245, + "loss": 2.5625, + "step": 26551 + }, + { + "epoch": 0.7873558105743854, + "grad_norm": 0.08583687990903854, + "learning_rate": 0.00010961496346334826, + "loss": 2.5477, + "step": 26552 + }, + { + "epoch": 0.787385463926697, + "grad_norm": 0.08579076826572418, + "learning_rate": 0.00010958556688354065, + "loss": 2.5871, + "step": 26553 + }, + { + "epoch": 0.7874151172790084, + "grad_norm": 0.09411388635635376, + "learning_rate": 0.0001095561737609198, + "loss": 2.5565, + "step": 26554 + }, + { + "epoch": 0.7874447706313199, + "grad_norm": 0.08889336884021759, + "learning_rate": 0.00010952678409574606, + "loss": 2.549, + "step": 26555 + }, + { + "epoch": 0.7874744239836313, + "grad_norm": 0.08740351349115372, + "learning_rate": 0.00010949739788827972, + "loss": 2.5473, + "step": 26556 + }, + { + "epoch": 0.7875040773359429, + "grad_norm": 0.09197992831468582, + "learning_rate": 0.00010946801513878091, + "loss": 2.572, + "step": 26557 + }, + { + "epoch": 0.7875337306882543, + "grad_norm": 0.08284694701433182, + "learning_rate": 0.00010943863584750985, + "loss": 2.5078, + "step": 26558 + }, + { + "epoch": 0.7875633840405658, + "grad_norm": 0.0965447649359703, + "learning_rate": 0.00010940926001472673, + "loss": 2.5619, + "step": 26559 + }, + { + "epoch": 0.7875930373928772, + "grad_norm": 0.08767972886562347, + "learning_rate": 0.00010937988764069167, + "loss": 2.5834, + "step": 26560 + }, + { + "epoch": 0.7876226907451888, + "grad_norm": 0.07858936488628387, + "learning_rate": 0.00010935051872566476, + "loss": 2.5792, + "step": 26561 + }, + { + "epoch": 0.7876523440975002, + "grad_norm": 0.08863027393817902, + "learning_rate": 0.000109321153269906, + "loss": 2.5521, + "step": 26562 + }, + { + "epoch": 0.7876819974498117, + "grad_norm": 0.08782152831554413, + "learning_rate": 0.00010929179127367555, + "loss": 2.6022, + "step": 26563 + }, + { + "epoch": 0.7877116508021231, + "grad_norm": 0.08368534594774246, + "learning_rate": 0.00010926243273723329, + "loss": 2.547, + "step": 26564 + }, + { + "epoch": 0.7877413041544347, + "grad_norm": 0.0814802497625351, + "learning_rate": 0.00010923307766083934, + "loss": 2.525, + "step": 26565 + }, + { + "epoch": 0.7877709575067461, + "grad_norm": 0.0811910405755043, + "learning_rate": 0.00010920372604475348, + "loss": 2.5521, + "step": 26566 + }, + { + "epoch": 0.7878006108590576, + "grad_norm": 0.09631797671318054, + "learning_rate": 0.00010917437788923578, + "loss": 2.5875, + "step": 26567 + }, + { + "epoch": 0.7878302642113691, + "grad_norm": 0.08302954584360123, + "learning_rate": 0.000109145033194546, + "loss": 2.5998, + "step": 26568 + }, + { + "epoch": 0.7878599175636806, + "grad_norm": 0.09261438250541687, + "learning_rate": 0.00010911569196094406, + "loss": 2.5113, + "step": 26569 + }, + { + "epoch": 0.7878895709159921, + "grad_norm": 0.0822986513376236, + "learning_rate": 0.00010908635418868974, + "loss": 2.5249, + "step": 26570 + }, + { + "epoch": 0.7879192242683035, + "grad_norm": 0.08669736981391907, + "learning_rate": 0.00010905701987804284, + "loss": 2.5465, + "step": 26571 + }, + { + "epoch": 0.787948877620615, + "grad_norm": 0.08439595997333527, + "learning_rate": 0.00010902768902926318, + "loss": 2.5821, + "step": 26572 + }, + { + "epoch": 0.7879785309729265, + "grad_norm": 0.08907236903905869, + "learning_rate": 0.0001089983616426104, + "loss": 2.5547, + "step": 26573 + }, + { + "epoch": 0.788008184325238, + "grad_norm": 0.08138392865657806, + "learning_rate": 0.00010896903771834427, + "loss": 2.5708, + "step": 26574 + }, + { + "epoch": 0.7880378376775494, + "grad_norm": 0.09129474312067032, + "learning_rate": 0.00010893971725672447, + "loss": 2.5834, + "step": 26575 + }, + { + "epoch": 0.788067491029861, + "grad_norm": 0.09014502167701721, + "learning_rate": 0.00010891040025801052, + "loss": 2.5397, + "step": 26576 + }, + { + "epoch": 0.7880971443821724, + "grad_norm": 0.08560419827699661, + "learning_rate": 0.00010888108672246217, + "loss": 2.5779, + "step": 26577 + }, + { + "epoch": 0.7881267977344839, + "grad_norm": 0.09079582244157791, + "learning_rate": 0.00010885177665033902, + "loss": 2.5822, + "step": 26578 + }, + { + "epoch": 0.7881564510867953, + "grad_norm": 0.08706121146678925, + "learning_rate": 0.00010882247004190038, + "loss": 2.5322, + "step": 26579 + }, + { + "epoch": 0.7881861044391069, + "grad_norm": 0.09400045871734619, + "learning_rate": 0.00010879316689740598, + "loss": 2.5588, + "step": 26580 + }, + { + "epoch": 0.7882157577914183, + "grad_norm": 0.09098852425813675, + "learning_rate": 0.00010876386721711507, + "loss": 2.5636, + "step": 26581 + }, + { + "epoch": 0.7882454111437298, + "grad_norm": 0.08750497549772263, + "learning_rate": 0.00010873457100128737, + "loss": 2.5887, + "step": 26582 + }, + { + "epoch": 0.7882750644960412, + "grad_norm": 0.09027241170406342, + "learning_rate": 0.00010870527825018222, + "loss": 2.5565, + "step": 26583 + }, + { + "epoch": 0.7883047178483528, + "grad_norm": 0.08258388936519623, + "learning_rate": 0.000108675988964059, + "loss": 2.5352, + "step": 26584 + }, + { + "epoch": 0.7883343712006642, + "grad_norm": 0.09557683020830154, + "learning_rate": 0.0001086467031431771, + "loss": 2.5709, + "step": 26585 + }, + { + "epoch": 0.7883640245529757, + "grad_norm": 0.07737202942371368, + "learning_rate": 0.0001086174207877958, + "loss": 2.5273, + "step": 26586 + }, + { + "epoch": 0.7883936779052871, + "grad_norm": 0.09300235658884048, + "learning_rate": 0.0001085881418981744, + "loss": 2.5575, + "step": 26587 + }, + { + "epoch": 0.7884233312575987, + "grad_norm": 0.08179054409265518, + "learning_rate": 0.0001085588664745722, + "loss": 2.5618, + "step": 26588 + }, + { + "epoch": 0.7884529846099102, + "grad_norm": 0.09211718291044235, + "learning_rate": 0.00010852959451724859, + "loss": 2.5782, + "step": 26589 + }, + { + "epoch": 0.7884826379622216, + "grad_norm": 0.08337690681219101, + "learning_rate": 0.00010850032602646243, + "loss": 2.5306, + "step": 26590 + }, + { + "epoch": 0.7885122913145332, + "grad_norm": 0.08605366945266724, + "learning_rate": 0.00010847106100247312, + "loss": 2.5634, + "step": 26591 + }, + { + "epoch": 0.7885419446668446, + "grad_norm": 0.08292406797409058, + "learning_rate": 0.00010844179944553973, + "loss": 2.5764, + "step": 26592 + }, + { + "epoch": 0.7885715980191561, + "grad_norm": 0.09583034366369247, + "learning_rate": 0.0001084125413559215, + "loss": 2.5905, + "step": 26593 + }, + { + "epoch": 0.7886012513714675, + "grad_norm": 0.08507979661226273, + "learning_rate": 0.00010838328673387721, + "loss": 2.5371, + "step": 26594 + }, + { + "epoch": 0.7886309047237791, + "grad_norm": 0.08390794694423676, + "learning_rate": 0.00010835403557966627, + "loss": 2.5348, + "step": 26595 + }, + { + "epoch": 0.7886605580760905, + "grad_norm": 0.09071236848831177, + "learning_rate": 0.00010832478789354761, + "loss": 2.5665, + "step": 26596 + }, + { + "epoch": 0.788690211428402, + "grad_norm": 0.09216050058603287, + "learning_rate": 0.00010829554367578021, + "loss": 2.5694, + "step": 26597 + }, + { + "epoch": 0.7887198647807134, + "grad_norm": 0.08747369050979614, + "learning_rate": 0.00010826630292662292, + "loss": 2.5501, + "step": 26598 + }, + { + "epoch": 0.788749518133025, + "grad_norm": 0.09666190296411514, + "learning_rate": 0.00010823706564633495, + "loss": 2.5553, + "step": 26599 + }, + { + "epoch": 0.7887791714853364, + "grad_norm": 0.09030599147081375, + "learning_rate": 0.00010820783183517491, + "loss": 2.5835, + "step": 26600 + }, + { + "epoch": 0.7888088248376479, + "grad_norm": 0.0923956111073494, + "learning_rate": 0.0001081786014934017, + "loss": 2.6008, + "step": 26601 + }, + { + "epoch": 0.7888384781899593, + "grad_norm": 0.08658251911401749, + "learning_rate": 0.00010814937462127428, + "loss": 2.561, + "step": 26602 + }, + { + "epoch": 0.7888681315422709, + "grad_norm": 0.09259182214736938, + "learning_rate": 0.00010812015121905139, + "loss": 2.5683, + "step": 26603 + }, + { + "epoch": 0.7888977848945823, + "grad_norm": 0.09047134220600128, + "learning_rate": 0.00010809093128699177, + "loss": 2.5752, + "step": 26604 + }, + { + "epoch": 0.7889274382468938, + "grad_norm": 0.08968814462423325, + "learning_rate": 0.00010806171482535431, + "loss": 2.5711, + "step": 26605 + }, + { + "epoch": 0.7889570915992052, + "grad_norm": 0.08800341188907623, + "learning_rate": 0.00010803250183439762, + "loss": 2.5441, + "step": 26606 + }, + { + "epoch": 0.7889867449515168, + "grad_norm": 0.08984420448541641, + "learning_rate": 0.00010800329231438022, + "loss": 2.5503, + "step": 26607 + }, + { + "epoch": 0.7890163983038282, + "grad_norm": 0.08523139357566833, + "learning_rate": 0.0001079740862655611, + "loss": 2.5767, + "step": 26608 + }, + { + "epoch": 0.7890460516561397, + "grad_norm": 0.09079854935407639, + "learning_rate": 0.00010794488368819877, + "loss": 2.5895, + "step": 26609 + }, + { + "epoch": 0.7890757050084513, + "grad_norm": 0.08547148108482361, + "learning_rate": 0.00010791568458255191, + "loss": 2.5282, + "step": 26610 + }, + { + "epoch": 0.7891053583607627, + "grad_norm": 0.08602859824895859, + "learning_rate": 0.00010788648894887887, + "loss": 2.5335, + "step": 26611 + }, + { + "epoch": 0.7891350117130742, + "grad_norm": 0.08409584313631058, + "learning_rate": 0.00010785729678743822, + "loss": 2.5287, + "step": 26612 + }, + { + "epoch": 0.7891646650653856, + "grad_norm": 0.08818072825670242, + "learning_rate": 0.00010782810809848853, + "loss": 2.5625, + "step": 26613 + }, + { + "epoch": 0.7891943184176972, + "grad_norm": 0.0826297178864479, + "learning_rate": 0.00010779892288228826, + "loss": 2.5664, + "step": 26614 + }, + { + "epoch": 0.7892239717700086, + "grad_norm": 0.08796311169862747, + "learning_rate": 0.00010776974113909587, + "loss": 2.5528, + "step": 26615 + }, + { + "epoch": 0.7892536251223201, + "grad_norm": 0.08119155466556549, + "learning_rate": 0.00010774056286916973, + "loss": 2.5687, + "step": 26616 + }, + { + "epoch": 0.7892832784746315, + "grad_norm": 0.08921261876821518, + "learning_rate": 0.0001077113880727682, + "loss": 2.5575, + "step": 26617 + }, + { + "epoch": 0.7893129318269431, + "grad_norm": 0.08403433114290237, + "learning_rate": 0.00010768221675014972, + "loss": 2.5555, + "step": 26618 + }, + { + "epoch": 0.7893425851792545, + "grad_norm": 0.09611790627241135, + "learning_rate": 0.00010765304890157251, + "loss": 2.5783, + "step": 26619 + }, + { + "epoch": 0.789372238531566, + "grad_norm": 0.07972610741853714, + "learning_rate": 0.00010762388452729494, + "loss": 2.5715, + "step": 26620 + }, + { + "epoch": 0.7894018918838774, + "grad_norm": 0.08734092861413956, + "learning_rate": 0.00010759472362757522, + "loss": 2.5067, + "step": 26621 + }, + { + "epoch": 0.789431545236189, + "grad_norm": 0.0775870829820633, + "learning_rate": 0.00010756556620267154, + "loss": 2.5587, + "step": 26622 + }, + { + "epoch": 0.7894611985885004, + "grad_norm": 0.08636803179979324, + "learning_rate": 0.00010753641225284217, + "loss": 2.57, + "step": 26623 + }, + { + "epoch": 0.7894908519408119, + "grad_norm": 0.07819926738739014, + "learning_rate": 0.00010750726177834519, + "loss": 2.5162, + "step": 26624 + }, + { + "epoch": 0.7895205052931233, + "grad_norm": 0.08570986986160278, + "learning_rate": 0.0001074781147794388, + "loss": 2.5517, + "step": 26625 + }, + { + "epoch": 0.7895501586454349, + "grad_norm": 0.08087397366762161, + "learning_rate": 0.00010744897125638109, + "loss": 2.5354, + "step": 26626 + }, + { + "epoch": 0.7895798119977463, + "grad_norm": 0.07783824950456619, + "learning_rate": 0.00010741983120943011, + "loss": 2.5407, + "step": 26627 + }, + { + "epoch": 0.7896094653500578, + "grad_norm": 0.08597083389759064, + "learning_rate": 0.00010739069463884394, + "loss": 2.5337, + "step": 26628 + }, + { + "epoch": 0.7896391187023694, + "grad_norm": 0.08749096095561981, + "learning_rate": 0.00010736156154488053, + "loss": 2.5556, + "step": 26629 + }, + { + "epoch": 0.7896687720546808, + "grad_norm": 0.09766211360692978, + "learning_rate": 0.0001073324319277979, + "loss": 2.5702, + "step": 26630 + }, + { + "epoch": 0.7896984254069923, + "grad_norm": 0.08146025985479355, + "learning_rate": 0.00010730330578785397, + "loss": 2.547, + "step": 26631 + }, + { + "epoch": 0.7897280787593037, + "grad_norm": 0.09183508902788162, + "learning_rate": 0.00010727418312530668, + "loss": 2.5658, + "step": 26632 + }, + { + "epoch": 0.7897577321116153, + "grad_norm": 0.09311721473932266, + "learning_rate": 0.00010724506394041388, + "loss": 2.5729, + "step": 26633 + }, + { + "epoch": 0.7897873854639267, + "grad_norm": 0.0887913778424263, + "learning_rate": 0.0001072159482334335, + "loss": 2.5471, + "step": 26634 + }, + { + "epoch": 0.7898170388162382, + "grad_norm": 0.09246368706226349, + "learning_rate": 0.00010718683600462332, + "loss": 2.5866, + "step": 26635 + }, + { + "epoch": 0.7898466921685496, + "grad_norm": 0.09518177062273026, + "learning_rate": 0.00010715772725424111, + "loss": 2.5649, + "step": 26636 + }, + { + "epoch": 0.7898763455208612, + "grad_norm": 0.10339898616075516, + "learning_rate": 0.00010712862198254463, + "loss": 2.5567, + "step": 26637 + }, + { + "epoch": 0.7899059988731726, + "grad_norm": 0.08895042538642883, + "learning_rate": 0.00010709952018979169, + "loss": 2.5526, + "step": 26638 + }, + { + "epoch": 0.7899356522254841, + "grad_norm": 0.10315386205911636, + "learning_rate": 0.00010707042187623989, + "loss": 2.554, + "step": 26639 + }, + { + "epoch": 0.7899653055777955, + "grad_norm": 0.08807697147130966, + "learning_rate": 0.00010704132704214698, + "loss": 2.5649, + "step": 26640 + }, + { + "epoch": 0.7899949589301071, + "grad_norm": 0.09255584329366684, + "learning_rate": 0.00010701223568777058, + "loss": 2.5583, + "step": 26641 + }, + { + "epoch": 0.7900246122824185, + "grad_norm": 0.09667587280273438, + "learning_rate": 0.00010698314781336826, + "loss": 2.5514, + "step": 26642 + }, + { + "epoch": 0.79005426563473, + "grad_norm": 0.08868508785963058, + "learning_rate": 0.00010695406341919766, + "loss": 2.5804, + "step": 26643 + }, + { + "epoch": 0.7900839189870414, + "grad_norm": 0.09967290610074997, + "learning_rate": 0.0001069249825055164, + "loss": 2.577, + "step": 26644 + }, + { + "epoch": 0.790113572339353, + "grad_norm": 0.08507232367992401, + "learning_rate": 0.00010689590507258162, + "loss": 2.5878, + "step": 26645 + }, + { + "epoch": 0.7901432256916644, + "grad_norm": 0.09780240058898926, + "learning_rate": 0.00010686683112065121, + "loss": 2.554, + "step": 26646 + }, + { + "epoch": 0.7901728790439759, + "grad_norm": 0.08403231203556061, + "learning_rate": 0.00010683776064998252, + "loss": 2.558, + "step": 26647 + }, + { + "epoch": 0.7902025323962873, + "grad_norm": 0.09057125449180603, + "learning_rate": 0.00010680869366083295, + "loss": 2.5456, + "step": 26648 + }, + { + "epoch": 0.7902321857485989, + "grad_norm": 0.08296945691108704, + "learning_rate": 0.00010677963015345988, + "loss": 2.5369, + "step": 26649 + }, + { + "epoch": 0.7902618391009104, + "grad_norm": 0.09394524246454239, + "learning_rate": 0.00010675057012812061, + "loss": 2.5407, + "step": 26650 + }, + { + "epoch": 0.7902914924532218, + "grad_norm": 0.08080904930830002, + "learning_rate": 0.00010672151358507265, + "loss": 2.5685, + "step": 26651 + }, + { + "epoch": 0.7903211458055334, + "grad_norm": 0.09070155769586563, + "learning_rate": 0.0001066924605245731, + "loss": 2.5539, + "step": 26652 + }, + { + "epoch": 0.7903507991578448, + "grad_norm": 0.07830779999494553, + "learning_rate": 0.00010666341094687937, + "loss": 2.5331, + "step": 26653 + }, + { + "epoch": 0.7903804525101563, + "grad_norm": 0.08171924203634262, + "learning_rate": 0.0001066343648522488, + "loss": 2.563, + "step": 26654 + }, + { + "epoch": 0.7904101058624677, + "grad_norm": 0.07842331379652023, + "learning_rate": 0.00010660532224093828, + "loss": 2.5146, + "step": 26655 + }, + { + "epoch": 0.7904397592147793, + "grad_norm": 0.08749416470527649, + "learning_rate": 0.00010657628311320517, + "loss": 2.545, + "step": 26656 + }, + { + "epoch": 0.7904694125670907, + "grad_norm": 0.08121920377016068, + "learning_rate": 0.00010654724746930661, + "loss": 2.5392, + "step": 26657 + }, + { + "epoch": 0.7904990659194022, + "grad_norm": 0.08306697010993958, + "learning_rate": 0.00010651821530949957, + "loss": 2.5476, + "step": 26658 + }, + { + "epoch": 0.7905287192717136, + "grad_norm": 0.09308336675167084, + "learning_rate": 0.0001064891866340414, + "loss": 2.5696, + "step": 26659 + }, + { + "epoch": 0.7905583726240252, + "grad_norm": 0.08207471668720245, + "learning_rate": 0.00010646016144318904, + "loss": 2.5469, + "step": 26660 + }, + { + "epoch": 0.7905880259763366, + "grad_norm": 0.09431791305541992, + "learning_rate": 0.00010643113973719948, + "loss": 2.5501, + "step": 26661 + }, + { + "epoch": 0.7906176793286481, + "grad_norm": 0.08799469470977783, + "learning_rate": 0.00010640212151632977, + "loss": 2.5868, + "step": 26662 + }, + { + "epoch": 0.7906473326809595, + "grad_norm": 0.08005188405513763, + "learning_rate": 0.00010637310678083678, + "loss": 2.5661, + "step": 26663 + }, + { + "epoch": 0.7906769860332711, + "grad_norm": 0.09560790657997131, + "learning_rate": 0.00010634409553097751, + "loss": 2.5433, + "step": 26664 + }, + { + "epoch": 0.7907066393855825, + "grad_norm": 0.09103468805551529, + "learning_rate": 0.00010631508776700905, + "loss": 2.5594, + "step": 26665 + }, + { + "epoch": 0.790736292737894, + "grad_norm": 0.08903571963310242, + "learning_rate": 0.00010628608348918783, + "loss": 2.5651, + "step": 26666 + }, + { + "epoch": 0.7907659460902055, + "grad_norm": 0.08973584324121475, + "learning_rate": 0.00010625708269777096, + "loss": 2.5728, + "step": 26667 + }, + { + "epoch": 0.790795599442517, + "grad_norm": 0.09288080036640167, + "learning_rate": 0.00010622808539301526, + "loss": 2.52, + "step": 26668 + }, + { + "epoch": 0.7908252527948284, + "grad_norm": 0.08464677631855011, + "learning_rate": 0.00010619909157517738, + "loss": 2.5507, + "step": 26669 + }, + { + "epoch": 0.7908549061471399, + "grad_norm": 0.08776404708623886, + "learning_rate": 0.00010617010124451415, + "loss": 2.524, + "step": 26670 + }, + { + "epoch": 0.7908845594994515, + "grad_norm": 0.08514538407325745, + "learning_rate": 0.00010614111440128216, + "loss": 2.5539, + "step": 26671 + }, + { + "epoch": 0.7909142128517629, + "grad_norm": 0.0896250531077385, + "learning_rate": 0.00010611213104573836, + "loss": 2.5934, + "step": 26672 + }, + { + "epoch": 0.7909438662040744, + "grad_norm": 0.08211299031972885, + "learning_rate": 0.00010608315117813921, + "loss": 2.5517, + "step": 26673 + }, + { + "epoch": 0.7909735195563858, + "grad_norm": 0.08496693521738052, + "learning_rate": 0.00010605417479874141, + "loss": 2.5395, + "step": 26674 + }, + { + "epoch": 0.7910031729086974, + "grad_norm": 0.09344599395990372, + "learning_rate": 0.00010602520190780157, + "loss": 2.5671, + "step": 26675 + }, + { + "epoch": 0.7910328262610088, + "grad_norm": 0.08153089880943298, + "learning_rate": 0.00010599623250557616, + "loss": 2.5595, + "step": 26676 + }, + { + "epoch": 0.7910624796133203, + "grad_norm": 0.0806528851389885, + "learning_rate": 0.0001059672665923217, + "loss": 2.5451, + "step": 26677 + }, + { + "epoch": 0.7910921329656317, + "grad_norm": 0.08835318684577942, + "learning_rate": 0.00010593830416829469, + "loss": 2.5936, + "step": 26678 + }, + { + "epoch": 0.7911217863179433, + "grad_norm": 0.08327174186706543, + "learning_rate": 0.00010590934523375168, + "loss": 2.5565, + "step": 26679 + }, + { + "epoch": 0.7911514396702547, + "grad_norm": 0.08323442935943604, + "learning_rate": 0.00010588038978894904, + "loss": 2.5724, + "step": 26680 + }, + { + "epoch": 0.7911810930225662, + "grad_norm": 0.08800999820232391, + "learning_rate": 0.0001058514378341432, + "loss": 2.5796, + "step": 26681 + }, + { + "epoch": 0.7912107463748776, + "grad_norm": 0.07969176024198532, + "learning_rate": 0.00010582248936959055, + "loss": 2.5824, + "step": 26682 + }, + { + "epoch": 0.7912403997271892, + "grad_norm": 0.08991961181163788, + "learning_rate": 0.0001057935443955474, + "loss": 2.5678, + "step": 26683 + }, + { + "epoch": 0.7912700530795006, + "grad_norm": 0.0848252922296524, + "learning_rate": 0.00010576460291226997, + "loss": 2.5347, + "step": 26684 + }, + { + "epoch": 0.7912997064318121, + "grad_norm": 0.08767151087522507, + "learning_rate": 0.00010573566492001474, + "loss": 2.5613, + "step": 26685 + }, + { + "epoch": 0.7913293597841236, + "grad_norm": 0.0884767472743988, + "learning_rate": 0.00010570673041903806, + "loss": 2.5721, + "step": 26686 + }, + { + "epoch": 0.7913590131364351, + "grad_norm": 0.08914544433355331, + "learning_rate": 0.00010567779940959577, + "loss": 2.5834, + "step": 26687 + }, + { + "epoch": 0.7913886664887465, + "grad_norm": 0.08792434632778168, + "learning_rate": 0.00010564887189194428, + "loss": 2.5564, + "step": 26688 + }, + { + "epoch": 0.791418319841058, + "grad_norm": 0.09470577538013458, + "learning_rate": 0.00010561994786633972, + "loss": 2.584, + "step": 26689 + }, + { + "epoch": 0.7914479731933695, + "grad_norm": 0.0824807658791542, + "learning_rate": 0.00010559102733303822, + "loss": 2.5613, + "step": 26690 + }, + { + "epoch": 0.791477626545681, + "grad_norm": 0.09069401025772095, + "learning_rate": 0.0001055621102922959, + "loss": 2.5607, + "step": 26691 + }, + { + "epoch": 0.7915072798979925, + "grad_norm": 0.07843286544084549, + "learning_rate": 0.00010553319674436873, + "loss": 2.5925, + "step": 26692 + }, + { + "epoch": 0.7915369332503039, + "grad_norm": 0.08999418467283249, + "learning_rate": 0.00010550428668951284, + "loss": 2.5401, + "step": 26693 + }, + { + "epoch": 0.7915665866026155, + "grad_norm": 0.08139971643686295, + "learning_rate": 0.00010547538012798424, + "loss": 2.5277, + "step": 26694 + }, + { + "epoch": 0.7915962399549269, + "grad_norm": 0.08476737141609192, + "learning_rate": 0.00010544647706003885, + "loss": 2.548, + "step": 26695 + }, + { + "epoch": 0.7916258933072384, + "grad_norm": 0.0929698571562767, + "learning_rate": 0.00010541757748593262, + "loss": 2.5759, + "step": 26696 + }, + { + "epoch": 0.7916555466595498, + "grad_norm": 0.08077434450387955, + "learning_rate": 0.00010538868140592145, + "loss": 2.5396, + "step": 26697 + }, + { + "epoch": 0.7916852000118614, + "grad_norm": 0.08082806318998337, + "learning_rate": 0.00010535978882026126, + "loss": 2.5684, + "step": 26698 + }, + { + "epoch": 0.7917148533641728, + "grad_norm": 0.0928138867020607, + "learning_rate": 0.0001053308997292079, + "loss": 2.5423, + "step": 26699 + }, + { + "epoch": 0.7917445067164843, + "grad_norm": 0.08311820775270462, + "learning_rate": 0.00010530201413301716, + "loss": 2.5332, + "step": 26700 + }, + { + "epoch": 0.7917741600687958, + "grad_norm": 0.0872202217578888, + "learning_rate": 0.00010527313203194483, + "loss": 2.5811, + "step": 26701 + }, + { + "epoch": 0.7918038134211073, + "grad_norm": 0.08547793328762054, + "learning_rate": 0.00010524425342624666, + "loss": 2.5679, + "step": 26702 + }, + { + "epoch": 0.7918334667734187, + "grad_norm": 0.08015470206737518, + "learning_rate": 0.0001052153783161784, + "loss": 2.598, + "step": 26703 + }, + { + "epoch": 0.7918631201257302, + "grad_norm": 0.09710992127656937, + "learning_rate": 0.0001051865067019957, + "loss": 2.5328, + "step": 26704 + }, + { + "epoch": 0.7918927734780417, + "grad_norm": 0.08213699609041214, + "learning_rate": 0.00010515763858395428, + "loss": 2.5577, + "step": 26705 + }, + { + "epoch": 0.7919224268303532, + "grad_norm": 0.09623242169618607, + "learning_rate": 0.00010512877396230969, + "loss": 2.5307, + "step": 26706 + }, + { + "epoch": 0.7919520801826646, + "grad_norm": 0.09002181142568588, + "learning_rate": 0.00010509991283731762, + "loss": 2.5431, + "step": 26707 + }, + { + "epoch": 0.7919817335349761, + "grad_norm": 0.07941786199808121, + "learning_rate": 0.00010507105520923365, + "loss": 2.5381, + "step": 26708 + }, + { + "epoch": 0.7920113868872876, + "grad_norm": 0.08794385939836502, + "learning_rate": 0.00010504220107831336, + "loss": 2.4985, + "step": 26709 + }, + { + "epoch": 0.7920410402395991, + "grad_norm": 0.09061659127473831, + "learning_rate": 0.00010501335044481192, + "loss": 2.5793, + "step": 26710 + }, + { + "epoch": 0.7920706935919105, + "grad_norm": 0.08410802483558655, + "learning_rate": 0.00010498450330898518, + "loss": 2.5759, + "step": 26711 + }, + { + "epoch": 0.792100346944222, + "grad_norm": 0.08776361495256424, + "learning_rate": 0.0001049556596710885, + "loss": 2.5268, + "step": 26712 + }, + { + "epoch": 0.7921300002965336, + "grad_norm": 0.09003464132547379, + "learning_rate": 0.00010492681953137723, + "loss": 2.571, + "step": 26713 + }, + { + "epoch": 0.792159653648845, + "grad_norm": 0.08080898970365524, + "learning_rate": 0.00010489798289010682, + "loss": 2.549, + "step": 26714 + }, + { + "epoch": 0.7921893070011565, + "grad_norm": 0.08648906648159027, + "learning_rate": 0.00010486914974753253, + "loss": 2.5479, + "step": 26715 + }, + { + "epoch": 0.792218960353468, + "grad_norm": 0.09604499489068985, + "learning_rate": 0.00010484032010390982, + "loss": 2.545, + "step": 26716 + }, + { + "epoch": 0.7922486137057795, + "grad_norm": 0.08315459638834, + "learning_rate": 0.00010481149395949386, + "loss": 2.532, + "step": 26717 + }, + { + "epoch": 0.7922782670580909, + "grad_norm": 0.09040262550115585, + "learning_rate": 0.00010478267131453994, + "loss": 2.5847, + "step": 26718 + }, + { + "epoch": 0.7923079204104024, + "grad_norm": 0.08264866471290588, + "learning_rate": 0.00010475385216930333, + "loss": 2.5827, + "step": 26719 + }, + { + "epoch": 0.7923375737627139, + "grad_norm": 0.0928775742650032, + "learning_rate": 0.00010472503652403931, + "loss": 2.593, + "step": 26720 + }, + { + "epoch": 0.7923672271150254, + "grad_norm": 0.07970406860113144, + "learning_rate": 0.00010469622437900283, + "loss": 2.5479, + "step": 26721 + }, + { + "epoch": 0.7923968804673368, + "grad_norm": 0.09759467095136642, + "learning_rate": 0.0001046674157344491, + "loss": 2.5416, + "step": 26722 + }, + { + "epoch": 0.7924265338196483, + "grad_norm": 0.09057241678237915, + "learning_rate": 0.00010463861059063317, + "loss": 2.5758, + "step": 26723 + }, + { + "epoch": 0.7924561871719598, + "grad_norm": 0.0962500348687172, + "learning_rate": 0.00010460980894781036, + "loss": 2.5545, + "step": 26724 + }, + { + "epoch": 0.7924858405242713, + "grad_norm": 0.09253334254026413, + "learning_rate": 0.0001045810108062355, + "loss": 2.518, + "step": 26725 + }, + { + "epoch": 0.7925154938765827, + "grad_norm": 0.1062317043542862, + "learning_rate": 0.00010455221616616368, + "loss": 2.5663, + "step": 26726 + }, + { + "epoch": 0.7925451472288942, + "grad_norm": 0.08967683464288712, + "learning_rate": 0.0001045234250278499, + "loss": 2.5402, + "step": 26727 + }, + { + "epoch": 0.7925748005812057, + "grad_norm": 0.09611547738313675, + "learning_rate": 0.00010449463739154903, + "loss": 2.508, + "step": 26728 + }, + { + "epoch": 0.7926044539335172, + "grad_norm": 0.08525119721889496, + "learning_rate": 0.00010446585325751606, + "loss": 2.5336, + "step": 26729 + }, + { + "epoch": 0.7926341072858286, + "grad_norm": 0.10281606763601303, + "learning_rate": 0.00010443707262600599, + "loss": 2.5618, + "step": 26730 + }, + { + "epoch": 0.7926637606381401, + "grad_norm": 0.08364783227443695, + "learning_rate": 0.00010440829549727337, + "loss": 2.5474, + "step": 26731 + }, + { + "epoch": 0.7926934139904516, + "grad_norm": 0.10023048520088196, + "learning_rate": 0.00010437952187157323, + "loss": 2.5696, + "step": 26732 + }, + { + "epoch": 0.7927230673427631, + "grad_norm": 0.08198712766170502, + "learning_rate": 0.0001043507517491603, + "loss": 2.5633, + "step": 26733 + }, + { + "epoch": 0.7927527206950746, + "grad_norm": 0.09764163196086884, + "learning_rate": 0.0001043219851302894, + "loss": 2.5788, + "step": 26734 + }, + { + "epoch": 0.792782374047386, + "grad_norm": 0.08297279477119446, + "learning_rate": 0.00010429322201521524, + "loss": 2.5808, + "step": 26735 + }, + { + "epoch": 0.7928120273996976, + "grad_norm": 0.09504665434360504, + "learning_rate": 0.00010426446240419235, + "loss": 2.5621, + "step": 26736 + }, + { + "epoch": 0.792841680752009, + "grad_norm": 0.0897071585059166, + "learning_rate": 0.00010423570629747575, + "loss": 2.5249, + "step": 26737 + }, + { + "epoch": 0.7928713341043205, + "grad_norm": 0.08244463801383972, + "learning_rate": 0.00010420695369531991, + "loss": 2.5573, + "step": 26738 + }, + { + "epoch": 0.792900987456632, + "grad_norm": 0.08298204094171524, + "learning_rate": 0.00010417820459797939, + "loss": 2.5618, + "step": 26739 + }, + { + "epoch": 0.7929306408089435, + "grad_norm": 0.08929387480020523, + "learning_rate": 0.00010414945900570883, + "loss": 2.573, + "step": 26740 + }, + { + "epoch": 0.7929602941612549, + "grad_norm": 0.08991899341344833, + "learning_rate": 0.00010412071691876291, + "loss": 2.5289, + "step": 26741 + }, + { + "epoch": 0.7929899475135664, + "grad_norm": 0.09172739833593369, + "learning_rate": 0.00010409197833739581, + "loss": 2.5555, + "step": 26742 + }, + { + "epoch": 0.7930196008658779, + "grad_norm": 0.08473795652389526, + "learning_rate": 0.00010406324326186223, + "loss": 2.5319, + "step": 26743 + }, + { + "epoch": 0.7930492542181894, + "grad_norm": 0.09861864894628525, + "learning_rate": 0.00010403451169241663, + "loss": 2.5637, + "step": 26744 + }, + { + "epoch": 0.7930789075705008, + "grad_norm": 0.09196417033672333, + "learning_rate": 0.00010400578362931334, + "loss": 2.5768, + "step": 26745 + }, + { + "epoch": 0.7931085609228123, + "grad_norm": 0.08928507566452026, + "learning_rate": 0.0001039770590728068, + "loss": 2.5516, + "step": 26746 + }, + { + "epoch": 0.7931382142751238, + "grad_norm": 0.09220432490110397, + "learning_rate": 0.0001039483380231514, + "loss": 2.5737, + "step": 26747 + }, + { + "epoch": 0.7931678676274353, + "grad_norm": 0.08794165402650833, + "learning_rate": 0.0001039196204806015, + "loss": 2.5523, + "step": 26748 + }, + { + "epoch": 0.7931975209797467, + "grad_norm": 0.09714522957801819, + "learning_rate": 0.00010389090644541116, + "loss": 2.5665, + "step": 26749 + }, + { + "epoch": 0.7932271743320582, + "grad_norm": 0.08648417890071869, + "learning_rate": 0.00010386219591783496, + "loss": 2.5645, + "step": 26750 + }, + { + "epoch": 0.7932568276843697, + "grad_norm": 0.0922674685716629, + "learning_rate": 0.00010383348889812716, + "loss": 2.5279, + "step": 26751 + }, + { + "epoch": 0.7932864810366812, + "grad_norm": 0.08626767247915268, + "learning_rate": 0.0001038047853865417, + "loss": 2.5528, + "step": 26752 + }, + { + "epoch": 0.7933161343889926, + "grad_norm": 0.08613576740026474, + "learning_rate": 0.00010377608538333283, + "loss": 2.536, + "step": 26753 + }, + { + "epoch": 0.7933457877413042, + "grad_norm": 0.08291593939065933, + "learning_rate": 0.00010374738888875478, + "loss": 2.5512, + "step": 26754 + }, + { + "epoch": 0.7933754410936157, + "grad_norm": 0.08428838104009628, + "learning_rate": 0.0001037186959030616, + "loss": 2.5474, + "step": 26755 + }, + { + "epoch": 0.7934050944459271, + "grad_norm": 0.08811398595571518, + "learning_rate": 0.00010369000642650739, + "loss": 2.5294, + "step": 26756 + }, + { + "epoch": 0.7934347477982386, + "grad_norm": 0.08464948832988739, + "learning_rate": 0.00010366132045934618, + "loss": 2.519, + "step": 26757 + }, + { + "epoch": 0.7934644011505501, + "grad_norm": 0.08196853846311569, + "learning_rate": 0.00010363263800183204, + "loss": 2.5773, + "step": 26758 + }, + { + "epoch": 0.7934940545028616, + "grad_norm": 0.08997776359319687, + "learning_rate": 0.00010360395905421887, + "loss": 2.5618, + "step": 26759 + }, + { + "epoch": 0.793523707855173, + "grad_norm": 0.08498113602399826, + "learning_rate": 0.00010357528361676072, + "loss": 2.553, + "step": 26760 + }, + { + "epoch": 0.7935533612074845, + "grad_norm": 0.0847739428281784, + "learning_rate": 0.00010354661168971147, + "loss": 2.5796, + "step": 26761 + }, + { + "epoch": 0.793583014559796, + "grad_norm": 0.08164001256227493, + "learning_rate": 0.00010351794327332503, + "loss": 2.5538, + "step": 26762 + }, + { + "epoch": 0.7936126679121075, + "grad_norm": 0.0793151706457138, + "learning_rate": 0.00010348927836785527, + "loss": 2.5604, + "step": 26763 + }, + { + "epoch": 0.7936423212644189, + "grad_norm": 0.07626264542341232, + "learning_rate": 0.00010346061697355603, + "loss": 2.5662, + "step": 26764 + }, + { + "epoch": 0.7936719746167304, + "grad_norm": 0.0807216465473175, + "learning_rate": 0.00010343195909068104, + "loss": 2.5361, + "step": 26765 + }, + { + "epoch": 0.7937016279690419, + "grad_norm": 0.08319125324487686, + "learning_rate": 0.00010340330471948417, + "loss": 2.5841, + "step": 26766 + }, + { + "epoch": 0.7937312813213534, + "grad_norm": 0.08348985016345978, + "learning_rate": 0.00010337465386021905, + "loss": 2.5667, + "step": 26767 + }, + { + "epoch": 0.7937609346736648, + "grad_norm": 0.07820197939872742, + "learning_rate": 0.00010334600651313952, + "loss": 2.529, + "step": 26768 + }, + { + "epoch": 0.7937905880259764, + "grad_norm": 0.08653217554092407, + "learning_rate": 0.00010331736267849912, + "loss": 2.5574, + "step": 26769 + }, + { + "epoch": 0.7938202413782878, + "grad_norm": 0.08553742617368698, + "learning_rate": 0.00010328872235655163, + "loss": 2.5373, + "step": 26770 + }, + { + "epoch": 0.7938498947305993, + "grad_norm": 0.08286039531230927, + "learning_rate": 0.00010326008554755057, + "loss": 2.5539, + "step": 26771 + }, + { + "epoch": 0.7938795480829107, + "grad_norm": 0.08272125571966171, + "learning_rate": 0.00010323145225174952, + "loss": 2.5698, + "step": 26772 + }, + { + "epoch": 0.7939092014352223, + "grad_norm": 0.08144926279783249, + "learning_rate": 0.0001032028224694021, + "loss": 2.5719, + "step": 26773 + }, + { + "epoch": 0.7939388547875337, + "grad_norm": 0.08377066254615784, + "learning_rate": 0.0001031741962007618, + "loss": 2.5468, + "step": 26774 + }, + { + "epoch": 0.7939685081398452, + "grad_norm": 0.07769124954938889, + "learning_rate": 0.00010314557344608211, + "loss": 2.5398, + "step": 26775 + }, + { + "epoch": 0.7939981614921567, + "grad_norm": 0.08651076257228851, + "learning_rate": 0.00010311695420561645, + "loss": 2.5694, + "step": 26776 + }, + { + "epoch": 0.7940278148444682, + "grad_norm": 0.08174967765808105, + "learning_rate": 0.00010308833847961829, + "loss": 2.5622, + "step": 26777 + }, + { + "epoch": 0.7940574681967797, + "grad_norm": 0.08349641412496567, + "learning_rate": 0.00010305972626834103, + "loss": 2.5458, + "step": 26778 + }, + { + "epoch": 0.7940871215490911, + "grad_norm": 0.08901368826627731, + "learning_rate": 0.00010303111757203804, + "loss": 2.5522, + "step": 26779 + }, + { + "epoch": 0.7941167749014026, + "grad_norm": 0.081881083548069, + "learning_rate": 0.00010300251239096264, + "loss": 2.5296, + "step": 26780 + }, + { + "epoch": 0.7941464282537141, + "grad_norm": 0.09472133964300156, + "learning_rate": 0.00010297391072536816, + "loss": 2.5556, + "step": 26781 + }, + { + "epoch": 0.7941760816060256, + "grad_norm": 0.09290928393602371, + "learning_rate": 0.00010294531257550782, + "loss": 2.5701, + "step": 26782 + }, + { + "epoch": 0.794205734958337, + "grad_norm": 0.08763408660888672, + "learning_rate": 0.00010291671794163487, + "loss": 2.5851, + "step": 26783 + }, + { + "epoch": 0.7942353883106485, + "grad_norm": 0.08754859119653702, + "learning_rate": 0.00010288812682400256, + "loss": 2.5701, + "step": 26784 + }, + { + "epoch": 0.79426504166296, + "grad_norm": 0.08980170637369156, + "learning_rate": 0.00010285953922286406, + "loss": 2.5632, + "step": 26785 + }, + { + "epoch": 0.7942946950152715, + "grad_norm": 0.08913212269544601, + "learning_rate": 0.00010283095513847268, + "loss": 2.57, + "step": 26786 + }, + { + "epoch": 0.7943243483675829, + "grad_norm": 0.08883444964885712, + "learning_rate": 0.00010280237457108115, + "loss": 2.5654, + "step": 26787 + }, + { + "epoch": 0.7943540017198945, + "grad_norm": 0.0922640785574913, + "learning_rate": 0.00010277379752094268, + "loss": 2.5397, + "step": 26788 + }, + { + "epoch": 0.7943836550722059, + "grad_norm": 0.08360162377357483, + "learning_rate": 0.00010274522398831054, + "loss": 2.5789, + "step": 26789 + }, + { + "epoch": 0.7944133084245174, + "grad_norm": 0.08456900715827942, + "learning_rate": 0.00010271665397343766, + "loss": 2.5613, + "step": 26790 + }, + { + "epoch": 0.7944429617768288, + "grad_norm": 0.0879364088177681, + "learning_rate": 0.00010268808747657698, + "loss": 2.5371, + "step": 26791 + }, + { + "epoch": 0.7944726151291404, + "grad_norm": 0.09007623791694641, + "learning_rate": 0.0001026595244979815, + "loss": 2.5139, + "step": 26792 + }, + { + "epoch": 0.7945022684814518, + "grad_norm": 0.08837471157312393, + "learning_rate": 0.00010263096503790409, + "loss": 2.5508, + "step": 26793 + }, + { + "epoch": 0.7945319218337633, + "grad_norm": 0.08676723390817642, + "learning_rate": 0.00010260240909659773, + "loss": 2.5323, + "step": 26794 + }, + { + "epoch": 0.7945615751860747, + "grad_norm": 0.0823565274477005, + "learning_rate": 0.00010257385667431524, + "loss": 2.5211, + "step": 26795 + }, + { + "epoch": 0.7945912285383863, + "grad_norm": 0.08583806455135345, + "learning_rate": 0.0001025453077713096, + "loss": 2.5344, + "step": 26796 + }, + { + "epoch": 0.7946208818906978, + "grad_norm": 0.0865999162197113, + "learning_rate": 0.0001025167623878333, + "loss": 2.5947, + "step": 26797 + }, + { + "epoch": 0.7946505352430092, + "grad_norm": 0.08886100351810455, + "learning_rate": 0.00010248822052413937, + "loss": 2.5504, + "step": 26798 + }, + { + "epoch": 0.7946801885953207, + "grad_norm": 0.08188407868146896, + "learning_rate": 0.00010245968218048046, + "loss": 2.5475, + "step": 26799 + }, + { + "epoch": 0.7947098419476322, + "grad_norm": 0.08764605224132538, + "learning_rate": 0.00010243114735710928, + "loss": 2.5468, + "step": 26800 + }, + { + "epoch": 0.7947394952999437, + "grad_norm": 0.09040458500385284, + "learning_rate": 0.00010240261605427842, + "loss": 2.5772, + "step": 26801 + }, + { + "epoch": 0.7947691486522551, + "grad_norm": 0.0925130844116211, + "learning_rate": 0.00010237408827224076, + "loss": 2.5632, + "step": 26802 + }, + { + "epoch": 0.7947988020045667, + "grad_norm": 0.07781386375427246, + "learning_rate": 0.00010234556401124878, + "loss": 2.5441, + "step": 26803 + }, + { + "epoch": 0.7948284553568781, + "grad_norm": 0.08582183718681335, + "learning_rate": 0.00010231704327155517, + "loss": 2.5677, + "step": 26804 + }, + { + "epoch": 0.7948581087091896, + "grad_norm": 0.0897032767534256, + "learning_rate": 0.00010228852605341232, + "loss": 2.5675, + "step": 26805 + }, + { + "epoch": 0.794887762061501, + "grad_norm": 0.08533928543329239, + "learning_rate": 0.00010226001235707299, + "loss": 2.5223, + "step": 26806 + }, + { + "epoch": 0.7949174154138126, + "grad_norm": 0.09312872588634491, + "learning_rate": 0.00010223150218278943, + "loss": 2.5894, + "step": 26807 + }, + { + "epoch": 0.794947068766124, + "grad_norm": 0.0813106968998909, + "learning_rate": 0.00010220299553081414, + "loss": 2.5509, + "step": 26808 + }, + { + "epoch": 0.7949767221184355, + "grad_norm": 0.08642087876796722, + "learning_rate": 0.00010217449240139964, + "loss": 2.5335, + "step": 26809 + }, + { + "epoch": 0.7950063754707469, + "grad_norm": 0.08941833674907684, + "learning_rate": 0.00010214599279479825, + "loss": 2.5786, + "step": 26810 + }, + { + "epoch": 0.7950360288230585, + "grad_norm": 0.08500517904758453, + "learning_rate": 0.00010211749671126241, + "loss": 2.556, + "step": 26811 + }, + { + "epoch": 0.7950656821753699, + "grad_norm": 0.08169298619031906, + "learning_rate": 0.00010208900415104444, + "loss": 2.5841, + "step": 26812 + }, + { + "epoch": 0.7950953355276814, + "grad_norm": 0.0854891762137413, + "learning_rate": 0.00010206051511439651, + "loss": 2.5264, + "step": 26813 + }, + { + "epoch": 0.7951249888799928, + "grad_norm": 0.08547800779342651, + "learning_rate": 0.00010203202960157116, + "loss": 2.5681, + "step": 26814 + }, + { + "epoch": 0.7951546422323044, + "grad_norm": 0.08987490087747574, + "learning_rate": 0.00010200354761282049, + "loss": 2.594, + "step": 26815 + }, + { + "epoch": 0.7951842955846158, + "grad_norm": 0.0839558094739914, + "learning_rate": 0.00010197506914839671, + "loss": 2.5416, + "step": 26816 + }, + { + "epoch": 0.7952139489369273, + "grad_norm": 0.08878972381353378, + "learning_rate": 0.00010194659420855218, + "loss": 2.5456, + "step": 26817 + }, + { + "epoch": 0.7952436022892388, + "grad_norm": 0.08365847915410995, + "learning_rate": 0.00010191812279353868, + "loss": 2.5225, + "step": 26818 + }, + { + "epoch": 0.7952732556415503, + "grad_norm": 0.08547809720039368, + "learning_rate": 0.00010188965490360862, + "loss": 2.5831, + "step": 26819 + }, + { + "epoch": 0.7953029089938618, + "grad_norm": 0.08691230416297913, + "learning_rate": 0.00010186119053901393, + "loss": 2.5359, + "step": 26820 + }, + { + "epoch": 0.7953325623461732, + "grad_norm": 0.09211868047714233, + "learning_rate": 0.00010183272970000678, + "loss": 2.5566, + "step": 26821 + }, + { + "epoch": 0.7953622156984848, + "grad_norm": 0.08355113863945007, + "learning_rate": 0.00010180427238683915, + "loss": 2.5577, + "step": 26822 + }, + { + "epoch": 0.7953918690507962, + "grad_norm": 0.09186078608036041, + "learning_rate": 0.00010177581859976304, + "loss": 2.5488, + "step": 26823 + }, + { + "epoch": 0.7954215224031077, + "grad_norm": 0.08771234005689621, + "learning_rate": 0.00010174736833903037, + "loss": 2.5517, + "step": 26824 + }, + { + "epoch": 0.7954511757554191, + "grad_norm": 0.090121790766716, + "learning_rate": 0.00010171892160489315, + "loss": 2.5978, + "step": 26825 + }, + { + "epoch": 0.7954808291077307, + "grad_norm": 0.09097189456224442, + "learning_rate": 0.00010169047839760309, + "loss": 2.5635, + "step": 26826 + }, + { + "epoch": 0.7955104824600421, + "grad_norm": 0.08630067855119705, + "learning_rate": 0.00010166203871741247, + "loss": 2.5771, + "step": 26827 + }, + { + "epoch": 0.7955401358123536, + "grad_norm": 0.0850902870297432, + "learning_rate": 0.00010163360256457276, + "loss": 2.5536, + "step": 26828 + }, + { + "epoch": 0.795569789164665, + "grad_norm": 0.08032293617725372, + "learning_rate": 0.00010160516993933588, + "loss": 2.5457, + "step": 26829 + }, + { + "epoch": 0.7955994425169766, + "grad_norm": 0.0881548821926117, + "learning_rate": 0.00010157674084195362, + "loss": 2.5436, + "step": 26830 + }, + { + "epoch": 0.795629095869288, + "grad_norm": 0.0797918364405632, + "learning_rate": 0.00010154831527267766, + "loss": 2.5377, + "step": 26831 + }, + { + "epoch": 0.7956587492215995, + "grad_norm": 0.08294414728879929, + "learning_rate": 0.00010151989323175981, + "loss": 2.5649, + "step": 26832 + }, + { + "epoch": 0.7956884025739109, + "grad_norm": 0.08663814514875412, + "learning_rate": 0.00010149147471945169, + "loss": 2.538, + "step": 26833 + }, + { + "epoch": 0.7957180559262225, + "grad_norm": 0.08369763195514679, + "learning_rate": 0.00010146305973600495, + "loss": 2.5623, + "step": 26834 + }, + { + "epoch": 0.7957477092785339, + "grad_norm": 0.09047764539718628, + "learning_rate": 0.0001014346482816712, + "loss": 2.567, + "step": 26835 + }, + { + "epoch": 0.7957773626308454, + "grad_norm": 0.08356039226055145, + "learning_rate": 0.0001014062403567021, + "loss": 2.5612, + "step": 26836 + }, + { + "epoch": 0.795807015983157, + "grad_norm": 0.08128082007169724, + "learning_rate": 0.00010137783596134914, + "loss": 2.568, + "step": 26837 + }, + { + "epoch": 0.7958366693354684, + "grad_norm": 0.08097030222415924, + "learning_rate": 0.00010134943509586386, + "loss": 2.5549, + "step": 26838 + }, + { + "epoch": 0.7958663226877799, + "grad_norm": 0.08138974010944366, + "learning_rate": 0.00010132103776049779, + "loss": 2.5825, + "step": 26839 + }, + { + "epoch": 0.7958959760400913, + "grad_norm": 0.08111787587404251, + "learning_rate": 0.00010129264395550236, + "loss": 2.5775, + "step": 26840 + }, + { + "epoch": 0.7959256293924029, + "grad_norm": 0.09030656516551971, + "learning_rate": 0.00010126425368112896, + "loss": 2.58, + "step": 26841 + }, + { + "epoch": 0.7959552827447143, + "grad_norm": 0.08112598210573196, + "learning_rate": 0.0001012358669376291, + "loss": 2.5542, + "step": 26842 + }, + { + "epoch": 0.7959849360970258, + "grad_norm": 0.08981627970933914, + "learning_rate": 0.00010120748372525401, + "loss": 2.5617, + "step": 26843 + }, + { + "epoch": 0.7960145894493372, + "grad_norm": 0.08517707884311676, + "learning_rate": 0.00010117910404425512, + "loss": 2.5799, + "step": 26844 + }, + { + "epoch": 0.7960442428016488, + "grad_norm": 0.09577421844005585, + "learning_rate": 0.00010115072789488378, + "loss": 2.5891, + "step": 26845 + }, + { + "epoch": 0.7960738961539602, + "grad_norm": 0.0899602472782135, + "learning_rate": 0.00010112235527739116, + "loss": 2.5818, + "step": 26846 + }, + { + "epoch": 0.7961035495062717, + "grad_norm": 0.09024304151535034, + "learning_rate": 0.00010109398619202853, + "loss": 2.5677, + "step": 26847 + }, + { + "epoch": 0.7961332028585831, + "grad_norm": 0.08507534116506577, + "learning_rate": 0.00010106562063904718, + "loss": 2.5327, + "step": 26848 + }, + { + "epoch": 0.7961628562108947, + "grad_norm": 0.08691088110208511, + "learning_rate": 0.00010103725861869817, + "loss": 2.5335, + "step": 26849 + }, + { + "epoch": 0.7961925095632061, + "grad_norm": 0.08964448422193527, + "learning_rate": 0.00010100890013123277, + "loss": 2.5635, + "step": 26850 + }, + { + "epoch": 0.7962221629155176, + "grad_norm": 0.08429533988237381, + "learning_rate": 0.0001009805451769022, + "loss": 2.589, + "step": 26851 + }, + { + "epoch": 0.796251816267829, + "grad_norm": 0.08988363295793533, + "learning_rate": 0.00010095219375595704, + "loss": 2.5451, + "step": 26852 + }, + { + "epoch": 0.7962814696201406, + "grad_norm": 0.08185078203678131, + "learning_rate": 0.00010092384586864888, + "loss": 2.5576, + "step": 26853 + }, + { + "epoch": 0.796311122972452, + "grad_norm": 0.08524706214666367, + "learning_rate": 0.00010089550151522859, + "loss": 2.548, + "step": 26854 + }, + { + "epoch": 0.7963407763247635, + "grad_norm": 0.0778670683503151, + "learning_rate": 0.00010086716069594709, + "loss": 2.563, + "step": 26855 + }, + { + "epoch": 0.7963704296770749, + "grad_norm": 0.08673980832099915, + "learning_rate": 0.00010083882341105543, + "loss": 2.5721, + "step": 26856 + }, + { + "epoch": 0.7964000830293865, + "grad_norm": 0.08471216261386871, + "learning_rate": 0.00010081048966080448, + "loss": 2.5321, + "step": 26857 + }, + { + "epoch": 0.796429736381698, + "grad_norm": 0.08846669644117355, + "learning_rate": 0.00010078215944544517, + "loss": 2.5581, + "step": 26858 + }, + { + "epoch": 0.7964593897340094, + "grad_norm": 0.08063050359487534, + "learning_rate": 0.00010075383276522837, + "loss": 2.5797, + "step": 26859 + }, + { + "epoch": 0.796489043086321, + "grad_norm": 0.08993005007505417, + "learning_rate": 0.00010072550962040494, + "loss": 2.58, + "step": 26860 + }, + { + "epoch": 0.7965186964386324, + "grad_norm": 0.08865777403116226, + "learning_rate": 0.00010069719001122563, + "loss": 2.5926, + "step": 26861 + }, + { + "epoch": 0.7965483497909439, + "grad_norm": 0.0813632383942604, + "learning_rate": 0.00010066887393794133, + "loss": 2.5769, + "step": 26862 + }, + { + "epoch": 0.7965780031432553, + "grad_norm": 0.08785799890756607, + "learning_rate": 0.00010064056140080263, + "loss": 2.558, + "step": 26863 + }, + { + "epoch": 0.7966076564955669, + "grad_norm": 0.08919807523488998, + "learning_rate": 0.00010061225240006028, + "loss": 2.5728, + "step": 26864 + }, + { + "epoch": 0.7966373098478783, + "grad_norm": 0.08614765107631683, + "learning_rate": 0.00010058394693596484, + "loss": 2.5243, + "step": 26865 + }, + { + "epoch": 0.7966669632001898, + "grad_norm": 0.09773698449134827, + "learning_rate": 0.00010055564500876729, + "loss": 2.5866, + "step": 26866 + }, + { + "epoch": 0.7966966165525012, + "grad_norm": 0.08281893283128738, + "learning_rate": 0.00010052734661871804, + "loss": 2.5281, + "step": 26867 + }, + { + "epoch": 0.7967262699048128, + "grad_norm": 0.08339927345514297, + "learning_rate": 0.00010049905176606766, + "loss": 2.5675, + "step": 26868 + }, + { + "epoch": 0.7967559232571242, + "grad_norm": 0.08525171130895615, + "learning_rate": 0.00010047076045106679, + "loss": 2.5823, + "step": 26869 + }, + { + "epoch": 0.7967855766094357, + "grad_norm": 0.08328527212142944, + "learning_rate": 0.00010044247267396595, + "loss": 2.554, + "step": 26870 + }, + { + "epoch": 0.7968152299617471, + "grad_norm": 0.08448683470487595, + "learning_rate": 0.00010041418843501555, + "loss": 2.5809, + "step": 26871 + }, + { + "epoch": 0.7968448833140587, + "grad_norm": 0.08496168255805969, + "learning_rate": 0.00010038590773446627, + "loss": 2.5874, + "step": 26872 + }, + { + "epoch": 0.7968745366663701, + "grad_norm": 0.08447198569774628, + "learning_rate": 0.00010035763057256819, + "loss": 2.5733, + "step": 26873 + }, + { + "epoch": 0.7969041900186816, + "grad_norm": 0.08270621299743652, + "learning_rate": 0.0001003293569495719, + "loss": 2.5516, + "step": 26874 + }, + { + "epoch": 0.796933843370993, + "grad_norm": 0.09492473304271698, + "learning_rate": 0.00010030108686572775, + "loss": 2.5911, + "step": 26875 + }, + { + "epoch": 0.7969634967233046, + "grad_norm": 0.08460760116577148, + "learning_rate": 0.00010027282032128615, + "loss": 2.551, + "step": 26876 + }, + { + "epoch": 0.796993150075616, + "grad_norm": 0.08813070505857468, + "learning_rate": 0.00010024455731649728, + "loss": 2.5777, + "step": 26877 + }, + { + "epoch": 0.7970228034279275, + "grad_norm": 0.08060575276613235, + "learning_rate": 0.00010021629785161135, + "loss": 2.5669, + "step": 26878 + }, + { + "epoch": 0.7970524567802391, + "grad_norm": 0.08938594162464142, + "learning_rate": 0.00010018804192687886, + "loss": 2.5733, + "step": 26879 + }, + { + "epoch": 0.7970821101325505, + "grad_norm": 0.08738432824611664, + "learning_rate": 0.00010015978954254984, + "loss": 2.5407, + "step": 26880 + }, + { + "epoch": 0.797111763484862, + "grad_norm": 0.08505063503980637, + "learning_rate": 0.00010013154069887458, + "loss": 2.5629, + "step": 26881 + }, + { + "epoch": 0.7971414168371734, + "grad_norm": 0.09007972478866577, + "learning_rate": 0.0001001032953961033, + "loss": 2.5645, + "step": 26882 + }, + { + "epoch": 0.797171070189485, + "grad_norm": 0.08662401884794235, + "learning_rate": 0.00010007505363448578, + "loss": 2.5491, + "step": 26883 + }, + { + "epoch": 0.7972007235417964, + "grad_norm": 0.08150692284107208, + "learning_rate": 0.00010004681541427236, + "loss": 2.5604, + "step": 26884 + }, + { + "epoch": 0.7972303768941079, + "grad_norm": 0.08676442503929138, + "learning_rate": 0.00010001858073571302, + "loss": 2.5969, + "step": 26885 + }, + { + "epoch": 0.7972600302464193, + "grad_norm": 0.08577550202608109, + "learning_rate": 9.999034959905784e-05, + "loss": 2.598, + "step": 26886 + }, + { + "epoch": 0.7972896835987309, + "grad_norm": 0.08891541510820389, + "learning_rate": 9.996212200455673e-05, + "loss": 2.5736, + "step": 26887 + }, + { + "epoch": 0.7973193369510423, + "grad_norm": 0.08610353618860245, + "learning_rate": 9.993389795245972e-05, + "loss": 2.5497, + "step": 26888 + }, + { + "epoch": 0.7973489903033538, + "grad_norm": 0.09405432641506195, + "learning_rate": 9.990567744301671e-05, + "loss": 2.5618, + "step": 26889 + }, + { + "epoch": 0.7973786436556652, + "grad_norm": 0.08529945462942123, + "learning_rate": 9.987746047647755e-05, + "loss": 2.5413, + "step": 26890 + }, + { + "epoch": 0.7974082970079768, + "grad_norm": 0.080337293446064, + "learning_rate": 9.984924705309212e-05, + "loss": 2.5355, + "step": 26891 + }, + { + "epoch": 0.7974379503602882, + "grad_norm": 0.08466782420873642, + "learning_rate": 9.982103717311037e-05, + "loss": 2.5954, + "step": 26892 + }, + { + "epoch": 0.7974676037125997, + "grad_norm": 0.09039321541786194, + "learning_rate": 9.979283083678214e-05, + "loss": 2.5313, + "step": 26893 + }, + { + "epoch": 0.7974972570649111, + "grad_norm": 0.07518647611141205, + "learning_rate": 9.976462804435699e-05, + "loss": 2.5477, + "step": 26894 + }, + { + "epoch": 0.7975269104172227, + "grad_norm": 0.09358075261116028, + "learning_rate": 9.973642879608475e-05, + "loss": 2.56, + "step": 26895 + }, + { + "epoch": 0.7975565637695341, + "grad_norm": 0.07880385220050812, + "learning_rate": 9.970823309221517e-05, + "loss": 2.5432, + "step": 26896 + }, + { + "epoch": 0.7975862171218456, + "grad_norm": 0.08679176867008209, + "learning_rate": 9.968004093299782e-05, + "loss": 2.5579, + "step": 26897 + }, + { + "epoch": 0.797615870474157, + "grad_norm": 0.0885976180434227, + "learning_rate": 9.965185231868245e-05, + "loss": 2.5614, + "step": 26898 + }, + { + "epoch": 0.7976455238264686, + "grad_norm": 0.08105207979679108, + "learning_rate": 9.962366724951872e-05, + "loss": 2.5278, + "step": 26899 + }, + { + "epoch": 0.7976751771787801, + "grad_norm": 0.08291030675172806, + "learning_rate": 9.959548572575606e-05, + "loss": 2.5187, + "step": 26900 + }, + { + "epoch": 0.7977048305310915, + "grad_norm": 0.07964259386062622, + "learning_rate": 9.95673077476441e-05, + "loss": 2.5733, + "step": 26901 + }, + { + "epoch": 0.7977344838834031, + "grad_norm": 0.08109621703624725, + "learning_rate": 9.953913331543241e-05, + "loss": 2.5515, + "step": 26902 + }, + { + "epoch": 0.7977641372357145, + "grad_norm": 0.08465342968702316, + "learning_rate": 9.951096242937041e-05, + "loss": 2.5314, + "step": 26903 + }, + { + "epoch": 0.797793790588026, + "grad_norm": 0.08079072088003159, + "learning_rate": 9.948279508970754e-05, + "loss": 2.5737, + "step": 26904 + }, + { + "epoch": 0.7978234439403374, + "grad_norm": 0.08009757846593857, + "learning_rate": 9.945463129669336e-05, + "loss": 2.5216, + "step": 26905 + }, + { + "epoch": 0.797853097292649, + "grad_norm": 0.07942777127027512, + "learning_rate": 9.942647105057706e-05, + "loss": 2.5571, + "step": 26906 + }, + { + "epoch": 0.7978827506449604, + "grad_norm": 0.08226048946380615, + "learning_rate": 9.939831435160818e-05, + "loss": 2.5669, + "step": 26907 + }, + { + "epoch": 0.7979124039972719, + "grad_norm": 0.0810224711894989, + "learning_rate": 9.9370161200036e-05, + "loss": 2.5349, + "step": 26908 + }, + { + "epoch": 0.7979420573495833, + "grad_norm": 0.07935523241758347, + "learning_rate": 9.934201159610979e-05, + "loss": 2.5466, + "step": 26909 + }, + { + "epoch": 0.7979717107018949, + "grad_norm": 0.07616191357374191, + "learning_rate": 9.931386554007888e-05, + "loss": 2.5613, + "step": 26910 + }, + { + "epoch": 0.7980013640542063, + "grad_norm": 0.08391731977462769, + "learning_rate": 9.928572303219241e-05, + "loss": 2.547, + "step": 26911 + }, + { + "epoch": 0.7980310174065178, + "grad_norm": 0.08313781023025513, + "learning_rate": 9.925758407269963e-05, + "loss": 2.593, + "step": 26912 + }, + { + "epoch": 0.7980606707588292, + "grad_norm": 0.08741917461156845, + "learning_rate": 9.92294486618498e-05, + "loss": 2.5416, + "step": 26913 + }, + { + "epoch": 0.7980903241111408, + "grad_norm": 0.08077529817819595, + "learning_rate": 9.920131679989197e-05, + "loss": 2.5506, + "step": 26914 + }, + { + "epoch": 0.7981199774634522, + "grad_norm": 0.08201595395803452, + "learning_rate": 9.917318848707524e-05, + "loss": 2.5604, + "step": 26915 + }, + { + "epoch": 0.7981496308157637, + "grad_norm": 0.08758760988712311, + "learning_rate": 9.914506372364873e-05, + "loss": 2.5789, + "step": 26916 + }, + { + "epoch": 0.7981792841680752, + "grad_norm": 0.0770404040813446, + "learning_rate": 9.911694250986153e-05, + "loss": 2.5492, + "step": 26917 + }, + { + "epoch": 0.7982089375203867, + "grad_norm": 0.08962246775627136, + "learning_rate": 9.90888248459626e-05, + "loss": 2.5575, + "step": 26918 + }, + { + "epoch": 0.7982385908726981, + "grad_norm": 0.08174284547567368, + "learning_rate": 9.90607107322009e-05, + "loss": 2.5448, + "step": 26919 + }, + { + "epoch": 0.7982682442250096, + "grad_norm": 0.08084946870803833, + "learning_rate": 9.903260016882548e-05, + "loss": 2.5578, + "step": 26920 + }, + { + "epoch": 0.7982978975773212, + "grad_norm": 0.08670562505722046, + "learning_rate": 9.900449315608517e-05, + "loss": 2.5576, + "step": 26921 + }, + { + "epoch": 0.7983275509296326, + "grad_norm": 0.08498581498861313, + "learning_rate": 9.897638969422895e-05, + "loss": 2.5271, + "step": 26922 + }, + { + "epoch": 0.7983572042819441, + "grad_norm": 0.0860031396150589, + "learning_rate": 9.894828978350562e-05, + "loss": 2.5495, + "step": 26923 + }, + { + "epoch": 0.7983868576342555, + "grad_norm": 0.08077466487884521, + "learning_rate": 9.892019342416402e-05, + "loss": 2.5594, + "step": 26924 + }, + { + "epoch": 0.7984165109865671, + "grad_norm": 0.0869925320148468, + "learning_rate": 9.889210061645293e-05, + "loss": 2.5705, + "step": 26925 + }, + { + "epoch": 0.7984461643388785, + "grad_norm": 0.08152158558368683, + "learning_rate": 9.886401136062118e-05, + "loss": 2.5465, + "step": 26926 + }, + { + "epoch": 0.79847581769119, + "grad_norm": 0.07824862003326416, + "learning_rate": 9.883592565691752e-05, + "loss": 2.5701, + "step": 26927 + }, + { + "epoch": 0.7985054710435014, + "grad_norm": 0.08255426585674286, + "learning_rate": 9.880784350559052e-05, + "loss": 2.5742, + "step": 26928 + }, + { + "epoch": 0.798535124395813, + "grad_norm": 0.08388771861791611, + "learning_rate": 9.877976490688895e-05, + "loss": 2.5582, + "step": 26929 + }, + { + "epoch": 0.7985647777481244, + "grad_norm": 0.08456280082464218, + "learning_rate": 9.875168986106125e-05, + "loss": 2.5395, + "step": 26930 + }, + { + "epoch": 0.7985944311004359, + "grad_norm": 0.07739713042974472, + "learning_rate": 9.872361836835637e-05, + "loss": 2.5671, + "step": 26931 + }, + { + "epoch": 0.7986240844527474, + "grad_norm": 0.08272472769021988, + "learning_rate": 9.869555042902273e-05, + "loss": 2.5401, + "step": 26932 + }, + { + "epoch": 0.7986537378050589, + "grad_norm": 0.08315922319889069, + "learning_rate": 9.866748604330883e-05, + "loss": 2.5329, + "step": 26933 + }, + { + "epoch": 0.7986833911573703, + "grad_norm": 0.08598313480615616, + "learning_rate": 9.863942521146329e-05, + "loss": 2.5906, + "step": 26934 + }, + { + "epoch": 0.7987130445096818, + "grad_norm": 0.08405715972185135, + "learning_rate": 9.861136793373449e-05, + "loss": 2.5938, + "step": 26935 + }, + { + "epoch": 0.7987426978619933, + "grad_norm": 0.07903331518173218, + "learning_rate": 9.858331421037093e-05, + "loss": 2.5364, + "step": 26936 + }, + { + "epoch": 0.7987723512143048, + "grad_norm": 0.08269710093736649, + "learning_rate": 9.855526404162107e-05, + "loss": 2.5747, + "step": 26937 + }, + { + "epoch": 0.7988020045666162, + "grad_norm": 0.08301999419927597, + "learning_rate": 9.852721742773336e-05, + "loss": 2.544, + "step": 26938 + }, + { + "epoch": 0.7988316579189277, + "grad_norm": 0.08194810152053833, + "learning_rate": 9.849917436895589e-05, + "loss": 2.5312, + "step": 26939 + }, + { + "epoch": 0.7988613112712392, + "grad_norm": 0.08764414489269257, + "learning_rate": 9.847113486553715e-05, + "loss": 2.5742, + "step": 26940 + }, + { + "epoch": 0.7988909646235507, + "grad_norm": 0.08878768980503082, + "learning_rate": 9.844309891772546e-05, + "loss": 2.5705, + "step": 26941 + }, + { + "epoch": 0.7989206179758622, + "grad_norm": 0.08554546535015106, + "learning_rate": 9.841506652576904e-05, + "loss": 2.5903, + "step": 26942 + }, + { + "epoch": 0.7989502713281736, + "grad_norm": 0.08103298395872116, + "learning_rate": 9.838703768991603e-05, + "loss": 2.5599, + "step": 26943 + }, + { + "epoch": 0.7989799246804852, + "grad_norm": 0.08890768140554428, + "learning_rate": 9.835901241041484e-05, + "loss": 2.5514, + "step": 26944 + }, + { + "epoch": 0.7990095780327966, + "grad_norm": 0.07695148140192032, + "learning_rate": 9.833099068751355e-05, + "loss": 2.5259, + "step": 26945 + }, + { + "epoch": 0.7990392313851081, + "grad_norm": 0.0927482321858406, + "learning_rate": 9.830297252146025e-05, + "loss": 2.5171, + "step": 26946 + }, + { + "epoch": 0.7990688847374195, + "grad_norm": 0.08265659213066101, + "learning_rate": 9.827495791250313e-05, + "loss": 2.5587, + "step": 26947 + }, + { + "epoch": 0.7990985380897311, + "grad_norm": 0.08197487890720367, + "learning_rate": 9.82469468608903e-05, + "loss": 2.544, + "step": 26948 + }, + { + "epoch": 0.7991281914420425, + "grad_norm": 0.08538317680358887, + "learning_rate": 9.82189393668696e-05, + "loss": 2.6032, + "step": 26949 + }, + { + "epoch": 0.799157844794354, + "grad_norm": 0.08239589631557465, + "learning_rate": 9.819093543068919e-05, + "loss": 2.5617, + "step": 26950 + }, + { + "epoch": 0.7991874981466655, + "grad_norm": 0.08611136674880981, + "learning_rate": 9.8162935052597e-05, + "loss": 2.5307, + "step": 26951 + }, + { + "epoch": 0.799217151498977, + "grad_norm": 0.08777357637882233, + "learning_rate": 9.813493823284098e-05, + "loss": 2.5582, + "step": 26952 + }, + { + "epoch": 0.7992468048512884, + "grad_norm": 0.07429665327072144, + "learning_rate": 9.810694497166906e-05, + "loss": 2.5545, + "step": 26953 + }, + { + "epoch": 0.7992764582035999, + "grad_norm": 0.0900687649846077, + "learning_rate": 9.807895526932914e-05, + "loss": 2.5637, + "step": 26954 + }, + { + "epoch": 0.7993061115559114, + "grad_norm": 0.07514595240354538, + "learning_rate": 9.805096912606904e-05, + "loss": 2.5619, + "step": 26955 + }, + { + "epoch": 0.7993357649082229, + "grad_norm": 0.08269189298152924, + "learning_rate": 9.802298654213648e-05, + "loss": 2.5295, + "step": 26956 + }, + { + "epoch": 0.7993654182605343, + "grad_norm": 0.08026207983493805, + "learning_rate": 9.799500751777952e-05, + "loss": 2.5518, + "step": 26957 + }, + { + "epoch": 0.7993950716128458, + "grad_norm": 0.08689874410629272, + "learning_rate": 9.796703205324575e-05, + "loss": 2.5231, + "step": 26958 + }, + { + "epoch": 0.7994247249651573, + "grad_norm": 0.0798986628651619, + "learning_rate": 9.793906014878306e-05, + "loss": 2.5455, + "step": 26959 + }, + { + "epoch": 0.7994543783174688, + "grad_norm": 0.09176073223352432, + "learning_rate": 9.791109180463886e-05, + "loss": 2.5682, + "step": 26960 + }, + { + "epoch": 0.7994840316697802, + "grad_norm": 0.0819605365395546, + "learning_rate": 9.788312702106094e-05, + "loss": 2.5733, + "step": 26961 + }, + { + "epoch": 0.7995136850220917, + "grad_norm": 0.08251440525054932, + "learning_rate": 9.785516579829701e-05, + "loss": 2.5361, + "step": 26962 + }, + { + "epoch": 0.7995433383744033, + "grad_norm": 0.08372779935598373, + "learning_rate": 9.782720813659457e-05, + "loss": 2.5634, + "step": 26963 + }, + { + "epoch": 0.7995729917267147, + "grad_norm": 0.08178569376468658, + "learning_rate": 9.779925403620127e-05, + "loss": 2.574, + "step": 26964 + }, + { + "epoch": 0.7996026450790262, + "grad_norm": 0.08303399384021759, + "learning_rate": 9.777130349736458e-05, + "loss": 2.5714, + "step": 26965 + }, + { + "epoch": 0.7996322984313377, + "grad_norm": 0.08205241709947586, + "learning_rate": 9.774335652033206e-05, + "loss": 2.6129, + "step": 26966 + }, + { + "epoch": 0.7996619517836492, + "grad_norm": 0.08798228204250336, + "learning_rate": 9.771541310535115e-05, + "loss": 2.5241, + "step": 26967 + }, + { + "epoch": 0.7996916051359606, + "grad_norm": 0.08327921479940414, + "learning_rate": 9.768747325266935e-05, + "loss": 2.5677, + "step": 26968 + }, + { + "epoch": 0.7997212584882721, + "grad_norm": 0.09673888236284256, + "learning_rate": 9.765953696253399e-05, + "loss": 2.521, + "step": 26969 + }, + { + "epoch": 0.7997509118405836, + "grad_norm": 0.08373049646615982, + "learning_rate": 9.763160423519247e-05, + "loss": 2.5571, + "step": 26970 + }, + { + "epoch": 0.7997805651928951, + "grad_norm": 0.08580643683671951, + "learning_rate": 9.760367507089218e-05, + "loss": 2.5753, + "step": 26971 + }, + { + "epoch": 0.7998102185452065, + "grad_norm": 0.08786680549383163, + "learning_rate": 9.757574946988046e-05, + "loss": 2.5462, + "step": 26972 + }, + { + "epoch": 0.799839871897518, + "grad_norm": 0.08543993532657623, + "learning_rate": 9.754782743240453e-05, + "loss": 2.5831, + "step": 26973 + }, + { + "epoch": 0.7998695252498295, + "grad_norm": 0.08094020187854767, + "learning_rate": 9.751990895871166e-05, + "loss": 2.5561, + "step": 26974 + }, + { + "epoch": 0.799899178602141, + "grad_norm": 0.07943360507488251, + "learning_rate": 9.749199404904907e-05, + "loss": 2.5477, + "step": 26975 + }, + { + "epoch": 0.7999288319544524, + "grad_norm": 0.08620204031467438, + "learning_rate": 9.746408270366397e-05, + "loss": 2.5687, + "step": 26976 + }, + { + "epoch": 0.799958485306764, + "grad_norm": 0.08415798842906952, + "learning_rate": 9.743617492280349e-05, + "loss": 2.5731, + "step": 26977 + }, + { + "epoch": 0.7999881386590754, + "grad_norm": 0.08924207836389542, + "learning_rate": 9.740827070671482e-05, + "loss": 2.5572, + "step": 26978 + }, + { + "epoch": 0.8000177920113869, + "grad_norm": 0.0916592925786972, + "learning_rate": 9.738037005564499e-05, + "loss": 2.5169, + "step": 26979 + }, + { + "epoch": 0.8000474453636983, + "grad_norm": 0.09059539437294006, + "learning_rate": 9.735247296984112e-05, + "loss": 2.5776, + "step": 26980 + }, + { + "epoch": 0.8000770987160098, + "grad_norm": 0.10304538160562515, + "learning_rate": 9.73245794495502e-05, + "loss": 2.5633, + "step": 26981 + }, + { + "epoch": 0.8001067520683213, + "grad_norm": 0.09021081030368805, + "learning_rate": 9.729668949501924e-05, + "loss": 2.5868, + "step": 26982 + }, + { + "epoch": 0.8001364054206328, + "grad_norm": 0.08914662152528763, + "learning_rate": 9.726880310649522e-05, + "loss": 2.5355, + "step": 26983 + }, + { + "epoch": 0.8001660587729443, + "grad_norm": 0.09097349643707275, + "learning_rate": 9.724092028422504e-05, + "loss": 2.5559, + "step": 26984 + }, + { + "epoch": 0.8001957121252558, + "grad_norm": 0.09065566211938858, + "learning_rate": 9.721304102845569e-05, + "loss": 2.5505, + "step": 26985 + }, + { + "epoch": 0.8002253654775673, + "grad_norm": 0.09093352407217026, + "learning_rate": 9.7185165339434e-05, + "loss": 2.5753, + "step": 26986 + }, + { + "epoch": 0.8002550188298787, + "grad_norm": 0.08890057355165482, + "learning_rate": 9.71572932174068e-05, + "loss": 2.5587, + "step": 26987 + }, + { + "epoch": 0.8002846721821902, + "grad_norm": 0.08433130383491516, + "learning_rate": 9.712942466262093e-05, + "loss": 2.5476, + "step": 26988 + }, + { + "epoch": 0.8003143255345017, + "grad_norm": 0.08929115533828735, + "learning_rate": 9.710155967532314e-05, + "loss": 2.5689, + "step": 26989 + }, + { + "epoch": 0.8003439788868132, + "grad_norm": 0.0824219286441803, + "learning_rate": 9.707369825576023e-05, + "loss": 2.548, + "step": 26990 + }, + { + "epoch": 0.8003736322391246, + "grad_norm": 0.08644495904445648, + "learning_rate": 9.704584040417885e-05, + "loss": 2.565, + "step": 26991 + }, + { + "epoch": 0.8004032855914361, + "grad_norm": 0.09114915877580643, + "learning_rate": 9.701798612082569e-05, + "loss": 2.568, + "step": 26992 + }, + { + "epoch": 0.8004329389437476, + "grad_norm": 0.08806601911783218, + "learning_rate": 9.699013540594765e-05, + "loss": 2.5671, + "step": 26993 + }, + { + "epoch": 0.8004625922960591, + "grad_norm": 0.08644063770771027, + "learning_rate": 9.69622882597908e-05, + "loss": 2.5777, + "step": 26994 + }, + { + "epoch": 0.8004922456483705, + "grad_norm": 0.0821286290884018, + "learning_rate": 9.693444468260221e-05, + "loss": 2.5691, + "step": 26995 + }, + { + "epoch": 0.800521899000682, + "grad_norm": 0.0915977880358696, + "learning_rate": 9.69066046746283e-05, + "loss": 2.5828, + "step": 26996 + }, + { + "epoch": 0.8005515523529935, + "grad_norm": 0.08364821970462799, + "learning_rate": 9.687876823611564e-05, + "loss": 2.5673, + "step": 26997 + }, + { + "epoch": 0.800581205705305, + "grad_norm": 0.09461814910173416, + "learning_rate": 9.685093536731066e-05, + "loss": 2.5525, + "step": 26998 + }, + { + "epoch": 0.8006108590576164, + "grad_norm": 0.08397652953863144, + "learning_rate": 9.682310606845979e-05, + "loss": 2.5807, + "step": 26999 + }, + { + "epoch": 0.800640512409928, + "grad_norm": 0.08448788523674011, + "learning_rate": 9.67952803398096e-05, + "loss": 2.5558, + "step": 27000 + }, + { + "epoch": 0.8006701657622394, + "grad_norm": 0.10247310250997543, + "learning_rate": 9.676745818160637e-05, + "loss": 2.5414, + "step": 27001 + }, + { + "epoch": 0.8006998191145509, + "grad_norm": 0.08466248959302902, + "learning_rate": 9.673963959409654e-05, + "loss": 2.5496, + "step": 27002 + }, + { + "epoch": 0.8007294724668623, + "grad_norm": 0.08719632774591446, + "learning_rate": 9.671182457752653e-05, + "loss": 2.5562, + "step": 27003 + }, + { + "epoch": 0.8007591258191739, + "grad_norm": 0.08597993105649948, + "learning_rate": 9.668401313214237e-05, + "loss": 2.5575, + "step": 27004 + }, + { + "epoch": 0.8007887791714854, + "grad_norm": 0.08833741396665573, + "learning_rate": 9.665620525819058e-05, + "loss": 2.5488, + "step": 27005 + }, + { + "epoch": 0.8008184325237968, + "grad_norm": 0.08682442456483841, + "learning_rate": 9.662840095591724e-05, + "loss": 2.5701, + "step": 27006 + }, + { + "epoch": 0.8008480858761083, + "grad_norm": 0.07918037474155426, + "learning_rate": 9.66006002255686e-05, + "loss": 2.5289, + "step": 27007 + }, + { + "epoch": 0.8008777392284198, + "grad_norm": 0.08833874762058258, + "learning_rate": 9.657280306739097e-05, + "loss": 2.5698, + "step": 27008 + }, + { + "epoch": 0.8009073925807313, + "grad_norm": 0.08230911195278168, + "learning_rate": 9.654500948163042e-05, + "loss": 2.5612, + "step": 27009 + }, + { + "epoch": 0.8009370459330427, + "grad_norm": 0.0810118094086647, + "learning_rate": 9.651721946853304e-05, + "loss": 2.562, + "step": 27010 + }, + { + "epoch": 0.8009666992853542, + "grad_norm": 0.08806276321411133, + "learning_rate": 9.648943302834501e-05, + "loss": 2.556, + "step": 27011 + }, + { + "epoch": 0.8009963526376657, + "grad_norm": 0.08298207819461823, + "learning_rate": 9.646165016131225e-05, + "loss": 2.5693, + "step": 27012 + }, + { + "epoch": 0.8010260059899772, + "grad_norm": 0.08601059019565582, + "learning_rate": 9.643387086768086e-05, + "loss": 2.5678, + "step": 27013 + }, + { + "epoch": 0.8010556593422886, + "grad_norm": 0.08526194840669632, + "learning_rate": 9.640609514769694e-05, + "loss": 2.5583, + "step": 27014 + }, + { + "epoch": 0.8010853126946001, + "grad_norm": 0.09635844081640244, + "learning_rate": 9.63783230016062e-05, + "loss": 2.5576, + "step": 27015 + }, + { + "epoch": 0.8011149660469116, + "grad_norm": 0.09571391344070435, + "learning_rate": 9.635055442965468e-05, + "loss": 2.5828, + "step": 27016 + }, + { + "epoch": 0.8011446193992231, + "grad_norm": 0.09365732222795486, + "learning_rate": 9.632278943208833e-05, + "loss": 2.5699, + "step": 27017 + }, + { + "epoch": 0.8011742727515345, + "grad_norm": 0.08271149545907974, + "learning_rate": 9.629502800915291e-05, + "loss": 2.5619, + "step": 27018 + }, + { + "epoch": 0.801203926103846, + "grad_norm": 0.09550416469573975, + "learning_rate": 9.626727016109437e-05, + "loss": 2.5797, + "step": 27019 + }, + { + "epoch": 0.8012335794561575, + "grad_norm": 0.08685679733753204, + "learning_rate": 9.623951588815827e-05, + "loss": 2.5571, + "step": 27020 + }, + { + "epoch": 0.801263232808469, + "grad_norm": 0.09050820767879486, + "learning_rate": 9.621176519059072e-05, + "loss": 2.5433, + "step": 27021 + }, + { + "epoch": 0.8012928861607804, + "grad_norm": 0.08608049899339676, + "learning_rate": 9.61840180686373e-05, + "loss": 2.5518, + "step": 27022 + }, + { + "epoch": 0.801322539513092, + "grad_norm": 0.08343170583248138, + "learning_rate": 9.615627452254371e-05, + "loss": 2.5433, + "step": 27023 + }, + { + "epoch": 0.8013521928654034, + "grad_norm": 0.08969003707170486, + "learning_rate": 9.612853455255577e-05, + "loss": 2.5707, + "step": 27024 + }, + { + "epoch": 0.8013818462177149, + "grad_norm": 0.08796533197164536, + "learning_rate": 9.610079815891882e-05, + "loss": 2.5877, + "step": 27025 + }, + { + "epoch": 0.8014114995700264, + "grad_norm": 0.09180177748203278, + "learning_rate": 9.607306534187865e-05, + "loss": 2.5751, + "step": 27026 + }, + { + "epoch": 0.8014411529223379, + "grad_norm": 0.08217532932758331, + "learning_rate": 9.604533610168081e-05, + "loss": 2.5626, + "step": 27027 + }, + { + "epoch": 0.8014708062746494, + "grad_norm": 0.0916953906416893, + "learning_rate": 9.601761043857088e-05, + "loss": 2.555, + "step": 27028 + }, + { + "epoch": 0.8015004596269608, + "grad_norm": 0.09399110823869705, + "learning_rate": 9.598988835279431e-05, + "loss": 2.5421, + "step": 27029 + }, + { + "epoch": 0.8015301129792723, + "grad_norm": 0.08252270519733429, + "learning_rate": 9.596216984459665e-05, + "loss": 2.5631, + "step": 27030 + }, + { + "epoch": 0.8015597663315838, + "grad_norm": 0.08785758167505264, + "learning_rate": 9.593445491422331e-05, + "loss": 2.5452, + "step": 27031 + }, + { + "epoch": 0.8015894196838953, + "grad_norm": 0.08780767768621445, + "learning_rate": 9.590674356191975e-05, + "loss": 2.575, + "step": 27032 + }, + { + "epoch": 0.8016190730362067, + "grad_norm": 0.08920478820800781, + "learning_rate": 9.587903578793122e-05, + "loss": 2.6012, + "step": 27033 + }, + { + "epoch": 0.8016487263885183, + "grad_norm": 0.08198774605989456, + "learning_rate": 9.585133159250331e-05, + "loss": 2.5472, + "step": 27034 + }, + { + "epoch": 0.8016783797408297, + "grad_norm": 0.07972542941570282, + "learning_rate": 9.582363097588137e-05, + "loss": 2.5557, + "step": 27035 + }, + { + "epoch": 0.8017080330931412, + "grad_norm": 0.09634774923324585, + "learning_rate": 9.579593393831044e-05, + "loss": 2.5749, + "step": 27036 + }, + { + "epoch": 0.8017376864454526, + "grad_norm": 0.07966920733451843, + "learning_rate": 9.576824048003585e-05, + "loss": 2.5848, + "step": 27037 + }, + { + "epoch": 0.8017673397977642, + "grad_norm": 0.08854725956916809, + "learning_rate": 9.574055060130287e-05, + "loss": 2.5568, + "step": 27038 + }, + { + "epoch": 0.8017969931500756, + "grad_norm": 0.08248195052146912, + "learning_rate": 9.571286430235676e-05, + "loss": 2.5331, + "step": 27039 + }, + { + "epoch": 0.8018266465023871, + "grad_norm": 0.08354125916957855, + "learning_rate": 9.568518158344258e-05, + "loss": 2.5708, + "step": 27040 + }, + { + "epoch": 0.8018562998546985, + "grad_norm": 0.08828288316726685, + "learning_rate": 9.565750244480554e-05, + "loss": 2.5598, + "step": 27041 + }, + { + "epoch": 0.8018859532070101, + "grad_norm": 0.07628411799669266, + "learning_rate": 9.56298268866907e-05, + "loss": 2.5724, + "step": 27042 + }, + { + "epoch": 0.8019156065593215, + "grad_norm": 0.08927681297063828, + "learning_rate": 9.56021549093432e-05, + "loss": 2.5901, + "step": 27043 + }, + { + "epoch": 0.801945259911633, + "grad_norm": 0.08379452675580978, + "learning_rate": 9.557448651300798e-05, + "loss": 2.584, + "step": 27044 + }, + { + "epoch": 0.8019749132639445, + "grad_norm": 0.08344881981611252, + "learning_rate": 9.554682169793011e-05, + "loss": 2.5471, + "step": 27045 + }, + { + "epoch": 0.802004566616256, + "grad_norm": 0.0829310342669487, + "learning_rate": 9.55191604643546e-05, + "loss": 2.5461, + "step": 27046 + }, + { + "epoch": 0.8020342199685675, + "grad_norm": 0.08154652267694473, + "learning_rate": 9.549150281252633e-05, + "loss": 2.5485, + "step": 27047 + }, + { + "epoch": 0.8020638733208789, + "grad_norm": 0.08543016761541367, + "learning_rate": 9.54638487426902e-05, + "loss": 2.5261, + "step": 27048 + }, + { + "epoch": 0.8020935266731904, + "grad_norm": 0.08977074921131134, + "learning_rate": 9.54361982550912e-05, + "loss": 2.5689, + "step": 27049 + }, + { + "epoch": 0.8021231800255019, + "grad_norm": 0.08223647624254227, + "learning_rate": 9.540855134997406e-05, + "loss": 2.5449, + "step": 27050 + }, + { + "epoch": 0.8021528333778134, + "grad_norm": 0.09863695502281189, + "learning_rate": 9.538090802758365e-05, + "loss": 2.5365, + "step": 27051 + }, + { + "epoch": 0.8021824867301248, + "grad_norm": 0.0817018374800682, + "learning_rate": 9.535326828816471e-05, + "loss": 2.5484, + "step": 27052 + }, + { + "epoch": 0.8022121400824364, + "grad_norm": 0.09539516270160675, + "learning_rate": 9.53256321319621e-05, + "loss": 2.5511, + "step": 27053 + }, + { + "epoch": 0.8022417934347478, + "grad_norm": 0.08111903816461563, + "learning_rate": 9.529799955922042e-05, + "loss": 2.566, + "step": 27054 + }, + { + "epoch": 0.8022714467870593, + "grad_norm": 0.09141048789024353, + "learning_rate": 9.527037057018446e-05, + "loss": 2.5535, + "step": 27055 + }, + { + "epoch": 0.8023011001393707, + "grad_norm": 0.08035886287689209, + "learning_rate": 9.524274516509885e-05, + "loss": 2.5376, + "step": 27056 + }, + { + "epoch": 0.8023307534916823, + "grad_norm": 0.08814556151628494, + "learning_rate": 9.521512334420818e-05, + "loss": 2.5762, + "step": 27057 + }, + { + "epoch": 0.8023604068439937, + "grad_norm": 0.08554253727197647, + "learning_rate": 9.51875051077572e-05, + "loss": 2.5666, + "step": 27058 + }, + { + "epoch": 0.8023900601963052, + "grad_norm": 0.07887585461139679, + "learning_rate": 9.51598904559901e-05, + "loss": 2.5522, + "step": 27059 + }, + { + "epoch": 0.8024197135486166, + "grad_norm": 0.08588653057813644, + "learning_rate": 9.513227938915181e-05, + "loss": 2.5021, + "step": 27060 + }, + { + "epoch": 0.8024493669009282, + "grad_norm": 0.0814613327383995, + "learning_rate": 9.510467190748667e-05, + "loss": 2.553, + "step": 27061 + }, + { + "epoch": 0.8024790202532396, + "grad_norm": 0.08552451431751251, + "learning_rate": 9.507706801123916e-05, + "loss": 2.522, + "step": 27062 + }, + { + "epoch": 0.8025086736055511, + "grad_norm": 0.07789597660303116, + "learning_rate": 9.50494677006537e-05, + "loss": 2.543, + "step": 27063 + }, + { + "epoch": 0.8025383269578625, + "grad_norm": 0.0815691277384758, + "learning_rate": 9.50218709759747e-05, + "loss": 2.5725, + "step": 27064 + }, + { + "epoch": 0.8025679803101741, + "grad_norm": 0.08389319479465485, + "learning_rate": 9.499427783744658e-05, + "loss": 2.577, + "step": 27065 + }, + { + "epoch": 0.8025976336624856, + "grad_norm": 0.08568942546844482, + "learning_rate": 9.496668828531363e-05, + "loss": 2.5469, + "step": 27066 + }, + { + "epoch": 0.802627287014797, + "grad_norm": 0.08893322944641113, + "learning_rate": 9.493910231982017e-05, + "loss": 2.5525, + "step": 27067 + }, + { + "epoch": 0.8026569403671086, + "grad_norm": 0.09553544968366623, + "learning_rate": 9.49115199412105e-05, + "loss": 2.5366, + "step": 27068 + }, + { + "epoch": 0.80268659371942, + "grad_norm": 0.08544327318668365, + "learning_rate": 9.488394114972898e-05, + "loss": 2.554, + "step": 27069 + }, + { + "epoch": 0.8027162470717315, + "grad_norm": 0.08972176164388657, + "learning_rate": 9.485636594561958e-05, + "loss": 2.5491, + "step": 27070 + }, + { + "epoch": 0.8027459004240429, + "grad_norm": 0.09163450449705124, + "learning_rate": 9.482879432912661e-05, + "loss": 2.56, + "step": 27071 + }, + { + "epoch": 0.8027755537763545, + "grad_norm": 0.09310851991176605, + "learning_rate": 9.48012263004941e-05, + "loss": 2.5846, + "step": 27072 + }, + { + "epoch": 0.8028052071286659, + "grad_norm": 0.08813758939504623, + "learning_rate": 9.477366185996634e-05, + "loss": 2.527, + "step": 27073 + }, + { + "epoch": 0.8028348604809774, + "grad_norm": 0.08954901993274689, + "learning_rate": 9.474610100778741e-05, + "loss": 2.5451, + "step": 27074 + }, + { + "epoch": 0.8028645138332888, + "grad_norm": 0.08761294186115265, + "learning_rate": 9.47185437442013e-05, + "loss": 2.5309, + "step": 27075 + }, + { + "epoch": 0.8028941671856004, + "grad_norm": 0.08999552577733994, + "learning_rate": 9.469099006945203e-05, + "loss": 2.5288, + "step": 27076 + }, + { + "epoch": 0.8029238205379118, + "grad_norm": 0.08740336447954178, + "learning_rate": 9.466343998378368e-05, + "loss": 2.5373, + "step": 27077 + }, + { + "epoch": 0.8029534738902233, + "grad_norm": 0.08242958039045334, + "learning_rate": 9.463589348744011e-05, + "loss": 2.5253, + "step": 27078 + }, + { + "epoch": 0.8029831272425347, + "grad_norm": 0.08729604631662369, + "learning_rate": 9.460835058066541e-05, + "loss": 2.529, + "step": 27079 + }, + { + "epoch": 0.8030127805948463, + "grad_norm": 0.08518325537443161, + "learning_rate": 9.458081126370322e-05, + "loss": 2.5406, + "step": 27080 + }, + { + "epoch": 0.8030424339471577, + "grad_norm": 0.09205001592636108, + "learning_rate": 9.45532755367975e-05, + "loss": 2.5348, + "step": 27081 + }, + { + "epoch": 0.8030720872994692, + "grad_norm": 0.08621805906295776, + "learning_rate": 9.452574340019216e-05, + "loss": 2.5638, + "step": 27082 + }, + { + "epoch": 0.8031017406517806, + "grad_norm": 0.08566564321517944, + "learning_rate": 9.449821485413096e-05, + "loss": 2.5624, + "step": 27083 + }, + { + "epoch": 0.8031313940040922, + "grad_norm": 0.08053320646286011, + "learning_rate": 9.447068989885766e-05, + "loss": 2.5218, + "step": 27084 + }, + { + "epoch": 0.8031610473564036, + "grad_norm": 0.09608659148216248, + "learning_rate": 9.444316853461587e-05, + "loss": 2.5487, + "step": 27085 + }, + { + "epoch": 0.8031907007087151, + "grad_norm": 0.08488215506076813, + "learning_rate": 9.44156507616496e-05, + "loss": 2.5648, + "step": 27086 + }, + { + "epoch": 0.8032203540610267, + "grad_norm": 0.08952479809522629, + "learning_rate": 9.438813658020234e-05, + "loss": 2.5541, + "step": 27087 + }, + { + "epoch": 0.8032500074133381, + "grad_norm": 0.0893835723400116, + "learning_rate": 9.436062599051776e-05, + "loss": 2.5494, + "step": 27088 + }, + { + "epoch": 0.8032796607656496, + "grad_norm": 0.08405471593141556, + "learning_rate": 9.433311899283942e-05, + "loss": 2.5529, + "step": 27089 + }, + { + "epoch": 0.803309314117961, + "grad_norm": 0.09192666411399841, + "learning_rate": 9.430561558741114e-05, + "loss": 2.5629, + "step": 27090 + }, + { + "epoch": 0.8033389674702726, + "grad_norm": 0.07553096860647202, + "learning_rate": 9.427811577447609e-05, + "loss": 2.5263, + "step": 27091 + }, + { + "epoch": 0.803368620822584, + "grad_norm": 0.097996287047863, + "learning_rate": 9.4250619554278e-05, + "loss": 2.5574, + "step": 27092 + }, + { + "epoch": 0.8033982741748955, + "grad_norm": 0.08271737396717072, + "learning_rate": 9.422312692706032e-05, + "loss": 2.5219, + "step": 27093 + }, + { + "epoch": 0.8034279275272069, + "grad_norm": 0.08775898814201355, + "learning_rate": 9.419563789306645e-05, + "loss": 2.5979, + "step": 27094 + }, + { + "epoch": 0.8034575808795185, + "grad_norm": 0.09185175597667694, + "learning_rate": 9.41681524525399e-05, + "loss": 2.5041, + "step": 27095 + }, + { + "epoch": 0.8034872342318299, + "grad_norm": 0.0909031331539154, + "learning_rate": 9.4140670605724e-05, + "loss": 2.5567, + "step": 27096 + }, + { + "epoch": 0.8035168875841414, + "grad_norm": 0.09227985888719559, + "learning_rate": 9.411319235286219e-05, + "loss": 2.5389, + "step": 27097 + }, + { + "epoch": 0.8035465409364528, + "grad_norm": 0.08318685740232468, + "learning_rate": 9.408571769419755e-05, + "loss": 2.5208, + "step": 27098 + }, + { + "epoch": 0.8035761942887644, + "grad_norm": 0.08852287381887436, + "learning_rate": 9.40582466299737e-05, + "loss": 2.5704, + "step": 27099 + }, + { + "epoch": 0.8036058476410758, + "grad_norm": 0.08221285790205002, + "learning_rate": 9.403077916043384e-05, + "loss": 2.5717, + "step": 27100 + }, + { + "epoch": 0.8036355009933873, + "grad_norm": 0.07992016524076462, + "learning_rate": 9.400331528582101e-05, + "loss": 2.5542, + "step": 27101 + }, + { + "epoch": 0.8036651543456987, + "grad_norm": 0.07926777750253677, + "learning_rate": 9.397585500637856e-05, + "loss": 2.5349, + "step": 27102 + }, + { + "epoch": 0.8036948076980103, + "grad_norm": 0.08177323639392853, + "learning_rate": 9.394839832234958e-05, + "loss": 2.5466, + "step": 27103 + }, + { + "epoch": 0.8037244610503217, + "grad_norm": 0.08317501097917557, + "learning_rate": 9.392094523397721e-05, + "loss": 2.5599, + "step": 27104 + }, + { + "epoch": 0.8037541144026332, + "grad_norm": 0.08011899888515472, + "learning_rate": 9.389349574150457e-05, + "loss": 2.5584, + "step": 27105 + }, + { + "epoch": 0.8037837677549446, + "grad_norm": 0.08707847446203232, + "learning_rate": 9.386604984517477e-05, + "loss": 2.5733, + "step": 27106 + }, + { + "epoch": 0.8038134211072562, + "grad_norm": 0.07949115335941315, + "learning_rate": 9.383860754523076e-05, + "loss": 2.5507, + "step": 27107 + }, + { + "epoch": 0.8038430744595677, + "grad_norm": 0.08208838850259781, + "learning_rate": 9.38111688419156e-05, + "loss": 2.5281, + "step": 27108 + }, + { + "epoch": 0.8038727278118791, + "grad_norm": 0.08865516632795334, + "learning_rate": 9.378373373547233e-05, + "loss": 2.5665, + "step": 27109 + }, + { + "epoch": 0.8039023811641907, + "grad_norm": 0.07765430212020874, + "learning_rate": 9.375630222614373e-05, + "loss": 2.5581, + "step": 27110 + }, + { + "epoch": 0.8039320345165021, + "grad_norm": 0.08028652518987656, + "learning_rate": 9.372887431417288e-05, + "loss": 2.5673, + "step": 27111 + }, + { + "epoch": 0.8039616878688136, + "grad_norm": 0.08697871118783951, + "learning_rate": 9.370144999980257e-05, + "loss": 2.5363, + "step": 27112 + }, + { + "epoch": 0.803991341221125, + "grad_norm": 0.08137553930282593, + "learning_rate": 9.367402928327562e-05, + "loss": 2.5627, + "step": 27113 + }, + { + "epoch": 0.8040209945734366, + "grad_norm": 0.08866169303655624, + "learning_rate": 9.364661216483494e-05, + "loss": 2.5202, + "step": 27114 + }, + { + "epoch": 0.804050647925748, + "grad_norm": 0.08629599213600159, + "learning_rate": 9.361919864472317e-05, + "loss": 2.5122, + "step": 27115 + }, + { + "epoch": 0.8040803012780595, + "grad_norm": 0.08508268743753433, + "learning_rate": 9.359178872318325e-05, + "loss": 2.5556, + "step": 27116 + }, + { + "epoch": 0.8041099546303709, + "grad_norm": 0.08392281830310822, + "learning_rate": 9.356438240045778e-05, + "loss": 2.537, + "step": 27117 + }, + { + "epoch": 0.8041396079826825, + "grad_norm": 0.09031965583562851, + "learning_rate": 9.353697967678942e-05, + "loss": 2.5787, + "step": 27118 + }, + { + "epoch": 0.8041692613349939, + "grad_norm": 0.09799328446388245, + "learning_rate": 9.350958055242093e-05, + "loss": 2.5494, + "step": 27119 + }, + { + "epoch": 0.8041989146873054, + "grad_norm": 0.079761803150177, + "learning_rate": 9.348218502759482e-05, + "loss": 2.5364, + "step": 27120 + }, + { + "epoch": 0.8042285680396168, + "grad_norm": 0.09077619016170502, + "learning_rate": 9.345479310255378e-05, + "loss": 2.5831, + "step": 27121 + }, + { + "epoch": 0.8042582213919284, + "grad_norm": 0.08731305599212646, + "learning_rate": 9.34274047775403e-05, + "loss": 2.551, + "step": 27122 + }, + { + "epoch": 0.8042878747442398, + "grad_norm": 0.08807423710823059, + "learning_rate": 9.340002005279697e-05, + "loss": 2.5866, + "step": 27123 + }, + { + "epoch": 0.8043175280965513, + "grad_norm": 0.094168521463871, + "learning_rate": 9.337263892856624e-05, + "loss": 2.5409, + "step": 27124 + }, + { + "epoch": 0.8043471814488627, + "grad_norm": 0.09009742736816406, + "learning_rate": 9.334526140509059e-05, + "loss": 2.5657, + "step": 27125 + }, + { + "epoch": 0.8043768348011743, + "grad_norm": 0.09437598288059235, + "learning_rate": 9.331788748261244e-05, + "loss": 2.5457, + "step": 27126 + }, + { + "epoch": 0.8044064881534857, + "grad_norm": 0.08345860242843628, + "learning_rate": 9.329051716137421e-05, + "loss": 2.5471, + "step": 27127 + }, + { + "epoch": 0.8044361415057972, + "grad_norm": 0.08858368545770645, + "learning_rate": 9.326315044161826e-05, + "loss": 2.5726, + "step": 27128 + }, + { + "epoch": 0.8044657948581088, + "grad_norm": 0.0833599865436554, + "learning_rate": 9.323578732358695e-05, + "loss": 2.5562, + "step": 27129 + }, + { + "epoch": 0.8044954482104202, + "grad_norm": 0.09260187298059464, + "learning_rate": 9.320842780752253e-05, + "loss": 2.5716, + "step": 27130 + }, + { + "epoch": 0.8045251015627317, + "grad_norm": 0.08744846284389496, + "learning_rate": 9.318107189366737e-05, + "loss": 2.5464, + "step": 27131 + }, + { + "epoch": 0.8045547549150431, + "grad_norm": 0.10305298864841461, + "learning_rate": 9.31537195822636e-05, + "loss": 2.5479, + "step": 27132 + }, + { + "epoch": 0.8045844082673547, + "grad_norm": 0.08364538848400116, + "learning_rate": 9.312637087355347e-05, + "loss": 2.5663, + "step": 27133 + }, + { + "epoch": 0.8046140616196661, + "grad_norm": 0.10554323345422745, + "learning_rate": 9.309902576777929e-05, + "loss": 2.5531, + "step": 27134 + }, + { + "epoch": 0.8046437149719776, + "grad_norm": 0.0885186716914177, + "learning_rate": 9.307168426518297e-05, + "loss": 2.5359, + "step": 27135 + }, + { + "epoch": 0.804673368324289, + "grad_norm": 0.09788565337657928, + "learning_rate": 9.304434636600673e-05, + "loss": 2.5316, + "step": 27136 + }, + { + "epoch": 0.8047030216766006, + "grad_norm": 0.08714038133621216, + "learning_rate": 9.301701207049251e-05, + "loss": 2.5498, + "step": 27137 + }, + { + "epoch": 0.804732675028912, + "grad_norm": 0.09137552231550217, + "learning_rate": 9.298968137888264e-05, + "loss": 2.5685, + "step": 27138 + }, + { + "epoch": 0.8047623283812235, + "grad_norm": 0.08183371275663376, + "learning_rate": 9.296235429141903e-05, + "loss": 2.6029, + "step": 27139 + }, + { + "epoch": 0.804791981733535, + "grad_norm": 0.09338207542896271, + "learning_rate": 9.293503080834365e-05, + "loss": 2.5423, + "step": 27140 + }, + { + "epoch": 0.8048216350858465, + "grad_norm": 0.08454801142215729, + "learning_rate": 9.290771092989842e-05, + "loss": 2.5582, + "step": 27141 + }, + { + "epoch": 0.8048512884381579, + "grad_norm": 0.08587336540222168, + "learning_rate": 9.288039465632526e-05, + "loss": 2.5583, + "step": 27142 + }, + { + "epoch": 0.8048809417904694, + "grad_norm": 0.09227335453033447, + "learning_rate": 9.285308198786612e-05, + "loss": 2.5783, + "step": 27143 + }, + { + "epoch": 0.8049105951427808, + "grad_norm": 0.09542107582092285, + "learning_rate": 9.28257729247628e-05, + "loss": 2.5716, + "step": 27144 + }, + { + "epoch": 0.8049402484950924, + "grad_norm": 0.09064224362373352, + "learning_rate": 9.279846746725729e-05, + "loss": 2.5784, + "step": 27145 + }, + { + "epoch": 0.8049699018474038, + "grad_norm": 0.08638561517000198, + "learning_rate": 9.277116561559113e-05, + "loss": 2.5743, + "step": 27146 + }, + { + "epoch": 0.8049995551997153, + "grad_norm": 0.09211703389883041, + "learning_rate": 9.274386737000617e-05, + "loss": 2.5298, + "step": 27147 + }, + { + "epoch": 0.8050292085520268, + "grad_norm": 0.08893940597772598, + "learning_rate": 9.271657273074419e-05, + "loss": 2.5534, + "step": 27148 + }, + { + "epoch": 0.8050588619043383, + "grad_norm": 0.08094016462564468, + "learning_rate": 9.268928169804685e-05, + "loss": 2.5812, + "step": 27149 + }, + { + "epoch": 0.8050885152566498, + "grad_norm": 0.09134780615568161, + "learning_rate": 9.266199427215576e-05, + "loss": 2.5535, + "step": 27150 + }, + { + "epoch": 0.8051181686089612, + "grad_norm": 0.0886651873588562, + "learning_rate": 9.263471045331274e-05, + "loss": 2.5498, + "step": 27151 + }, + { + "epoch": 0.8051478219612728, + "grad_norm": 0.08673643320798874, + "learning_rate": 9.260743024175921e-05, + "loss": 2.5392, + "step": 27152 + }, + { + "epoch": 0.8051774753135842, + "grad_norm": 0.0871662124991417, + "learning_rate": 9.258015363773692e-05, + "loss": 2.5455, + "step": 27153 + }, + { + "epoch": 0.8052071286658957, + "grad_norm": 0.09915335476398468, + "learning_rate": 9.25528806414872e-05, + "loss": 2.5388, + "step": 27154 + }, + { + "epoch": 0.8052367820182071, + "grad_norm": 0.08474797010421753, + "learning_rate": 9.252561125325187e-05, + "loss": 2.5726, + "step": 27155 + }, + { + "epoch": 0.8052664353705187, + "grad_norm": 0.10443658381700516, + "learning_rate": 9.249834547327201e-05, + "loss": 2.552, + "step": 27156 + }, + { + "epoch": 0.8052960887228301, + "grad_norm": 0.08584795147180557, + "learning_rate": 9.247108330178927e-05, + "loss": 2.5598, + "step": 27157 + }, + { + "epoch": 0.8053257420751416, + "grad_norm": 0.09309300780296326, + "learning_rate": 9.244382473904505e-05, + "loss": 2.5753, + "step": 27158 + }, + { + "epoch": 0.805355395427453, + "grad_norm": 0.08473578095436096, + "learning_rate": 9.241656978528073e-05, + "loss": 2.5529, + "step": 27159 + }, + { + "epoch": 0.8053850487797646, + "grad_norm": 0.08837595582008362, + "learning_rate": 9.238931844073762e-05, + "loss": 2.5405, + "step": 27160 + }, + { + "epoch": 0.805414702132076, + "grad_norm": 0.0914078801870346, + "learning_rate": 9.236207070565705e-05, + "loss": 2.5738, + "step": 27161 + }, + { + "epoch": 0.8054443554843875, + "grad_norm": 0.08648226410150528, + "learning_rate": 9.233482658028031e-05, + "loss": 2.576, + "step": 27162 + }, + { + "epoch": 0.805474008836699, + "grad_norm": 0.08988554030656815, + "learning_rate": 9.230758606484857e-05, + "loss": 2.5555, + "step": 27163 + }, + { + "epoch": 0.8055036621890105, + "grad_norm": 0.08958348631858826, + "learning_rate": 9.228034915960321e-05, + "loss": 2.5633, + "step": 27164 + }, + { + "epoch": 0.8055333155413219, + "grad_norm": 0.09428317099809647, + "learning_rate": 9.22531158647854e-05, + "loss": 2.5612, + "step": 27165 + }, + { + "epoch": 0.8055629688936334, + "grad_norm": 0.08540605753660202, + "learning_rate": 9.222588618063632e-05, + "loss": 2.5107, + "step": 27166 + }, + { + "epoch": 0.8055926222459449, + "grad_norm": 0.0853445827960968, + "learning_rate": 9.219866010739691e-05, + "loss": 2.5305, + "step": 27167 + }, + { + "epoch": 0.8056222755982564, + "grad_norm": 0.08939754217863083, + "learning_rate": 9.217143764530834e-05, + "loss": 2.5107, + "step": 27168 + }, + { + "epoch": 0.8056519289505678, + "grad_norm": 0.08962711691856384, + "learning_rate": 9.214421879461172e-05, + "loss": 2.5892, + "step": 27169 + }, + { + "epoch": 0.8056815823028793, + "grad_norm": 0.08381153643131256, + "learning_rate": 9.211700355554803e-05, + "loss": 2.5469, + "step": 27170 + }, + { + "epoch": 0.8057112356551909, + "grad_norm": 0.09153848141431808, + "learning_rate": 9.208979192835832e-05, + "loss": 2.5398, + "step": 27171 + }, + { + "epoch": 0.8057408890075023, + "grad_norm": 0.09388794004917145, + "learning_rate": 9.206258391328348e-05, + "loss": 2.5415, + "step": 27172 + }, + { + "epoch": 0.8057705423598138, + "grad_norm": 0.08769894391298294, + "learning_rate": 9.203537951056445e-05, + "loss": 2.5569, + "step": 27173 + }, + { + "epoch": 0.8058001957121252, + "grad_norm": 0.09165961295366287, + "learning_rate": 9.20081787204422e-05, + "loss": 2.5388, + "step": 27174 + }, + { + "epoch": 0.8058298490644368, + "grad_norm": 0.08498650044202805, + "learning_rate": 9.19809815431576e-05, + "loss": 2.5635, + "step": 27175 + }, + { + "epoch": 0.8058595024167482, + "grad_norm": 0.08485870063304901, + "learning_rate": 9.195378797895138e-05, + "loss": 2.5463, + "step": 27176 + }, + { + "epoch": 0.8058891557690597, + "grad_norm": 0.08683105558156967, + "learning_rate": 9.192659802806441e-05, + "loss": 2.5679, + "step": 27177 + }, + { + "epoch": 0.8059188091213711, + "grad_norm": 0.0875626653432846, + "learning_rate": 9.189941169073751e-05, + "loss": 2.5345, + "step": 27178 + }, + { + "epoch": 0.8059484624736827, + "grad_norm": 0.08964836597442627, + "learning_rate": 9.187222896721131e-05, + "loss": 2.508, + "step": 27179 + }, + { + "epoch": 0.8059781158259941, + "grad_norm": 0.09754453599452972, + "learning_rate": 9.184504985772663e-05, + "loss": 2.5664, + "step": 27180 + }, + { + "epoch": 0.8060077691783056, + "grad_norm": 0.0858624055981636, + "learning_rate": 9.18178743625241e-05, + "loss": 2.5306, + "step": 27181 + }, + { + "epoch": 0.806037422530617, + "grad_norm": 0.09266085922718048, + "learning_rate": 9.179070248184429e-05, + "loss": 2.5597, + "step": 27182 + }, + { + "epoch": 0.8060670758829286, + "grad_norm": 0.08479072898626328, + "learning_rate": 9.176353421592792e-05, + "loss": 2.529, + "step": 27183 + }, + { + "epoch": 0.80609672923524, + "grad_norm": 0.0932430848479271, + "learning_rate": 9.173636956501552e-05, + "loss": 2.5295, + "step": 27184 + }, + { + "epoch": 0.8061263825875515, + "grad_norm": 0.08744124323129654, + "learning_rate": 9.170920852934766e-05, + "loss": 2.5377, + "step": 27185 + }, + { + "epoch": 0.806156035939863, + "grad_norm": 0.08074875921010971, + "learning_rate": 9.168205110916483e-05, + "loss": 2.568, + "step": 27186 + }, + { + "epoch": 0.8061856892921745, + "grad_norm": 0.09365368634462357, + "learning_rate": 9.165489730470749e-05, + "loss": 2.5305, + "step": 27187 + }, + { + "epoch": 0.8062153426444859, + "grad_norm": 0.087981678545475, + "learning_rate": 9.162774711621619e-05, + "loss": 2.5876, + "step": 27188 + }, + { + "epoch": 0.8062449959967974, + "grad_norm": 0.08751653879880905, + "learning_rate": 9.160060054393121e-05, + "loss": 2.5802, + "step": 27189 + }, + { + "epoch": 0.8062746493491089, + "grad_norm": 0.09096699953079224, + "learning_rate": 9.157345758809304e-05, + "loss": 2.5875, + "step": 27190 + }, + { + "epoch": 0.8063043027014204, + "grad_norm": 0.08497175574302673, + "learning_rate": 9.154631824894205e-05, + "loss": 2.5296, + "step": 27191 + }, + { + "epoch": 0.8063339560537319, + "grad_norm": 0.09054294228553772, + "learning_rate": 9.151918252671849e-05, + "loss": 2.5621, + "step": 27192 + }, + { + "epoch": 0.8063636094060433, + "grad_norm": 0.08535125106573105, + "learning_rate": 9.149205042166269e-05, + "loss": 2.5539, + "step": 27193 + }, + { + "epoch": 0.8063932627583549, + "grad_norm": 0.09375783056020737, + "learning_rate": 9.14649219340149e-05, + "loss": 2.5093, + "step": 27194 + }, + { + "epoch": 0.8064229161106663, + "grad_norm": 0.09020760655403137, + "learning_rate": 9.143779706401533e-05, + "loss": 2.5649, + "step": 27195 + }, + { + "epoch": 0.8064525694629778, + "grad_norm": 0.08186337351799011, + "learning_rate": 9.141067581190426e-05, + "loss": 2.5499, + "step": 27196 + }, + { + "epoch": 0.8064822228152893, + "grad_norm": 0.09546951949596405, + "learning_rate": 9.138355817792176e-05, + "loss": 2.5379, + "step": 27197 + }, + { + "epoch": 0.8065118761676008, + "grad_norm": 0.08257407695055008, + "learning_rate": 9.135644416230798e-05, + "loss": 2.5757, + "step": 27198 + }, + { + "epoch": 0.8065415295199122, + "grad_norm": 0.11272603273391724, + "learning_rate": 9.1329333765303e-05, + "loss": 2.5327, + "step": 27199 + }, + { + "epoch": 0.8065711828722237, + "grad_norm": 0.08548673987388611, + "learning_rate": 9.130222698714707e-05, + "loss": 2.5645, + "step": 27200 + }, + { + "epoch": 0.8066008362245352, + "grad_norm": 0.07845814526081085, + "learning_rate": 9.127512382807984e-05, + "loss": 2.5013, + "step": 27201 + }, + { + "epoch": 0.8066304895768467, + "grad_norm": 0.08343102782964706, + "learning_rate": 9.124802428834162e-05, + "loss": 2.5723, + "step": 27202 + }, + { + "epoch": 0.8066601429291581, + "grad_norm": 0.08776593208312988, + "learning_rate": 9.122092836817236e-05, + "loss": 2.5634, + "step": 27203 + }, + { + "epoch": 0.8066897962814696, + "grad_norm": 0.08458821475505829, + "learning_rate": 9.11938360678119e-05, + "loss": 2.561, + "step": 27204 + }, + { + "epoch": 0.8067194496337811, + "grad_norm": 0.09079433232545853, + "learning_rate": 9.116674738750025e-05, + "loss": 2.5909, + "step": 27205 + }, + { + "epoch": 0.8067491029860926, + "grad_norm": 0.08594826608896255, + "learning_rate": 9.113966232747717e-05, + "loss": 2.5687, + "step": 27206 + }, + { + "epoch": 0.806778756338404, + "grad_norm": 0.08449357748031616, + "learning_rate": 9.111258088798258e-05, + "loss": 2.5505, + "step": 27207 + }, + { + "epoch": 0.8068084096907155, + "grad_norm": 0.08508380502462387, + "learning_rate": 9.108550306925628e-05, + "loss": 2.5673, + "step": 27208 + }, + { + "epoch": 0.806838063043027, + "grad_norm": 0.0814172700047493, + "learning_rate": 9.105842887153804e-05, + "loss": 2.5213, + "step": 27209 + }, + { + "epoch": 0.8068677163953385, + "grad_norm": 0.07920223474502563, + "learning_rate": 9.10313582950677e-05, + "loss": 2.5507, + "step": 27210 + }, + { + "epoch": 0.8068973697476499, + "grad_norm": 0.08752724528312683, + "learning_rate": 9.100429134008481e-05, + "loss": 2.5717, + "step": 27211 + }, + { + "epoch": 0.8069270230999614, + "grad_norm": 0.08449140191078186, + "learning_rate": 9.097722800682905e-05, + "loss": 2.5718, + "step": 27212 + }, + { + "epoch": 0.806956676452273, + "grad_norm": 0.08490315824747086, + "learning_rate": 9.09501682955402e-05, + "loss": 2.5577, + "step": 27213 + }, + { + "epoch": 0.8069863298045844, + "grad_norm": 0.08424320816993713, + "learning_rate": 9.092311220645772e-05, + "loss": 2.5352, + "step": 27214 + }, + { + "epoch": 0.8070159831568959, + "grad_norm": 0.07813844084739685, + "learning_rate": 9.089605973982134e-05, + "loss": 2.5461, + "step": 27215 + }, + { + "epoch": 0.8070456365092074, + "grad_norm": 0.08213941752910614, + "learning_rate": 9.086901089587063e-05, + "loss": 2.5564, + "step": 27216 + }, + { + "epoch": 0.8070752898615189, + "grad_norm": 0.07970205694437027, + "learning_rate": 9.08419656748451e-05, + "loss": 2.5745, + "step": 27217 + }, + { + "epoch": 0.8071049432138303, + "grad_norm": 0.07597661763429642, + "learning_rate": 9.081492407698411e-05, + "loss": 2.5382, + "step": 27218 + }, + { + "epoch": 0.8071345965661418, + "grad_norm": 0.07692022621631622, + "learning_rate": 9.078788610252725e-05, + "loss": 2.5262, + "step": 27219 + }, + { + "epoch": 0.8071642499184533, + "grad_norm": 0.08411438018083572, + "learning_rate": 9.07608517517139e-05, + "loss": 2.5667, + "step": 27220 + }, + { + "epoch": 0.8071939032707648, + "grad_norm": 0.08145549148321152, + "learning_rate": 9.07338210247835e-05, + "loss": 2.5732, + "step": 27221 + }, + { + "epoch": 0.8072235566230762, + "grad_norm": 0.08674898743629456, + "learning_rate": 9.070679392197534e-05, + "loss": 2.5598, + "step": 27222 + }, + { + "epoch": 0.8072532099753877, + "grad_norm": 0.08803395926952362, + "learning_rate": 9.067977044352871e-05, + "loss": 2.5673, + "step": 27223 + }, + { + "epoch": 0.8072828633276992, + "grad_norm": 0.08549468964338303, + "learning_rate": 9.065275058968303e-05, + "loss": 2.569, + "step": 27224 + }, + { + "epoch": 0.8073125166800107, + "grad_norm": 0.08942229300737381, + "learning_rate": 9.062573436067745e-05, + "loss": 2.5584, + "step": 27225 + }, + { + "epoch": 0.8073421700323221, + "grad_norm": 0.07943151146173477, + "learning_rate": 9.059872175675126e-05, + "loss": 2.56, + "step": 27226 + }, + { + "epoch": 0.8073718233846336, + "grad_norm": 0.08107995986938477, + "learning_rate": 9.057171277814358e-05, + "loss": 2.5326, + "step": 27227 + }, + { + "epoch": 0.8074014767369451, + "grad_norm": 0.08758348971605301, + "learning_rate": 9.054470742509374e-05, + "loss": 2.5644, + "step": 27228 + }, + { + "epoch": 0.8074311300892566, + "grad_norm": 0.08620677888393402, + "learning_rate": 9.051770569784085e-05, + "loss": 2.565, + "step": 27229 + }, + { + "epoch": 0.807460783441568, + "grad_norm": 0.08632313460111618, + "learning_rate": 9.049070759662392e-05, + "loss": 2.5683, + "step": 27230 + }, + { + "epoch": 0.8074904367938796, + "grad_norm": 0.08370267599821091, + "learning_rate": 9.046371312168222e-05, + "loss": 2.5377, + "step": 27231 + }, + { + "epoch": 0.807520090146191, + "grad_norm": 0.0866411030292511, + "learning_rate": 9.04367222732545e-05, + "loss": 2.5569, + "step": 27232 + }, + { + "epoch": 0.8075497434985025, + "grad_norm": 0.07908803224563599, + "learning_rate": 9.04097350515799e-05, + "loss": 2.5735, + "step": 27233 + }, + { + "epoch": 0.807579396850814, + "grad_norm": 0.09094138443470001, + "learning_rate": 9.038275145689739e-05, + "loss": 2.5758, + "step": 27234 + }, + { + "epoch": 0.8076090502031255, + "grad_norm": 0.08168381452560425, + "learning_rate": 9.035577148944596e-05, + "loss": 2.5051, + "step": 27235 + }, + { + "epoch": 0.807638703555437, + "grad_norm": 0.09741193056106567, + "learning_rate": 9.032879514946445e-05, + "loss": 2.5764, + "step": 27236 + }, + { + "epoch": 0.8076683569077484, + "grad_norm": 0.08507335186004639, + "learning_rate": 9.030182243719181e-05, + "loss": 2.5578, + "step": 27237 + }, + { + "epoch": 0.8076980102600599, + "grad_norm": 0.0892958715558052, + "learning_rate": 9.027485335286684e-05, + "loss": 2.5342, + "step": 27238 + }, + { + "epoch": 0.8077276636123714, + "grad_norm": 0.09135685861110687, + "learning_rate": 9.024788789672838e-05, + "loss": 2.5316, + "step": 27239 + }, + { + "epoch": 0.8077573169646829, + "grad_norm": 0.07761748135089874, + "learning_rate": 9.022092606901505e-05, + "loss": 2.5658, + "step": 27240 + }, + { + "epoch": 0.8077869703169943, + "grad_norm": 0.09321673214435577, + "learning_rate": 9.019396786996592e-05, + "loss": 2.5526, + "step": 27241 + }, + { + "epoch": 0.8078166236693058, + "grad_norm": 0.08484958112239838, + "learning_rate": 9.01670132998197e-05, + "loss": 2.5264, + "step": 27242 + }, + { + "epoch": 0.8078462770216173, + "grad_norm": 0.08440767973661423, + "learning_rate": 9.014006235881473e-05, + "loss": 2.5317, + "step": 27243 + }, + { + "epoch": 0.8078759303739288, + "grad_norm": 0.08452589809894562, + "learning_rate": 9.011311504718988e-05, + "loss": 2.5229, + "step": 27244 + }, + { + "epoch": 0.8079055837262402, + "grad_norm": 0.08038047701120377, + "learning_rate": 9.008617136518377e-05, + "loss": 2.5367, + "step": 27245 + }, + { + "epoch": 0.8079352370785517, + "grad_norm": 0.08374357968568802, + "learning_rate": 9.005923131303496e-05, + "loss": 2.5798, + "step": 27246 + }, + { + "epoch": 0.8079648904308632, + "grad_norm": 0.08139440417289734, + "learning_rate": 9.003229489098203e-05, + "loss": 2.5661, + "step": 27247 + }, + { + "epoch": 0.8079945437831747, + "grad_norm": 0.0730137825012207, + "learning_rate": 9.000536209926353e-05, + "loss": 2.5339, + "step": 27248 + }, + { + "epoch": 0.8080241971354861, + "grad_norm": 0.07719502598047256, + "learning_rate": 8.997843293811786e-05, + "loss": 2.5581, + "step": 27249 + }, + { + "epoch": 0.8080538504877977, + "grad_norm": 0.08236635476350784, + "learning_rate": 8.995150740778358e-05, + "loss": 2.5517, + "step": 27250 + }, + { + "epoch": 0.8080835038401091, + "grad_norm": 0.08757361024618149, + "learning_rate": 8.992458550849908e-05, + "loss": 2.5764, + "step": 27251 + }, + { + "epoch": 0.8081131571924206, + "grad_norm": 0.0871884822845459, + "learning_rate": 8.989766724050274e-05, + "loss": 2.5284, + "step": 27252 + }, + { + "epoch": 0.808142810544732, + "grad_norm": 0.0840027928352356, + "learning_rate": 8.987075260403299e-05, + "loss": 2.5879, + "step": 27253 + }, + { + "epoch": 0.8081724638970436, + "grad_norm": 0.08224823325872421, + "learning_rate": 8.984384159932807e-05, + "loss": 2.5615, + "step": 27254 + }, + { + "epoch": 0.8082021172493551, + "grad_norm": 0.07868728041648865, + "learning_rate": 8.98169342266264e-05, + "loss": 2.5536, + "step": 27255 + }, + { + "epoch": 0.8082317706016665, + "grad_norm": 0.09318762272596359, + "learning_rate": 8.979003048616613e-05, + "loss": 2.5193, + "step": 27256 + }, + { + "epoch": 0.808261423953978, + "grad_norm": 0.08691784739494324, + "learning_rate": 8.976313037818562e-05, + "loss": 2.5553, + "step": 27257 + }, + { + "epoch": 0.8082910773062895, + "grad_norm": 0.08575180917978287, + "learning_rate": 8.973623390292296e-05, + "loss": 2.4806, + "step": 27258 + }, + { + "epoch": 0.808320730658601, + "grad_norm": 0.09287870675325394, + "learning_rate": 8.970934106061634e-05, + "loss": 2.5817, + "step": 27259 + }, + { + "epoch": 0.8083503840109124, + "grad_norm": 0.08273753523826599, + "learning_rate": 8.968245185150398e-05, + "loss": 2.5437, + "step": 27260 + }, + { + "epoch": 0.808380037363224, + "grad_norm": 0.08769606053829193, + "learning_rate": 8.965556627582394e-05, + "loss": 2.5833, + "step": 27261 + }, + { + "epoch": 0.8084096907155354, + "grad_norm": 0.08518931269645691, + "learning_rate": 8.962868433381427e-05, + "loss": 2.5537, + "step": 27262 + }, + { + "epoch": 0.8084393440678469, + "grad_norm": 0.08931389451026917, + "learning_rate": 8.960180602571305e-05, + "loss": 2.5686, + "step": 27263 + }, + { + "epoch": 0.8084689974201583, + "grad_norm": 0.08812297135591507, + "learning_rate": 8.957493135175825e-05, + "loss": 2.5411, + "step": 27264 + }, + { + "epoch": 0.8084986507724699, + "grad_norm": 0.07997924834489822, + "learning_rate": 8.954806031218793e-05, + "loss": 2.5476, + "step": 27265 + }, + { + "epoch": 0.8085283041247813, + "grad_norm": 0.0929432362318039, + "learning_rate": 8.952119290723999e-05, + "loss": 2.5313, + "step": 27266 + }, + { + "epoch": 0.8085579574770928, + "grad_norm": 0.08756405860185623, + "learning_rate": 8.949432913715233e-05, + "loss": 2.5535, + "step": 27267 + }, + { + "epoch": 0.8085876108294042, + "grad_norm": 0.08935125917196274, + "learning_rate": 8.946746900216279e-05, + "loss": 2.5122, + "step": 27268 + }, + { + "epoch": 0.8086172641817158, + "grad_norm": 0.08071815222501755, + "learning_rate": 8.944061250250935e-05, + "loss": 2.5438, + "step": 27269 + }, + { + "epoch": 0.8086469175340272, + "grad_norm": 0.08228367567062378, + "learning_rate": 8.941375963842973e-05, + "loss": 2.5385, + "step": 27270 + }, + { + "epoch": 0.8086765708863387, + "grad_norm": 0.07596591114997864, + "learning_rate": 8.938691041016178e-05, + "loss": 2.5565, + "step": 27271 + }, + { + "epoch": 0.8087062242386501, + "grad_norm": 0.08595212548971176, + "learning_rate": 8.936006481794318e-05, + "loss": 2.5433, + "step": 27272 + }, + { + "epoch": 0.8087358775909617, + "grad_norm": 0.08561423420906067, + "learning_rate": 8.933322286201173e-05, + "loss": 2.5506, + "step": 27273 + }, + { + "epoch": 0.8087655309432732, + "grad_norm": 0.0829935148358345, + "learning_rate": 8.930638454260504e-05, + "loss": 2.5858, + "step": 27274 + }, + { + "epoch": 0.8087951842955846, + "grad_norm": 0.08630276471376419, + "learning_rate": 8.927954985996084e-05, + "loss": 2.5584, + "step": 27275 + }, + { + "epoch": 0.8088248376478961, + "grad_norm": 0.08296886831521988, + "learning_rate": 8.925271881431679e-05, + "loss": 2.5156, + "step": 27276 + }, + { + "epoch": 0.8088544910002076, + "grad_norm": 0.08110889047384262, + "learning_rate": 8.922589140591036e-05, + "loss": 2.5461, + "step": 27277 + }, + { + "epoch": 0.8088841443525191, + "grad_norm": 0.09102726727724075, + "learning_rate": 8.919906763497914e-05, + "loss": 2.5821, + "step": 27278 + }, + { + "epoch": 0.8089137977048305, + "grad_norm": 0.083770751953125, + "learning_rate": 8.917224750176056e-05, + "loss": 2.557, + "step": 27279 + }, + { + "epoch": 0.808943451057142, + "grad_norm": 0.0893232524394989, + "learning_rate": 8.914543100649242e-05, + "loss": 2.5138, + "step": 27280 + }, + { + "epoch": 0.8089731044094535, + "grad_norm": 0.08431801199913025, + "learning_rate": 8.911861814941197e-05, + "loss": 2.5755, + "step": 27281 + }, + { + "epoch": 0.809002757761765, + "grad_norm": 0.08530184626579285, + "learning_rate": 8.90918089307567e-05, + "loss": 2.5803, + "step": 27282 + }, + { + "epoch": 0.8090324111140764, + "grad_norm": 0.08377610146999359, + "learning_rate": 8.906500335076395e-05, + "loss": 2.5439, + "step": 27283 + }, + { + "epoch": 0.809062064466388, + "grad_norm": 0.08271834254264832, + "learning_rate": 8.903820140967116e-05, + "loss": 2.5188, + "step": 27284 + }, + { + "epoch": 0.8090917178186994, + "grad_norm": 0.0832534208893776, + "learning_rate": 8.901140310771566e-05, + "loss": 2.5724, + "step": 27285 + }, + { + "epoch": 0.8091213711710109, + "grad_norm": 0.08335760980844498, + "learning_rate": 8.898460844513484e-05, + "loss": 2.5055, + "step": 27286 + }, + { + "epoch": 0.8091510245233223, + "grad_norm": 0.08121860772371292, + "learning_rate": 8.895781742216574e-05, + "loss": 2.5158, + "step": 27287 + }, + { + "epoch": 0.8091806778756339, + "grad_norm": 0.08464854210615158, + "learning_rate": 8.893103003904573e-05, + "loss": 2.6147, + "step": 27288 + }, + { + "epoch": 0.8092103312279453, + "grad_norm": 0.09068761020898819, + "learning_rate": 8.890424629601196e-05, + "loss": 2.5324, + "step": 27289 + }, + { + "epoch": 0.8092399845802568, + "grad_norm": 0.0852959156036377, + "learning_rate": 8.887746619330166e-05, + "loss": 2.4994, + "step": 27290 + }, + { + "epoch": 0.8092696379325682, + "grad_norm": 0.08460068702697754, + "learning_rate": 8.885068973115201e-05, + "loss": 2.5823, + "step": 27291 + }, + { + "epoch": 0.8092992912848798, + "grad_norm": 0.08059611171483994, + "learning_rate": 8.882391690979996e-05, + "loss": 2.5206, + "step": 27292 + }, + { + "epoch": 0.8093289446371912, + "grad_norm": 0.08426472544670105, + "learning_rate": 8.879714772948278e-05, + "loss": 2.52, + "step": 27293 + }, + { + "epoch": 0.8093585979895027, + "grad_norm": 0.08970203250646591, + "learning_rate": 8.877038219043748e-05, + "loss": 2.5759, + "step": 27294 + }, + { + "epoch": 0.8093882513418142, + "grad_norm": 0.07927025109529495, + "learning_rate": 8.8743620292901e-05, + "loss": 2.5627, + "step": 27295 + }, + { + "epoch": 0.8094179046941257, + "grad_norm": 0.08982503414154053, + "learning_rate": 8.871686203711038e-05, + "loss": 2.5332, + "step": 27296 + }, + { + "epoch": 0.8094475580464372, + "grad_norm": 0.07873018085956573, + "learning_rate": 8.869010742330264e-05, + "loss": 2.5357, + "step": 27297 + }, + { + "epoch": 0.8094772113987486, + "grad_norm": 0.0835869163274765, + "learning_rate": 8.866335645171447e-05, + "loss": 2.5394, + "step": 27298 + }, + { + "epoch": 0.8095068647510602, + "grad_norm": 0.08835247904062271, + "learning_rate": 8.863660912258292e-05, + "loss": 2.5496, + "step": 27299 + }, + { + "epoch": 0.8095365181033716, + "grad_norm": 0.09176039695739746, + "learning_rate": 8.86098654361448e-05, + "loss": 2.5585, + "step": 27300 + }, + { + "epoch": 0.8095661714556831, + "grad_norm": 0.0799422487616539, + "learning_rate": 8.858312539263691e-05, + "loss": 2.5741, + "step": 27301 + }, + { + "epoch": 0.8095958248079945, + "grad_norm": 0.08734720200300217, + "learning_rate": 8.855638899229607e-05, + "loss": 2.5666, + "step": 27302 + }, + { + "epoch": 0.8096254781603061, + "grad_norm": 0.08843422681093216, + "learning_rate": 8.852965623535903e-05, + "loss": 2.5211, + "step": 27303 + }, + { + "epoch": 0.8096551315126175, + "grad_norm": 0.08986977487802505, + "learning_rate": 8.850292712206249e-05, + "loss": 2.5628, + "step": 27304 + }, + { + "epoch": 0.809684784864929, + "grad_norm": 0.095467209815979, + "learning_rate": 8.847620165264308e-05, + "loss": 2.5415, + "step": 27305 + }, + { + "epoch": 0.8097144382172404, + "grad_norm": 0.08874645084142685, + "learning_rate": 8.844947982733765e-05, + "loss": 2.5446, + "step": 27306 + }, + { + "epoch": 0.809744091569552, + "grad_norm": 0.09660443663597107, + "learning_rate": 8.842276164638286e-05, + "loss": 2.549, + "step": 27307 + }, + { + "epoch": 0.8097737449218634, + "grad_norm": 0.0868678167462349, + "learning_rate": 8.839604711001497e-05, + "loss": 2.5308, + "step": 27308 + }, + { + "epoch": 0.8098033982741749, + "grad_norm": 0.08632826060056686, + "learning_rate": 8.836933621847082e-05, + "loss": 2.5474, + "step": 27309 + }, + { + "epoch": 0.8098330516264863, + "grad_norm": 0.09466986358165741, + "learning_rate": 8.83426289719868e-05, + "loss": 2.5735, + "step": 27310 + }, + { + "epoch": 0.8098627049787979, + "grad_norm": 0.07506114989519119, + "learning_rate": 8.831592537079946e-05, + "loss": 2.5761, + "step": 27311 + }, + { + "epoch": 0.8098923583311093, + "grad_norm": 0.08506161719560623, + "learning_rate": 8.828922541514534e-05, + "loss": 2.5409, + "step": 27312 + }, + { + "epoch": 0.8099220116834208, + "grad_norm": 0.08373412489891052, + "learning_rate": 8.826252910526072e-05, + "loss": 2.5644, + "step": 27313 + }, + { + "epoch": 0.8099516650357322, + "grad_norm": 0.08818294107913971, + "learning_rate": 8.823583644138211e-05, + "loss": 2.5761, + "step": 27314 + }, + { + "epoch": 0.8099813183880438, + "grad_norm": 0.08285947889089584, + "learning_rate": 8.82091474237458e-05, + "loss": 2.5302, + "step": 27315 + }, + { + "epoch": 0.8100109717403553, + "grad_norm": 0.08200524747371674, + "learning_rate": 8.818246205258822e-05, + "loss": 2.511, + "step": 27316 + }, + { + "epoch": 0.8100406250926667, + "grad_norm": 0.08296222984790802, + "learning_rate": 8.815578032814565e-05, + "loss": 2.5493, + "step": 27317 + }, + { + "epoch": 0.8100702784449783, + "grad_norm": 0.07840599864721298, + "learning_rate": 8.81291022506543e-05, + "loss": 2.5219, + "step": 27318 + }, + { + "epoch": 0.8100999317972897, + "grad_norm": 0.08984462171792984, + "learning_rate": 8.810242782035044e-05, + "loss": 2.5408, + "step": 27319 + }, + { + "epoch": 0.8101295851496012, + "grad_norm": 0.07741333544254303, + "learning_rate": 8.807575703747028e-05, + "loss": 2.577, + "step": 27320 + }, + { + "epoch": 0.8101592385019126, + "grad_norm": 0.08571401238441467, + "learning_rate": 8.804908990224996e-05, + "loss": 2.5757, + "step": 27321 + }, + { + "epoch": 0.8101888918542242, + "grad_norm": 0.08181820064783096, + "learning_rate": 8.802242641492575e-05, + "loss": 2.5315, + "step": 27322 + }, + { + "epoch": 0.8102185452065356, + "grad_norm": 0.09456123411655426, + "learning_rate": 8.799576657573361e-05, + "loss": 2.5738, + "step": 27323 + }, + { + "epoch": 0.8102481985588471, + "grad_norm": 0.07505124807357788, + "learning_rate": 8.796911038490968e-05, + "loss": 2.5885, + "step": 27324 + }, + { + "epoch": 0.8102778519111585, + "grad_norm": 0.0880398377776146, + "learning_rate": 8.794245784269006e-05, + "loss": 2.5464, + "step": 27325 + }, + { + "epoch": 0.8103075052634701, + "grad_norm": 0.08892618864774704, + "learning_rate": 8.791580894931062e-05, + "loss": 2.5434, + "step": 27326 + }, + { + "epoch": 0.8103371586157815, + "grad_norm": 0.08675041794776917, + "learning_rate": 8.788916370500749e-05, + "loss": 2.5796, + "step": 27327 + }, + { + "epoch": 0.810366811968093, + "grad_norm": 0.09199152141809464, + "learning_rate": 8.78625221100165e-05, + "loss": 2.5285, + "step": 27328 + }, + { + "epoch": 0.8103964653204044, + "grad_norm": 0.08589424937963486, + "learning_rate": 8.783588416457367e-05, + "loss": 2.6102, + "step": 27329 + }, + { + "epoch": 0.810426118672716, + "grad_norm": 0.09247377514839172, + "learning_rate": 8.780924986891481e-05, + "loss": 2.548, + "step": 27330 + }, + { + "epoch": 0.8104557720250274, + "grad_norm": 0.09178514033555984, + "learning_rate": 8.77826192232758e-05, + "loss": 2.5219, + "step": 27331 + }, + { + "epoch": 0.8104854253773389, + "grad_norm": 0.0920218974351883, + "learning_rate": 8.775599222789244e-05, + "loss": 2.5852, + "step": 27332 + }, + { + "epoch": 0.8105150787296503, + "grad_norm": 0.08702158182859421, + "learning_rate": 8.772936888300053e-05, + "loss": 2.5741, + "step": 27333 + }, + { + "epoch": 0.8105447320819619, + "grad_norm": 0.08939892798662186, + "learning_rate": 8.770274918883586e-05, + "loss": 2.5391, + "step": 27334 + }, + { + "epoch": 0.8105743854342733, + "grad_norm": 0.08665677160024643, + "learning_rate": 8.767613314563405e-05, + "loss": 2.5969, + "step": 27335 + }, + { + "epoch": 0.8106040387865848, + "grad_norm": 0.08674220740795135, + "learning_rate": 8.764952075363092e-05, + "loss": 2.5539, + "step": 27336 + }, + { + "epoch": 0.8106336921388964, + "grad_norm": 0.08368730545043945, + "learning_rate": 8.762291201306199e-05, + "loss": 2.5834, + "step": 27337 + }, + { + "epoch": 0.8106633454912078, + "grad_norm": 0.09368577599525452, + "learning_rate": 8.759630692416304e-05, + "loss": 2.5782, + "step": 27338 + }, + { + "epoch": 0.8106929988435193, + "grad_norm": 0.08542183041572571, + "learning_rate": 8.756970548716953e-05, + "loss": 2.5024, + "step": 27339 + }, + { + "epoch": 0.8107226521958307, + "grad_norm": 0.08224324882030487, + "learning_rate": 8.75431077023171e-05, + "loss": 2.5336, + "step": 27340 + }, + { + "epoch": 0.8107523055481423, + "grad_norm": 0.09761907160282135, + "learning_rate": 8.751651356984119e-05, + "loss": 2.5737, + "step": 27341 + }, + { + "epoch": 0.8107819589004537, + "grad_norm": 0.08399603515863419, + "learning_rate": 8.748992308997755e-05, + "loss": 2.5764, + "step": 27342 + }, + { + "epoch": 0.8108116122527652, + "grad_norm": 0.09577136486768723, + "learning_rate": 8.746333626296127e-05, + "loss": 2.542, + "step": 27343 + }, + { + "epoch": 0.8108412656050766, + "grad_norm": 0.08677530288696289, + "learning_rate": 8.743675308902787e-05, + "loss": 2.5231, + "step": 27344 + }, + { + "epoch": 0.8108709189573882, + "grad_norm": 0.0955810546875, + "learning_rate": 8.741017356841297e-05, + "loss": 2.5791, + "step": 27345 + }, + { + "epoch": 0.8109005723096996, + "grad_norm": 0.08033601939678192, + "learning_rate": 8.738359770135179e-05, + "loss": 2.5821, + "step": 27346 + }, + { + "epoch": 0.8109302256620111, + "grad_norm": 0.08477357029914856, + "learning_rate": 8.735702548807966e-05, + "loss": 2.6141, + "step": 27347 + }, + { + "epoch": 0.8109598790143225, + "grad_norm": 0.08564193546772003, + "learning_rate": 8.733045692883191e-05, + "loss": 2.5298, + "step": 27348 + }, + { + "epoch": 0.8109895323666341, + "grad_norm": 0.08256185054779053, + "learning_rate": 8.730389202384382e-05, + "loss": 2.5485, + "step": 27349 + }, + { + "epoch": 0.8110191857189455, + "grad_norm": 0.07844484597444534, + "learning_rate": 8.727733077335053e-05, + "loss": 2.5416, + "step": 27350 + }, + { + "epoch": 0.811048839071257, + "grad_norm": 0.08724987506866455, + "learning_rate": 8.725077317758739e-05, + "loss": 2.5611, + "step": 27351 + }, + { + "epoch": 0.8110784924235684, + "grad_norm": 0.0855749100446701, + "learning_rate": 8.722421923678959e-05, + "loss": 2.5651, + "step": 27352 + }, + { + "epoch": 0.81110814577588, + "grad_norm": 0.0911383256316185, + "learning_rate": 8.719766895119207e-05, + "loss": 2.5566, + "step": 27353 + }, + { + "epoch": 0.8111377991281914, + "grad_norm": 0.08274415880441666, + "learning_rate": 8.717112232103008e-05, + "loss": 2.5508, + "step": 27354 + }, + { + "epoch": 0.8111674524805029, + "grad_norm": 0.08870076388120651, + "learning_rate": 8.714457934653863e-05, + "loss": 2.5163, + "step": 27355 + }, + { + "epoch": 0.8111971058328143, + "grad_norm": 0.0905737578868866, + "learning_rate": 8.711804002795276e-05, + "loss": 2.5858, + "step": 27356 + }, + { + "epoch": 0.8112267591851259, + "grad_norm": 0.0893714651465416, + "learning_rate": 8.709150436550744e-05, + "loss": 2.5431, + "step": 27357 + }, + { + "epoch": 0.8112564125374374, + "grad_norm": 0.0892692506313324, + "learning_rate": 8.706497235943783e-05, + "loss": 2.5074, + "step": 27358 + }, + { + "epoch": 0.8112860658897488, + "grad_norm": 0.09372442960739136, + "learning_rate": 8.703844400997878e-05, + "loss": 2.5627, + "step": 27359 + }, + { + "epoch": 0.8113157192420604, + "grad_norm": 0.09268846362829208, + "learning_rate": 8.701191931736518e-05, + "loss": 2.5552, + "step": 27360 + }, + { + "epoch": 0.8113453725943718, + "grad_norm": 0.0850028470158577, + "learning_rate": 8.698539828183193e-05, + "loss": 2.565, + "step": 27361 + }, + { + "epoch": 0.8113750259466833, + "grad_norm": 0.09741716831922531, + "learning_rate": 8.695888090361386e-05, + "loss": 2.5646, + "step": 27362 + }, + { + "epoch": 0.8114046792989947, + "grad_norm": 0.0906304270029068, + "learning_rate": 8.693236718294595e-05, + "loss": 2.5598, + "step": 27363 + }, + { + "epoch": 0.8114343326513063, + "grad_norm": 0.0864669680595398, + "learning_rate": 8.690585712006272e-05, + "loss": 2.5339, + "step": 27364 + }, + { + "epoch": 0.8114639860036177, + "grad_norm": 0.09669236093759537, + "learning_rate": 8.687935071519898e-05, + "loss": 2.5428, + "step": 27365 + }, + { + "epoch": 0.8114936393559292, + "grad_norm": 0.08499164134263992, + "learning_rate": 8.685284796858955e-05, + "loss": 2.552, + "step": 27366 + }, + { + "epoch": 0.8115232927082406, + "grad_norm": 0.09377532452344894, + "learning_rate": 8.682634888046903e-05, + "loss": 2.5707, + "step": 27367 + }, + { + "epoch": 0.8115529460605522, + "grad_norm": 0.0834646075963974, + "learning_rate": 8.679985345107211e-05, + "loss": 2.5332, + "step": 27368 + }, + { + "epoch": 0.8115825994128636, + "grad_norm": 0.09798081964254379, + "learning_rate": 8.677336168063332e-05, + "loss": 2.5321, + "step": 27369 + }, + { + "epoch": 0.8116122527651751, + "grad_norm": 0.08224013447761536, + "learning_rate": 8.674687356938743e-05, + "loss": 2.551, + "step": 27370 + }, + { + "epoch": 0.8116419061174865, + "grad_norm": 0.08912309259176254, + "learning_rate": 8.67203891175689e-05, + "loss": 2.5657, + "step": 27371 + }, + { + "epoch": 0.8116715594697981, + "grad_norm": 0.0912666916847229, + "learning_rate": 8.66939083254123e-05, + "loss": 2.5844, + "step": 27372 + }, + { + "epoch": 0.8117012128221095, + "grad_norm": 0.08211526274681091, + "learning_rate": 8.666743119315218e-05, + "loss": 2.5662, + "step": 27373 + }, + { + "epoch": 0.811730866174421, + "grad_norm": 0.07979489117860794, + "learning_rate": 8.664095772102282e-05, + "loss": 2.5592, + "step": 27374 + }, + { + "epoch": 0.8117605195267324, + "grad_norm": 0.09651858359575272, + "learning_rate": 8.661448790925868e-05, + "loss": 2.5761, + "step": 27375 + }, + { + "epoch": 0.811790172879044, + "grad_norm": 0.08529927581548691, + "learning_rate": 8.658802175809427e-05, + "loss": 2.556, + "step": 27376 + }, + { + "epoch": 0.8118198262313554, + "grad_norm": 0.09428542852401733, + "learning_rate": 8.656155926776383e-05, + "loss": 2.5338, + "step": 27377 + }, + { + "epoch": 0.8118494795836669, + "grad_norm": 0.07997696846723557, + "learning_rate": 8.653510043850176e-05, + "loss": 2.5674, + "step": 27378 + }, + { + "epoch": 0.8118791329359785, + "grad_norm": 0.0876702293753624, + "learning_rate": 8.650864527054237e-05, + "loss": 2.5819, + "step": 27379 + }, + { + "epoch": 0.8119087862882899, + "grad_norm": 0.0878354161977768, + "learning_rate": 8.648219376411986e-05, + "loss": 2.5783, + "step": 27380 + }, + { + "epoch": 0.8119384396406014, + "grad_norm": 0.07515079528093338, + "learning_rate": 8.645574591946859e-05, + "loss": 2.5621, + "step": 27381 + }, + { + "epoch": 0.8119680929929128, + "grad_norm": 0.08776384592056274, + "learning_rate": 8.642930173682245e-05, + "loss": 2.5336, + "step": 27382 + }, + { + "epoch": 0.8119977463452244, + "grad_norm": 0.08124780654907227, + "learning_rate": 8.640286121641611e-05, + "loss": 2.5765, + "step": 27383 + }, + { + "epoch": 0.8120273996975358, + "grad_norm": 0.08389297127723694, + "learning_rate": 8.637642435848336e-05, + "loss": 2.5465, + "step": 27384 + }, + { + "epoch": 0.8120570530498473, + "grad_norm": 0.07858417928218842, + "learning_rate": 8.634999116325832e-05, + "loss": 2.5269, + "step": 27385 + }, + { + "epoch": 0.8120867064021587, + "grad_norm": 0.08105406165122986, + "learning_rate": 8.63235616309751e-05, + "loss": 2.5393, + "step": 27386 + }, + { + "epoch": 0.8121163597544703, + "grad_norm": 0.08921210467815399, + "learning_rate": 8.629713576186776e-05, + "loss": 2.5763, + "step": 27387 + }, + { + "epoch": 0.8121460131067817, + "grad_norm": 0.07536692917346954, + "learning_rate": 8.627071355617027e-05, + "loss": 2.5657, + "step": 27388 + }, + { + "epoch": 0.8121756664590932, + "grad_norm": 0.09026975929737091, + "learning_rate": 8.624429501411667e-05, + "loss": 2.5463, + "step": 27389 + }, + { + "epoch": 0.8122053198114046, + "grad_norm": 0.07910200953483582, + "learning_rate": 8.621788013594084e-05, + "loss": 2.5464, + "step": 27390 + }, + { + "epoch": 0.8122349731637162, + "grad_norm": 0.08789621293544769, + "learning_rate": 8.619146892187674e-05, + "loss": 2.5716, + "step": 27391 + }, + { + "epoch": 0.8122646265160276, + "grad_norm": 0.08067602664232254, + "learning_rate": 8.616506137215813e-05, + "loss": 2.5478, + "step": 27392 + }, + { + "epoch": 0.8122942798683391, + "grad_norm": 0.08631215244531631, + "learning_rate": 8.613865748701899e-05, + "loss": 2.5582, + "step": 27393 + }, + { + "epoch": 0.8123239332206506, + "grad_norm": 0.0854489728808403, + "learning_rate": 8.611225726669309e-05, + "loss": 2.5817, + "step": 27394 + }, + { + "epoch": 0.8123535865729621, + "grad_norm": 0.07962467521429062, + "learning_rate": 8.608586071141417e-05, + "loss": 2.5913, + "step": 27395 + }, + { + "epoch": 0.8123832399252735, + "grad_norm": 0.08039887249469757, + "learning_rate": 8.605946782141599e-05, + "loss": 2.5489, + "step": 27396 + }, + { + "epoch": 0.812412893277585, + "grad_norm": 0.1550820767879486, + "learning_rate": 8.603307859693233e-05, + "loss": 2.5857, + "step": 27397 + }, + { + "epoch": 0.8124425466298965, + "grad_norm": 0.08337342739105225, + "learning_rate": 8.600669303819675e-05, + "loss": 2.5698, + "step": 27398 + }, + { + "epoch": 0.812472199982208, + "grad_norm": 0.08496293425559998, + "learning_rate": 8.598031114544303e-05, + "loss": 2.5616, + "step": 27399 + }, + { + "epoch": 0.8125018533345195, + "grad_norm": 0.08250927925109863, + "learning_rate": 8.595393291890463e-05, + "loss": 2.5376, + "step": 27400 + }, + { + "epoch": 0.8125315066868309, + "grad_norm": 0.08217862248420715, + "learning_rate": 8.592755835881527e-05, + "loss": 2.5602, + "step": 27401 + }, + { + "epoch": 0.8125611600391425, + "grad_norm": 0.08562687784433365, + "learning_rate": 8.590118746540847e-05, + "loss": 2.5565, + "step": 27402 + }, + { + "epoch": 0.8125908133914539, + "grad_norm": 0.08391623198986053, + "learning_rate": 8.587482023891773e-05, + "loss": 2.5476, + "step": 27403 + }, + { + "epoch": 0.8126204667437654, + "grad_norm": 0.07837963104248047, + "learning_rate": 8.584845667957653e-05, + "loss": 2.5434, + "step": 27404 + }, + { + "epoch": 0.8126501200960768, + "grad_norm": 0.08964235335588455, + "learning_rate": 8.582209678761837e-05, + "loss": 2.5667, + "step": 27405 + }, + { + "epoch": 0.8126797734483884, + "grad_norm": 0.08281470835208893, + "learning_rate": 8.57957405632766e-05, + "loss": 2.5762, + "step": 27406 + }, + { + "epoch": 0.8127094268006998, + "grad_norm": 0.08346429467201233, + "learning_rate": 8.576938800678474e-05, + "loss": 2.5815, + "step": 27407 + }, + { + "epoch": 0.8127390801530113, + "grad_norm": 0.08204148709774017, + "learning_rate": 8.574303911837589e-05, + "loss": 2.5512, + "step": 27408 + }, + { + "epoch": 0.8127687335053227, + "grad_norm": 0.08511649072170258, + "learning_rate": 8.571669389828358e-05, + "loss": 2.5362, + "step": 27409 + }, + { + "epoch": 0.8127983868576343, + "grad_norm": 0.07902523130178452, + "learning_rate": 8.569035234674105e-05, + "loss": 2.5565, + "step": 27410 + }, + { + "epoch": 0.8128280402099457, + "grad_norm": 0.08530299365520477, + "learning_rate": 8.566401446398165e-05, + "loss": 2.5662, + "step": 27411 + }, + { + "epoch": 0.8128576935622572, + "grad_norm": 0.08079558610916138, + "learning_rate": 8.563768025023844e-05, + "loss": 2.5626, + "step": 27412 + }, + { + "epoch": 0.8128873469145687, + "grad_norm": 0.08317991346120834, + "learning_rate": 8.561134970574474e-05, + "loss": 2.5557, + "step": 27413 + }, + { + "epoch": 0.8129170002668802, + "grad_norm": 0.08218461275100708, + "learning_rate": 8.558502283073366e-05, + "loss": 2.5694, + "step": 27414 + }, + { + "epoch": 0.8129466536191916, + "grad_norm": 0.08190792053937912, + "learning_rate": 8.555869962543834e-05, + "loss": 2.5509, + "step": 27415 + }, + { + "epoch": 0.8129763069715031, + "grad_norm": 0.0841231420636177, + "learning_rate": 8.553238009009184e-05, + "loss": 2.582, + "step": 27416 + }, + { + "epoch": 0.8130059603238146, + "grad_norm": 0.07626636326313019, + "learning_rate": 8.550606422492729e-05, + "loss": 2.5468, + "step": 27417 + }, + { + "epoch": 0.8130356136761261, + "grad_norm": 0.08201707154512405, + "learning_rate": 8.547975203017777e-05, + "loss": 2.5932, + "step": 27418 + }, + { + "epoch": 0.8130652670284375, + "grad_norm": 0.08015192300081253, + "learning_rate": 8.545344350607609e-05, + "loss": 2.5765, + "step": 27419 + }, + { + "epoch": 0.813094920380749, + "grad_norm": 0.08139830827713013, + "learning_rate": 8.542713865285534e-05, + "loss": 2.5523, + "step": 27420 + }, + { + "epoch": 0.8131245737330606, + "grad_norm": 0.07918334752321243, + "learning_rate": 8.540083747074834e-05, + "loss": 2.5626, + "step": 27421 + }, + { + "epoch": 0.813154227085372, + "grad_norm": 0.07987658679485321, + "learning_rate": 8.537453995998818e-05, + "loss": 2.5551, + "step": 27422 + }, + { + "epoch": 0.8131838804376835, + "grad_norm": 0.08511464297771454, + "learning_rate": 8.534824612080766e-05, + "loss": 2.5696, + "step": 27423 + }, + { + "epoch": 0.813213533789995, + "grad_norm": 0.08025214076042175, + "learning_rate": 8.532195595343955e-05, + "loss": 2.5258, + "step": 27424 + }, + { + "epoch": 0.8132431871423065, + "grad_norm": 0.08635443449020386, + "learning_rate": 8.529566945811673e-05, + "loss": 2.5727, + "step": 27425 + }, + { + "epoch": 0.8132728404946179, + "grad_norm": 0.08159037679433823, + "learning_rate": 8.526938663507194e-05, + "loss": 2.5479, + "step": 27426 + }, + { + "epoch": 0.8133024938469294, + "grad_norm": 0.08252428472042084, + "learning_rate": 8.52431074845379e-05, + "loss": 2.557, + "step": 27427 + }, + { + "epoch": 0.8133321471992409, + "grad_norm": 0.08294765651226044, + "learning_rate": 8.521683200674745e-05, + "loss": 2.5836, + "step": 27428 + }, + { + "epoch": 0.8133618005515524, + "grad_norm": 0.08279556781053543, + "learning_rate": 8.519056020193305e-05, + "loss": 2.5835, + "step": 27429 + }, + { + "epoch": 0.8133914539038638, + "grad_norm": 0.08205214142799377, + "learning_rate": 8.516429207032744e-05, + "loss": 2.5425, + "step": 27430 + }, + { + "epoch": 0.8134211072561753, + "grad_norm": 0.08774237334728241, + "learning_rate": 8.513802761216327e-05, + "loss": 2.513, + "step": 27431 + }, + { + "epoch": 0.8134507606084868, + "grad_norm": 0.0838257372379303, + "learning_rate": 8.511176682767302e-05, + "loss": 2.498, + "step": 27432 + }, + { + "epoch": 0.8134804139607983, + "grad_norm": 0.08146971464157104, + "learning_rate": 8.508550971708929e-05, + "loss": 2.5267, + "step": 27433 + }, + { + "epoch": 0.8135100673131097, + "grad_norm": 0.08502914011478424, + "learning_rate": 8.505925628064448e-05, + "loss": 2.546, + "step": 27434 + }, + { + "epoch": 0.8135397206654212, + "grad_norm": 0.08159635961055756, + "learning_rate": 8.503300651857132e-05, + "loss": 2.5567, + "step": 27435 + }, + { + "epoch": 0.8135693740177327, + "grad_norm": 0.08519547432661057, + "learning_rate": 8.500676043110211e-05, + "loss": 2.5593, + "step": 27436 + }, + { + "epoch": 0.8135990273700442, + "grad_norm": 0.08058910071849823, + "learning_rate": 8.498051801846923e-05, + "loss": 2.5679, + "step": 27437 + }, + { + "epoch": 0.8136286807223556, + "grad_norm": 0.08090193569660187, + "learning_rate": 8.495427928090515e-05, + "loss": 2.5683, + "step": 27438 + }, + { + "epoch": 0.8136583340746671, + "grad_norm": 0.08517172932624817, + "learning_rate": 8.492804421864225e-05, + "loss": 2.5866, + "step": 27439 + }, + { + "epoch": 0.8136879874269786, + "grad_norm": 0.0770314633846283, + "learning_rate": 8.490181283191268e-05, + "loss": 2.5526, + "step": 27440 + }, + { + "epoch": 0.8137176407792901, + "grad_norm": 0.08617235720157623, + "learning_rate": 8.487558512094878e-05, + "loss": 2.5415, + "step": 27441 + }, + { + "epoch": 0.8137472941316016, + "grad_norm": 0.08413047343492508, + "learning_rate": 8.484936108598285e-05, + "loss": 2.5658, + "step": 27442 + }, + { + "epoch": 0.813776947483913, + "grad_norm": 0.08223512023687363, + "learning_rate": 8.482314072724706e-05, + "loss": 2.5965, + "step": 27443 + }, + { + "epoch": 0.8138066008362246, + "grad_norm": 0.08311478048563004, + "learning_rate": 8.479692404497363e-05, + "loss": 2.5486, + "step": 27444 + }, + { + "epoch": 0.813836254188536, + "grad_norm": 0.0836019366979599, + "learning_rate": 8.477071103939471e-05, + "loss": 2.5256, + "step": 27445 + }, + { + "epoch": 0.8138659075408475, + "grad_norm": 0.07914748787879944, + "learning_rate": 8.474450171074244e-05, + "loss": 2.534, + "step": 27446 + }, + { + "epoch": 0.813895560893159, + "grad_norm": 0.08324021100997925, + "learning_rate": 8.471829605924874e-05, + "loss": 2.5619, + "step": 27447 + }, + { + "epoch": 0.8139252142454705, + "grad_norm": 0.0799989402294159, + "learning_rate": 8.469209408514595e-05, + "loss": 2.5476, + "step": 27448 + }, + { + "epoch": 0.8139548675977819, + "grad_norm": 0.08376419544219971, + "learning_rate": 8.466589578866607e-05, + "loss": 2.5833, + "step": 27449 + }, + { + "epoch": 0.8139845209500934, + "grad_norm": 0.0817282646894455, + "learning_rate": 8.463970117004083e-05, + "loss": 2.5731, + "step": 27450 + }, + { + "epoch": 0.8140141743024049, + "grad_norm": 0.08166518062353134, + "learning_rate": 8.461351022950236e-05, + "loss": 2.557, + "step": 27451 + }, + { + "epoch": 0.8140438276547164, + "grad_norm": 0.087864950299263, + "learning_rate": 8.458732296728255e-05, + "loss": 2.5766, + "step": 27452 + }, + { + "epoch": 0.8140734810070278, + "grad_norm": 0.07802750915288925, + "learning_rate": 8.456113938361326e-05, + "loss": 2.5213, + "step": 27453 + }, + { + "epoch": 0.8141031343593393, + "grad_norm": 0.08726304769515991, + "learning_rate": 8.453495947872641e-05, + "loss": 2.5642, + "step": 27454 + }, + { + "epoch": 0.8141327877116508, + "grad_norm": 0.09315840154886246, + "learning_rate": 8.450878325285382e-05, + "loss": 2.5504, + "step": 27455 + }, + { + "epoch": 0.8141624410639623, + "grad_norm": 0.08556415140628815, + "learning_rate": 8.448261070622731e-05, + "loss": 2.5617, + "step": 27456 + }, + { + "epoch": 0.8141920944162737, + "grad_norm": 0.08940945565700531, + "learning_rate": 8.445644183907858e-05, + "loss": 2.5468, + "step": 27457 + }, + { + "epoch": 0.8142217477685852, + "grad_norm": 0.09034908562898636, + "learning_rate": 8.443027665163938e-05, + "loss": 2.6099, + "step": 27458 + }, + { + "epoch": 0.8142514011208967, + "grad_norm": 0.08758231997489929, + "learning_rate": 8.440411514414137e-05, + "loss": 2.5292, + "step": 27459 + }, + { + "epoch": 0.8142810544732082, + "grad_norm": 0.08880516141653061, + "learning_rate": 8.43779573168163e-05, + "loss": 2.5049, + "step": 27460 + }, + { + "epoch": 0.8143107078255196, + "grad_norm": 0.08069854974746704, + "learning_rate": 8.435180316989576e-05, + "loss": 2.5436, + "step": 27461 + }, + { + "epoch": 0.8143403611778312, + "grad_norm": 0.0913977101445198, + "learning_rate": 8.432565270361131e-05, + "loss": 2.5624, + "step": 27462 + }, + { + "epoch": 0.8143700145301427, + "grad_norm": 0.08281628042459488, + "learning_rate": 8.429950591819463e-05, + "loss": 2.6007, + "step": 27463 + }, + { + "epoch": 0.8143996678824541, + "grad_norm": 0.08942483365535736, + "learning_rate": 8.42733628138771e-05, + "loss": 2.5268, + "step": 27464 + }, + { + "epoch": 0.8144293212347656, + "grad_norm": 0.08484049886465073, + "learning_rate": 8.424722339089036e-05, + "loss": 2.5698, + "step": 27465 + }, + { + "epoch": 0.8144589745870771, + "grad_norm": 0.07974127680063248, + "learning_rate": 8.422108764946579e-05, + "loss": 2.5267, + "step": 27466 + }, + { + "epoch": 0.8144886279393886, + "grad_norm": 0.0864025428891182, + "learning_rate": 8.419495558983487e-05, + "loss": 2.5543, + "step": 27467 + }, + { + "epoch": 0.8145182812917, + "grad_norm": 0.08327249437570572, + "learning_rate": 8.416882721222896e-05, + "loss": 2.5537, + "step": 27468 + }, + { + "epoch": 0.8145479346440115, + "grad_norm": 0.07996539771556854, + "learning_rate": 8.414270251687945e-05, + "loss": 2.5586, + "step": 27469 + }, + { + "epoch": 0.814577587996323, + "grad_norm": 0.08078528195619583, + "learning_rate": 8.411658150401774e-05, + "loss": 2.5261, + "step": 27470 + }, + { + "epoch": 0.8146072413486345, + "grad_norm": 0.0821303054690361, + "learning_rate": 8.409046417387505e-05, + "loss": 2.5687, + "step": 27471 + }, + { + "epoch": 0.8146368947009459, + "grad_norm": 0.09180691093206406, + "learning_rate": 8.406435052668271e-05, + "loss": 2.5926, + "step": 27472 + }, + { + "epoch": 0.8146665480532574, + "grad_norm": 0.0851016417145729, + "learning_rate": 8.403824056267195e-05, + "loss": 2.5346, + "step": 27473 + }, + { + "epoch": 0.8146962014055689, + "grad_norm": 0.08481957763433456, + "learning_rate": 8.401213428207394e-05, + "loss": 2.58, + "step": 27474 + }, + { + "epoch": 0.8147258547578804, + "grad_norm": 0.10035242140293121, + "learning_rate": 8.398603168511992e-05, + "loss": 2.5796, + "step": 27475 + }, + { + "epoch": 0.8147555081101918, + "grad_norm": 0.08006682991981506, + "learning_rate": 8.395993277204095e-05, + "loss": 2.5599, + "step": 27476 + }, + { + "epoch": 0.8147851614625033, + "grad_norm": 0.08943682909011841, + "learning_rate": 8.393383754306821e-05, + "loss": 2.5713, + "step": 27477 + }, + { + "epoch": 0.8148148148148148, + "grad_norm": 0.08880888670682907, + "learning_rate": 8.390774599843282e-05, + "loss": 2.549, + "step": 27478 + }, + { + "epoch": 0.8148444681671263, + "grad_norm": 0.0815349742770195, + "learning_rate": 8.388165813836568e-05, + "loss": 2.5374, + "step": 27479 + }, + { + "epoch": 0.8148741215194377, + "grad_norm": 0.10394613444805145, + "learning_rate": 8.385557396309789e-05, + "loss": 2.553, + "step": 27480 + }, + { + "epoch": 0.8149037748717493, + "grad_norm": 0.08321291953325272, + "learning_rate": 8.382949347286045e-05, + "loss": 2.5743, + "step": 27481 + }, + { + "epoch": 0.8149334282240608, + "grad_norm": 0.08926641196012497, + "learning_rate": 8.380341666788427e-05, + "loss": 2.5488, + "step": 27482 + }, + { + "epoch": 0.8149630815763722, + "grad_norm": 0.09566277265548706, + "learning_rate": 8.377734354840039e-05, + "loss": 2.5971, + "step": 27483 + }, + { + "epoch": 0.8149927349286837, + "grad_norm": 0.0880020260810852, + "learning_rate": 8.375127411463945e-05, + "loss": 2.5298, + "step": 27484 + }, + { + "epoch": 0.8150223882809952, + "grad_norm": 0.08939067274332047, + "learning_rate": 8.372520836683244e-05, + "loss": 2.5528, + "step": 27485 + }, + { + "epoch": 0.8150520416333067, + "grad_norm": 0.09026680886745453, + "learning_rate": 8.369914630521003e-05, + "loss": 2.5803, + "step": 27486 + }, + { + "epoch": 0.8150816949856181, + "grad_norm": 0.08762539178133011, + "learning_rate": 8.367308793000328e-05, + "loss": 2.5565, + "step": 27487 + }, + { + "epoch": 0.8151113483379296, + "grad_norm": 0.08884136378765106, + "learning_rate": 8.36470332414428e-05, + "loss": 2.5405, + "step": 27488 + }, + { + "epoch": 0.8151410016902411, + "grad_norm": 0.08815746754407883, + "learning_rate": 8.362098223975928e-05, + "loss": 2.5655, + "step": 27489 + }, + { + "epoch": 0.8151706550425526, + "grad_norm": 0.09962418675422668, + "learning_rate": 8.359493492518344e-05, + "loss": 2.5829, + "step": 27490 + }, + { + "epoch": 0.815200308394864, + "grad_norm": 0.08185303211212158, + "learning_rate": 8.356889129794593e-05, + "loss": 2.5435, + "step": 27491 + }, + { + "epoch": 0.8152299617471755, + "grad_norm": 0.09119881689548492, + "learning_rate": 8.354285135827733e-05, + "loss": 2.546, + "step": 27492 + }, + { + "epoch": 0.815259615099487, + "grad_norm": 0.08874782174825668, + "learning_rate": 8.351681510640829e-05, + "loss": 2.5528, + "step": 27493 + }, + { + "epoch": 0.8152892684517985, + "grad_norm": 0.08714999258518219, + "learning_rate": 8.349078254256948e-05, + "loss": 2.5668, + "step": 27494 + }, + { + "epoch": 0.8153189218041099, + "grad_norm": 0.08921507000923157, + "learning_rate": 8.346475366699119e-05, + "loss": 2.5388, + "step": 27495 + }, + { + "epoch": 0.8153485751564215, + "grad_norm": 0.0794229805469513, + "learning_rate": 8.343872847990392e-05, + "loss": 2.5419, + "step": 27496 + }, + { + "epoch": 0.8153782285087329, + "grad_norm": 0.08366136252880096, + "learning_rate": 8.34127069815383e-05, + "loss": 2.5369, + "step": 27497 + }, + { + "epoch": 0.8154078818610444, + "grad_norm": 0.08722782880067825, + "learning_rate": 8.33866891721246e-05, + "loss": 2.5554, + "step": 27498 + }, + { + "epoch": 0.8154375352133558, + "grad_norm": 0.0804109126329422, + "learning_rate": 8.336067505189316e-05, + "loss": 2.5519, + "step": 27499 + }, + { + "epoch": 0.8154671885656674, + "grad_norm": 0.08163207769393921, + "learning_rate": 8.333466462107465e-05, + "loss": 2.5693, + "step": 27500 + }, + { + "epoch": 0.8154968419179788, + "grad_norm": 0.08997726440429688, + "learning_rate": 8.330865787989911e-05, + "loss": 2.5798, + "step": 27501 + }, + { + "epoch": 0.8155264952702903, + "grad_norm": 0.08565358072519302, + "learning_rate": 8.328265482859698e-05, + "loss": 2.5525, + "step": 27502 + }, + { + "epoch": 0.8155561486226018, + "grad_norm": 0.0866776630282402, + "learning_rate": 8.325665546739846e-05, + "loss": 2.58, + "step": 27503 + }, + { + "epoch": 0.8155858019749133, + "grad_norm": 0.08136318624019623, + "learning_rate": 8.323065979653394e-05, + "loss": 2.5492, + "step": 27504 + }, + { + "epoch": 0.8156154553272248, + "grad_norm": 0.10700005292892456, + "learning_rate": 8.320466781623331e-05, + "loss": 2.5559, + "step": 27505 + }, + { + "epoch": 0.8156451086795362, + "grad_norm": 0.0757804736495018, + "learning_rate": 8.317867952672686e-05, + "loss": 2.5696, + "step": 27506 + }, + { + "epoch": 0.8156747620318477, + "grad_norm": 0.0978534072637558, + "learning_rate": 8.315269492824478e-05, + "loss": 2.57, + "step": 27507 + }, + { + "epoch": 0.8157044153841592, + "grad_norm": 0.07890839874744415, + "learning_rate": 8.312671402101717e-05, + "loss": 2.5763, + "step": 27508 + }, + { + "epoch": 0.8157340687364707, + "grad_norm": 0.08614323288202286, + "learning_rate": 8.310073680527402e-05, + "loss": 2.5353, + "step": 27509 + }, + { + "epoch": 0.8157637220887821, + "grad_norm": 0.08098002523183823, + "learning_rate": 8.307476328124542e-05, + "loss": 2.5216, + "step": 27510 + }, + { + "epoch": 0.8157933754410936, + "grad_norm": 0.08308549970388412, + "learning_rate": 8.304879344916139e-05, + "loss": 2.5436, + "step": 27511 + }, + { + "epoch": 0.8158230287934051, + "grad_norm": 0.08342941850423813, + "learning_rate": 8.30228273092517e-05, + "loss": 2.5118, + "step": 27512 + }, + { + "epoch": 0.8158526821457166, + "grad_norm": 0.0877772867679596, + "learning_rate": 8.299686486174657e-05, + "loss": 2.5603, + "step": 27513 + }, + { + "epoch": 0.815882335498028, + "grad_norm": 0.09828522801399231, + "learning_rate": 8.29709061068758e-05, + "loss": 2.5452, + "step": 27514 + }, + { + "epoch": 0.8159119888503396, + "grad_norm": 0.08829618990421295, + "learning_rate": 8.294495104486932e-05, + "loss": 2.5471, + "step": 27515 + }, + { + "epoch": 0.815941642202651, + "grad_norm": 0.08881532400846481, + "learning_rate": 8.291899967595678e-05, + "loss": 2.5467, + "step": 27516 + }, + { + "epoch": 0.8159712955549625, + "grad_norm": 0.08097191900014877, + "learning_rate": 8.289305200036812e-05, + "loss": 2.5615, + "step": 27517 + }, + { + "epoch": 0.8160009489072739, + "grad_norm": 0.09000236541032791, + "learning_rate": 8.286710801833303e-05, + "loss": 2.5728, + "step": 27518 + }, + { + "epoch": 0.8160306022595855, + "grad_norm": 0.08057336509227753, + "learning_rate": 8.284116773008132e-05, + "loss": 2.5643, + "step": 27519 + }, + { + "epoch": 0.8160602556118969, + "grad_norm": 0.0821133479475975, + "learning_rate": 8.281523113584272e-05, + "loss": 2.5134, + "step": 27520 + }, + { + "epoch": 0.8160899089642084, + "grad_norm": 0.07978269457817078, + "learning_rate": 8.27892982358468e-05, + "loss": 2.5493, + "step": 27521 + }, + { + "epoch": 0.8161195623165198, + "grad_norm": 0.08712659776210785, + "learning_rate": 8.276336903032327e-05, + "loss": 2.5378, + "step": 27522 + }, + { + "epoch": 0.8161492156688314, + "grad_norm": 0.08511599898338318, + "learning_rate": 8.273744351950174e-05, + "loss": 2.5499, + "step": 27523 + }, + { + "epoch": 0.8161788690211429, + "grad_norm": 0.07697834074497223, + "learning_rate": 8.271152170361174e-05, + "loss": 2.586, + "step": 27524 + }, + { + "epoch": 0.8162085223734543, + "grad_norm": 0.07981076091527939, + "learning_rate": 8.268560358288285e-05, + "loss": 2.5565, + "step": 27525 + }, + { + "epoch": 0.8162381757257658, + "grad_norm": 0.07951851934194565, + "learning_rate": 8.265968915754463e-05, + "loss": 2.5742, + "step": 27526 + }, + { + "epoch": 0.8162678290780773, + "grad_norm": 0.0839327946305275, + "learning_rate": 8.263377842782644e-05, + "loss": 2.5317, + "step": 27527 + }, + { + "epoch": 0.8162974824303888, + "grad_norm": 0.07923818379640579, + "learning_rate": 8.260787139395775e-05, + "loss": 2.5403, + "step": 27528 + }, + { + "epoch": 0.8163271357827002, + "grad_norm": 0.07968898117542267, + "learning_rate": 8.258196805616808e-05, + "loss": 2.5309, + "step": 27529 + }, + { + "epoch": 0.8163567891350118, + "grad_norm": 0.08172553777694702, + "learning_rate": 8.255606841468672e-05, + "loss": 2.5728, + "step": 27530 + }, + { + "epoch": 0.8163864424873232, + "grad_norm": 0.07831957191228867, + "learning_rate": 8.253017246974298e-05, + "loss": 2.5399, + "step": 27531 + }, + { + "epoch": 0.8164160958396347, + "grad_norm": 0.08066461235284805, + "learning_rate": 8.250428022156626e-05, + "loss": 2.5516, + "step": 27532 + }, + { + "epoch": 0.8164457491919461, + "grad_norm": 0.0776297077536583, + "learning_rate": 8.247839167038579e-05, + "loss": 2.5349, + "step": 27533 + }, + { + "epoch": 0.8164754025442577, + "grad_norm": 0.08042863756418228, + "learning_rate": 8.24525068164308e-05, + "loss": 2.5632, + "step": 27534 + }, + { + "epoch": 0.8165050558965691, + "grad_norm": 0.08358512073755264, + "learning_rate": 8.242662565993059e-05, + "loss": 2.5936, + "step": 27535 + }, + { + "epoch": 0.8165347092488806, + "grad_norm": 0.08274545520544052, + "learning_rate": 8.240074820111421e-05, + "loss": 2.5598, + "step": 27536 + }, + { + "epoch": 0.816564362601192, + "grad_norm": 0.08388868719339371, + "learning_rate": 8.237487444021096e-05, + "loss": 2.5449, + "step": 27537 + }, + { + "epoch": 0.8165940159535036, + "grad_norm": 0.08053132891654968, + "learning_rate": 8.234900437744985e-05, + "loss": 2.562, + "step": 27538 + }, + { + "epoch": 0.816623669305815, + "grad_norm": 0.0800044909119606, + "learning_rate": 8.232313801305996e-05, + "loss": 2.5867, + "step": 27539 + }, + { + "epoch": 0.8166533226581265, + "grad_norm": 0.08775138109922409, + "learning_rate": 8.229727534727044e-05, + "loss": 2.5515, + "step": 27540 + }, + { + "epoch": 0.8166829760104379, + "grad_norm": 0.0756964311003685, + "learning_rate": 8.227141638031022e-05, + "loss": 2.5495, + "step": 27541 + }, + { + "epoch": 0.8167126293627495, + "grad_norm": 0.08151186257600784, + "learning_rate": 8.224556111240827e-05, + "loss": 2.5458, + "step": 27542 + }, + { + "epoch": 0.8167422827150609, + "grad_norm": 0.08147142827510834, + "learning_rate": 8.221970954379365e-05, + "loss": 2.5457, + "step": 27543 + }, + { + "epoch": 0.8167719360673724, + "grad_norm": 0.08698852360248566, + "learning_rate": 8.219386167469517e-05, + "loss": 2.5698, + "step": 27544 + }, + { + "epoch": 0.816801589419684, + "grad_norm": 0.08726026117801666, + "learning_rate": 8.216801750534176e-05, + "loss": 2.5373, + "step": 27545 + }, + { + "epoch": 0.8168312427719954, + "grad_norm": 0.09251682460308075, + "learning_rate": 8.214217703596228e-05, + "loss": 2.5617, + "step": 27546 + }, + { + "epoch": 0.8168608961243069, + "grad_norm": 0.08842673152685165, + "learning_rate": 8.211634026678554e-05, + "loss": 2.5572, + "step": 27547 + }, + { + "epoch": 0.8168905494766183, + "grad_norm": 0.08638539165258408, + "learning_rate": 8.20905071980404e-05, + "loss": 2.5283, + "step": 27548 + }, + { + "epoch": 0.8169202028289299, + "grad_norm": 0.08965624868869781, + "learning_rate": 8.206467782995558e-05, + "loss": 2.5617, + "step": 27549 + }, + { + "epoch": 0.8169498561812413, + "grad_norm": 0.09126310795545578, + "learning_rate": 8.203885216275958e-05, + "loss": 2.5639, + "step": 27550 + }, + { + "epoch": 0.8169795095335528, + "grad_norm": 0.08694038540124893, + "learning_rate": 8.201303019668143e-05, + "loss": 2.563, + "step": 27551 + }, + { + "epoch": 0.8170091628858642, + "grad_norm": 0.08951418846845627, + "learning_rate": 8.198721193194964e-05, + "loss": 2.5467, + "step": 27552 + }, + { + "epoch": 0.8170388162381758, + "grad_norm": 0.08720213174819946, + "learning_rate": 8.196139736879276e-05, + "loss": 2.5484, + "step": 27553 + }, + { + "epoch": 0.8170684695904872, + "grad_norm": 0.08257801085710526, + "learning_rate": 8.193558650743955e-05, + "loss": 2.5557, + "step": 27554 + }, + { + "epoch": 0.8170981229427987, + "grad_norm": 0.08281321078538895, + "learning_rate": 8.190977934811843e-05, + "loss": 2.5585, + "step": 27555 + }, + { + "epoch": 0.8171277762951101, + "grad_norm": 0.08630039542913437, + "learning_rate": 8.1883975891058e-05, + "loss": 2.5304, + "step": 27556 + }, + { + "epoch": 0.8171574296474217, + "grad_norm": 0.08825412392616272, + "learning_rate": 8.185817613648672e-05, + "loss": 2.5485, + "step": 27557 + }, + { + "epoch": 0.8171870829997331, + "grad_norm": 0.080286905169487, + "learning_rate": 8.183238008463312e-05, + "loss": 2.5355, + "step": 27558 + }, + { + "epoch": 0.8172167363520446, + "grad_norm": 0.09250859916210175, + "learning_rate": 8.180658773572563e-05, + "loss": 2.564, + "step": 27559 + }, + { + "epoch": 0.817246389704356, + "grad_norm": 0.08904929459095001, + "learning_rate": 8.178079908999248e-05, + "loss": 2.5296, + "step": 27560 + }, + { + "epoch": 0.8172760430566676, + "grad_norm": 0.09656061977148056, + "learning_rate": 8.175501414766212e-05, + "loss": 2.6029, + "step": 27561 + }, + { + "epoch": 0.817305696408979, + "grad_norm": 0.08135049790143967, + "learning_rate": 8.172923290896295e-05, + "loss": 2.543, + "step": 27562 + }, + { + "epoch": 0.8173353497612905, + "grad_norm": 0.08742361515760422, + "learning_rate": 8.170345537412305e-05, + "loss": 2.5501, + "step": 27563 + }, + { + "epoch": 0.8173650031136019, + "grad_norm": 0.08048617094755173, + "learning_rate": 8.167768154337101e-05, + "loss": 2.5642, + "step": 27564 + }, + { + "epoch": 0.8173946564659135, + "grad_norm": 0.0899108499288559, + "learning_rate": 8.165191141693489e-05, + "loss": 2.5669, + "step": 27565 + }, + { + "epoch": 0.817424309818225, + "grad_norm": 0.08423932641744614, + "learning_rate": 8.162614499504289e-05, + "loss": 2.5978, + "step": 27566 + }, + { + "epoch": 0.8174539631705364, + "grad_norm": 0.08683323115110397, + "learning_rate": 8.160038227792322e-05, + "loss": 2.541, + "step": 27567 + }, + { + "epoch": 0.817483616522848, + "grad_norm": 0.0816381648182869, + "learning_rate": 8.157462326580395e-05, + "loss": 2.5022, + "step": 27568 + }, + { + "epoch": 0.8175132698751594, + "grad_norm": 0.09193814545869827, + "learning_rate": 8.15488679589132e-05, + "loss": 2.5329, + "step": 27569 + }, + { + "epoch": 0.8175429232274709, + "grad_norm": 0.0853496566414833, + "learning_rate": 8.152311635747922e-05, + "loss": 2.5427, + "step": 27570 + }, + { + "epoch": 0.8175725765797823, + "grad_norm": 0.09651993215084076, + "learning_rate": 8.149736846172972e-05, + "loss": 2.5613, + "step": 27571 + }, + { + "epoch": 0.8176022299320939, + "grad_norm": 0.09111884981393814, + "learning_rate": 8.147162427189287e-05, + "loss": 2.5627, + "step": 27572 + }, + { + "epoch": 0.8176318832844053, + "grad_norm": 0.08589036762714386, + "learning_rate": 8.144588378819661e-05, + "loss": 2.5452, + "step": 27573 + }, + { + "epoch": 0.8176615366367168, + "grad_norm": 0.0914936289191246, + "learning_rate": 8.142014701086892e-05, + "loss": 2.5669, + "step": 27574 + }, + { + "epoch": 0.8176911899890282, + "grad_norm": 0.08544456213712692, + "learning_rate": 8.139441394013769e-05, + "loss": 2.5216, + "step": 27575 + }, + { + "epoch": 0.8177208433413398, + "grad_norm": 0.09236156195402145, + "learning_rate": 8.13686845762306e-05, + "loss": 2.5507, + "step": 27576 + }, + { + "epoch": 0.8177504966936512, + "grad_norm": 0.07834504544734955, + "learning_rate": 8.134295891937582e-05, + "loss": 2.5625, + "step": 27577 + }, + { + "epoch": 0.8177801500459627, + "grad_norm": 0.09498057514429092, + "learning_rate": 8.131723696980098e-05, + "loss": 2.5625, + "step": 27578 + }, + { + "epoch": 0.8178098033982741, + "grad_norm": 0.08752096444368362, + "learning_rate": 8.129151872773388e-05, + "loss": 2.5122, + "step": 27579 + }, + { + "epoch": 0.8178394567505857, + "grad_norm": 0.08255764842033386, + "learning_rate": 8.12658041934024e-05, + "loss": 2.568, + "step": 27580 + }, + { + "epoch": 0.8178691101028971, + "grad_norm": 0.0875399112701416, + "learning_rate": 8.124009336703397e-05, + "loss": 2.5412, + "step": 27581 + }, + { + "epoch": 0.8178987634552086, + "grad_norm": 0.08189629763364792, + "learning_rate": 8.121438624885636e-05, + "loss": 2.5321, + "step": 27582 + }, + { + "epoch": 0.81792841680752, + "grad_norm": 0.08790723979473114, + "learning_rate": 8.11886828390973e-05, + "loss": 2.5565, + "step": 27583 + }, + { + "epoch": 0.8179580701598316, + "grad_norm": 0.07927638292312622, + "learning_rate": 8.116298313798432e-05, + "loss": 2.5409, + "step": 27584 + }, + { + "epoch": 0.817987723512143, + "grad_norm": 0.09147040545940399, + "learning_rate": 8.113728714574497e-05, + "loss": 2.5795, + "step": 27585 + }, + { + "epoch": 0.8180173768644545, + "grad_norm": 0.07814186066389084, + "learning_rate": 8.111159486260689e-05, + "loss": 2.5324, + "step": 27586 + }, + { + "epoch": 0.8180470302167661, + "grad_norm": 0.09404870867729187, + "learning_rate": 8.108590628879752e-05, + "loss": 2.5243, + "step": 27587 + }, + { + "epoch": 0.8180766835690775, + "grad_norm": 0.08549356460571289, + "learning_rate": 8.106022142454434e-05, + "loss": 2.5465, + "step": 27588 + }, + { + "epoch": 0.818106336921389, + "grad_norm": 0.07885143905878067, + "learning_rate": 8.103454027007473e-05, + "loss": 2.5087, + "step": 27589 + }, + { + "epoch": 0.8181359902737004, + "grad_norm": 0.09390407800674438, + "learning_rate": 8.100886282561626e-05, + "loss": 2.5716, + "step": 27590 + }, + { + "epoch": 0.818165643626012, + "grad_norm": 0.08381808549165726, + "learning_rate": 8.098318909139634e-05, + "loss": 2.5253, + "step": 27591 + }, + { + "epoch": 0.8181952969783234, + "grad_norm": 0.08602749556303024, + "learning_rate": 8.095751906764214e-05, + "loss": 2.5593, + "step": 27592 + }, + { + "epoch": 0.8182249503306349, + "grad_norm": 0.08913461863994598, + "learning_rate": 8.093185275458098e-05, + "loss": 2.5344, + "step": 27593 + }, + { + "epoch": 0.8182546036829463, + "grad_norm": 0.08755827695131302, + "learning_rate": 8.090619015244022e-05, + "loss": 2.5481, + "step": 27594 + }, + { + "epoch": 0.8182842570352579, + "grad_norm": 0.08749818056821823, + "learning_rate": 8.088053126144712e-05, + "loss": 2.5237, + "step": 27595 + }, + { + "epoch": 0.8183139103875693, + "grad_norm": 0.09654494374990463, + "learning_rate": 8.085487608182878e-05, + "loss": 2.5669, + "step": 27596 + }, + { + "epoch": 0.8183435637398808, + "grad_norm": 0.09065999835729599, + "learning_rate": 8.082922461381253e-05, + "loss": 2.5607, + "step": 27597 + }, + { + "epoch": 0.8183732170921922, + "grad_norm": 0.08332441747188568, + "learning_rate": 8.080357685762541e-05, + "loss": 2.5388, + "step": 27598 + }, + { + "epoch": 0.8184028704445038, + "grad_norm": 0.09211690723896027, + "learning_rate": 8.077793281349461e-05, + "loss": 2.5475, + "step": 27599 + }, + { + "epoch": 0.8184325237968152, + "grad_norm": 0.07411779463291168, + "learning_rate": 8.075229248164711e-05, + "loss": 2.5235, + "step": 27600 + }, + { + "epoch": 0.8184621771491267, + "grad_norm": 0.08982135355472565, + "learning_rate": 8.072665586231004e-05, + "loss": 2.5537, + "step": 27601 + }, + { + "epoch": 0.8184918305014381, + "grad_norm": 0.08691553771495819, + "learning_rate": 8.070102295571041e-05, + "loss": 2.5531, + "step": 27602 + }, + { + "epoch": 0.8185214838537497, + "grad_norm": 0.08900110423564911, + "learning_rate": 8.067539376207523e-05, + "loss": 2.5454, + "step": 27603 + }, + { + "epoch": 0.8185511372060611, + "grad_norm": 0.08472555875778198, + "learning_rate": 8.064976828163134e-05, + "loss": 2.5472, + "step": 27604 + }, + { + "epoch": 0.8185807905583726, + "grad_norm": 0.08102656155824661, + "learning_rate": 8.06241465146058e-05, + "loss": 2.5464, + "step": 27605 + }, + { + "epoch": 0.818610443910684, + "grad_norm": 0.08845008909702301, + "learning_rate": 8.05985284612254e-05, + "loss": 2.5701, + "step": 27606 + }, + { + "epoch": 0.8186400972629956, + "grad_norm": 0.08048375695943832, + "learning_rate": 8.057291412171703e-05, + "loss": 2.5354, + "step": 27607 + }, + { + "epoch": 0.8186697506153071, + "grad_norm": 0.08045004308223724, + "learning_rate": 8.054730349630746e-05, + "loss": 2.5295, + "step": 27608 + }, + { + "epoch": 0.8186994039676185, + "grad_norm": 0.0826822966337204, + "learning_rate": 8.052169658522357e-05, + "loss": 2.5332, + "step": 27609 + }, + { + "epoch": 0.8187290573199301, + "grad_norm": 0.07754850387573242, + "learning_rate": 8.049609338869201e-05, + "loss": 2.5773, + "step": 27610 + }, + { + "epoch": 0.8187587106722415, + "grad_norm": 0.08360616117715836, + "learning_rate": 8.047049390693955e-05, + "loss": 2.5172, + "step": 27611 + }, + { + "epoch": 0.818788364024553, + "grad_norm": 0.08607473969459534, + "learning_rate": 8.04448981401929e-05, + "loss": 2.5575, + "step": 27612 + }, + { + "epoch": 0.8188180173768644, + "grad_norm": 0.08259433507919312, + "learning_rate": 8.04193060886787e-05, + "loss": 2.5717, + "step": 27613 + }, + { + "epoch": 0.818847670729176, + "grad_norm": 0.08156022429466248, + "learning_rate": 8.039371775262372e-05, + "loss": 2.5302, + "step": 27614 + }, + { + "epoch": 0.8188773240814874, + "grad_norm": 0.08491954207420349, + "learning_rate": 8.036813313225411e-05, + "loss": 2.5493, + "step": 27615 + }, + { + "epoch": 0.8189069774337989, + "grad_norm": 0.084518663585186, + "learning_rate": 8.03425522277968e-05, + "loss": 2.5581, + "step": 27616 + }, + { + "epoch": 0.8189366307861103, + "grad_norm": 0.0773031935095787, + "learning_rate": 8.031697503947827e-05, + "loss": 2.527, + "step": 27617 + }, + { + "epoch": 0.8189662841384219, + "grad_norm": 0.0899886041879654, + "learning_rate": 8.029140156752495e-05, + "loss": 2.538, + "step": 27618 + }, + { + "epoch": 0.8189959374907333, + "grad_norm": 0.08268292248249054, + "learning_rate": 8.026583181216329e-05, + "loss": 2.5592, + "step": 27619 + }, + { + "epoch": 0.8190255908430448, + "grad_norm": 0.08728715032339096, + "learning_rate": 8.024026577361976e-05, + "loss": 2.5559, + "step": 27620 + }, + { + "epoch": 0.8190552441953562, + "grad_norm": 0.08623746782541275, + "learning_rate": 8.021470345212073e-05, + "loss": 2.5523, + "step": 27621 + }, + { + "epoch": 0.8190848975476678, + "grad_norm": 0.08453096449375153, + "learning_rate": 8.018914484789252e-05, + "loss": 2.5323, + "step": 27622 + }, + { + "epoch": 0.8191145508999792, + "grad_norm": 0.08969195932149887, + "learning_rate": 8.016358996116157e-05, + "loss": 2.5526, + "step": 27623 + }, + { + "epoch": 0.8191442042522907, + "grad_norm": 0.0834379717707634, + "learning_rate": 8.013803879215403e-05, + "loss": 2.5508, + "step": 27624 + }, + { + "epoch": 0.8191738576046022, + "grad_norm": 0.0878918245434761, + "learning_rate": 8.011249134109638e-05, + "loss": 2.5337, + "step": 27625 + }, + { + "epoch": 0.8192035109569137, + "grad_norm": 0.08450216054916382, + "learning_rate": 8.008694760821456e-05, + "loss": 2.5599, + "step": 27626 + }, + { + "epoch": 0.8192331643092251, + "grad_norm": 0.08957945555448532, + "learning_rate": 8.006140759373486e-05, + "loss": 2.529, + "step": 27627 + }, + { + "epoch": 0.8192628176615366, + "grad_norm": 0.08326572924852371, + "learning_rate": 8.003587129788337e-05, + "loss": 2.5851, + "step": 27628 + }, + { + "epoch": 0.8192924710138482, + "grad_norm": 0.09279988706111908, + "learning_rate": 8.001033872088648e-05, + "loss": 2.547, + "step": 27629 + }, + { + "epoch": 0.8193221243661596, + "grad_norm": 0.09000565111637115, + "learning_rate": 7.998480986297013e-05, + "loss": 2.54, + "step": 27630 + }, + { + "epoch": 0.8193517777184711, + "grad_norm": 0.0834898129105568, + "learning_rate": 7.995928472436037e-05, + "loss": 2.5345, + "step": 27631 + }, + { + "epoch": 0.8193814310707825, + "grad_norm": 0.08023115247488022, + "learning_rate": 7.993376330528323e-05, + "loss": 2.5703, + "step": 27632 + }, + { + "epoch": 0.8194110844230941, + "grad_norm": 0.08564569056034088, + "learning_rate": 7.990824560596472e-05, + "loss": 2.5554, + "step": 27633 + }, + { + "epoch": 0.8194407377754055, + "grad_norm": 0.08727449923753738, + "learning_rate": 7.988273162663078e-05, + "loss": 2.5444, + "step": 27634 + }, + { + "epoch": 0.819470391127717, + "grad_norm": 0.08736857771873474, + "learning_rate": 7.985722136750755e-05, + "loss": 2.5871, + "step": 27635 + }, + { + "epoch": 0.8195000444800284, + "grad_norm": 0.08421767503023148, + "learning_rate": 7.983171482882057e-05, + "loss": 2.5147, + "step": 27636 + }, + { + "epoch": 0.81952969783234, + "grad_norm": 0.08296343684196472, + "learning_rate": 7.980621201079591e-05, + "loss": 2.5261, + "step": 27637 + }, + { + "epoch": 0.8195593511846514, + "grad_norm": 0.08899570256471634, + "learning_rate": 7.978071291365935e-05, + "loss": 2.5743, + "step": 27638 + }, + { + "epoch": 0.8195890045369629, + "grad_norm": 0.08298096060752869, + "learning_rate": 7.97552175376367e-05, + "loss": 2.562, + "step": 27639 + }, + { + "epoch": 0.8196186578892743, + "grad_norm": 0.08532126992940903, + "learning_rate": 7.972972588295374e-05, + "loss": 2.5213, + "step": 27640 + }, + { + "epoch": 0.8196483112415859, + "grad_norm": 0.09067290276288986, + "learning_rate": 7.970423794983606e-05, + "loss": 2.53, + "step": 27641 + }, + { + "epoch": 0.8196779645938973, + "grad_norm": 0.08072931319475174, + "learning_rate": 7.967875373850964e-05, + "loss": 2.564, + "step": 27642 + }, + { + "epoch": 0.8197076179462088, + "grad_norm": 0.08476506918668747, + "learning_rate": 7.965327324920002e-05, + "loss": 2.5478, + "step": 27643 + }, + { + "epoch": 0.8197372712985203, + "grad_norm": 0.08318736404180527, + "learning_rate": 7.962779648213276e-05, + "loss": 2.5588, + "step": 27644 + }, + { + "epoch": 0.8197669246508318, + "grad_norm": 0.08664727956056595, + "learning_rate": 7.960232343753354e-05, + "loss": 2.5227, + "step": 27645 + }, + { + "epoch": 0.8197965780031432, + "grad_norm": 0.08227275311946869, + "learning_rate": 7.957685411562804e-05, + "loss": 2.5266, + "step": 27646 + }, + { + "epoch": 0.8198262313554547, + "grad_norm": 0.0825745239853859, + "learning_rate": 7.955138851664156e-05, + "loss": 2.5405, + "step": 27647 + }, + { + "epoch": 0.8198558847077662, + "grad_norm": 0.08505495637655258, + "learning_rate": 7.952592664079966e-05, + "loss": 2.5709, + "step": 27648 + }, + { + "epoch": 0.8198855380600777, + "grad_norm": 0.086638443171978, + "learning_rate": 7.950046848832787e-05, + "loss": 2.5333, + "step": 27649 + }, + { + "epoch": 0.8199151914123892, + "grad_norm": 0.08641493320465088, + "learning_rate": 7.947501405945162e-05, + "loss": 2.5388, + "step": 27650 + }, + { + "epoch": 0.8199448447647006, + "grad_norm": 0.08257663249969482, + "learning_rate": 7.944956335439629e-05, + "loss": 2.5522, + "step": 27651 + }, + { + "epoch": 0.8199744981170122, + "grad_norm": 0.08803920447826385, + "learning_rate": 7.942411637338732e-05, + "loss": 2.5483, + "step": 27652 + }, + { + "epoch": 0.8200041514693236, + "grad_norm": 0.09396836161613464, + "learning_rate": 7.939867311664989e-05, + "loss": 2.5376, + "step": 27653 + }, + { + "epoch": 0.8200338048216351, + "grad_norm": 0.08083456754684448, + "learning_rate": 7.937323358440934e-05, + "loss": 2.5727, + "step": 27654 + }, + { + "epoch": 0.8200634581739465, + "grad_norm": 0.08394593745470047, + "learning_rate": 7.934779777689116e-05, + "loss": 2.5275, + "step": 27655 + }, + { + "epoch": 0.8200931115262581, + "grad_norm": 0.08020243793725967, + "learning_rate": 7.93223656943205e-05, + "loss": 2.5598, + "step": 27656 + }, + { + "epoch": 0.8201227648785695, + "grad_norm": 0.08444011956453323, + "learning_rate": 7.929693733692239e-05, + "loss": 2.5057, + "step": 27657 + }, + { + "epoch": 0.820152418230881, + "grad_norm": 0.07494199275970459, + "learning_rate": 7.92715127049221e-05, + "loss": 2.5065, + "step": 27658 + }, + { + "epoch": 0.8201820715831925, + "grad_norm": 0.08392887562513351, + "learning_rate": 7.92460917985448e-05, + "loss": 2.5533, + "step": 27659 + }, + { + "epoch": 0.820211724935504, + "grad_norm": 0.08117653429508209, + "learning_rate": 7.922067461801557e-05, + "loss": 2.5528, + "step": 27660 + }, + { + "epoch": 0.8202413782878154, + "grad_norm": 0.07916080951690674, + "learning_rate": 7.919526116355952e-05, + "loss": 2.5455, + "step": 27661 + }, + { + "epoch": 0.8202710316401269, + "grad_norm": 0.08003543317317963, + "learning_rate": 7.916985143540168e-05, + "loss": 2.5584, + "step": 27662 + }, + { + "epoch": 0.8203006849924384, + "grad_norm": 0.07471980154514313, + "learning_rate": 7.914444543376698e-05, + "loss": 2.5658, + "step": 27663 + }, + { + "epoch": 0.8203303383447499, + "grad_norm": 0.08535250276327133, + "learning_rate": 7.911904315888047e-05, + "loss": 2.5141, + "step": 27664 + }, + { + "epoch": 0.8203599916970613, + "grad_norm": 0.07876253128051758, + "learning_rate": 7.909364461096707e-05, + "loss": 2.534, + "step": 27665 + }, + { + "epoch": 0.8203896450493728, + "grad_norm": 0.08129636198282242, + "learning_rate": 7.906824979025174e-05, + "loss": 2.5909, + "step": 27666 + }, + { + "epoch": 0.8204192984016843, + "grad_norm": 0.0831943079829216, + "learning_rate": 7.904285869695926e-05, + "loss": 2.5256, + "step": 27667 + }, + { + "epoch": 0.8204489517539958, + "grad_norm": 0.07713939249515533, + "learning_rate": 7.901747133131453e-05, + "loss": 2.5368, + "step": 27668 + }, + { + "epoch": 0.8204786051063072, + "grad_norm": 0.07958640158176422, + "learning_rate": 7.899208769354237e-05, + "loss": 2.5711, + "step": 27669 + }, + { + "epoch": 0.8205082584586187, + "grad_norm": 0.07956639677286148, + "learning_rate": 7.896670778386756e-05, + "loss": 2.5511, + "step": 27670 + }, + { + "epoch": 0.8205379118109303, + "grad_norm": 0.08394895493984222, + "learning_rate": 7.894133160251476e-05, + "loss": 2.569, + "step": 27671 + }, + { + "epoch": 0.8205675651632417, + "grad_norm": 0.07592610269784927, + "learning_rate": 7.891595914970878e-05, + "loss": 2.5533, + "step": 27672 + }, + { + "epoch": 0.8205972185155532, + "grad_norm": 0.08764338493347168, + "learning_rate": 7.889059042567425e-05, + "loss": 2.5924, + "step": 27673 + }, + { + "epoch": 0.8206268718678646, + "grad_norm": 0.07804784178733826, + "learning_rate": 7.886522543063584e-05, + "loss": 2.5777, + "step": 27674 + }, + { + "epoch": 0.8206565252201762, + "grad_norm": 0.08328051120042801, + "learning_rate": 7.883986416481814e-05, + "loss": 2.5465, + "step": 27675 + }, + { + "epoch": 0.8206861785724876, + "grad_norm": 0.07933003455400467, + "learning_rate": 7.881450662844575e-05, + "loss": 2.5363, + "step": 27676 + }, + { + "epoch": 0.8207158319247991, + "grad_norm": 0.07889944314956665, + "learning_rate": 7.878915282174314e-05, + "loss": 2.546, + "step": 27677 + }, + { + "epoch": 0.8207454852771106, + "grad_norm": 0.08848046511411667, + "learning_rate": 7.876380274493494e-05, + "loss": 2.5889, + "step": 27678 + }, + { + "epoch": 0.8207751386294221, + "grad_norm": 0.08021602779626846, + "learning_rate": 7.873845639824556e-05, + "loss": 2.5589, + "step": 27679 + }, + { + "epoch": 0.8208047919817335, + "grad_norm": 0.09837134927511215, + "learning_rate": 7.871311378189943e-05, + "loss": 2.541, + "step": 27680 + }, + { + "epoch": 0.820834445334045, + "grad_norm": 0.08464498072862625, + "learning_rate": 7.868777489612105e-05, + "loss": 2.5757, + "step": 27681 + }, + { + "epoch": 0.8208640986863565, + "grad_norm": 0.0862135961651802, + "learning_rate": 7.866243974113469e-05, + "loss": 2.5653, + "step": 27682 + }, + { + "epoch": 0.820893752038668, + "grad_norm": 0.09425905346870422, + "learning_rate": 7.863710831716475e-05, + "loss": 2.6058, + "step": 27683 + }, + { + "epoch": 0.8209234053909794, + "grad_norm": 0.08567523956298828, + "learning_rate": 7.861178062443552e-05, + "loss": 2.5694, + "step": 27684 + }, + { + "epoch": 0.8209530587432909, + "grad_norm": 0.08569884300231934, + "learning_rate": 7.858645666317138e-05, + "loss": 2.5679, + "step": 27685 + }, + { + "epoch": 0.8209827120956024, + "grad_norm": 0.08906886726617813, + "learning_rate": 7.856113643359642e-05, + "loss": 2.539, + "step": 27686 + }, + { + "epoch": 0.8210123654479139, + "grad_norm": 0.08761893212795258, + "learning_rate": 7.8535819935935e-05, + "loss": 2.5428, + "step": 27687 + }, + { + "epoch": 0.8210420188002253, + "grad_norm": 0.08414575457572937, + "learning_rate": 7.85105071704112e-05, + "loss": 2.5312, + "step": 27688 + }, + { + "epoch": 0.8210716721525368, + "grad_norm": 0.08598321676254272, + "learning_rate": 7.84851981372492e-05, + "loss": 2.5484, + "step": 27689 + }, + { + "epoch": 0.8211013255048484, + "grad_norm": 0.0921211987733841, + "learning_rate": 7.845989283667326e-05, + "loss": 2.575, + "step": 27690 + }, + { + "epoch": 0.8211309788571598, + "grad_norm": 0.08512192219495773, + "learning_rate": 7.843459126890722e-05, + "loss": 2.5493, + "step": 27691 + }, + { + "epoch": 0.8211606322094713, + "grad_norm": 0.09074456989765167, + "learning_rate": 7.840929343417519e-05, + "loss": 2.5316, + "step": 27692 + }, + { + "epoch": 0.8211902855617828, + "grad_norm": 0.08747448772192001, + "learning_rate": 7.838399933270118e-05, + "loss": 2.5503, + "step": 27693 + }, + { + "epoch": 0.8212199389140943, + "grad_norm": 0.09645907580852509, + "learning_rate": 7.835870896470926e-05, + "loss": 2.5693, + "step": 27694 + }, + { + "epoch": 0.8212495922664057, + "grad_norm": 0.07908033579587936, + "learning_rate": 7.83334223304234e-05, + "loss": 2.5505, + "step": 27695 + }, + { + "epoch": 0.8212792456187172, + "grad_norm": 0.08755219727754593, + "learning_rate": 7.830813943006748e-05, + "loss": 2.5173, + "step": 27696 + }, + { + "epoch": 0.8213088989710287, + "grad_norm": 0.0904708281159401, + "learning_rate": 7.828286026386533e-05, + "loss": 2.5489, + "step": 27697 + }, + { + "epoch": 0.8213385523233402, + "grad_norm": 0.08649822324514389, + "learning_rate": 7.825758483204087e-05, + "loss": 2.5563, + "step": 27698 + }, + { + "epoch": 0.8213682056756516, + "grad_norm": 0.08181203901767731, + "learning_rate": 7.823231313481787e-05, + "loss": 2.5399, + "step": 27699 + }, + { + "epoch": 0.8213978590279631, + "grad_norm": 0.08352270722389221, + "learning_rate": 7.82070451724201e-05, + "loss": 2.6106, + "step": 27700 + }, + { + "epoch": 0.8214275123802746, + "grad_norm": 0.08203772455453873, + "learning_rate": 7.81817809450715e-05, + "loss": 2.5726, + "step": 27701 + }, + { + "epoch": 0.8214571657325861, + "grad_norm": 0.08826643228530884, + "learning_rate": 7.815652045299554e-05, + "loss": 2.555, + "step": 27702 + }, + { + "epoch": 0.8214868190848975, + "grad_norm": 0.08026609569787979, + "learning_rate": 7.813126369641593e-05, + "loss": 2.5656, + "step": 27703 + }, + { + "epoch": 0.821516472437209, + "grad_norm": 0.09017308801412582, + "learning_rate": 7.810601067555645e-05, + "loss": 2.5551, + "step": 27704 + }, + { + "epoch": 0.8215461257895205, + "grad_norm": 0.08578979223966599, + "learning_rate": 7.808076139064064e-05, + "loss": 2.5222, + "step": 27705 + }, + { + "epoch": 0.821575779141832, + "grad_norm": 0.08439174294471741, + "learning_rate": 7.805551584189203e-05, + "loss": 2.5518, + "step": 27706 + }, + { + "epoch": 0.8216054324941434, + "grad_norm": 0.0929519310593605, + "learning_rate": 7.803027402953433e-05, + "loss": 2.568, + "step": 27707 + }, + { + "epoch": 0.821635085846455, + "grad_norm": 0.08549226075410843, + "learning_rate": 7.800503595379099e-05, + "loss": 2.5395, + "step": 27708 + }, + { + "epoch": 0.8216647391987664, + "grad_norm": 0.0895070731639862, + "learning_rate": 7.79798016148855e-05, + "loss": 2.5607, + "step": 27709 + }, + { + "epoch": 0.8216943925510779, + "grad_norm": 0.0859900638461113, + "learning_rate": 7.795457101304126e-05, + "loss": 2.5673, + "step": 27710 + }, + { + "epoch": 0.8217240459033894, + "grad_norm": 0.09079454094171524, + "learning_rate": 7.792934414848191e-05, + "loss": 2.5629, + "step": 27711 + }, + { + "epoch": 0.8217536992557009, + "grad_norm": 0.08168262988328934, + "learning_rate": 7.790412102143051e-05, + "loss": 2.5889, + "step": 27712 + }, + { + "epoch": 0.8217833526080124, + "grad_norm": 0.08085271716117859, + "learning_rate": 7.787890163211058e-05, + "loss": 2.5669, + "step": 27713 + }, + { + "epoch": 0.8218130059603238, + "grad_norm": 0.08060424774885178, + "learning_rate": 7.785368598074549e-05, + "loss": 2.5374, + "step": 27714 + }, + { + "epoch": 0.8218426593126353, + "grad_norm": 0.0817689448595047, + "learning_rate": 7.782847406755839e-05, + "loss": 2.5293, + "step": 27715 + }, + { + "epoch": 0.8218723126649468, + "grad_norm": 0.08219344913959503, + "learning_rate": 7.780326589277264e-05, + "loss": 2.5304, + "step": 27716 + }, + { + "epoch": 0.8219019660172583, + "grad_norm": 0.08437154442071915, + "learning_rate": 7.777806145661149e-05, + "loss": 2.566, + "step": 27717 + }, + { + "epoch": 0.8219316193695697, + "grad_norm": 0.08035816997289658, + "learning_rate": 7.7752860759298e-05, + "loss": 2.5694, + "step": 27718 + }, + { + "epoch": 0.8219612727218812, + "grad_norm": 0.07977404445409775, + "learning_rate": 7.772766380105534e-05, + "loss": 2.5803, + "step": 27719 + }, + { + "epoch": 0.8219909260741927, + "grad_norm": 0.08059951663017273, + "learning_rate": 7.770247058210683e-05, + "loss": 2.5194, + "step": 27720 + }, + { + "epoch": 0.8220205794265042, + "grad_norm": 0.08581680059432983, + "learning_rate": 7.767728110267535e-05, + "loss": 2.5989, + "step": 27721 + }, + { + "epoch": 0.8220502327788156, + "grad_norm": 0.08545002341270447, + "learning_rate": 7.765209536298423e-05, + "loss": 2.5503, + "step": 27722 + }, + { + "epoch": 0.8220798861311271, + "grad_norm": 0.0754598081111908, + "learning_rate": 7.762691336325617e-05, + "loss": 2.5461, + "step": 27723 + }, + { + "epoch": 0.8221095394834386, + "grad_norm": 0.08409828692674637, + "learning_rate": 7.760173510371426e-05, + "loss": 2.5388, + "step": 27724 + }, + { + "epoch": 0.8221391928357501, + "grad_norm": 0.07936174422502518, + "learning_rate": 7.757656058458151e-05, + "loss": 2.5615, + "step": 27725 + }, + { + "epoch": 0.8221688461880615, + "grad_norm": 0.07713021337985992, + "learning_rate": 7.755138980608084e-05, + "loss": 2.5658, + "step": 27726 + }, + { + "epoch": 0.822198499540373, + "grad_norm": 0.08198591321706772, + "learning_rate": 7.752622276843513e-05, + "loss": 2.5641, + "step": 27727 + }, + { + "epoch": 0.8222281528926845, + "grad_norm": 0.07785127311944962, + "learning_rate": 7.75010594718672e-05, + "loss": 2.5713, + "step": 27728 + }, + { + "epoch": 0.822257806244996, + "grad_norm": 0.08029647916555405, + "learning_rate": 7.747589991659992e-05, + "loss": 2.5084, + "step": 27729 + }, + { + "epoch": 0.8222874595973074, + "grad_norm": 0.07633432745933533, + "learning_rate": 7.745074410285607e-05, + "loss": 2.5393, + "step": 27730 + }, + { + "epoch": 0.822317112949619, + "grad_norm": 0.08110303431749344, + "learning_rate": 7.742559203085831e-05, + "loss": 2.566, + "step": 27731 + }, + { + "epoch": 0.8223467663019305, + "grad_norm": 0.0819731056690216, + "learning_rate": 7.740044370082971e-05, + "loss": 2.5407, + "step": 27732 + }, + { + "epoch": 0.8223764196542419, + "grad_norm": 0.0775369182229042, + "learning_rate": 7.737529911299256e-05, + "loss": 2.545, + "step": 27733 + }, + { + "epoch": 0.8224060730065534, + "grad_norm": 0.08231303095817566, + "learning_rate": 7.735015826756969e-05, + "loss": 2.5475, + "step": 27734 + }, + { + "epoch": 0.8224357263588649, + "grad_norm": 0.0810607299208641, + "learning_rate": 7.732502116478373e-05, + "loss": 2.5419, + "step": 27735 + }, + { + "epoch": 0.8224653797111764, + "grad_norm": 0.08374422043561935, + "learning_rate": 7.729988780485725e-05, + "loss": 2.5615, + "step": 27736 + }, + { + "epoch": 0.8224950330634878, + "grad_norm": 0.09070556610822678, + "learning_rate": 7.727475818801283e-05, + "loss": 2.5765, + "step": 27737 + }, + { + "epoch": 0.8225246864157993, + "grad_norm": 0.08644586056470871, + "learning_rate": 7.724963231447302e-05, + "loss": 2.5355, + "step": 27738 + }, + { + "epoch": 0.8225543397681108, + "grad_norm": 0.08498074114322662, + "learning_rate": 7.722451018446025e-05, + "loss": 2.5591, + "step": 27739 + }, + { + "epoch": 0.8225839931204223, + "grad_norm": 0.09036724269390106, + "learning_rate": 7.719939179819696e-05, + "loss": 2.5579, + "step": 27740 + }, + { + "epoch": 0.8226136464727337, + "grad_norm": 0.08544497191905975, + "learning_rate": 7.717427715590569e-05, + "loss": 2.5451, + "step": 27741 + }, + { + "epoch": 0.8226432998250452, + "grad_norm": 0.08651323616504669, + "learning_rate": 7.714916625780877e-05, + "loss": 2.5405, + "step": 27742 + }, + { + "epoch": 0.8226729531773567, + "grad_norm": 0.0909985899925232, + "learning_rate": 7.712405910412851e-05, + "loss": 2.5148, + "step": 27743 + }, + { + "epoch": 0.8227026065296682, + "grad_norm": 0.08216066658496857, + "learning_rate": 7.709895569508734e-05, + "loss": 2.5739, + "step": 27744 + }, + { + "epoch": 0.8227322598819796, + "grad_norm": 0.09024230390787125, + "learning_rate": 7.70738560309075e-05, + "loss": 2.5416, + "step": 27745 + }, + { + "epoch": 0.8227619132342912, + "grad_norm": 0.08735091984272003, + "learning_rate": 7.704876011181127e-05, + "loss": 2.5384, + "step": 27746 + }, + { + "epoch": 0.8227915665866026, + "grad_norm": 0.09237370640039444, + "learning_rate": 7.702366793802085e-05, + "loss": 2.5521, + "step": 27747 + }, + { + "epoch": 0.8228212199389141, + "grad_norm": 0.09035343676805496, + "learning_rate": 7.699857950975847e-05, + "loss": 2.5602, + "step": 27748 + }, + { + "epoch": 0.8228508732912255, + "grad_norm": 0.0910859927535057, + "learning_rate": 7.697349482724625e-05, + "loss": 2.5305, + "step": 27749 + }, + { + "epoch": 0.8228805266435371, + "grad_norm": 0.08185205608606339, + "learning_rate": 7.694841389070633e-05, + "loss": 2.5579, + "step": 27750 + }, + { + "epoch": 0.8229101799958485, + "grad_norm": 0.09070167690515518, + "learning_rate": 7.692333670036089e-05, + "loss": 2.5743, + "step": 27751 + }, + { + "epoch": 0.82293983334816, + "grad_norm": 0.08683329820632935, + "learning_rate": 7.689826325643184e-05, + "loss": 2.5679, + "step": 27752 + }, + { + "epoch": 0.8229694867004715, + "grad_norm": 0.08533193916082382, + "learning_rate": 7.687319355914135e-05, + "loss": 2.5246, + "step": 27753 + }, + { + "epoch": 0.822999140052783, + "grad_norm": 0.0837906002998352, + "learning_rate": 7.684812760871135e-05, + "loss": 2.5398, + "step": 27754 + }, + { + "epoch": 0.8230287934050945, + "grad_norm": 0.08561062067747116, + "learning_rate": 7.682306540536383e-05, + "loss": 2.537, + "step": 27755 + }, + { + "epoch": 0.8230584467574059, + "grad_norm": 0.07688350975513458, + "learning_rate": 7.679800694932076e-05, + "loss": 2.5555, + "step": 27756 + }, + { + "epoch": 0.8230881001097174, + "grad_norm": 0.08112245053052902, + "learning_rate": 7.677295224080383e-05, + "loss": 2.5424, + "step": 27757 + }, + { + "epoch": 0.8231177534620289, + "grad_norm": 0.08136212825775146, + "learning_rate": 7.674790128003512e-05, + "loss": 2.5866, + "step": 27758 + }, + { + "epoch": 0.8231474068143404, + "grad_norm": 0.07864200323820114, + "learning_rate": 7.67228540672364e-05, + "loss": 2.5597, + "step": 27759 + }, + { + "epoch": 0.8231770601666518, + "grad_norm": 0.07927316427230835, + "learning_rate": 7.669781060262943e-05, + "loss": 2.5628, + "step": 27760 + }, + { + "epoch": 0.8232067135189634, + "grad_norm": 0.08168130367994308, + "learning_rate": 7.667277088643604e-05, + "loss": 2.5777, + "step": 27761 + }, + { + "epoch": 0.8232363668712748, + "grad_norm": 0.07815255969762802, + "learning_rate": 7.664773491887794e-05, + "loss": 2.519, + "step": 27762 + }, + { + "epoch": 0.8232660202235863, + "grad_norm": 0.0759180560708046, + "learning_rate": 7.66227027001768e-05, + "loss": 2.5328, + "step": 27763 + }, + { + "epoch": 0.8232956735758977, + "grad_norm": 0.0783558189868927, + "learning_rate": 7.659767423055431e-05, + "loss": 2.5673, + "step": 27764 + }, + { + "epoch": 0.8233253269282093, + "grad_norm": 0.07773443311452866, + "learning_rate": 7.657264951023207e-05, + "loss": 2.5405, + "step": 27765 + }, + { + "epoch": 0.8233549802805207, + "grad_norm": 0.08348605781793594, + "learning_rate": 7.654762853943182e-05, + "loss": 2.52, + "step": 27766 + }, + { + "epoch": 0.8233846336328322, + "grad_norm": 0.08514580130577087, + "learning_rate": 7.65226113183749e-05, + "loss": 2.544, + "step": 27767 + }, + { + "epoch": 0.8234142869851436, + "grad_norm": 0.08633647114038467, + "learning_rate": 7.649759784728295e-05, + "loss": 2.5404, + "step": 27768 + }, + { + "epoch": 0.8234439403374552, + "grad_norm": 0.08221235126256943, + "learning_rate": 7.647258812637741e-05, + "loss": 2.5518, + "step": 27769 + }, + { + "epoch": 0.8234735936897666, + "grad_norm": 0.08926527947187424, + "learning_rate": 7.644758215587977e-05, + "loss": 2.5681, + "step": 27770 + }, + { + "epoch": 0.8235032470420781, + "grad_norm": 0.07925540208816528, + "learning_rate": 7.642257993601153e-05, + "loss": 2.5676, + "step": 27771 + }, + { + "epoch": 0.8235329003943895, + "grad_norm": 0.08388473838567734, + "learning_rate": 7.639758146699411e-05, + "loss": 2.5294, + "step": 27772 + }, + { + "epoch": 0.8235625537467011, + "grad_norm": 0.07717081904411316, + "learning_rate": 7.63725867490488e-05, + "loss": 2.5415, + "step": 27773 + }, + { + "epoch": 0.8235922070990126, + "grad_norm": 0.08210337162017822, + "learning_rate": 7.634759578239692e-05, + "loss": 2.5307, + "step": 27774 + }, + { + "epoch": 0.823621860451324, + "grad_norm": 0.08473392575979233, + "learning_rate": 7.632260856725981e-05, + "loss": 2.5366, + "step": 27775 + }, + { + "epoch": 0.8236515138036355, + "grad_norm": 0.07637642323970795, + "learning_rate": 7.629762510385874e-05, + "loss": 2.5807, + "step": 27776 + }, + { + "epoch": 0.823681167155947, + "grad_norm": 0.08241406083106995, + "learning_rate": 7.627264539241508e-05, + "loss": 2.5548, + "step": 27777 + }, + { + "epoch": 0.8237108205082585, + "grad_norm": 0.0776628777384758, + "learning_rate": 7.62476694331497e-05, + "loss": 2.5144, + "step": 27778 + }, + { + "epoch": 0.8237404738605699, + "grad_norm": 0.0804068073630333, + "learning_rate": 7.622269722628394e-05, + "loss": 2.5713, + "step": 27779 + }, + { + "epoch": 0.8237701272128815, + "grad_norm": 0.08083689957857132, + "learning_rate": 7.619772877203895e-05, + "loss": 2.5552, + "step": 27780 + }, + { + "epoch": 0.8237997805651929, + "grad_norm": 0.0785805881023407, + "learning_rate": 7.617276407063584e-05, + "loss": 2.5329, + "step": 27781 + }, + { + "epoch": 0.8238294339175044, + "grad_norm": 0.08452705293893814, + "learning_rate": 7.614780312229564e-05, + "loss": 2.5445, + "step": 27782 + }, + { + "epoch": 0.8238590872698158, + "grad_norm": 0.07762572169303894, + "learning_rate": 7.612284592723928e-05, + "loss": 2.5442, + "step": 27783 + }, + { + "epoch": 0.8238887406221274, + "grad_norm": 0.08165055513381958, + "learning_rate": 7.609789248568799e-05, + "loss": 2.541, + "step": 27784 + }, + { + "epoch": 0.8239183939744388, + "grad_norm": 0.0852019265294075, + "learning_rate": 7.607294279786265e-05, + "loss": 2.5589, + "step": 27785 + }, + { + "epoch": 0.8239480473267503, + "grad_norm": 0.08045156300067902, + "learning_rate": 7.604799686398411e-05, + "loss": 2.5595, + "step": 27786 + }, + { + "epoch": 0.8239777006790617, + "grad_norm": 0.08107981830835342, + "learning_rate": 7.602305468427345e-05, + "loss": 2.5458, + "step": 27787 + }, + { + "epoch": 0.8240073540313733, + "grad_norm": 0.08964449912309647, + "learning_rate": 7.599811625895137e-05, + "loss": 2.5452, + "step": 27788 + }, + { + "epoch": 0.8240370073836847, + "grad_norm": 0.07817826420068741, + "learning_rate": 7.597318158823868e-05, + "loss": 2.5545, + "step": 27789 + }, + { + "epoch": 0.8240666607359962, + "grad_norm": 0.07964810729026794, + "learning_rate": 7.594825067235628e-05, + "loss": 2.5551, + "step": 27790 + }, + { + "epoch": 0.8240963140883076, + "grad_norm": 0.08333682268857956, + "learning_rate": 7.592332351152493e-05, + "loss": 2.5609, + "step": 27791 + }, + { + "epoch": 0.8241259674406192, + "grad_norm": 0.08002304285764694, + "learning_rate": 7.589840010596527e-05, + "loss": 2.5044, + "step": 27792 + }, + { + "epoch": 0.8241556207929306, + "grad_norm": 0.0821748748421669, + "learning_rate": 7.587348045589815e-05, + "loss": 2.5352, + "step": 27793 + }, + { + "epoch": 0.8241852741452421, + "grad_norm": 0.08764541894197464, + "learning_rate": 7.58485645615441e-05, + "loss": 2.579, + "step": 27794 + }, + { + "epoch": 0.8242149274975537, + "grad_norm": 0.08333912491798401, + "learning_rate": 7.582365242312389e-05, + "loss": 2.5373, + "step": 27795 + }, + { + "epoch": 0.8242445808498651, + "grad_norm": 0.08533263951539993, + "learning_rate": 7.579874404085785e-05, + "loss": 2.5527, + "step": 27796 + }, + { + "epoch": 0.8242742342021766, + "grad_norm": 0.08123729377985, + "learning_rate": 7.57738394149669e-05, + "loss": 2.5762, + "step": 27797 + }, + { + "epoch": 0.824303887554488, + "grad_norm": 0.08768227696418762, + "learning_rate": 7.574893854567155e-05, + "loss": 2.5543, + "step": 27798 + }, + { + "epoch": 0.8243335409067996, + "grad_norm": 0.08605056256055832, + "learning_rate": 7.572404143319201e-05, + "loss": 2.504, + "step": 27799 + }, + { + "epoch": 0.824363194259111, + "grad_norm": 0.08161899447441101, + "learning_rate": 7.569914807774896e-05, + "loss": 2.5673, + "step": 27800 + }, + { + "epoch": 0.8243928476114225, + "grad_norm": 0.08331047743558884, + "learning_rate": 7.567425847956278e-05, + "loss": 2.5563, + "step": 27801 + }, + { + "epoch": 0.8244225009637339, + "grad_norm": 0.07921446114778519, + "learning_rate": 7.564937263885385e-05, + "loss": 2.5317, + "step": 27802 + }, + { + "epoch": 0.8244521543160455, + "grad_norm": 0.07849210500717163, + "learning_rate": 7.562449055584254e-05, + "loss": 2.5759, + "step": 27803 + }, + { + "epoch": 0.8244818076683569, + "grad_norm": 0.08263323456048965, + "learning_rate": 7.559961223074924e-05, + "loss": 2.5706, + "step": 27804 + }, + { + "epoch": 0.8245114610206684, + "grad_norm": 0.07704199850559235, + "learning_rate": 7.557473766379424e-05, + "loss": 2.551, + "step": 27805 + }, + { + "epoch": 0.8245411143729798, + "grad_norm": 0.08641036599874496, + "learning_rate": 7.554986685519776e-05, + "loss": 2.5149, + "step": 27806 + }, + { + "epoch": 0.8245707677252914, + "grad_norm": 0.08723500370979309, + "learning_rate": 7.552499980518007e-05, + "loss": 2.568, + "step": 27807 + }, + { + "epoch": 0.8246004210776028, + "grad_norm": 0.08415228873491287, + "learning_rate": 7.550013651396137e-05, + "loss": 2.5534, + "step": 27808 + }, + { + "epoch": 0.8246300744299143, + "grad_norm": 0.09426688402891159, + "learning_rate": 7.547527698176182e-05, + "loss": 2.5659, + "step": 27809 + }, + { + "epoch": 0.8246597277822257, + "grad_norm": 0.08258892595767975, + "learning_rate": 7.545042120880158e-05, + "loss": 2.5512, + "step": 27810 + }, + { + "epoch": 0.8246893811345373, + "grad_norm": 0.08701816201210022, + "learning_rate": 7.542556919530075e-05, + "loss": 2.5236, + "step": 27811 + }, + { + "epoch": 0.8247190344868487, + "grad_norm": 0.09242136776447296, + "learning_rate": 7.540072094147932e-05, + "loss": 2.5414, + "step": 27812 + }, + { + "epoch": 0.8247486878391602, + "grad_norm": 0.08480532467365265, + "learning_rate": 7.537587644755745e-05, + "loss": 2.5771, + "step": 27813 + }, + { + "epoch": 0.8247783411914716, + "grad_norm": 0.08801327645778656, + "learning_rate": 7.535103571375501e-05, + "loss": 2.574, + "step": 27814 + }, + { + "epoch": 0.8248079945437832, + "grad_norm": 0.09368783980607986, + "learning_rate": 7.532619874029212e-05, + "loss": 2.5527, + "step": 27815 + }, + { + "epoch": 0.8248376478960947, + "grad_norm": 0.08124805986881256, + "learning_rate": 7.530136552738859e-05, + "loss": 2.5739, + "step": 27816 + }, + { + "epoch": 0.8248673012484061, + "grad_norm": 0.0858476534485817, + "learning_rate": 7.527653607526435e-05, + "loss": 2.5745, + "step": 27817 + }, + { + "epoch": 0.8248969546007177, + "grad_norm": 0.07863189280033112, + "learning_rate": 7.52517103841393e-05, + "loss": 2.5238, + "step": 27818 + }, + { + "epoch": 0.8249266079530291, + "grad_norm": 0.08988704532384872, + "learning_rate": 7.522688845423325e-05, + "loss": 2.5522, + "step": 27819 + }, + { + "epoch": 0.8249562613053406, + "grad_norm": 0.08298427611589432, + "learning_rate": 7.520207028576609e-05, + "loss": 2.574, + "step": 27820 + }, + { + "epoch": 0.824985914657652, + "grad_norm": 0.09153024852275848, + "learning_rate": 7.51772558789574e-05, + "loss": 2.5425, + "step": 27821 + }, + { + "epoch": 0.8250155680099636, + "grad_norm": 0.08991828560829163, + "learning_rate": 7.515244523402708e-05, + "loss": 2.5243, + "step": 27822 + }, + { + "epoch": 0.825045221362275, + "grad_norm": 0.08492305129766464, + "learning_rate": 7.51276383511948e-05, + "loss": 2.5197, + "step": 27823 + }, + { + "epoch": 0.8250748747145865, + "grad_norm": 0.09099957346916199, + "learning_rate": 7.510283523068023e-05, + "loss": 2.5813, + "step": 27824 + }, + { + "epoch": 0.8251045280668979, + "grad_norm": 0.08735773712396622, + "learning_rate": 7.507803587270295e-05, + "loss": 2.5624, + "step": 27825 + }, + { + "epoch": 0.8251341814192095, + "grad_norm": 0.08897524327039719, + "learning_rate": 7.505324027748262e-05, + "loss": 2.5245, + "step": 27826 + }, + { + "epoch": 0.8251638347715209, + "grad_norm": 0.08821941912174225, + "learning_rate": 7.502844844523876e-05, + "loss": 2.585, + "step": 27827 + }, + { + "epoch": 0.8251934881238324, + "grad_norm": 0.09195344150066376, + "learning_rate": 7.500366037619095e-05, + "loss": 2.5478, + "step": 27828 + }, + { + "epoch": 0.8252231414761438, + "grad_norm": 0.09054284542798996, + "learning_rate": 7.497887607055864e-05, + "loss": 2.5296, + "step": 27829 + }, + { + "epoch": 0.8252527948284554, + "grad_norm": 0.0873793289065361, + "learning_rate": 7.495409552856137e-05, + "loss": 2.5309, + "step": 27830 + }, + { + "epoch": 0.8252824481807668, + "grad_norm": 0.08312137424945831, + "learning_rate": 7.492931875041858e-05, + "loss": 2.551, + "step": 27831 + }, + { + "epoch": 0.8253121015330783, + "grad_norm": 0.08475108444690704, + "learning_rate": 7.490454573634969e-05, + "loss": 2.5291, + "step": 27832 + }, + { + "epoch": 0.8253417548853897, + "grad_norm": 0.07859160751104355, + "learning_rate": 7.48797764865739e-05, + "loss": 2.4969, + "step": 27833 + }, + { + "epoch": 0.8253714082377013, + "grad_norm": 0.08888164162635803, + "learning_rate": 7.48550110013107e-05, + "loss": 2.5513, + "step": 27834 + }, + { + "epoch": 0.8254010615900127, + "grad_norm": 0.07749616354703903, + "learning_rate": 7.483024928077919e-05, + "loss": 2.5559, + "step": 27835 + }, + { + "epoch": 0.8254307149423242, + "grad_norm": 0.08472109586000443, + "learning_rate": 7.480549132519898e-05, + "loss": 2.565, + "step": 27836 + }, + { + "epoch": 0.8254603682946358, + "grad_norm": 0.07917874306440353, + "learning_rate": 7.47807371347891e-05, + "loss": 2.575, + "step": 27837 + }, + { + "epoch": 0.8254900216469472, + "grad_norm": 0.08804032951593399, + "learning_rate": 7.475598670976874e-05, + "loss": 2.5568, + "step": 27838 + }, + { + "epoch": 0.8255196749992587, + "grad_norm": 0.09963975101709366, + "learning_rate": 7.47312400503572e-05, + "loss": 2.5372, + "step": 27839 + }, + { + "epoch": 0.8255493283515701, + "grad_norm": 0.07893345504999161, + "learning_rate": 7.470649715677347e-05, + "loss": 2.5045, + "step": 27840 + }, + { + "epoch": 0.8255789817038817, + "grad_norm": 0.09497124701738358, + "learning_rate": 7.468175802923666e-05, + "loss": 2.5534, + "step": 27841 + }, + { + "epoch": 0.8256086350561931, + "grad_norm": 0.08450398594141006, + "learning_rate": 7.465702266796597e-05, + "loss": 2.5456, + "step": 27842 + }, + { + "epoch": 0.8256382884085046, + "grad_norm": 0.08797577768564224, + "learning_rate": 7.463229107318042e-05, + "loss": 2.5724, + "step": 27843 + }, + { + "epoch": 0.825667941760816, + "grad_norm": 0.08500885218381882, + "learning_rate": 7.460756324509888e-05, + "loss": 2.5094, + "step": 27844 + }, + { + "epoch": 0.8256975951131276, + "grad_norm": 0.08530274033546448, + "learning_rate": 7.458283918394033e-05, + "loss": 2.562, + "step": 27845 + }, + { + "epoch": 0.825727248465439, + "grad_norm": 0.09018591791391373, + "learning_rate": 7.455811888992376e-05, + "loss": 2.5614, + "step": 27846 + }, + { + "epoch": 0.8257569018177505, + "grad_norm": 0.08629880845546722, + "learning_rate": 7.453340236326811e-05, + "loss": 2.5501, + "step": 27847 + }, + { + "epoch": 0.8257865551700619, + "grad_norm": 0.08653245866298676, + "learning_rate": 7.450868960419211e-05, + "loss": 2.5418, + "step": 27848 + }, + { + "epoch": 0.8258162085223735, + "grad_norm": 0.08431710302829742, + "learning_rate": 7.448398061291472e-05, + "loss": 2.5804, + "step": 27849 + }, + { + "epoch": 0.8258458618746849, + "grad_norm": 0.08227308094501495, + "learning_rate": 7.44592753896548e-05, + "loss": 2.533, + "step": 27850 + }, + { + "epoch": 0.8258755152269964, + "grad_norm": 0.08653318881988525, + "learning_rate": 7.443457393463105e-05, + "loss": 2.4981, + "step": 27851 + }, + { + "epoch": 0.8259051685793078, + "grad_norm": 0.0813743993639946, + "learning_rate": 7.440987624806217e-05, + "loss": 2.5852, + "step": 27852 + }, + { + "epoch": 0.8259348219316194, + "grad_norm": 0.08543210476636887, + "learning_rate": 7.4385182330167e-05, + "loss": 2.5603, + "step": 27853 + }, + { + "epoch": 0.8259644752839308, + "grad_norm": 0.08060859888792038, + "learning_rate": 7.436049218116397e-05, + "loss": 2.5584, + "step": 27854 + }, + { + "epoch": 0.8259941286362423, + "grad_norm": 0.0809411033987999, + "learning_rate": 7.433580580127186e-05, + "loss": 2.5658, + "step": 27855 + }, + { + "epoch": 0.8260237819885538, + "grad_norm": 0.08573208749294281, + "learning_rate": 7.431112319070926e-05, + "loss": 2.5345, + "step": 27856 + }, + { + "epoch": 0.8260534353408653, + "grad_norm": 0.08054228872060776, + "learning_rate": 7.428644434969472e-05, + "loss": 2.5568, + "step": 27857 + }, + { + "epoch": 0.8260830886931768, + "grad_norm": 0.08909463882446289, + "learning_rate": 7.42617692784468e-05, + "loss": 2.5569, + "step": 27858 + }, + { + "epoch": 0.8261127420454882, + "grad_norm": 0.08506869524717331, + "learning_rate": 7.423709797718397e-05, + "loss": 2.5651, + "step": 27859 + }, + { + "epoch": 0.8261423953977998, + "grad_norm": 0.082826629281044, + "learning_rate": 7.421243044612475e-05, + "loss": 2.5257, + "step": 27860 + }, + { + "epoch": 0.8261720487501112, + "grad_norm": 0.10234127938747406, + "learning_rate": 7.418776668548738e-05, + "loss": 2.536, + "step": 27861 + }, + { + "epoch": 0.8262017021024227, + "grad_norm": 0.08652283996343613, + "learning_rate": 7.416310669549059e-05, + "loss": 2.5114, + "step": 27862 + }, + { + "epoch": 0.8262313554547341, + "grad_norm": 0.09622803330421448, + "learning_rate": 7.41384504763527e-05, + "loss": 2.5395, + "step": 27863 + }, + { + "epoch": 0.8262610088070457, + "grad_norm": 0.09475400298833847, + "learning_rate": 7.411379802829176e-05, + "loss": 2.5654, + "step": 27864 + }, + { + "epoch": 0.8262906621593571, + "grad_norm": 0.09541591256856918, + "learning_rate": 7.408914935152628e-05, + "loss": 2.5475, + "step": 27865 + }, + { + "epoch": 0.8263203155116686, + "grad_norm": 0.0902591124176979, + "learning_rate": 7.40645044462745e-05, + "loss": 2.5144, + "step": 27866 + }, + { + "epoch": 0.82634996886398, + "grad_norm": 0.08754497766494751, + "learning_rate": 7.403986331275459e-05, + "loss": 2.5781, + "step": 27867 + }, + { + "epoch": 0.8263796222162916, + "grad_norm": 0.09124338626861572, + "learning_rate": 7.401522595118487e-05, + "loss": 2.5873, + "step": 27868 + }, + { + "epoch": 0.826409275568603, + "grad_norm": 0.08161831647157669, + "learning_rate": 7.39905923617834e-05, + "loss": 2.5516, + "step": 27869 + }, + { + "epoch": 0.8264389289209145, + "grad_norm": 0.08224470168352127, + "learning_rate": 7.396596254476839e-05, + "loss": 2.5381, + "step": 27870 + }, + { + "epoch": 0.826468582273226, + "grad_norm": 0.10483723878860474, + "learning_rate": 7.39413365003579e-05, + "loss": 2.5245, + "step": 27871 + }, + { + "epoch": 0.8264982356255375, + "grad_norm": 0.07847931981086731, + "learning_rate": 7.391671422877e-05, + "loss": 2.5588, + "step": 27872 + }, + { + "epoch": 0.8265278889778489, + "grad_norm": 0.0868406742811203, + "learning_rate": 7.38920957302227e-05, + "loss": 2.5404, + "step": 27873 + }, + { + "epoch": 0.8265575423301604, + "grad_norm": 0.08997313678264618, + "learning_rate": 7.386748100493407e-05, + "loss": 2.5395, + "step": 27874 + }, + { + "epoch": 0.8265871956824719, + "grad_norm": 0.08401201665401459, + "learning_rate": 7.384287005312207e-05, + "loss": 2.5762, + "step": 27875 + }, + { + "epoch": 0.8266168490347834, + "grad_norm": 0.08466837555170059, + "learning_rate": 7.381826287500454e-05, + "loss": 2.5639, + "step": 27876 + }, + { + "epoch": 0.8266465023870948, + "grad_norm": 0.08138193935155869, + "learning_rate": 7.379365947079946e-05, + "loss": 2.5457, + "step": 27877 + }, + { + "epoch": 0.8266761557394063, + "grad_norm": 0.08253800868988037, + "learning_rate": 7.376905984072473e-05, + "loss": 2.5453, + "step": 27878 + }, + { + "epoch": 0.8267058090917179, + "grad_norm": 0.08504468947649002, + "learning_rate": 7.374446398499812e-05, + "loss": 2.5618, + "step": 27879 + }, + { + "epoch": 0.8267354624440293, + "grad_norm": 0.08151964843273163, + "learning_rate": 7.371987190383745e-05, + "loss": 2.5793, + "step": 27880 + }, + { + "epoch": 0.8267651157963408, + "grad_norm": 0.08354277163743973, + "learning_rate": 7.369528359746042e-05, + "loss": 2.5851, + "step": 27881 + }, + { + "epoch": 0.8267947691486522, + "grad_norm": 0.08223674446344376, + "learning_rate": 7.367069906608486e-05, + "loss": 2.567, + "step": 27882 + }, + { + "epoch": 0.8268244225009638, + "grad_norm": 0.0838279277086258, + "learning_rate": 7.364611830992846e-05, + "loss": 2.5753, + "step": 27883 + }, + { + "epoch": 0.8268540758532752, + "grad_norm": 0.08198244869709015, + "learning_rate": 7.36215413292089e-05, + "loss": 2.562, + "step": 27884 + }, + { + "epoch": 0.8268837292055867, + "grad_norm": 0.078848697245121, + "learning_rate": 7.359696812414374e-05, + "loss": 2.5705, + "step": 27885 + }, + { + "epoch": 0.8269133825578981, + "grad_norm": 0.08072778582572937, + "learning_rate": 7.357239869495058e-05, + "loss": 2.5289, + "step": 27886 + }, + { + "epoch": 0.8269430359102097, + "grad_norm": 0.07774906605482101, + "learning_rate": 7.354783304184708e-05, + "loss": 2.538, + "step": 27887 + }, + { + "epoch": 0.8269726892625211, + "grad_norm": 0.0792272686958313, + "learning_rate": 7.35232711650507e-05, + "loss": 2.5324, + "step": 27888 + }, + { + "epoch": 0.8270023426148326, + "grad_norm": 0.08380778878927231, + "learning_rate": 7.349871306477896e-05, + "loss": 2.5382, + "step": 27889 + }, + { + "epoch": 0.827031995967144, + "grad_norm": 0.08380930125713348, + "learning_rate": 7.34741587412493e-05, + "loss": 2.5659, + "step": 27890 + }, + { + "epoch": 0.8270616493194556, + "grad_norm": 0.07721792161464691, + "learning_rate": 7.344960819467922e-05, + "loss": 2.5576, + "step": 27891 + }, + { + "epoch": 0.827091302671767, + "grad_norm": 0.08883710205554962, + "learning_rate": 7.342506142528605e-05, + "loss": 2.5684, + "step": 27892 + }, + { + "epoch": 0.8271209560240785, + "grad_norm": 0.0803992748260498, + "learning_rate": 7.340051843328715e-05, + "loss": 2.5299, + "step": 27893 + }, + { + "epoch": 0.82715060937639, + "grad_norm": 0.07957199215888977, + "learning_rate": 7.337597921889993e-05, + "loss": 2.5056, + "step": 27894 + }, + { + "epoch": 0.8271802627287015, + "grad_norm": 0.07908262312412262, + "learning_rate": 7.335144378234165e-05, + "loss": 2.5293, + "step": 27895 + }, + { + "epoch": 0.8272099160810129, + "grad_norm": 0.08478336781263351, + "learning_rate": 7.332691212382952e-05, + "loss": 2.514, + "step": 27896 + }, + { + "epoch": 0.8272395694333244, + "grad_norm": 0.07534103840589523, + "learning_rate": 7.330238424358088e-05, + "loss": 2.5497, + "step": 27897 + }, + { + "epoch": 0.827269222785636, + "grad_norm": 0.07179895043373108, + "learning_rate": 7.327786014181293e-05, + "loss": 2.5174, + "step": 27898 + }, + { + "epoch": 0.8272988761379474, + "grad_norm": 0.0848441869020462, + "learning_rate": 7.325333981874271e-05, + "loss": 2.5413, + "step": 27899 + }, + { + "epoch": 0.8273285294902589, + "grad_norm": 0.07928230613470078, + "learning_rate": 7.322882327458724e-05, + "loss": 2.5335, + "step": 27900 + }, + { + "epoch": 0.8273581828425703, + "grad_norm": 0.08088269829750061, + "learning_rate": 7.320431050956394e-05, + "loss": 2.5613, + "step": 27901 + }, + { + "epoch": 0.8273878361948819, + "grad_norm": 0.08375250548124313, + "learning_rate": 7.317980152388975e-05, + "loss": 2.5584, + "step": 27902 + }, + { + "epoch": 0.8274174895471933, + "grad_norm": 0.07919039577245712, + "learning_rate": 7.315529631778167e-05, + "loss": 2.5709, + "step": 27903 + }, + { + "epoch": 0.8274471428995048, + "grad_norm": 0.0848379135131836, + "learning_rate": 7.313079489145669e-05, + "loss": 2.5271, + "step": 27904 + }, + { + "epoch": 0.8274767962518162, + "grad_norm": 0.07850828021764755, + "learning_rate": 7.310629724513179e-05, + "loss": 2.5116, + "step": 27905 + }, + { + "epoch": 0.8275064496041278, + "grad_norm": 0.08776675909757614, + "learning_rate": 7.308180337902392e-05, + "loss": 2.5362, + "step": 27906 + }, + { + "epoch": 0.8275361029564392, + "grad_norm": 0.08591672033071518, + "learning_rate": 7.305731329334996e-05, + "loss": 2.5675, + "step": 27907 + }, + { + "epoch": 0.8275657563087507, + "grad_norm": 0.08210179209709167, + "learning_rate": 7.303282698832691e-05, + "loss": 2.5773, + "step": 27908 + }, + { + "epoch": 0.8275954096610622, + "grad_norm": 0.0822451114654541, + "learning_rate": 7.300834446417131e-05, + "loss": 2.583, + "step": 27909 + }, + { + "epoch": 0.8276250630133737, + "grad_norm": 0.08589857071638107, + "learning_rate": 7.29838657211001e-05, + "loss": 2.5653, + "step": 27910 + }, + { + "epoch": 0.8276547163656851, + "grad_norm": 0.08163398504257202, + "learning_rate": 7.295939075933012e-05, + "loss": 2.5396, + "step": 27911 + }, + { + "epoch": 0.8276843697179966, + "grad_norm": 0.08209419995546341, + "learning_rate": 7.2934919579078e-05, + "loss": 2.5404, + "step": 27912 + }, + { + "epoch": 0.8277140230703081, + "grad_norm": 0.08617404848337173, + "learning_rate": 7.291045218056036e-05, + "loss": 2.5222, + "step": 27913 + }, + { + "epoch": 0.8277436764226196, + "grad_norm": 0.08126159012317657, + "learning_rate": 7.288598856399408e-05, + "loss": 2.5369, + "step": 27914 + }, + { + "epoch": 0.827773329774931, + "grad_norm": 0.07822462171316147, + "learning_rate": 7.286152872959567e-05, + "loss": 2.5726, + "step": 27915 + }, + { + "epoch": 0.8278029831272425, + "grad_norm": 0.08109325170516968, + "learning_rate": 7.283707267758177e-05, + "loss": 2.5276, + "step": 27916 + }, + { + "epoch": 0.827832636479554, + "grad_norm": 0.07721675932407379, + "learning_rate": 7.281262040816894e-05, + "loss": 2.5408, + "step": 27917 + }, + { + "epoch": 0.8278622898318655, + "grad_norm": 0.07905492186546326, + "learning_rate": 7.278817192157361e-05, + "loss": 2.5648, + "step": 27918 + }, + { + "epoch": 0.827891943184177, + "grad_norm": 0.07971730083227158, + "learning_rate": 7.27637272180125e-05, + "loss": 2.5433, + "step": 27919 + }, + { + "epoch": 0.8279215965364884, + "grad_norm": 0.08282548934221268, + "learning_rate": 7.273928629770182e-05, + "loss": 2.5809, + "step": 27920 + }, + { + "epoch": 0.8279512498888, + "grad_norm": 0.08237580955028534, + "learning_rate": 7.271484916085808e-05, + "loss": 2.5438, + "step": 27921 + }, + { + "epoch": 0.8279809032411114, + "grad_norm": 0.08711167424917221, + "learning_rate": 7.269041580769769e-05, + "loss": 2.5545, + "step": 27922 + }, + { + "epoch": 0.8280105565934229, + "grad_norm": 0.07923593372106552, + "learning_rate": 7.266598623843701e-05, + "loss": 2.5794, + "step": 27923 + }, + { + "epoch": 0.8280402099457344, + "grad_norm": 0.0937085822224617, + "learning_rate": 7.26415604532924e-05, + "loss": 2.5539, + "step": 27924 + }, + { + "epoch": 0.8280698632980459, + "grad_norm": 0.08019324392080307, + "learning_rate": 7.261713845247998e-05, + "loss": 2.5423, + "step": 27925 + }, + { + "epoch": 0.8280995166503573, + "grad_norm": 0.08216874301433563, + "learning_rate": 7.259272023621627e-05, + "loss": 2.5673, + "step": 27926 + }, + { + "epoch": 0.8281291700026688, + "grad_norm": 0.08885882049798965, + "learning_rate": 7.25683058047174e-05, + "loss": 2.5586, + "step": 27927 + }, + { + "epoch": 0.8281588233549803, + "grad_norm": 0.08666583150625229, + "learning_rate": 7.254389515819959e-05, + "loss": 2.5669, + "step": 27928 + }, + { + "epoch": 0.8281884767072918, + "grad_norm": 0.08954637497663498, + "learning_rate": 7.251948829687905e-05, + "loss": 2.5152, + "step": 27929 + }, + { + "epoch": 0.8282181300596032, + "grad_norm": 0.08026910573244095, + "learning_rate": 7.249508522097164e-05, + "loss": 2.5585, + "step": 27930 + }, + { + "epoch": 0.8282477834119147, + "grad_norm": 0.09637220948934555, + "learning_rate": 7.247068593069373e-05, + "loss": 2.5095, + "step": 27931 + }, + { + "epoch": 0.8282774367642262, + "grad_norm": 0.08522073179483414, + "learning_rate": 7.244629042626122e-05, + "loss": 2.5441, + "step": 27932 + }, + { + "epoch": 0.8283070901165377, + "grad_norm": 0.08152616024017334, + "learning_rate": 7.242189870789017e-05, + "loss": 2.5727, + "step": 27933 + }, + { + "epoch": 0.8283367434688491, + "grad_norm": 0.08914364874362946, + "learning_rate": 7.239751077579665e-05, + "loss": 2.5624, + "step": 27934 + }, + { + "epoch": 0.8283663968211606, + "grad_norm": 0.0875137597322464, + "learning_rate": 7.237312663019657e-05, + "loss": 2.5245, + "step": 27935 + }, + { + "epoch": 0.8283960501734721, + "grad_norm": 0.08495903760194778, + "learning_rate": 7.234874627130584e-05, + "loss": 2.5448, + "step": 27936 + }, + { + "epoch": 0.8284257035257836, + "grad_norm": 0.08932528644800186, + "learning_rate": 7.232436969934036e-05, + "loss": 2.546, + "step": 27937 + }, + { + "epoch": 0.828455356878095, + "grad_norm": 0.0892152488231659, + "learning_rate": 7.229999691451594e-05, + "loss": 2.5247, + "step": 27938 + }, + { + "epoch": 0.8284850102304065, + "grad_norm": 0.08449256420135498, + "learning_rate": 7.227562791704862e-05, + "loss": 2.5817, + "step": 27939 + }, + { + "epoch": 0.8285146635827181, + "grad_norm": 0.09180334955453873, + "learning_rate": 7.225126270715393e-05, + "loss": 2.5232, + "step": 27940 + }, + { + "epoch": 0.8285443169350295, + "grad_norm": 0.092736154794693, + "learning_rate": 7.222690128504777e-05, + "loss": 2.5492, + "step": 27941 + }, + { + "epoch": 0.828573970287341, + "grad_norm": 0.08822590112686157, + "learning_rate": 7.220254365094575e-05, + "loss": 2.538, + "step": 27942 + }, + { + "epoch": 0.8286036236396525, + "grad_norm": 0.09934534132480621, + "learning_rate": 7.217818980506369e-05, + "loss": 2.5682, + "step": 27943 + }, + { + "epoch": 0.828633276991964, + "grad_norm": 0.08094330877065659, + "learning_rate": 7.215383974761719e-05, + "loss": 2.5461, + "step": 27944 + }, + { + "epoch": 0.8286629303442754, + "grad_norm": 0.0964893028140068, + "learning_rate": 7.212949347882187e-05, + "loss": 2.559, + "step": 27945 + }, + { + "epoch": 0.8286925836965869, + "grad_norm": 0.09472894668579102, + "learning_rate": 7.210515099889336e-05, + "loss": 2.5507, + "step": 27946 + }, + { + "epoch": 0.8287222370488984, + "grad_norm": 0.09189755469560623, + "learning_rate": 7.208081230804714e-05, + "loss": 2.5837, + "step": 27947 + }, + { + "epoch": 0.8287518904012099, + "grad_norm": 0.09321091324090958, + "learning_rate": 7.205647740649879e-05, + "loss": 2.5734, + "step": 27948 + }, + { + "epoch": 0.8287815437535213, + "grad_norm": 0.08455263823270798, + "learning_rate": 7.203214629446381e-05, + "loss": 2.5622, + "step": 27949 + }, + { + "epoch": 0.8288111971058328, + "grad_norm": 0.10212162882089615, + "learning_rate": 7.200781897215763e-05, + "loss": 2.5458, + "step": 27950 + }, + { + "epoch": 0.8288408504581443, + "grad_norm": 0.08152973651885986, + "learning_rate": 7.198349543979565e-05, + "loss": 2.5435, + "step": 27951 + }, + { + "epoch": 0.8288705038104558, + "grad_norm": 0.0929327979683876, + "learning_rate": 7.195917569759331e-05, + "loss": 2.575, + "step": 27952 + }, + { + "epoch": 0.8289001571627672, + "grad_norm": 0.0825035572052002, + "learning_rate": 7.193485974576592e-05, + "loss": 2.5258, + "step": 27953 + }, + { + "epoch": 0.8289298105150787, + "grad_norm": 0.09231219440698624, + "learning_rate": 7.191054758452886e-05, + "loss": 2.522, + "step": 27954 + }, + { + "epoch": 0.8289594638673902, + "grad_norm": 0.09518544375896454, + "learning_rate": 7.188623921409731e-05, + "loss": 2.5481, + "step": 27955 + }, + { + "epoch": 0.8289891172197017, + "grad_norm": 0.09090188890695572, + "learning_rate": 7.186193463468666e-05, + "loss": 2.5731, + "step": 27956 + }, + { + "epoch": 0.8290187705720131, + "grad_norm": 0.09610210359096527, + "learning_rate": 7.183763384651204e-05, + "loss": 2.5903, + "step": 27957 + }, + { + "epoch": 0.8290484239243247, + "grad_norm": 0.0780160129070282, + "learning_rate": 7.181333684978869e-05, + "loss": 2.5662, + "step": 27958 + }, + { + "epoch": 0.8290780772766361, + "grad_norm": 0.09700217843055725, + "learning_rate": 7.178904364473176e-05, + "loss": 2.5674, + "step": 27959 + }, + { + "epoch": 0.8291077306289476, + "grad_norm": 0.08435401320457458, + "learning_rate": 7.176475423155632e-05, + "loss": 2.5533, + "step": 27960 + }, + { + "epoch": 0.8291373839812591, + "grad_norm": 0.08415143191814423, + "learning_rate": 7.174046861047745e-05, + "loss": 2.5621, + "step": 27961 + }, + { + "epoch": 0.8291670373335706, + "grad_norm": 0.0949651300907135, + "learning_rate": 7.171618678171027e-05, + "loss": 2.5564, + "step": 27962 + }, + { + "epoch": 0.8291966906858821, + "grad_norm": 0.08543223887681961, + "learning_rate": 7.169190874546988e-05, + "loss": 2.5685, + "step": 27963 + }, + { + "epoch": 0.8292263440381935, + "grad_norm": 0.0906984880566597, + "learning_rate": 7.166763450197095e-05, + "loss": 2.5155, + "step": 27964 + }, + { + "epoch": 0.829255997390505, + "grad_norm": 0.09641718119382858, + "learning_rate": 7.164336405142874e-05, + "loss": 2.5421, + "step": 27965 + }, + { + "epoch": 0.8292856507428165, + "grad_norm": 0.07883605360984802, + "learning_rate": 7.161909739405809e-05, + "loss": 2.5004, + "step": 27966 + }, + { + "epoch": 0.829315304095128, + "grad_norm": 0.09302396327257156, + "learning_rate": 7.159483453007382e-05, + "loss": 2.5656, + "step": 27967 + }, + { + "epoch": 0.8293449574474394, + "grad_norm": 0.09220528602600098, + "learning_rate": 7.157057545969087e-05, + "loss": 2.5577, + "step": 27968 + }, + { + "epoch": 0.8293746107997509, + "grad_norm": 0.07729524374008179, + "learning_rate": 7.154632018312396e-05, + "loss": 2.5223, + "step": 27969 + }, + { + "epoch": 0.8294042641520624, + "grad_norm": 0.08755458891391754, + "learning_rate": 7.152206870058797e-05, + "loss": 2.5777, + "step": 27970 + }, + { + "epoch": 0.8294339175043739, + "grad_norm": 0.08001513034105301, + "learning_rate": 7.149782101229757e-05, + "loss": 2.5552, + "step": 27971 + }, + { + "epoch": 0.8294635708566853, + "grad_norm": 0.08177512139081955, + "learning_rate": 7.147357711846758e-05, + "loss": 2.5867, + "step": 27972 + }, + { + "epoch": 0.8294932242089968, + "grad_norm": 0.08933252096176147, + "learning_rate": 7.144933701931255e-05, + "loss": 2.5689, + "step": 27973 + }, + { + "epoch": 0.8295228775613083, + "grad_norm": 0.08932346105575562, + "learning_rate": 7.142510071504737e-05, + "loss": 2.5531, + "step": 27974 + }, + { + "epoch": 0.8295525309136198, + "grad_norm": 0.08585971593856812, + "learning_rate": 7.140086820588632e-05, + "loss": 2.5396, + "step": 27975 + }, + { + "epoch": 0.8295821842659312, + "grad_norm": 0.08149078488349915, + "learning_rate": 7.13766394920442e-05, + "loss": 2.5378, + "step": 27976 + }, + { + "epoch": 0.8296118376182428, + "grad_norm": 0.08821490406990051, + "learning_rate": 7.135241457373537e-05, + "loss": 2.5432, + "step": 27977 + }, + { + "epoch": 0.8296414909705542, + "grad_norm": 0.07807689905166626, + "learning_rate": 7.132819345117459e-05, + "loss": 2.5721, + "step": 27978 + }, + { + "epoch": 0.8296711443228657, + "grad_norm": 0.08810776472091675, + "learning_rate": 7.130397612457629e-05, + "loss": 2.537, + "step": 27979 + }, + { + "epoch": 0.8297007976751771, + "grad_norm": 0.07837703078985214, + "learning_rate": 7.127976259415481e-05, + "loss": 2.5509, + "step": 27980 + }, + { + "epoch": 0.8297304510274887, + "grad_norm": 0.08611667156219482, + "learning_rate": 7.125555286012465e-05, + "loss": 2.5578, + "step": 27981 + }, + { + "epoch": 0.8297601043798002, + "grad_norm": 0.08516906946897507, + "learning_rate": 7.123134692270012e-05, + "loss": 2.5466, + "step": 27982 + }, + { + "epoch": 0.8297897577321116, + "grad_norm": 0.07932628691196442, + "learning_rate": 7.120714478209567e-05, + "loss": 2.5464, + "step": 27983 + }, + { + "epoch": 0.8298194110844231, + "grad_norm": 0.08314052224159241, + "learning_rate": 7.118294643852562e-05, + "loss": 2.5706, + "step": 27984 + }, + { + "epoch": 0.8298490644367346, + "grad_norm": 0.0800475925207138, + "learning_rate": 7.115875189220411e-05, + "loss": 2.5434, + "step": 27985 + }, + { + "epoch": 0.8298787177890461, + "grad_norm": 0.07996980845928192, + "learning_rate": 7.113456114334543e-05, + "loss": 2.5333, + "step": 27986 + }, + { + "epoch": 0.8299083711413575, + "grad_norm": 0.08008802682161331, + "learning_rate": 7.111037419216382e-05, + "loss": 2.5492, + "step": 27987 + }, + { + "epoch": 0.829938024493669, + "grad_norm": 0.07891040295362473, + "learning_rate": 7.108619103887349e-05, + "loss": 2.5394, + "step": 27988 + }, + { + "epoch": 0.8299676778459805, + "grad_norm": 0.08121553808450699, + "learning_rate": 7.106201168368858e-05, + "loss": 2.5732, + "step": 27989 + }, + { + "epoch": 0.829997331198292, + "grad_norm": 0.07872901856899261, + "learning_rate": 7.103783612682302e-05, + "loss": 2.5441, + "step": 27990 + }, + { + "epoch": 0.8300269845506034, + "grad_norm": 0.07694754749536514, + "learning_rate": 7.101366436849122e-05, + "loss": 2.5641, + "step": 27991 + }, + { + "epoch": 0.830056637902915, + "grad_norm": 0.0774960070848465, + "learning_rate": 7.0989496408907e-05, + "loss": 2.5125, + "step": 27992 + }, + { + "epoch": 0.8300862912552264, + "grad_norm": 0.07973120361566544, + "learning_rate": 7.096533224828444e-05, + "loss": 2.5989, + "step": 27993 + }, + { + "epoch": 0.8301159446075379, + "grad_norm": 0.0782187208533287, + "learning_rate": 7.094117188683752e-05, + "loss": 2.5616, + "step": 27994 + }, + { + "epoch": 0.8301455979598493, + "grad_norm": 0.08357774466276169, + "learning_rate": 7.091701532478029e-05, + "loss": 2.5658, + "step": 27995 + }, + { + "epoch": 0.8301752513121609, + "grad_norm": 0.08118720352649689, + "learning_rate": 7.089286256232641e-05, + "loss": 2.5074, + "step": 27996 + }, + { + "epoch": 0.8302049046644723, + "grad_norm": 0.07563512027263641, + "learning_rate": 7.086871359968988e-05, + "loss": 2.5543, + "step": 27997 + }, + { + "epoch": 0.8302345580167838, + "grad_norm": 0.07725822180509567, + "learning_rate": 7.08445684370846e-05, + "loss": 2.5746, + "step": 27998 + }, + { + "epoch": 0.8302642113690952, + "grad_norm": 0.07775796204805374, + "learning_rate": 7.08204270747243e-05, + "loss": 2.5208, + "step": 27999 + }, + { + "epoch": 0.8302938647214068, + "grad_norm": 0.07520225644111633, + "learning_rate": 7.079628951282274e-05, + "loss": 2.5425, + "step": 28000 + }, + { + "epoch": 0.8303235180737182, + "grad_norm": 0.07619307935237885, + "learning_rate": 7.07721557515938e-05, + "loss": 2.5334, + "step": 28001 + }, + { + "epoch": 0.8303531714260297, + "grad_norm": 0.07363628596067429, + "learning_rate": 7.0748025791251e-05, + "loss": 2.5537, + "step": 28002 + }, + { + "epoch": 0.8303828247783412, + "grad_norm": 0.07864069938659668, + "learning_rate": 7.072389963200804e-05, + "loss": 2.5463, + "step": 28003 + }, + { + "epoch": 0.8304124781306527, + "grad_norm": 0.07578066736459732, + "learning_rate": 7.069977727407878e-05, + "loss": 2.5213, + "step": 28004 + }, + { + "epoch": 0.8304421314829642, + "grad_norm": 0.08316849917173386, + "learning_rate": 7.067565871767673e-05, + "loss": 2.5475, + "step": 28005 + }, + { + "epoch": 0.8304717848352756, + "grad_norm": 0.07970891147851944, + "learning_rate": 7.065154396301538e-05, + "loss": 2.5534, + "step": 28006 + }, + { + "epoch": 0.8305014381875871, + "grad_norm": 0.0824662297964096, + "learning_rate": 7.062743301030822e-05, + "loss": 2.5521, + "step": 28007 + }, + { + "epoch": 0.8305310915398986, + "grad_norm": 0.08627897500991821, + "learning_rate": 7.060332585976892e-05, + "loss": 2.561, + "step": 28008 + }, + { + "epoch": 0.8305607448922101, + "grad_norm": 0.07761795073747635, + "learning_rate": 7.057922251161081e-05, + "loss": 2.5192, + "step": 28009 + }, + { + "epoch": 0.8305903982445215, + "grad_norm": 0.0842847153544426, + "learning_rate": 7.055512296604744e-05, + "loss": 2.5428, + "step": 28010 + }, + { + "epoch": 0.830620051596833, + "grad_norm": 0.08379825949668884, + "learning_rate": 7.053102722329214e-05, + "loss": 2.5628, + "step": 28011 + }, + { + "epoch": 0.8306497049491445, + "grad_norm": 0.08049886673688889, + "learning_rate": 7.050693528355834e-05, + "loss": 2.5695, + "step": 28012 + }, + { + "epoch": 0.830679358301456, + "grad_norm": 0.07655824720859528, + "learning_rate": 7.048284714705932e-05, + "loss": 2.5594, + "step": 28013 + }, + { + "epoch": 0.8307090116537674, + "grad_norm": 0.07909002155065536, + "learning_rate": 7.045876281400842e-05, + "loss": 2.534, + "step": 28014 + }, + { + "epoch": 0.830738665006079, + "grad_norm": 0.0777243822813034, + "learning_rate": 7.043468228461891e-05, + "loss": 2.5517, + "step": 28015 + }, + { + "epoch": 0.8307683183583904, + "grad_norm": 0.08355670422315598, + "learning_rate": 7.0410605559104e-05, + "loss": 2.5669, + "step": 28016 + }, + { + "epoch": 0.8307979717107019, + "grad_norm": 0.08301018178462982, + "learning_rate": 7.038653263767697e-05, + "loss": 2.5808, + "step": 28017 + }, + { + "epoch": 0.8308276250630133, + "grad_norm": 0.0804351344704628, + "learning_rate": 7.036246352055092e-05, + "loss": 2.5476, + "step": 28018 + }, + { + "epoch": 0.8308572784153249, + "grad_norm": 0.07997539639472961, + "learning_rate": 7.033839820793897e-05, + "loss": 2.5781, + "step": 28019 + }, + { + "epoch": 0.8308869317676363, + "grad_norm": 0.08261138200759888, + "learning_rate": 7.031433670005428e-05, + "loss": 2.5657, + "step": 28020 + }, + { + "epoch": 0.8309165851199478, + "grad_norm": 0.08984733372926712, + "learning_rate": 7.029027899710989e-05, + "loss": 2.5424, + "step": 28021 + }, + { + "epoch": 0.8309462384722592, + "grad_norm": 0.08155360072851181, + "learning_rate": 7.026622509931879e-05, + "loss": 2.5395, + "step": 28022 + }, + { + "epoch": 0.8309758918245708, + "grad_norm": 0.08885027468204498, + "learning_rate": 7.024217500689412e-05, + "loss": 2.534, + "step": 28023 + }, + { + "epoch": 0.8310055451768823, + "grad_norm": 0.08350168168544769, + "learning_rate": 7.021812872004868e-05, + "loss": 2.5994, + "step": 28024 + }, + { + "epoch": 0.8310351985291937, + "grad_norm": 0.09043698757886887, + "learning_rate": 7.019408623899553e-05, + "loss": 2.5724, + "step": 28025 + }, + { + "epoch": 0.8310648518815053, + "grad_norm": 0.08540742099285126, + "learning_rate": 7.017004756394746e-05, + "loss": 2.5485, + "step": 28026 + }, + { + "epoch": 0.8310945052338167, + "grad_norm": 0.08279763162136078, + "learning_rate": 7.014601269511745e-05, + "loss": 2.5549, + "step": 28027 + }, + { + "epoch": 0.8311241585861282, + "grad_norm": 0.08410350233316422, + "learning_rate": 7.012198163271827e-05, + "loss": 2.5393, + "step": 28028 + }, + { + "epoch": 0.8311538119384396, + "grad_norm": 0.09246283769607544, + "learning_rate": 7.009795437696276e-05, + "loss": 2.5698, + "step": 28029 + }, + { + "epoch": 0.8311834652907512, + "grad_norm": 0.09290248155593872, + "learning_rate": 7.007393092806363e-05, + "loss": 2.5373, + "step": 28030 + }, + { + "epoch": 0.8312131186430626, + "grad_norm": 0.0834922045469284, + "learning_rate": 7.004991128623361e-05, + "loss": 2.5317, + "step": 28031 + }, + { + "epoch": 0.8312427719953741, + "grad_norm": 0.09334271401166916, + "learning_rate": 7.002589545168548e-05, + "loss": 2.5487, + "step": 28032 + }, + { + "epoch": 0.8312724253476855, + "grad_norm": 0.08268251270055771, + "learning_rate": 7.000188342463182e-05, + "loss": 2.5379, + "step": 28033 + }, + { + "epoch": 0.8313020786999971, + "grad_norm": 0.08363005518913269, + "learning_rate": 6.997787520528526e-05, + "loss": 2.5242, + "step": 28034 + }, + { + "epoch": 0.8313317320523085, + "grad_norm": 0.08509977161884308, + "learning_rate": 6.995387079385845e-05, + "loss": 2.5401, + "step": 28035 + }, + { + "epoch": 0.83136138540462, + "grad_norm": 0.08178696781396866, + "learning_rate": 6.992987019056396e-05, + "loss": 2.5901, + "step": 28036 + }, + { + "epoch": 0.8313910387569314, + "grad_norm": 0.0838160291314125, + "learning_rate": 6.990587339561427e-05, + "loss": 2.5498, + "step": 28037 + }, + { + "epoch": 0.831420692109243, + "grad_norm": 0.08017481118440628, + "learning_rate": 6.988188040922189e-05, + "loss": 2.5515, + "step": 28038 + }, + { + "epoch": 0.8314503454615544, + "grad_norm": 0.08378361165523529, + "learning_rate": 6.985789123159942e-05, + "loss": 2.5647, + "step": 28039 + }, + { + "epoch": 0.8314799988138659, + "grad_norm": 0.08602892607450485, + "learning_rate": 6.983390586295902e-05, + "loss": 2.5559, + "step": 28040 + }, + { + "epoch": 0.8315096521661773, + "grad_norm": 0.0832146629691124, + "learning_rate": 6.980992430351324e-05, + "loss": 2.5197, + "step": 28041 + }, + { + "epoch": 0.8315393055184889, + "grad_norm": 0.08218935132026672, + "learning_rate": 6.978594655347426e-05, + "loss": 2.5271, + "step": 28042 + }, + { + "epoch": 0.8315689588708003, + "grad_norm": 0.0857343003153801, + "learning_rate": 6.976197261305472e-05, + "loss": 2.5391, + "step": 28043 + }, + { + "epoch": 0.8315986122231118, + "grad_norm": 0.07915220409631729, + "learning_rate": 6.973800248246676e-05, + "loss": 2.5714, + "step": 28044 + }, + { + "epoch": 0.8316282655754234, + "grad_norm": 0.08436058461666107, + "learning_rate": 6.971403616192262e-05, + "loss": 2.5578, + "step": 28045 + }, + { + "epoch": 0.8316579189277348, + "grad_norm": 0.08902484178543091, + "learning_rate": 6.96900736516346e-05, + "loss": 2.5668, + "step": 28046 + }, + { + "epoch": 0.8316875722800463, + "grad_norm": 0.08321389555931091, + "learning_rate": 6.96661149518148e-05, + "loss": 2.5604, + "step": 28047 + }, + { + "epoch": 0.8317172256323577, + "grad_norm": 0.07999390363693237, + "learning_rate": 6.964216006267543e-05, + "loss": 2.5702, + "step": 28048 + }, + { + "epoch": 0.8317468789846693, + "grad_norm": 0.0794108435511589, + "learning_rate": 6.961820898442861e-05, + "loss": 2.5433, + "step": 28049 + }, + { + "epoch": 0.8317765323369807, + "grad_norm": 0.08205892890691757, + "learning_rate": 6.95942617172865e-05, + "loss": 2.5598, + "step": 28050 + }, + { + "epoch": 0.8318061856892922, + "grad_norm": 0.08479300141334534, + "learning_rate": 6.957031826146098e-05, + "loss": 2.5549, + "step": 28051 + }, + { + "epoch": 0.8318358390416036, + "grad_norm": 0.07748722285032272, + "learning_rate": 6.954637861716423e-05, + "loss": 2.5275, + "step": 28052 + }, + { + "epoch": 0.8318654923939152, + "grad_norm": 0.0847855806350708, + "learning_rate": 6.952244278460812e-05, + "loss": 2.5331, + "step": 28053 + }, + { + "epoch": 0.8318951457462266, + "grad_norm": 0.0799374058842659, + "learning_rate": 6.94985107640047e-05, + "loss": 2.5656, + "step": 28054 + }, + { + "epoch": 0.8319247990985381, + "grad_norm": 0.08527809381484985, + "learning_rate": 6.947458255556576e-05, + "loss": 2.5486, + "step": 28055 + }, + { + "epoch": 0.8319544524508495, + "grad_norm": 0.08647970855236053, + "learning_rate": 6.945065815950336e-05, + "loss": 2.5604, + "step": 28056 + }, + { + "epoch": 0.8319841058031611, + "grad_norm": 0.08281126618385315, + "learning_rate": 6.94267375760293e-05, + "loss": 2.5392, + "step": 28057 + }, + { + "epoch": 0.8320137591554725, + "grad_norm": 0.0794491171836853, + "learning_rate": 6.940282080535543e-05, + "loss": 2.5741, + "step": 28058 + }, + { + "epoch": 0.832043412507784, + "grad_norm": 0.08231189101934433, + "learning_rate": 6.937890784769341e-05, + "loss": 2.5565, + "step": 28059 + }, + { + "epoch": 0.8320730658600954, + "grad_norm": 0.09191194176673889, + "learning_rate": 6.935499870325524e-05, + "loss": 2.5465, + "step": 28060 + }, + { + "epoch": 0.832102719212407, + "grad_norm": 0.07789599895477295, + "learning_rate": 6.933109337225236e-05, + "loss": 2.5543, + "step": 28061 + }, + { + "epoch": 0.8321323725647184, + "grad_norm": 0.07998525351285934, + "learning_rate": 6.930719185489659e-05, + "loss": 2.5264, + "step": 28062 + }, + { + "epoch": 0.8321620259170299, + "grad_norm": 0.08686003088951111, + "learning_rate": 6.928329415139956e-05, + "loss": 2.5661, + "step": 28063 + }, + { + "epoch": 0.8321916792693413, + "grad_norm": 0.09148234128952026, + "learning_rate": 6.925940026197287e-05, + "loss": 2.5472, + "step": 28064 + }, + { + "epoch": 0.8322213326216529, + "grad_norm": 0.08000414073467255, + "learning_rate": 6.923551018682811e-05, + "loss": 2.5424, + "step": 28065 + }, + { + "epoch": 0.8322509859739644, + "grad_norm": 0.08984559774398804, + "learning_rate": 6.921162392617686e-05, + "loss": 2.5622, + "step": 28066 + }, + { + "epoch": 0.8322806393262758, + "grad_norm": 0.08298852294683456, + "learning_rate": 6.918774148023066e-05, + "loss": 2.5561, + "step": 28067 + }, + { + "epoch": 0.8323102926785874, + "grad_norm": 0.08057206124067307, + "learning_rate": 6.916386284920078e-05, + "loss": 2.5339, + "step": 28068 + }, + { + "epoch": 0.8323399460308988, + "grad_norm": 0.09075082838535309, + "learning_rate": 6.913998803329902e-05, + "loss": 2.565, + "step": 28069 + }, + { + "epoch": 0.8323695993832103, + "grad_norm": 0.08125279098749161, + "learning_rate": 6.911611703273663e-05, + "loss": 2.5424, + "step": 28070 + }, + { + "epoch": 0.8323992527355217, + "grad_norm": 0.07720007002353668, + "learning_rate": 6.909224984772505e-05, + "loss": 2.5423, + "step": 28071 + }, + { + "epoch": 0.8324289060878333, + "grad_norm": 0.08957362174987793, + "learning_rate": 6.906838647847547e-05, + "loss": 2.5551, + "step": 28072 + }, + { + "epoch": 0.8324585594401447, + "grad_norm": 0.07694463431835175, + "learning_rate": 6.904452692519925e-05, + "loss": 2.5664, + "step": 28073 + }, + { + "epoch": 0.8324882127924562, + "grad_norm": 0.09163416177034378, + "learning_rate": 6.902067118810779e-05, + "loss": 2.5358, + "step": 28074 + }, + { + "epoch": 0.8325178661447676, + "grad_norm": 0.0834541767835617, + "learning_rate": 6.899681926741219e-05, + "loss": 2.5263, + "step": 28075 + }, + { + "epoch": 0.8325475194970792, + "grad_norm": 0.08251503109931946, + "learning_rate": 6.89729711633238e-05, + "loss": 2.5791, + "step": 28076 + }, + { + "epoch": 0.8325771728493906, + "grad_norm": 0.08137653768062592, + "learning_rate": 6.89491268760537e-05, + "loss": 2.5184, + "step": 28077 + }, + { + "epoch": 0.8326068262017021, + "grad_norm": 0.08177658170461655, + "learning_rate": 6.892528640581308e-05, + "loss": 2.5416, + "step": 28078 + }, + { + "epoch": 0.8326364795540135, + "grad_norm": 0.0886300802230835, + "learning_rate": 6.890144975281305e-05, + "loss": 2.5557, + "step": 28079 + }, + { + "epoch": 0.8326661329063251, + "grad_norm": 0.08312626928091049, + "learning_rate": 6.887761691726468e-05, + "loss": 2.5628, + "step": 28080 + }, + { + "epoch": 0.8326957862586365, + "grad_norm": 0.08068151026964188, + "learning_rate": 6.885378789937901e-05, + "loss": 2.5719, + "step": 28081 + }, + { + "epoch": 0.832725439610948, + "grad_norm": 0.0805879607796669, + "learning_rate": 6.882996269936703e-05, + "loss": 2.5609, + "step": 28082 + }, + { + "epoch": 0.8327550929632594, + "grad_norm": 0.08512561023235321, + "learning_rate": 6.880614131743978e-05, + "loss": 2.5327, + "step": 28083 + }, + { + "epoch": 0.832784746315571, + "grad_norm": 0.08186011761426926, + "learning_rate": 6.878232375380817e-05, + "loss": 2.5316, + "step": 28084 + }, + { + "epoch": 0.8328143996678824, + "grad_norm": 0.08228462189435959, + "learning_rate": 6.875851000868305e-05, + "loss": 2.5518, + "step": 28085 + }, + { + "epoch": 0.8328440530201939, + "grad_norm": 0.08431176096200943, + "learning_rate": 6.873470008227539e-05, + "loss": 2.5527, + "step": 28086 + }, + { + "epoch": 0.8328737063725055, + "grad_norm": 0.08710230141878128, + "learning_rate": 6.871089397479596e-05, + "loss": 2.5689, + "step": 28087 + }, + { + "epoch": 0.8329033597248169, + "grad_norm": 0.08142029494047165, + "learning_rate": 6.868709168645559e-05, + "loss": 2.5742, + "step": 28088 + }, + { + "epoch": 0.8329330130771284, + "grad_norm": 0.08555218577384949, + "learning_rate": 6.866329321746505e-05, + "loss": 2.5385, + "step": 28089 + }, + { + "epoch": 0.8329626664294398, + "grad_norm": 0.08428510278463364, + "learning_rate": 6.863949856803509e-05, + "loss": 2.5183, + "step": 28090 + }, + { + "epoch": 0.8329923197817514, + "grad_norm": 0.08235716819763184, + "learning_rate": 6.861570773837644e-05, + "loss": 2.5237, + "step": 28091 + }, + { + "epoch": 0.8330219731340628, + "grad_norm": 0.0827028751373291, + "learning_rate": 6.859192072869974e-05, + "loss": 2.5753, + "step": 28092 + }, + { + "epoch": 0.8330516264863743, + "grad_norm": 0.091961570084095, + "learning_rate": 6.85681375392156e-05, + "loss": 2.5413, + "step": 28093 + }, + { + "epoch": 0.8330812798386857, + "grad_norm": 0.0803006961941719, + "learning_rate": 6.854435817013472e-05, + "loss": 2.5382, + "step": 28094 + }, + { + "epoch": 0.8331109331909973, + "grad_norm": 0.08736208826303482, + "learning_rate": 6.852058262166755e-05, + "loss": 2.5585, + "step": 28095 + }, + { + "epoch": 0.8331405865433087, + "grad_norm": 0.07386403530836105, + "learning_rate": 6.849681089402471e-05, + "loss": 2.5807, + "step": 28096 + }, + { + "epoch": 0.8331702398956202, + "grad_norm": 0.08361860364675522, + "learning_rate": 6.847304298741664e-05, + "loss": 2.5459, + "step": 28097 + }, + { + "epoch": 0.8331998932479316, + "grad_norm": 0.0905470997095108, + "learning_rate": 6.844927890205389e-05, + "loss": 2.5116, + "step": 28098 + }, + { + "epoch": 0.8332295466002432, + "grad_norm": 0.08082076907157898, + "learning_rate": 6.842551863814678e-05, + "loss": 2.5562, + "step": 28099 + }, + { + "epoch": 0.8332591999525546, + "grad_norm": 0.08579805493354797, + "learning_rate": 6.840176219590582e-05, + "loss": 2.537, + "step": 28100 + }, + { + "epoch": 0.8332888533048661, + "grad_norm": 0.08360287547111511, + "learning_rate": 6.837800957554136e-05, + "loss": 2.5547, + "step": 28101 + }, + { + "epoch": 0.8333185066571775, + "grad_norm": 0.07787200063467026, + "learning_rate": 6.83542607772637e-05, + "loss": 2.5722, + "step": 28102 + }, + { + "epoch": 0.8333481600094891, + "grad_norm": 0.09237837046384811, + "learning_rate": 6.833051580128319e-05, + "loss": 2.5243, + "step": 28103 + }, + { + "epoch": 0.8333778133618005, + "grad_norm": 0.08059361577033997, + "learning_rate": 6.830677464780999e-05, + "loss": 2.5323, + "step": 28104 + }, + { + "epoch": 0.833407466714112, + "grad_norm": 0.07944497466087341, + "learning_rate": 6.828303731705454e-05, + "loss": 2.5533, + "step": 28105 + }, + { + "epoch": 0.8334371200664236, + "grad_norm": 0.09639180451631546, + "learning_rate": 6.825930380922668e-05, + "loss": 2.5422, + "step": 28106 + }, + { + "epoch": 0.833466773418735, + "grad_norm": 0.08704377710819244, + "learning_rate": 6.82355741245369e-05, + "loss": 2.5239, + "step": 28107 + }, + { + "epoch": 0.8334964267710465, + "grad_norm": 0.0863032191991806, + "learning_rate": 6.821184826319521e-05, + "loss": 2.5459, + "step": 28108 + }, + { + "epoch": 0.8335260801233579, + "grad_norm": 0.09535456448793411, + "learning_rate": 6.818812622541176e-05, + "loss": 2.5403, + "step": 28109 + }, + { + "epoch": 0.8335557334756695, + "grad_norm": 0.09091828018426895, + "learning_rate": 6.816440801139657e-05, + "loss": 2.5291, + "step": 28110 + }, + { + "epoch": 0.8335853868279809, + "grad_norm": 0.08631984889507294, + "learning_rate": 6.81406936213596e-05, + "loss": 2.5645, + "step": 28111 + }, + { + "epoch": 0.8336150401802924, + "grad_norm": 0.09012258052825928, + "learning_rate": 6.811698305551095e-05, + "loss": 2.5712, + "step": 28112 + }, + { + "epoch": 0.8336446935326038, + "grad_norm": 0.08533181995153427, + "learning_rate": 6.80932763140606e-05, + "loss": 2.5753, + "step": 28113 + }, + { + "epoch": 0.8336743468849154, + "grad_norm": 0.08505380898714066, + "learning_rate": 6.806957339721837e-05, + "loss": 2.5402, + "step": 28114 + }, + { + "epoch": 0.8337040002372268, + "grad_norm": 0.08145426958799362, + "learning_rate": 6.804587430519433e-05, + "loss": 2.5677, + "step": 28115 + }, + { + "epoch": 0.8337336535895383, + "grad_norm": 0.08791518211364746, + "learning_rate": 6.802217903819808e-05, + "loss": 2.547, + "step": 28116 + }, + { + "epoch": 0.8337633069418497, + "grad_norm": 0.07766323536634445, + "learning_rate": 6.799848759643962e-05, + "loss": 2.5704, + "step": 28117 + }, + { + "epoch": 0.8337929602941613, + "grad_norm": 0.08526305854320526, + "learning_rate": 6.797479998012867e-05, + "loss": 2.5805, + "step": 28118 + }, + { + "epoch": 0.8338226136464727, + "grad_norm": 0.08540001511573792, + "learning_rate": 6.795111618947497e-05, + "loss": 2.5303, + "step": 28119 + }, + { + "epoch": 0.8338522669987842, + "grad_norm": 0.08001808077096939, + "learning_rate": 6.792743622468833e-05, + "loss": 2.554, + "step": 28120 + }, + { + "epoch": 0.8338819203510957, + "grad_norm": 0.07784827798604965, + "learning_rate": 6.790376008597848e-05, + "loss": 2.5596, + "step": 28121 + }, + { + "epoch": 0.8339115737034072, + "grad_norm": 0.08909463882446289, + "learning_rate": 6.788008777355498e-05, + "loss": 2.5614, + "step": 28122 + }, + { + "epoch": 0.8339412270557186, + "grad_norm": 0.0866047814488411, + "learning_rate": 6.785641928762743e-05, + "loss": 2.5522, + "step": 28123 + }, + { + "epoch": 0.8339708804080301, + "grad_norm": 0.08406800776720047, + "learning_rate": 6.78327546284055e-05, + "loss": 2.5681, + "step": 28124 + }, + { + "epoch": 0.8340005337603416, + "grad_norm": 0.08475732058286667, + "learning_rate": 6.780909379609874e-05, + "loss": 2.522, + "step": 28125 + }, + { + "epoch": 0.8340301871126531, + "grad_norm": 0.10963514447212219, + "learning_rate": 6.77854367909167e-05, + "loss": 2.5539, + "step": 28126 + }, + { + "epoch": 0.8340598404649646, + "grad_norm": 0.08607769757509232, + "learning_rate": 6.776178361306872e-05, + "loss": 2.5508, + "step": 28127 + }, + { + "epoch": 0.834089493817276, + "grad_norm": 0.08423535525798798, + "learning_rate": 6.773813426276431e-05, + "loss": 2.5577, + "step": 28128 + }, + { + "epoch": 0.8341191471695876, + "grad_norm": 0.0854782685637474, + "learning_rate": 6.771448874021297e-05, + "loss": 2.5117, + "step": 28129 + }, + { + "epoch": 0.834148800521899, + "grad_norm": 0.08612974733114243, + "learning_rate": 6.7690847045624e-05, + "loss": 2.5395, + "step": 28130 + }, + { + "epoch": 0.8341784538742105, + "grad_norm": 0.07970238476991653, + "learning_rate": 6.766720917920677e-05, + "loss": 2.5563, + "step": 28131 + }, + { + "epoch": 0.8342081072265219, + "grad_norm": 0.09047529846429825, + "learning_rate": 6.764357514117053e-05, + "loss": 2.5519, + "step": 28132 + }, + { + "epoch": 0.8342377605788335, + "grad_norm": 0.08580932021141052, + "learning_rate": 6.761994493172474e-05, + "loss": 2.5701, + "step": 28133 + }, + { + "epoch": 0.8342674139311449, + "grad_norm": 0.09234164655208588, + "learning_rate": 6.759631855107856e-05, + "loss": 2.572, + "step": 28134 + }, + { + "epoch": 0.8342970672834564, + "grad_norm": 0.08500513434410095, + "learning_rate": 6.757269599944115e-05, + "loss": 2.5491, + "step": 28135 + }, + { + "epoch": 0.8343267206357678, + "grad_norm": 0.08479518443346024, + "learning_rate": 6.754907727702193e-05, + "loss": 2.5596, + "step": 28136 + }, + { + "epoch": 0.8343563739880794, + "grad_norm": 0.08824732154607773, + "learning_rate": 6.752546238402973e-05, + "loss": 2.5663, + "step": 28137 + }, + { + "epoch": 0.8343860273403908, + "grad_norm": 0.08845145255327225, + "learning_rate": 6.750185132067376e-05, + "loss": 2.5206, + "step": 28138 + }, + { + "epoch": 0.8344156806927023, + "grad_norm": 0.08524901419878006, + "learning_rate": 6.747824408716318e-05, + "loss": 2.5421, + "step": 28139 + }, + { + "epoch": 0.8344453340450138, + "grad_norm": 0.08233461529016495, + "learning_rate": 6.745464068370694e-05, + "loss": 2.5652, + "step": 28140 + }, + { + "epoch": 0.8344749873973253, + "grad_norm": 0.08051524311304092, + "learning_rate": 6.743104111051412e-05, + "loss": 2.5773, + "step": 28141 + }, + { + "epoch": 0.8345046407496367, + "grad_norm": 0.08478283137083054, + "learning_rate": 6.74074453677937e-05, + "loss": 2.546, + "step": 28142 + }, + { + "epoch": 0.8345342941019482, + "grad_norm": 0.07592121511697769, + "learning_rate": 6.73838534557546e-05, + "loss": 2.5751, + "step": 28143 + }, + { + "epoch": 0.8345639474542597, + "grad_norm": 0.08188773691654205, + "learning_rate": 6.736026537460577e-05, + "loss": 2.5347, + "step": 28144 + }, + { + "epoch": 0.8345936008065712, + "grad_norm": 0.0842200294137001, + "learning_rate": 6.733668112455588e-05, + "loss": 2.5524, + "step": 28145 + }, + { + "epoch": 0.8346232541588826, + "grad_norm": 0.07902850955724716, + "learning_rate": 6.731310070581409e-05, + "loss": 2.5322, + "step": 28146 + }, + { + "epoch": 0.8346529075111941, + "grad_norm": 0.0827416479587555, + "learning_rate": 6.728952411858913e-05, + "loss": 2.5151, + "step": 28147 + }, + { + "epoch": 0.8346825608635057, + "grad_norm": 0.08062174171209335, + "learning_rate": 6.726595136308967e-05, + "loss": 2.5612, + "step": 28148 + }, + { + "epoch": 0.8347122142158171, + "grad_norm": 0.08603044599294662, + "learning_rate": 6.72423824395244e-05, + "loss": 2.5603, + "step": 28149 + }, + { + "epoch": 0.8347418675681286, + "grad_norm": 0.08072637021541595, + "learning_rate": 6.72188173481022e-05, + "loss": 2.5412, + "step": 28150 + }, + { + "epoch": 0.83477152092044, + "grad_norm": 0.08151989430189133, + "learning_rate": 6.71952560890316e-05, + "loss": 2.5447, + "step": 28151 + }, + { + "epoch": 0.8348011742727516, + "grad_norm": 0.09518909454345703, + "learning_rate": 6.717169866252132e-05, + "loss": 2.5381, + "step": 28152 + }, + { + "epoch": 0.834830827625063, + "grad_norm": 0.07555902749300003, + "learning_rate": 6.714814506877992e-05, + "loss": 2.5611, + "step": 28153 + }, + { + "epoch": 0.8348604809773745, + "grad_norm": 0.08556210994720459, + "learning_rate": 6.712459530801601e-05, + "loss": 2.5352, + "step": 28154 + }, + { + "epoch": 0.834890134329686, + "grad_norm": 0.088331438601017, + "learning_rate": 6.710104938043815e-05, + "loss": 2.5317, + "step": 28155 + }, + { + "epoch": 0.8349197876819975, + "grad_norm": 0.07846089452505112, + "learning_rate": 6.707750728625472e-05, + "loss": 2.5552, + "step": 28156 + }, + { + "epoch": 0.8349494410343089, + "grad_norm": 0.08219438046216965, + "learning_rate": 6.705396902567434e-05, + "loss": 2.5479, + "step": 28157 + }, + { + "epoch": 0.8349790943866204, + "grad_norm": 0.08130879700183868, + "learning_rate": 6.703043459890534e-05, + "loss": 2.5713, + "step": 28158 + }, + { + "epoch": 0.8350087477389319, + "grad_norm": 0.0882597491145134, + "learning_rate": 6.700690400615622e-05, + "loss": 2.5491, + "step": 28159 + }, + { + "epoch": 0.8350384010912434, + "grad_norm": 0.08171690255403519, + "learning_rate": 6.698337724763521e-05, + "loss": 2.5553, + "step": 28160 + }, + { + "epoch": 0.8350680544435548, + "grad_norm": 0.08127214759588242, + "learning_rate": 6.695985432355078e-05, + "loss": 2.5453, + "step": 28161 + }, + { + "epoch": 0.8350977077958663, + "grad_norm": 0.0784970223903656, + "learning_rate": 6.693633523411114e-05, + "loss": 2.5644, + "step": 28162 + }, + { + "epoch": 0.8351273611481778, + "grad_norm": 0.08466978371143341, + "learning_rate": 6.691281997952459e-05, + "loss": 2.5544, + "step": 28163 + }, + { + "epoch": 0.8351570145004893, + "grad_norm": 0.08203576505184174, + "learning_rate": 6.688930855999936e-05, + "loss": 2.5209, + "step": 28164 + }, + { + "epoch": 0.8351866678528007, + "grad_norm": 0.08274821192026138, + "learning_rate": 6.686580097574363e-05, + "loss": 2.5196, + "step": 28165 + }, + { + "epoch": 0.8352163212051122, + "grad_norm": 0.08097146451473236, + "learning_rate": 6.684229722696561e-05, + "loss": 2.555, + "step": 28166 + }, + { + "epoch": 0.8352459745574237, + "grad_norm": 0.07927919179201126, + "learning_rate": 6.681879731387341e-05, + "loss": 2.5511, + "step": 28167 + }, + { + "epoch": 0.8352756279097352, + "grad_norm": 0.07736577093601227, + "learning_rate": 6.679530123667505e-05, + "loss": 2.568, + "step": 28168 + }, + { + "epoch": 0.8353052812620467, + "grad_norm": 0.08388395607471466, + "learning_rate": 6.677180899557873e-05, + "loss": 2.5685, + "step": 28169 + }, + { + "epoch": 0.8353349346143581, + "grad_norm": 0.07890678197145462, + "learning_rate": 6.674832059079244e-05, + "loss": 2.5323, + "step": 28170 + }, + { + "epoch": 0.8353645879666697, + "grad_norm": 0.0843963623046875, + "learning_rate": 6.672483602252399e-05, + "loss": 2.5606, + "step": 28171 + }, + { + "epoch": 0.8353942413189811, + "grad_norm": 0.078375443816185, + "learning_rate": 6.670135529098154e-05, + "loss": 2.5621, + "step": 28172 + }, + { + "epoch": 0.8354238946712926, + "grad_norm": 0.08825481683015823, + "learning_rate": 6.667787839637301e-05, + "loss": 2.5308, + "step": 28173 + }, + { + "epoch": 0.835453548023604, + "grad_norm": 0.0802115947008133, + "learning_rate": 6.665440533890621e-05, + "loss": 2.5588, + "step": 28174 + }, + { + "epoch": 0.8354832013759156, + "grad_norm": 0.07976716011762619, + "learning_rate": 6.663093611878907e-05, + "loss": 2.532, + "step": 28175 + }, + { + "epoch": 0.835512854728227, + "grad_norm": 0.07634499669075012, + "learning_rate": 6.66074707362293e-05, + "loss": 2.525, + "step": 28176 + }, + { + "epoch": 0.8355425080805385, + "grad_norm": 0.07790937274694443, + "learning_rate": 6.658400919143487e-05, + "loss": 2.5378, + "step": 28177 + }, + { + "epoch": 0.83557216143285, + "grad_norm": 0.07969872653484344, + "learning_rate": 6.656055148461337e-05, + "loss": 2.5376, + "step": 28178 + }, + { + "epoch": 0.8356018147851615, + "grad_norm": 0.07878591120243073, + "learning_rate": 6.65370976159726e-05, + "loss": 2.556, + "step": 28179 + }, + { + "epoch": 0.8356314681374729, + "grad_norm": 0.08178545534610748, + "learning_rate": 6.65136475857202e-05, + "loss": 2.5736, + "step": 28180 + }, + { + "epoch": 0.8356611214897844, + "grad_norm": 0.07763779163360596, + "learning_rate": 6.649020139406403e-05, + "loss": 2.5402, + "step": 28181 + }, + { + "epoch": 0.8356907748420959, + "grad_norm": 0.08250412344932556, + "learning_rate": 6.64667590412114e-05, + "loss": 2.5457, + "step": 28182 + }, + { + "epoch": 0.8357204281944074, + "grad_norm": 0.09184274822473526, + "learning_rate": 6.644332052737001e-05, + "loss": 2.5142, + "step": 28183 + }, + { + "epoch": 0.8357500815467188, + "grad_norm": 0.08185942471027374, + "learning_rate": 6.641988585274739e-05, + "loss": 2.5351, + "step": 28184 + }, + { + "epoch": 0.8357797348990303, + "grad_norm": 0.0811569094657898, + "learning_rate": 6.639645501755115e-05, + "loss": 2.5652, + "step": 28185 + }, + { + "epoch": 0.8358093882513418, + "grad_norm": 0.08299396932125092, + "learning_rate": 6.637302802198875e-05, + "loss": 2.5501, + "step": 28186 + }, + { + "epoch": 0.8358390416036533, + "grad_norm": 0.08122909069061279, + "learning_rate": 6.634960486626763e-05, + "loss": 2.5593, + "step": 28187 + }, + { + "epoch": 0.8358686949559647, + "grad_norm": 0.07891065627336502, + "learning_rate": 6.632618555059517e-05, + "loss": 2.552, + "step": 28188 + }, + { + "epoch": 0.8358983483082763, + "grad_norm": 0.07883331924676895, + "learning_rate": 6.630277007517876e-05, + "loss": 2.5589, + "step": 28189 + }, + { + "epoch": 0.8359280016605878, + "grad_norm": 0.07743699103593826, + "learning_rate": 6.627935844022582e-05, + "loss": 2.5559, + "step": 28190 + }, + { + "epoch": 0.8359576550128992, + "grad_norm": 0.07903831452131271, + "learning_rate": 6.62559506459437e-05, + "loss": 2.5289, + "step": 28191 + }, + { + "epoch": 0.8359873083652107, + "grad_norm": 0.07942944020032883, + "learning_rate": 6.623254669253948e-05, + "loss": 2.5466, + "step": 28192 + }, + { + "epoch": 0.8360169617175222, + "grad_norm": 0.07638352364301682, + "learning_rate": 6.620914658022048e-05, + "loss": 2.5478, + "step": 28193 + }, + { + "epoch": 0.8360466150698337, + "grad_norm": 0.08085820078849792, + "learning_rate": 6.6185750309194e-05, + "loss": 2.5829, + "step": 28194 + }, + { + "epoch": 0.8360762684221451, + "grad_norm": 0.07578164339065552, + "learning_rate": 6.616235787966713e-05, + "loss": 2.5251, + "step": 28195 + }, + { + "epoch": 0.8361059217744566, + "grad_norm": 0.08693811297416687, + "learning_rate": 6.613896929184705e-05, + "loss": 2.558, + "step": 28196 + }, + { + "epoch": 0.8361355751267681, + "grad_norm": 0.08345595747232437, + "learning_rate": 6.611558454594074e-05, + "loss": 2.5162, + "step": 28197 + }, + { + "epoch": 0.8361652284790796, + "grad_norm": 0.08439682424068451, + "learning_rate": 6.609220364215551e-05, + "loss": 2.5438, + "step": 28198 + }, + { + "epoch": 0.836194881831391, + "grad_norm": 0.0852520689368248, + "learning_rate": 6.606882658069834e-05, + "loss": 2.5684, + "step": 28199 + }, + { + "epoch": 0.8362245351837025, + "grad_norm": 0.08211573213338852, + "learning_rate": 6.60454533617762e-05, + "loss": 2.526, + "step": 28200 + }, + { + "epoch": 0.836254188536014, + "grad_norm": 0.08609890192747116, + "learning_rate": 6.602208398559601e-05, + "loss": 2.5111, + "step": 28201 + }, + { + "epoch": 0.8362838418883255, + "grad_norm": 0.0897536352276802, + "learning_rate": 6.599871845236488e-05, + "loss": 2.5594, + "step": 28202 + }, + { + "epoch": 0.8363134952406369, + "grad_norm": 0.08448707312345505, + "learning_rate": 6.59753567622895e-05, + "loss": 2.5776, + "step": 28203 + }, + { + "epoch": 0.8363431485929484, + "grad_norm": 0.08208032697439194, + "learning_rate": 6.595199891557685e-05, + "loss": 2.5716, + "step": 28204 + }, + { + "epoch": 0.8363728019452599, + "grad_norm": 0.09026756882667542, + "learning_rate": 6.592864491243373e-05, + "loss": 2.5542, + "step": 28205 + }, + { + "epoch": 0.8364024552975714, + "grad_norm": 0.08174452930688858, + "learning_rate": 6.590529475306695e-05, + "loss": 2.5556, + "step": 28206 + }, + { + "epoch": 0.8364321086498828, + "grad_norm": 0.08344515413045883, + "learning_rate": 6.58819484376833e-05, + "loss": 2.5281, + "step": 28207 + }, + { + "epoch": 0.8364617620021944, + "grad_norm": 0.08698134124279022, + "learning_rate": 6.585860596648952e-05, + "loss": 2.5205, + "step": 28208 + }, + { + "epoch": 0.8364914153545058, + "grad_norm": 0.07680737227201462, + "learning_rate": 6.58352673396923e-05, + "loss": 2.5203, + "step": 28209 + }, + { + "epoch": 0.8365210687068173, + "grad_norm": 0.08309978991746902, + "learning_rate": 6.581193255749823e-05, + "loss": 2.5446, + "step": 28210 + }, + { + "epoch": 0.8365507220591288, + "grad_norm": 0.08168736845254898, + "learning_rate": 6.578860162011413e-05, + "loss": 2.5489, + "step": 28211 + }, + { + "epoch": 0.8365803754114403, + "grad_norm": 0.08006376028060913, + "learning_rate": 6.576527452774656e-05, + "loss": 2.5551, + "step": 28212 + }, + { + "epoch": 0.8366100287637518, + "grad_norm": 0.08186715096235275, + "learning_rate": 6.574195128060196e-05, + "loss": 2.5504, + "step": 28213 + }, + { + "epoch": 0.8366396821160632, + "grad_norm": 0.08565355837345123, + "learning_rate": 6.571863187888688e-05, + "loss": 2.5626, + "step": 28214 + }, + { + "epoch": 0.8366693354683747, + "grad_norm": 0.08475169539451599, + "learning_rate": 6.56953163228079e-05, + "loss": 2.5755, + "step": 28215 + }, + { + "epoch": 0.8366989888206862, + "grad_norm": 0.09144461899995804, + "learning_rate": 6.567200461257145e-05, + "loss": 2.5743, + "step": 28216 + }, + { + "epoch": 0.8367286421729977, + "grad_norm": 0.07842621952295303, + "learning_rate": 6.564869674838386e-05, + "loss": 2.5357, + "step": 28217 + }, + { + "epoch": 0.8367582955253091, + "grad_norm": 0.08765462785959244, + "learning_rate": 6.562539273045171e-05, + "loss": 2.5108, + "step": 28218 + }, + { + "epoch": 0.8367879488776206, + "grad_norm": 0.08815152943134308, + "learning_rate": 6.560209255898126e-05, + "loss": 2.5356, + "step": 28219 + }, + { + "epoch": 0.8368176022299321, + "grad_norm": 0.08624693006277084, + "learning_rate": 6.557879623417878e-05, + "loss": 2.5636, + "step": 28220 + }, + { + "epoch": 0.8368472555822436, + "grad_norm": 0.09215929359197617, + "learning_rate": 6.55555037562507e-05, + "loss": 2.5502, + "step": 28221 + }, + { + "epoch": 0.836876908934555, + "grad_norm": 0.08039246499538422, + "learning_rate": 6.553221512540314e-05, + "loss": 2.5101, + "step": 28222 + }, + { + "epoch": 0.8369065622868666, + "grad_norm": 0.08482858538627625, + "learning_rate": 6.550893034184241e-05, + "loss": 2.5766, + "step": 28223 + }, + { + "epoch": 0.836936215639178, + "grad_norm": 0.09392338991165161, + "learning_rate": 6.548564940577467e-05, + "loss": 2.5412, + "step": 28224 + }, + { + "epoch": 0.8369658689914895, + "grad_norm": 0.0888427346944809, + "learning_rate": 6.546237231740614e-05, + "loss": 2.5641, + "step": 28225 + }, + { + "epoch": 0.8369955223438009, + "grad_norm": 0.0808548554778099, + "learning_rate": 6.543909907694284e-05, + "loss": 2.5966, + "step": 28226 + }, + { + "epoch": 0.8370251756961125, + "grad_norm": 0.08404754102230072, + "learning_rate": 6.541582968459092e-05, + "loss": 2.5506, + "step": 28227 + }, + { + "epoch": 0.8370548290484239, + "grad_norm": 0.08832374960184097, + "learning_rate": 6.539256414055644e-05, + "loss": 2.5438, + "step": 28228 + }, + { + "epoch": 0.8370844824007354, + "grad_norm": 0.07939404249191284, + "learning_rate": 6.536930244504541e-05, + "loss": 2.5418, + "step": 28229 + }, + { + "epoch": 0.8371141357530468, + "grad_norm": 0.08599812537431717, + "learning_rate": 6.534604459826377e-05, + "loss": 2.5721, + "step": 28230 + }, + { + "epoch": 0.8371437891053584, + "grad_norm": 0.09225832670927048, + "learning_rate": 6.532279060041757e-05, + "loss": 2.5644, + "step": 28231 + }, + { + "epoch": 0.8371734424576699, + "grad_norm": 0.08069887012243271, + "learning_rate": 6.52995404517126e-05, + "loss": 2.5252, + "step": 28232 + }, + { + "epoch": 0.8372030958099813, + "grad_norm": 0.08501877635717392, + "learning_rate": 6.527629415235486e-05, + "loss": 2.5317, + "step": 28233 + }, + { + "epoch": 0.8372327491622928, + "grad_norm": 0.08836743235588074, + "learning_rate": 6.52530517025502e-05, + "loss": 2.5485, + "step": 28234 + }, + { + "epoch": 0.8372624025146043, + "grad_norm": 0.08442992717027664, + "learning_rate": 6.522981310250431e-05, + "loss": 2.5721, + "step": 28235 + }, + { + "epoch": 0.8372920558669158, + "grad_norm": 0.08742942661046982, + "learning_rate": 6.520657835242311e-05, + "loss": 2.544, + "step": 28236 + }, + { + "epoch": 0.8373217092192272, + "grad_norm": 0.08017374575138092, + "learning_rate": 6.518334745251225e-05, + "loss": 2.5177, + "step": 28237 + }, + { + "epoch": 0.8373513625715387, + "grad_norm": 0.09229543805122375, + "learning_rate": 6.516012040297747e-05, + "loss": 2.538, + "step": 28238 + }, + { + "epoch": 0.8373810159238502, + "grad_norm": 0.07892946898937225, + "learning_rate": 6.513689720402449e-05, + "loss": 2.5646, + "step": 28239 + }, + { + "epoch": 0.8374106692761617, + "grad_norm": 0.08192646503448486, + "learning_rate": 6.511367785585893e-05, + "loss": 2.5804, + "step": 28240 + }, + { + "epoch": 0.8374403226284731, + "grad_norm": 0.08241148293018341, + "learning_rate": 6.509046235868637e-05, + "loss": 2.5059, + "step": 28241 + }, + { + "epoch": 0.8374699759807847, + "grad_norm": 0.08417729288339615, + "learning_rate": 6.506725071271246e-05, + "loss": 2.537, + "step": 28242 + }, + { + "epoch": 0.8374996293330961, + "grad_norm": 0.07756032794713974, + "learning_rate": 6.50440429181427e-05, + "loss": 2.5248, + "step": 28243 + }, + { + "epoch": 0.8375292826854076, + "grad_norm": 0.08358053117990494, + "learning_rate": 6.502083897518258e-05, + "loss": 2.5205, + "step": 28244 + }, + { + "epoch": 0.837558936037719, + "grad_norm": 0.08144988119602203, + "learning_rate": 6.49976388840376e-05, + "loss": 2.5366, + "step": 28245 + }, + { + "epoch": 0.8375885893900306, + "grad_norm": 0.08281505852937698, + "learning_rate": 6.49744426449132e-05, + "loss": 2.5042, + "step": 28246 + }, + { + "epoch": 0.837618242742342, + "grad_norm": 0.09040003269910812, + "learning_rate": 6.495125025801485e-05, + "loss": 2.5825, + "step": 28247 + }, + { + "epoch": 0.8376478960946535, + "grad_norm": 0.08339160680770874, + "learning_rate": 6.492806172354782e-05, + "loss": 2.5111, + "step": 28248 + }, + { + "epoch": 0.8376775494469649, + "grad_norm": 0.08831463754177094, + "learning_rate": 6.490487704171733e-05, + "loss": 2.5266, + "step": 28249 + }, + { + "epoch": 0.8377072027992765, + "grad_norm": 0.08064741641283035, + "learning_rate": 6.488169621272894e-05, + "loss": 2.5243, + "step": 28250 + }, + { + "epoch": 0.8377368561515879, + "grad_norm": 0.08491538465023041, + "learning_rate": 6.485851923678781e-05, + "loss": 2.5612, + "step": 28251 + }, + { + "epoch": 0.8377665095038994, + "grad_norm": 0.07811827957630157, + "learning_rate": 6.483534611409919e-05, + "loss": 2.5314, + "step": 28252 + }, + { + "epoch": 0.837796162856211, + "grad_norm": 0.08290720731019974, + "learning_rate": 6.481217684486829e-05, + "loss": 2.5269, + "step": 28253 + }, + { + "epoch": 0.8378258162085224, + "grad_norm": 0.08026041090488434, + "learning_rate": 6.478901142930027e-05, + "loss": 2.5166, + "step": 28254 + }, + { + "epoch": 0.8378554695608339, + "grad_norm": 0.08435192704200745, + "learning_rate": 6.47658498676002e-05, + "loss": 2.5203, + "step": 28255 + }, + { + "epoch": 0.8378851229131453, + "grad_norm": 0.08235233277082443, + "learning_rate": 6.474269215997331e-05, + "loss": 2.5281, + "step": 28256 + }, + { + "epoch": 0.8379147762654569, + "grad_norm": 0.08771882951259613, + "learning_rate": 6.471953830662463e-05, + "loss": 2.576, + "step": 28257 + }, + { + "epoch": 0.8379444296177683, + "grad_norm": 0.08486613631248474, + "learning_rate": 6.46963883077591e-05, + "loss": 2.5732, + "step": 28258 + }, + { + "epoch": 0.8379740829700798, + "grad_norm": 0.08474878966808319, + "learning_rate": 6.467324216358179e-05, + "loss": 2.5453, + "step": 28259 + }, + { + "epoch": 0.8380037363223912, + "grad_norm": 0.08403675258159637, + "learning_rate": 6.465009987429759e-05, + "loss": 2.5433, + "step": 28260 + }, + { + "epoch": 0.8380333896747028, + "grad_norm": 0.08293835818767548, + "learning_rate": 6.462696144011149e-05, + "loss": 2.5786, + "step": 28261 + }, + { + "epoch": 0.8380630430270142, + "grad_norm": 0.08221393823623657, + "learning_rate": 6.460382686122828e-05, + "loss": 2.5257, + "step": 28262 + }, + { + "epoch": 0.8380926963793257, + "grad_norm": 0.07951665669679642, + "learning_rate": 6.4580696137853e-05, + "loss": 2.5405, + "step": 28263 + }, + { + "epoch": 0.8381223497316371, + "grad_norm": 0.08045629411935806, + "learning_rate": 6.455756927019046e-05, + "loss": 2.5572, + "step": 28264 + }, + { + "epoch": 0.8381520030839487, + "grad_norm": 0.07919087260961533, + "learning_rate": 6.453444625844535e-05, + "loss": 2.5291, + "step": 28265 + }, + { + "epoch": 0.8381816564362601, + "grad_norm": 0.08482063561677933, + "learning_rate": 6.451132710282243e-05, + "loss": 2.5205, + "step": 28266 + }, + { + "epoch": 0.8382113097885716, + "grad_norm": 0.07975316047668457, + "learning_rate": 6.448821180352659e-05, + "loss": 2.5852, + "step": 28267 + }, + { + "epoch": 0.838240963140883, + "grad_norm": 0.08039490878582001, + "learning_rate": 6.44651003607623e-05, + "loss": 2.5423, + "step": 28268 + }, + { + "epoch": 0.8382706164931946, + "grad_norm": 0.08333488553762436, + "learning_rate": 6.444199277473428e-05, + "loss": 2.5455, + "step": 28269 + }, + { + "epoch": 0.838300269845506, + "grad_norm": 0.07870888710021973, + "learning_rate": 6.441888904564725e-05, + "loss": 2.5606, + "step": 28270 + }, + { + "epoch": 0.8383299231978175, + "grad_norm": 0.0802178606390953, + "learning_rate": 6.439578917370564e-05, + "loss": 2.5604, + "step": 28271 + }, + { + "epoch": 0.8383595765501289, + "grad_norm": 0.07681336998939514, + "learning_rate": 6.437269315911409e-05, + "loss": 2.5753, + "step": 28272 + }, + { + "epoch": 0.8383892299024405, + "grad_norm": 0.08295835554599762, + "learning_rate": 6.434960100207716e-05, + "loss": 2.548, + "step": 28273 + }, + { + "epoch": 0.838418883254752, + "grad_norm": 0.07896241545677185, + "learning_rate": 6.432651270279926e-05, + "loss": 2.5449, + "step": 28274 + }, + { + "epoch": 0.8384485366070634, + "grad_norm": 0.08868057280778885, + "learning_rate": 6.430342826148477e-05, + "loss": 2.5442, + "step": 28275 + }, + { + "epoch": 0.838478189959375, + "grad_norm": 0.08548980951309204, + "learning_rate": 6.428034767833835e-05, + "loss": 2.5575, + "step": 28276 + }, + { + "epoch": 0.8385078433116864, + "grad_norm": 0.08363836258649826, + "learning_rate": 6.425727095356421e-05, + "loss": 2.5829, + "step": 28277 + }, + { + "epoch": 0.8385374966639979, + "grad_norm": 0.08336257934570312, + "learning_rate": 6.42341980873668e-05, + "loss": 2.5439, + "step": 28278 + }, + { + "epoch": 0.8385671500163093, + "grad_norm": 0.08476033061742783, + "learning_rate": 6.42111290799503e-05, + "loss": 2.5957, + "step": 28279 + }, + { + "epoch": 0.8385968033686209, + "grad_norm": 0.0804189071059227, + "learning_rate": 6.41880639315191e-05, + "loss": 2.5815, + "step": 28280 + }, + { + "epoch": 0.8386264567209323, + "grad_norm": 0.08421333879232407, + "learning_rate": 6.416500264227731e-05, + "loss": 2.5845, + "step": 28281 + }, + { + "epoch": 0.8386561100732438, + "grad_norm": 0.07868514955043793, + "learning_rate": 6.414194521242928e-05, + "loss": 2.5236, + "step": 28282 + }, + { + "epoch": 0.8386857634255552, + "grad_norm": 0.07862066477537155, + "learning_rate": 6.411889164217916e-05, + "loss": 2.5427, + "step": 28283 + }, + { + "epoch": 0.8387154167778668, + "grad_norm": 0.08324279636144638, + "learning_rate": 6.409584193173101e-05, + "loss": 2.5979, + "step": 28284 + }, + { + "epoch": 0.8387450701301782, + "grad_norm": 0.07979349792003632, + "learning_rate": 6.407279608128907e-05, + "loss": 2.5593, + "step": 28285 + }, + { + "epoch": 0.8387747234824897, + "grad_norm": 0.08368194848299026, + "learning_rate": 6.40497540910573e-05, + "loss": 2.5486, + "step": 28286 + }, + { + "epoch": 0.8388043768348011, + "grad_norm": 0.07339291274547577, + "learning_rate": 6.402671596123972e-05, + "loss": 2.5241, + "step": 28287 + }, + { + "epoch": 0.8388340301871127, + "grad_norm": 0.08648192882537842, + "learning_rate": 6.400368169204057e-05, + "loss": 2.5358, + "step": 28288 + }, + { + "epoch": 0.8388636835394241, + "grad_norm": 0.0800001472234726, + "learning_rate": 6.398065128366359e-05, + "loss": 2.5771, + "step": 28289 + }, + { + "epoch": 0.8388933368917356, + "grad_norm": 0.0917600616812706, + "learning_rate": 6.395762473631273e-05, + "loss": 2.5691, + "step": 28290 + }, + { + "epoch": 0.838922990244047, + "grad_norm": 0.08765662461519241, + "learning_rate": 6.393460205019202e-05, + "loss": 2.5488, + "step": 28291 + }, + { + "epoch": 0.8389526435963586, + "grad_norm": 0.08187439292669296, + "learning_rate": 6.391158322550522e-05, + "loss": 2.5521, + "step": 28292 + }, + { + "epoch": 0.83898229694867, + "grad_norm": 0.0822111964225769, + "learning_rate": 6.38885682624562e-05, + "loss": 2.5175, + "step": 28293 + }, + { + "epoch": 0.8390119503009815, + "grad_norm": 0.07997837662696838, + "learning_rate": 6.386555716124875e-05, + "loss": 2.5683, + "step": 28294 + }, + { + "epoch": 0.839041603653293, + "grad_norm": 0.08166877180337906, + "learning_rate": 6.384254992208671e-05, + "loss": 2.5271, + "step": 28295 + }, + { + "epoch": 0.8390712570056045, + "grad_norm": 0.0877140611410141, + "learning_rate": 6.38195465451737e-05, + "loss": 2.553, + "step": 28296 + }, + { + "epoch": 0.839100910357916, + "grad_norm": 0.08816223591566086, + "learning_rate": 6.379654703071353e-05, + "loss": 2.596, + "step": 28297 + }, + { + "epoch": 0.8391305637102274, + "grad_norm": 0.08268880844116211, + "learning_rate": 6.377355137890972e-05, + "loss": 2.593, + "step": 28298 + }, + { + "epoch": 0.839160217062539, + "grad_norm": 0.09590582549571991, + "learning_rate": 6.375055958996606e-05, + "loss": 2.5443, + "step": 28299 + }, + { + "epoch": 0.8391898704148504, + "grad_norm": 0.09042404592037201, + "learning_rate": 6.372757166408605e-05, + "loss": 2.5697, + "step": 28300 + }, + { + "epoch": 0.8392195237671619, + "grad_norm": 0.08314763754606247, + "learning_rate": 6.37045876014733e-05, + "loss": 2.5532, + "step": 28301 + }, + { + "epoch": 0.8392491771194733, + "grad_norm": 0.07573296874761581, + "learning_rate": 6.368160740233132e-05, + "loss": 2.5587, + "step": 28302 + }, + { + "epoch": 0.8392788304717849, + "grad_norm": 0.09737174957990646, + "learning_rate": 6.36586310668636e-05, + "loss": 2.5544, + "step": 28303 + }, + { + "epoch": 0.8393084838240963, + "grad_norm": 0.08051611483097076, + "learning_rate": 6.36356585952736e-05, + "loss": 2.5873, + "step": 28304 + }, + { + "epoch": 0.8393381371764078, + "grad_norm": 0.09451011568307877, + "learning_rate": 6.36126899877647e-05, + "loss": 2.547, + "step": 28305 + }, + { + "epoch": 0.8393677905287192, + "grad_norm": 0.08127801865339279, + "learning_rate": 6.358972524454037e-05, + "loss": 2.586, + "step": 28306 + }, + { + "epoch": 0.8393974438810308, + "grad_norm": 0.08025629818439484, + "learning_rate": 6.356676436580394e-05, + "loss": 2.544, + "step": 28307 + }, + { + "epoch": 0.8394270972333422, + "grad_norm": 0.09452445805072784, + "learning_rate": 6.354380735175869e-05, + "loss": 2.5638, + "step": 28308 + }, + { + "epoch": 0.8394567505856537, + "grad_norm": 0.08103039115667343, + "learning_rate": 6.352085420260794e-05, + "loss": 2.5681, + "step": 28309 + }, + { + "epoch": 0.8394864039379651, + "grad_norm": 0.08481330424547195, + "learning_rate": 6.349790491855501e-05, + "loss": 2.5425, + "step": 28310 + }, + { + "epoch": 0.8395160572902767, + "grad_norm": 0.08045166730880737, + "learning_rate": 6.347495949980297e-05, + "loss": 2.5413, + "step": 28311 + }, + { + "epoch": 0.8395457106425881, + "grad_norm": 0.0889936238527298, + "learning_rate": 6.345201794655525e-05, + "loss": 2.5385, + "step": 28312 + }, + { + "epoch": 0.8395753639948996, + "grad_norm": 0.08027011901140213, + "learning_rate": 6.342908025901461e-05, + "loss": 2.5332, + "step": 28313 + }, + { + "epoch": 0.839605017347211, + "grad_norm": 0.0787392109632492, + "learning_rate": 6.340614643738457e-05, + "loss": 2.5203, + "step": 28314 + }, + { + "epoch": 0.8396346706995226, + "grad_norm": 0.0851709395647049, + "learning_rate": 6.338321648186795e-05, + "loss": 2.5675, + "step": 28315 + }, + { + "epoch": 0.8396643240518341, + "grad_norm": 0.07545744627714157, + "learning_rate": 6.336029039266794e-05, + "loss": 2.5494, + "step": 28316 + }, + { + "epoch": 0.8396939774041455, + "grad_norm": 0.08789586275815964, + "learning_rate": 6.333736816998753e-05, + "loss": 2.549, + "step": 28317 + }, + { + "epoch": 0.8397236307564571, + "grad_norm": 0.08328331261873245, + "learning_rate": 6.331444981402968e-05, + "loss": 2.5368, + "step": 28318 + }, + { + "epoch": 0.8397532841087685, + "grad_norm": 0.0900120884180069, + "learning_rate": 6.329153532499726e-05, + "loss": 2.572, + "step": 28319 + }, + { + "epoch": 0.83978293746108, + "grad_norm": 0.0877745971083641, + "learning_rate": 6.32686247030933e-05, + "loss": 2.5519, + "step": 28320 + }, + { + "epoch": 0.8398125908133914, + "grad_norm": 0.0884728878736496, + "learning_rate": 6.324571794852063e-05, + "loss": 2.5585, + "step": 28321 + }, + { + "epoch": 0.839842244165703, + "grad_norm": 0.08438824862241745, + "learning_rate": 6.322281506148215e-05, + "loss": 2.5268, + "step": 28322 + }, + { + "epoch": 0.8398718975180144, + "grad_norm": 0.08058634400367737, + "learning_rate": 6.319991604218062e-05, + "loss": 2.4853, + "step": 28323 + }, + { + "epoch": 0.8399015508703259, + "grad_norm": 0.08894401043653488, + "learning_rate": 6.317702089081879e-05, + "loss": 2.5564, + "step": 28324 + }, + { + "epoch": 0.8399312042226373, + "grad_norm": 0.08073264360427856, + "learning_rate": 6.315412960759936e-05, + "loss": 2.5679, + "step": 28325 + }, + { + "epoch": 0.8399608575749489, + "grad_norm": 0.09030678868293762, + "learning_rate": 6.313124219272498e-05, + "loss": 2.567, + "step": 28326 + }, + { + "epoch": 0.8399905109272603, + "grad_norm": 0.08188693970441818, + "learning_rate": 6.310835864639858e-05, + "loss": 2.5643, + "step": 28327 + }, + { + "epoch": 0.8400201642795718, + "grad_norm": 0.07932790368795395, + "learning_rate": 6.308547896882266e-05, + "loss": 2.547, + "step": 28328 + }, + { + "epoch": 0.8400498176318832, + "grad_norm": 0.07884159684181213, + "learning_rate": 6.306260316019985e-05, + "loss": 2.5584, + "step": 28329 + }, + { + "epoch": 0.8400794709841948, + "grad_norm": 0.08585549145936966, + "learning_rate": 6.303973122073265e-05, + "loss": 2.5655, + "step": 28330 + }, + { + "epoch": 0.8401091243365062, + "grad_norm": 0.0766851082444191, + "learning_rate": 6.30168631506236e-05, + "loss": 2.5318, + "step": 28331 + }, + { + "epoch": 0.8401387776888177, + "grad_norm": 0.08367161452770233, + "learning_rate": 6.29939989500753e-05, + "loss": 2.5405, + "step": 28332 + }, + { + "epoch": 0.8401684310411291, + "grad_norm": 0.07890161871910095, + "learning_rate": 6.297113861929022e-05, + "loss": 2.5425, + "step": 28333 + }, + { + "epoch": 0.8401980843934407, + "grad_norm": 0.08331286162137985, + "learning_rate": 6.294828215847059e-05, + "loss": 2.5773, + "step": 28334 + }, + { + "epoch": 0.8402277377457522, + "grad_norm": 0.07934078574180603, + "learning_rate": 6.292542956781899e-05, + "loss": 2.5405, + "step": 28335 + }, + { + "epoch": 0.8402573910980636, + "grad_norm": 0.0800401046872139, + "learning_rate": 6.29025808475377e-05, + "loss": 2.5187, + "step": 28336 + }, + { + "epoch": 0.8402870444503752, + "grad_norm": 0.07960831373929977, + "learning_rate": 6.28797359978291e-05, + "loss": 2.5633, + "step": 28337 + }, + { + "epoch": 0.8403166978026866, + "grad_norm": 0.07891847938299179, + "learning_rate": 6.285689501889546e-05, + "loss": 2.5103, + "step": 28338 + }, + { + "epoch": 0.8403463511549981, + "grad_norm": 0.0835135206580162, + "learning_rate": 6.283405791093893e-05, + "loss": 2.5672, + "step": 28339 + }, + { + "epoch": 0.8403760045073095, + "grad_norm": 0.0793510228395462, + "learning_rate": 6.2811224674162e-05, + "loss": 2.5565, + "step": 28340 + }, + { + "epoch": 0.8404056578596211, + "grad_norm": 0.08608882129192352, + "learning_rate": 6.278839530876667e-05, + "loss": 2.5574, + "step": 28341 + }, + { + "epoch": 0.8404353112119325, + "grad_norm": 0.07785630226135254, + "learning_rate": 6.276556981495518e-05, + "loss": 2.5398, + "step": 28342 + }, + { + "epoch": 0.840464964564244, + "grad_norm": 0.07615367323160172, + "learning_rate": 6.274274819292975e-05, + "loss": 2.5725, + "step": 28343 + }, + { + "epoch": 0.8404946179165554, + "grad_norm": 0.07597047835588455, + "learning_rate": 6.271993044289215e-05, + "loss": 2.5703, + "step": 28344 + }, + { + "epoch": 0.840524271268867, + "grad_norm": 0.07881749421358109, + "learning_rate": 6.269711656504467e-05, + "loss": 2.5371, + "step": 28345 + }, + { + "epoch": 0.8405539246211784, + "grad_norm": 0.08157122880220413, + "learning_rate": 6.267430655958934e-05, + "loss": 2.5253, + "step": 28346 + }, + { + "epoch": 0.8405835779734899, + "grad_norm": 0.08234691619873047, + "learning_rate": 6.265150042672802e-05, + "loss": 2.524, + "step": 28347 + }, + { + "epoch": 0.8406132313258013, + "grad_norm": 0.07850892841815948, + "learning_rate": 6.262869816666277e-05, + "loss": 2.5561, + "step": 28348 + }, + { + "epoch": 0.8406428846781129, + "grad_norm": 0.08331206440925598, + "learning_rate": 6.260589977959546e-05, + "loss": 2.5728, + "step": 28349 + }, + { + "epoch": 0.8406725380304243, + "grad_norm": 0.08031506836414337, + "learning_rate": 6.258310526572797e-05, + "loss": 2.5262, + "step": 28350 + }, + { + "epoch": 0.8407021913827358, + "grad_norm": 0.08534227311611176, + "learning_rate": 6.256031462526219e-05, + "loss": 2.5715, + "step": 28351 + }, + { + "epoch": 0.8407318447350473, + "grad_norm": 0.08102278411388397, + "learning_rate": 6.253752785839977e-05, + "loss": 2.5501, + "step": 28352 + }, + { + "epoch": 0.8407614980873588, + "grad_norm": 0.0868019163608551, + "learning_rate": 6.251474496534277e-05, + "loss": 2.5464, + "step": 28353 + }, + { + "epoch": 0.8407911514396702, + "grad_norm": 0.07912930846214294, + "learning_rate": 6.249196594629286e-05, + "loss": 2.5395, + "step": 28354 + }, + { + "epoch": 0.8408208047919817, + "grad_norm": 0.08004936575889587, + "learning_rate": 6.246919080145164e-05, + "loss": 2.5859, + "step": 28355 + }, + { + "epoch": 0.8408504581442933, + "grad_norm": 0.08076636493206024, + "learning_rate": 6.244641953102081e-05, + "loss": 2.5514, + "step": 28356 + }, + { + "epoch": 0.8408801114966047, + "grad_norm": 0.08666381984949112, + "learning_rate": 6.242365213520201e-05, + "loss": 2.5575, + "step": 28357 + }, + { + "epoch": 0.8409097648489162, + "grad_norm": 0.08332183957099915, + "learning_rate": 6.24008886141969e-05, + "loss": 2.5761, + "step": 28358 + }, + { + "epoch": 0.8409394182012276, + "grad_norm": 0.08356668055057526, + "learning_rate": 6.237812896820705e-05, + "loss": 2.5473, + "step": 28359 + }, + { + "epoch": 0.8409690715535392, + "grad_norm": 0.08916033804416656, + "learning_rate": 6.235537319743401e-05, + "loss": 2.5619, + "step": 28360 + }, + { + "epoch": 0.8409987249058506, + "grad_norm": 0.0812428742647171, + "learning_rate": 6.233262130207923e-05, + "loss": 2.5395, + "step": 28361 + }, + { + "epoch": 0.8410283782581621, + "grad_norm": 0.08565213531255722, + "learning_rate": 6.230987328234423e-05, + "loss": 2.5754, + "step": 28362 + }, + { + "epoch": 0.8410580316104735, + "grad_norm": 0.08170013874769211, + "learning_rate": 6.228712913843037e-05, + "loss": 2.5566, + "step": 28363 + }, + { + "epoch": 0.8410876849627851, + "grad_norm": 0.08467981219291687, + "learning_rate": 6.226438887053915e-05, + "loss": 2.5127, + "step": 28364 + }, + { + "epoch": 0.8411173383150965, + "grad_norm": 0.07801251113414764, + "learning_rate": 6.224165247887192e-05, + "loss": 2.5512, + "step": 28365 + }, + { + "epoch": 0.841146991667408, + "grad_norm": 0.08379226922988892, + "learning_rate": 6.221891996363e-05, + "loss": 2.5653, + "step": 28366 + }, + { + "epoch": 0.8411766450197194, + "grad_norm": 0.08604753017425537, + "learning_rate": 6.21961913250147e-05, + "loss": 2.5621, + "step": 28367 + }, + { + "epoch": 0.841206298372031, + "grad_norm": 0.07877267897129059, + "learning_rate": 6.21734665632272e-05, + "loss": 2.5903, + "step": 28368 + }, + { + "epoch": 0.8412359517243424, + "grad_norm": 0.0866093784570694, + "learning_rate": 6.215074567846885e-05, + "loss": 2.5385, + "step": 28369 + }, + { + "epoch": 0.8412656050766539, + "grad_norm": 0.08104761689901352, + "learning_rate": 6.212802867094081e-05, + "loss": 2.5609, + "step": 28370 + }, + { + "epoch": 0.8412952584289654, + "grad_norm": 0.08208076655864716, + "learning_rate": 6.210531554084426e-05, + "loss": 2.5333, + "step": 28371 + }, + { + "epoch": 0.8413249117812769, + "grad_norm": 0.08682411164045334, + "learning_rate": 6.208260628838025e-05, + "loss": 2.5565, + "step": 28372 + }, + { + "epoch": 0.8413545651335883, + "grad_norm": 0.08080681413412094, + "learning_rate": 6.205990091374997e-05, + "loss": 2.5344, + "step": 28373 + }, + { + "epoch": 0.8413842184858998, + "grad_norm": 0.07942083477973938, + "learning_rate": 6.203719941715441e-05, + "loss": 2.5594, + "step": 28374 + }, + { + "epoch": 0.8414138718382113, + "grad_norm": 0.08088413625955582, + "learning_rate": 6.201450179879465e-05, + "loss": 2.5411, + "step": 28375 + }, + { + "epoch": 0.8414435251905228, + "grad_norm": 0.08120256662368774, + "learning_rate": 6.199180805887167e-05, + "loss": 2.5732, + "step": 28376 + }, + { + "epoch": 0.8414731785428343, + "grad_norm": 0.07514099776744843, + "learning_rate": 6.196911819758638e-05, + "loss": 2.5186, + "step": 28377 + }, + { + "epoch": 0.8415028318951457, + "grad_norm": 0.0803665891289711, + "learning_rate": 6.194643221513974e-05, + "loss": 2.5285, + "step": 28378 + }, + { + "epoch": 0.8415324852474573, + "grad_norm": 0.072859987616539, + "learning_rate": 6.192375011173263e-05, + "loss": 2.5689, + "step": 28379 + }, + { + "epoch": 0.8415621385997687, + "grad_norm": 0.07975160330533981, + "learning_rate": 6.190107188756594e-05, + "loss": 2.5077, + "step": 28380 + }, + { + "epoch": 0.8415917919520802, + "grad_norm": 0.08191528916358948, + "learning_rate": 6.187839754284041e-05, + "loss": 2.5228, + "step": 28381 + }, + { + "epoch": 0.8416214453043916, + "grad_norm": 0.07611504942178726, + "learning_rate": 6.185572707775688e-05, + "loss": 2.5426, + "step": 28382 + }, + { + "epoch": 0.8416510986567032, + "grad_norm": 0.08360863476991653, + "learning_rate": 6.183306049251614e-05, + "loss": 2.5378, + "step": 28383 + }, + { + "epoch": 0.8416807520090146, + "grad_norm": 0.08393201231956482, + "learning_rate": 6.181039778731878e-05, + "loss": 2.5358, + "step": 28384 + }, + { + "epoch": 0.8417104053613261, + "grad_norm": 0.0815705731511116, + "learning_rate": 6.178773896236562e-05, + "loss": 2.5644, + "step": 28385 + }, + { + "epoch": 0.8417400587136376, + "grad_norm": 0.08156194537878036, + "learning_rate": 6.176508401785725e-05, + "loss": 2.5568, + "step": 28386 + }, + { + "epoch": 0.8417697120659491, + "grad_norm": 0.08879338949918747, + "learning_rate": 6.174243295399429e-05, + "loss": 2.536, + "step": 28387 + }, + { + "epoch": 0.8417993654182605, + "grad_norm": 0.08753800392150879, + "learning_rate": 6.171978577097736e-05, + "loss": 2.554, + "step": 28388 + }, + { + "epoch": 0.841829018770572, + "grad_norm": 0.0883094072341919, + "learning_rate": 6.169714246900693e-05, + "loss": 2.5141, + "step": 28389 + }, + { + "epoch": 0.8418586721228835, + "grad_norm": 0.08308493345975876, + "learning_rate": 6.167450304828348e-05, + "loss": 2.5485, + "step": 28390 + }, + { + "epoch": 0.841888325475195, + "grad_norm": 0.08348710834980011, + "learning_rate": 6.165186750900747e-05, + "loss": 2.5851, + "step": 28391 + }, + { + "epoch": 0.8419179788275064, + "grad_norm": 0.08035168796777725, + "learning_rate": 6.162923585137947e-05, + "loss": 2.5732, + "step": 28392 + }, + { + "epoch": 0.8419476321798179, + "grad_norm": 0.09500046819448471, + "learning_rate": 6.160660807559986e-05, + "loss": 2.5, + "step": 28393 + }, + { + "epoch": 0.8419772855321294, + "grad_norm": 0.08278291672468185, + "learning_rate": 6.1583984181869e-05, + "loss": 2.5687, + "step": 28394 + }, + { + "epoch": 0.8420069388844409, + "grad_norm": 0.08735796809196472, + "learning_rate": 6.156136417038721e-05, + "loss": 2.5528, + "step": 28395 + }, + { + "epoch": 0.8420365922367523, + "grad_norm": 0.07970394194126129, + "learning_rate": 6.153874804135479e-05, + "loss": 2.525, + "step": 28396 + }, + { + "epoch": 0.8420662455890638, + "grad_norm": 0.07755900919437408, + "learning_rate": 6.151613579497207e-05, + "loss": 2.5336, + "step": 28397 + }, + { + "epoch": 0.8420958989413754, + "grad_norm": 0.08119477331638336, + "learning_rate": 6.149352743143916e-05, + "loss": 2.5595, + "step": 28398 + }, + { + "epoch": 0.8421255522936868, + "grad_norm": 0.0793156698346138, + "learning_rate": 6.147092295095647e-05, + "loss": 2.5105, + "step": 28399 + }, + { + "epoch": 0.8421552056459983, + "grad_norm": 0.0777663141489029, + "learning_rate": 6.144832235372389e-05, + "loss": 2.5344, + "step": 28400 + }, + { + "epoch": 0.8421848589983097, + "grad_norm": 0.08366326987743378, + "learning_rate": 6.14257256399417e-05, + "loss": 2.5097, + "step": 28401 + }, + { + "epoch": 0.8422145123506213, + "grad_norm": 0.08856505900621414, + "learning_rate": 6.140313280981002e-05, + "loss": 2.5709, + "step": 28402 + }, + { + "epoch": 0.8422441657029327, + "grad_norm": 0.0850321501493454, + "learning_rate": 6.138054386352888e-05, + "loss": 2.5137, + "step": 28403 + }, + { + "epoch": 0.8422738190552442, + "grad_norm": 0.09151601791381836, + "learning_rate": 6.135795880129819e-05, + "loss": 2.513, + "step": 28404 + }, + { + "epoch": 0.8423034724075557, + "grad_norm": 0.0826292335987091, + "learning_rate": 6.13353776233182e-05, + "loss": 2.5229, + "step": 28405 + }, + { + "epoch": 0.8423331257598672, + "grad_norm": 0.09311605989933014, + "learning_rate": 6.13128003297887e-05, + "loss": 2.5352, + "step": 28406 + }, + { + "epoch": 0.8423627791121786, + "grad_norm": 0.07933381199836731, + "learning_rate": 6.129022692090969e-05, + "loss": 2.5596, + "step": 28407 + }, + { + "epoch": 0.8423924324644901, + "grad_norm": 0.08700663596391678, + "learning_rate": 6.126765739688095e-05, + "loss": 2.5214, + "step": 28408 + }, + { + "epoch": 0.8424220858168016, + "grad_norm": 0.08687908202409744, + "learning_rate": 6.12450917579026e-05, + "loss": 2.5466, + "step": 28409 + }, + { + "epoch": 0.8424517391691131, + "grad_norm": 0.09090783447027206, + "learning_rate": 6.122253000417417e-05, + "loss": 2.5646, + "step": 28410 + }, + { + "epoch": 0.8424813925214245, + "grad_norm": 0.08874083310365677, + "learning_rate": 6.119997213589551e-05, + "loss": 2.5707, + "step": 28411 + }, + { + "epoch": 0.842511045873736, + "grad_norm": 0.08064234256744385, + "learning_rate": 6.117741815326638e-05, + "loss": 2.568, + "step": 28412 + }, + { + "epoch": 0.8425406992260475, + "grad_norm": 0.09042876213788986, + "learning_rate": 6.115486805648663e-05, + "loss": 2.5571, + "step": 28413 + }, + { + "epoch": 0.842570352578359, + "grad_norm": 0.08408968895673752, + "learning_rate": 6.113232184575579e-05, + "loss": 2.5042, + "step": 28414 + }, + { + "epoch": 0.8426000059306704, + "grad_norm": 0.07670949399471283, + "learning_rate": 6.110977952127355e-05, + "loss": 2.5557, + "step": 28415 + }, + { + "epoch": 0.842629659282982, + "grad_norm": 0.09476186335086823, + "learning_rate": 6.10872410832396e-05, + "loss": 2.5765, + "step": 28416 + }, + { + "epoch": 0.8426593126352934, + "grad_norm": 0.07627097517251968, + "learning_rate": 6.106470653185331e-05, + "loss": 2.5023, + "step": 28417 + }, + { + "epoch": 0.8426889659876049, + "grad_norm": 0.08903635293245316, + "learning_rate": 6.104217586731453e-05, + "loss": 2.5502, + "step": 28418 + }, + { + "epoch": 0.8427186193399164, + "grad_norm": 0.07848227024078369, + "learning_rate": 6.101964908982266e-05, + "loss": 2.5421, + "step": 28419 + }, + { + "epoch": 0.8427482726922279, + "grad_norm": 0.08275312930345535, + "learning_rate": 6.09971261995772e-05, + "loss": 2.5514, + "step": 28420 + }, + { + "epoch": 0.8427779260445394, + "grad_norm": 0.09506332874298096, + "learning_rate": 6.0974607196777446e-05, + "loss": 2.5581, + "step": 28421 + }, + { + "epoch": 0.8428075793968508, + "grad_norm": 0.07889754325151443, + "learning_rate": 6.095209208162289e-05, + "loss": 2.5485, + "step": 28422 + }, + { + "epoch": 0.8428372327491623, + "grad_norm": 0.08272197097539902, + "learning_rate": 6.092958085431294e-05, + "loss": 2.5318, + "step": 28423 + }, + { + "epoch": 0.8428668861014738, + "grad_norm": 0.09348582476377487, + "learning_rate": 6.0907073515046926e-05, + "loss": 2.5633, + "step": 28424 + }, + { + "epoch": 0.8428965394537853, + "grad_norm": 0.07484471797943115, + "learning_rate": 6.0884570064024145e-05, + "loss": 2.5369, + "step": 28425 + }, + { + "epoch": 0.8429261928060967, + "grad_norm": 0.07891461998224258, + "learning_rate": 6.086207050144382e-05, + "loss": 2.572, + "step": 28426 + }, + { + "epoch": 0.8429558461584082, + "grad_norm": 0.0836050808429718, + "learning_rate": 6.0839574827505295e-05, + "loss": 2.5432, + "step": 28427 + }, + { + "epoch": 0.8429854995107197, + "grad_norm": 0.07853230834007263, + "learning_rate": 6.0817083042407685e-05, + "loss": 2.5403, + "step": 28428 + }, + { + "epoch": 0.8430151528630312, + "grad_norm": 0.0804758369922638, + "learning_rate": 6.079459514635022e-05, + "loss": 2.5274, + "step": 28429 + }, + { + "epoch": 0.8430448062153426, + "grad_norm": 0.0815642699599266, + "learning_rate": 6.077211113953196e-05, + "loss": 2.5428, + "step": 28430 + }, + { + "epoch": 0.8430744595676541, + "grad_norm": 0.08856286108493805, + "learning_rate": 6.0749631022152083e-05, + "loss": 2.5379, + "step": 28431 + }, + { + "epoch": 0.8431041129199656, + "grad_norm": 0.0830165222287178, + "learning_rate": 6.07271547944096e-05, + "loss": 2.5413, + "step": 28432 + }, + { + "epoch": 0.8431337662722771, + "grad_norm": 0.08181401342153549, + "learning_rate": 6.070468245650357e-05, + "loss": 2.5313, + "step": 28433 + }, + { + "epoch": 0.8431634196245885, + "grad_norm": 0.08910015970468521, + "learning_rate": 6.0682214008633e-05, + "loss": 2.5594, + "step": 28434 + }, + { + "epoch": 0.8431930729769, + "grad_norm": 0.08303822576999664, + "learning_rate": 6.065974945099684e-05, + "loss": 2.5253, + "step": 28435 + }, + { + "epoch": 0.8432227263292115, + "grad_norm": 0.08360143005847931, + "learning_rate": 6.0637288783793986e-05, + "loss": 2.5524, + "step": 28436 + }, + { + "epoch": 0.843252379681523, + "grad_norm": 0.08548454940319061, + "learning_rate": 6.061483200722334e-05, + "loss": 2.5518, + "step": 28437 + }, + { + "epoch": 0.8432820330338344, + "grad_norm": 0.08095206320285797, + "learning_rate": 6.05923791214838e-05, + "loss": 2.5448, + "step": 28438 + }, + { + "epoch": 0.843311686386146, + "grad_norm": 0.08160751312971115, + "learning_rate": 6.056993012677414e-05, + "loss": 2.5657, + "step": 28439 + }, + { + "epoch": 0.8433413397384575, + "grad_norm": 0.08301667124032974, + "learning_rate": 6.054748502329321e-05, + "loss": 2.5267, + "step": 28440 + }, + { + "epoch": 0.8433709930907689, + "grad_norm": 0.08137273788452148, + "learning_rate": 6.052504381123969e-05, + "loss": 2.5646, + "step": 28441 + }, + { + "epoch": 0.8434006464430804, + "grad_norm": 0.0797005295753479, + "learning_rate": 6.050260649081235e-05, + "loss": 2.5701, + "step": 28442 + }, + { + "epoch": 0.8434302997953919, + "grad_norm": 0.08454956114292145, + "learning_rate": 6.048017306220988e-05, + "loss": 2.5475, + "step": 28443 + }, + { + "epoch": 0.8434599531477034, + "grad_norm": 0.0754864364862442, + "learning_rate": 6.045774352563094e-05, + "loss": 2.5429, + "step": 28444 + }, + { + "epoch": 0.8434896065000148, + "grad_norm": 0.08540458977222443, + "learning_rate": 6.04353178812741e-05, + "loss": 2.5373, + "step": 28445 + }, + { + "epoch": 0.8435192598523263, + "grad_norm": 0.07719312608242035, + "learning_rate": 6.041289612933798e-05, + "loss": 2.5095, + "step": 28446 + }, + { + "epoch": 0.8435489132046378, + "grad_norm": 0.08413797616958618, + "learning_rate": 6.0390478270021144e-05, + "loss": 2.4951, + "step": 28447 + }, + { + "epoch": 0.8435785665569493, + "grad_norm": 0.07824373245239258, + "learning_rate": 6.036806430352204e-05, + "loss": 2.5305, + "step": 28448 + }, + { + "epoch": 0.8436082199092607, + "grad_norm": 0.08193697780370712, + "learning_rate": 6.0345654230039235e-05, + "loss": 2.5653, + "step": 28449 + }, + { + "epoch": 0.8436378732615722, + "grad_norm": 0.08727386593818665, + "learning_rate": 6.032324804977108e-05, + "loss": 2.5503, + "step": 28450 + }, + { + "epoch": 0.8436675266138837, + "grad_norm": 0.07804469764232635, + "learning_rate": 6.030084576291606e-05, + "loss": 2.5654, + "step": 28451 + }, + { + "epoch": 0.8436971799661952, + "grad_norm": 0.08836201578378677, + "learning_rate": 6.027844736967253e-05, + "loss": 2.5168, + "step": 28452 + }, + { + "epoch": 0.8437268333185066, + "grad_norm": 0.07309778034687042, + "learning_rate": 6.025605287023877e-05, + "loss": 2.5199, + "step": 28453 + }, + { + "epoch": 0.8437564866708182, + "grad_norm": 0.07918112725019455, + "learning_rate": 6.0233662264813306e-05, + "loss": 2.5621, + "step": 28454 + }, + { + "epoch": 0.8437861400231296, + "grad_norm": 0.08508069068193436, + "learning_rate": 6.0211275553594126e-05, + "loss": 2.5408, + "step": 28455 + }, + { + "epoch": 0.8438157933754411, + "grad_norm": 0.08380405604839325, + "learning_rate": 6.018889273677952e-05, + "loss": 2.5646, + "step": 28456 + }, + { + "epoch": 0.8438454467277525, + "grad_norm": 0.09183946996927261, + "learning_rate": 6.016651381456778e-05, + "loss": 2.5799, + "step": 28457 + }, + { + "epoch": 0.8438751000800641, + "grad_norm": 0.08613113313913345, + "learning_rate": 6.014413878715713e-05, + "loss": 2.5467, + "step": 28458 + }, + { + "epoch": 0.8439047534323755, + "grad_norm": 0.08763042092323303, + "learning_rate": 6.012176765474564e-05, + "loss": 2.5444, + "step": 28459 + }, + { + "epoch": 0.843934406784687, + "grad_norm": 0.08768387883901596, + "learning_rate": 6.009940041753137e-05, + "loss": 2.5728, + "step": 28460 + }, + { + "epoch": 0.8439640601369985, + "grad_norm": 0.08252235502004623, + "learning_rate": 6.007703707571238e-05, + "loss": 2.5141, + "step": 28461 + }, + { + "epoch": 0.84399371348931, + "grad_norm": 0.08920988440513611, + "learning_rate": 6.0054677629486795e-05, + "loss": 2.5571, + "step": 28462 + }, + { + "epoch": 0.8440233668416215, + "grad_norm": 0.07477790862321854, + "learning_rate": 6.003232207905251e-05, + "loss": 2.5111, + "step": 28463 + }, + { + "epoch": 0.8440530201939329, + "grad_norm": 0.08112139254808426, + "learning_rate": 6.0009970424607704e-05, + "loss": 2.5177, + "step": 28464 + }, + { + "epoch": 0.8440826735462444, + "grad_norm": 0.0950334221124649, + "learning_rate": 5.998762266634999e-05, + "loss": 2.5133, + "step": 28465 + }, + { + "epoch": 0.8441123268985559, + "grad_norm": 0.08021342009305954, + "learning_rate": 5.996527880447739e-05, + "loss": 2.5409, + "step": 28466 + }, + { + "epoch": 0.8441419802508674, + "grad_norm": 0.07783304154872894, + "learning_rate": 5.994293883918778e-05, + "loss": 2.5223, + "step": 28467 + }, + { + "epoch": 0.8441716336031788, + "grad_norm": 0.08689429610967636, + "learning_rate": 5.99206027706789e-05, + "loss": 2.536, + "step": 28468 + }, + { + "epoch": 0.8442012869554903, + "grad_norm": 0.0852355808019638, + "learning_rate": 5.989827059914871e-05, + "loss": 2.5559, + "step": 28469 + }, + { + "epoch": 0.8442309403078018, + "grad_norm": 0.08171380311250687, + "learning_rate": 5.9875942324794874e-05, + "loss": 2.5552, + "step": 28470 + }, + { + "epoch": 0.8442605936601133, + "grad_norm": 0.07450931519269943, + "learning_rate": 5.985361794781513e-05, + "loss": 2.5528, + "step": 28471 + }, + { + "epoch": 0.8442902470124247, + "grad_norm": 0.07871808111667633, + "learning_rate": 5.9831297468407156e-05, + "loss": 2.5098, + "step": 28472 + }, + { + "epoch": 0.8443199003647363, + "grad_norm": 0.07860644161701202, + "learning_rate": 5.980898088676856e-05, + "loss": 2.5654, + "step": 28473 + }, + { + "epoch": 0.8443495537170477, + "grad_norm": 0.08035009354352951, + "learning_rate": 5.978666820309703e-05, + "loss": 2.5278, + "step": 28474 + }, + { + "epoch": 0.8443792070693592, + "grad_norm": 0.07980496436357498, + "learning_rate": 5.976435941759018e-05, + "loss": 2.5602, + "step": 28475 + }, + { + "epoch": 0.8444088604216706, + "grad_norm": 0.07607915252447128, + "learning_rate": 5.97420545304454e-05, + "loss": 2.583, + "step": 28476 + }, + { + "epoch": 0.8444385137739822, + "grad_norm": 0.08106119930744171, + "learning_rate": 5.971975354186032e-05, + "loss": 2.57, + "step": 28477 + }, + { + "epoch": 0.8444681671262936, + "grad_norm": 0.08048080652952194, + "learning_rate": 5.9697456452032395e-05, + "loss": 2.5578, + "step": 28478 + }, + { + "epoch": 0.8444978204786051, + "grad_norm": 0.07984154671430588, + "learning_rate": 5.967516326115907e-05, + "loss": 2.5575, + "step": 28479 + }, + { + "epoch": 0.8445274738309165, + "grad_norm": 0.08218168467283249, + "learning_rate": 5.965287396943775e-05, + "loss": 2.5653, + "step": 28480 + }, + { + "epoch": 0.8445571271832281, + "grad_norm": 0.0760575458407402, + "learning_rate": 5.963058857706572e-05, + "loss": 2.5368, + "step": 28481 + }, + { + "epoch": 0.8445867805355396, + "grad_norm": 0.08002834767103195, + "learning_rate": 5.960830708424048e-05, + "loss": 2.5486, + "step": 28482 + }, + { + "epoch": 0.844616433887851, + "grad_norm": 0.09251540899276733, + "learning_rate": 5.958602949115932e-05, + "loss": 2.5497, + "step": 28483 + }, + { + "epoch": 0.8446460872401625, + "grad_norm": 0.07614263892173767, + "learning_rate": 5.9563755798019424e-05, + "loss": 2.5297, + "step": 28484 + }, + { + "epoch": 0.844675740592474, + "grad_norm": 0.08237828314304352, + "learning_rate": 5.954148600501818e-05, + "loss": 2.5735, + "step": 28485 + }, + { + "epoch": 0.8447053939447855, + "grad_norm": 0.09247417747974396, + "learning_rate": 5.951922011235261e-05, + "loss": 2.5542, + "step": 28486 + }, + { + "epoch": 0.8447350472970969, + "grad_norm": 0.07426775246858597, + "learning_rate": 5.949695812021994e-05, + "loss": 2.568, + "step": 28487 + }, + { + "epoch": 0.8447647006494085, + "grad_norm": 0.08991944044828415, + "learning_rate": 5.947470002881733e-05, + "loss": 2.5646, + "step": 28488 + }, + { + "epoch": 0.8447943540017199, + "grad_norm": 0.08248737454414368, + "learning_rate": 5.9452445838341864e-05, + "loss": 2.5285, + "step": 28489 + }, + { + "epoch": 0.8448240073540314, + "grad_norm": 0.07772057503461838, + "learning_rate": 5.943019554899059e-05, + "loss": 2.5379, + "step": 28490 + }, + { + "epoch": 0.8448536607063428, + "grad_norm": 0.09367379546165466, + "learning_rate": 5.940794916096054e-05, + "loss": 2.5344, + "step": 28491 + }, + { + "epoch": 0.8448833140586544, + "grad_norm": 0.07727932929992676, + "learning_rate": 5.938570667444876e-05, + "loss": 2.5614, + "step": 28492 + }, + { + "epoch": 0.8449129674109658, + "grad_norm": 0.08100175112485886, + "learning_rate": 5.936346808965215e-05, + "loss": 2.5437, + "step": 28493 + }, + { + "epoch": 0.8449426207632773, + "grad_norm": 0.08986695855855942, + "learning_rate": 5.934123340676756e-05, + "loss": 2.4868, + "step": 28494 + }, + { + "epoch": 0.8449722741155887, + "grad_norm": 0.08199873566627502, + "learning_rate": 5.9319002625992156e-05, + "loss": 2.5334, + "step": 28495 + }, + { + "epoch": 0.8450019274679003, + "grad_norm": 0.08709672838449478, + "learning_rate": 5.929677574752268e-05, + "loss": 2.5454, + "step": 28496 + }, + { + "epoch": 0.8450315808202117, + "grad_norm": 0.08462746441364288, + "learning_rate": 5.92745527715558e-05, + "loss": 2.5278, + "step": 28497 + }, + { + "epoch": 0.8450612341725232, + "grad_norm": 0.07423512637615204, + "learning_rate": 5.925233369828836e-05, + "loss": 2.5437, + "step": 28498 + }, + { + "epoch": 0.8450908875248346, + "grad_norm": 0.08783547580242157, + "learning_rate": 5.923011852791721e-05, + "loss": 2.5754, + "step": 28499 + }, + { + "epoch": 0.8451205408771462, + "grad_norm": 0.07940222322940826, + "learning_rate": 5.9207907260639016e-05, + "loss": 2.5674, + "step": 28500 + }, + { + "epoch": 0.8451501942294576, + "grad_norm": 0.0767088383436203, + "learning_rate": 5.918569989665046e-05, + "loss": 2.5876, + "step": 28501 + }, + { + "epoch": 0.8451798475817691, + "grad_norm": 0.08619140088558197, + "learning_rate": 5.9163496436148214e-05, + "loss": 2.5313, + "step": 28502 + }, + { + "epoch": 0.8452095009340806, + "grad_norm": 0.074671171605587, + "learning_rate": 5.914129687932884e-05, + "loss": 2.5355, + "step": 28503 + }, + { + "epoch": 0.8452391542863921, + "grad_norm": 0.0837213322520256, + "learning_rate": 5.911910122638897e-05, + "loss": 2.5318, + "step": 28504 + }, + { + "epoch": 0.8452688076387036, + "grad_norm": 0.082680843770504, + "learning_rate": 5.909690947752511e-05, + "loss": 2.5498, + "step": 28505 + }, + { + "epoch": 0.845298460991015, + "grad_norm": 0.08106744289398193, + "learning_rate": 5.907472163293387e-05, + "loss": 2.5654, + "step": 28506 + }, + { + "epoch": 0.8453281143433266, + "grad_norm": 0.0813339501619339, + "learning_rate": 5.9052537692811604e-05, + "loss": 2.5653, + "step": 28507 + }, + { + "epoch": 0.845357767695638, + "grad_norm": 0.07668686658143997, + "learning_rate": 5.903035765735476e-05, + "loss": 2.5401, + "step": 28508 + }, + { + "epoch": 0.8453874210479495, + "grad_norm": 0.08371642976999283, + "learning_rate": 5.900818152675985e-05, + "loss": 2.534, + "step": 28509 + }, + { + "epoch": 0.8454170744002609, + "grad_norm": 0.0808674544095993, + "learning_rate": 5.898600930122316e-05, + "loss": 2.5164, + "step": 28510 + }, + { + "epoch": 0.8454467277525725, + "grad_norm": 0.07742446660995483, + "learning_rate": 5.89638409809411e-05, + "loss": 2.5218, + "step": 28511 + }, + { + "epoch": 0.8454763811048839, + "grad_norm": 0.08423037081956863, + "learning_rate": 5.894167656610988e-05, + "loss": 2.5294, + "step": 28512 + }, + { + "epoch": 0.8455060344571954, + "grad_norm": 0.07812671363353729, + "learning_rate": 5.891951605692586e-05, + "loss": 2.5508, + "step": 28513 + }, + { + "epoch": 0.8455356878095068, + "grad_norm": 0.08260250091552734, + "learning_rate": 5.8897359453585206e-05, + "loss": 2.5443, + "step": 28514 + }, + { + "epoch": 0.8455653411618184, + "grad_norm": 0.08015841990709305, + "learning_rate": 5.887520675628416e-05, + "loss": 2.5444, + "step": 28515 + }, + { + "epoch": 0.8455949945141298, + "grad_norm": 0.07684442400932312, + "learning_rate": 5.8853057965218895e-05, + "loss": 2.5295, + "step": 28516 + }, + { + "epoch": 0.8456246478664413, + "grad_norm": 0.08359933644533157, + "learning_rate": 5.883091308058547e-05, + "loss": 2.5501, + "step": 28517 + }, + { + "epoch": 0.8456543012187527, + "grad_norm": 0.08340384066104889, + "learning_rate": 5.880877210258007e-05, + "loss": 2.5503, + "step": 28518 + }, + { + "epoch": 0.8456839545710643, + "grad_norm": 0.07677633315324783, + "learning_rate": 5.878663503139886e-05, + "loss": 2.5606, + "step": 28519 + }, + { + "epoch": 0.8457136079233757, + "grad_norm": 0.08824989199638367, + "learning_rate": 5.8764501867237474e-05, + "loss": 2.5516, + "step": 28520 + }, + { + "epoch": 0.8457432612756872, + "grad_norm": 0.08305279165506363, + "learning_rate": 5.8742372610292306e-05, + "loss": 2.5207, + "step": 28521 + }, + { + "epoch": 0.8457729146279986, + "grad_norm": 0.0760842114686966, + "learning_rate": 5.872024726075914e-05, + "loss": 2.5258, + "step": 28522 + }, + { + "epoch": 0.8458025679803102, + "grad_norm": 0.08280117809772491, + "learning_rate": 5.8698125818833934e-05, + "loss": 2.5687, + "step": 28523 + }, + { + "epoch": 0.8458322213326217, + "grad_norm": 0.0792049691081047, + "learning_rate": 5.8676008284712535e-05, + "loss": 2.5429, + "step": 28524 + }, + { + "epoch": 0.8458618746849331, + "grad_norm": 0.08450858294963837, + "learning_rate": 5.865389465859089e-05, + "loss": 2.5386, + "step": 28525 + }, + { + "epoch": 0.8458915280372447, + "grad_norm": 0.07659585773944855, + "learning_rate": 5.8631784940664734e-05, + "loss": 2.5296, + "step": 28526 + }, + { + "epoch": 0.8459211813895561, + "grad_norm": 0.07717278599739075, + "learning_rate": 5.8609679131129914e-05, + "loss": 2.5483, + "step": 28527 + }, + { + "epoch": 0.8459508347418676, + "grad_norm": 0.08054696768522263, + "learning_rate": 5.8587577230182096e-05, + "loss": 2.5673, + "step": 28528 + }, + { + "epoch": 0.845980488094179, + "grad_norm": 0.07480882853269577, + "learning_rate": 5.856547923801708e-05, + "loss": 2.4912, + "step": 28529 + }, + { + "epoch": 0.8460101414464906, + "grad_norm": 0.08212162554264069, + "learning_rate": 5.8543385154830655e-05, + "loss": 2.5456, + "step": 28530 + }, + { + "epoch": 0.846039794798802, + "grad_norm": 0.078998863697052, + "learning_rate": 5.852129498081815e-05, + "loss": 2.5466, + "step": 28531 + }, + { + "epoch": 0.8460694481511135, + "grad_norm": 0.08402295410633087, + "learning_rate": 5.849920871617542e-05, + "loss": 2.5385, + "step": 28532 + }, + { + "epoch": 0.8460991015034249, + "grad_norm": 0.07597187161445618, + "learning_rate": 5.8477126361097864e-05, + "loss": 2.5793, + "step": 28533 + }, + { + "epoch": 0.8461287548557365, + "grad_norm": 0.08264555782079697, + "learning_rate": 5.8455047915781215e-05, + "loss": 2.5428, + "step": 28534 + }, + { + "epoch": 0.8461584082080479, + "grad_norm": 0.07394962757825851, + "learning_rate": 5.8432973380420915e-05, + "loss": 2.5417, + "step": 28535 + }, + { + "epoch": 0.8461880615603594, + "grad_norm": 0.08026216924190521, + "learning_rate": 5.8410902755212494e-05, + "loss": 2.5316, + "step": 28536 + }, + { + "epoch": 0.8462177149126708, + "grad_norm": 0.08907875418663025, + "learning_rate": 5.8388836040351224e-05, + "loss": 2.566, + "step": 28537 + }, + { + "epoch": 0.8462473682649824, + "grad_norm": 0.0769876092672348, + "learning_rate": 5.8366773236032674e-05, + "loss": 2.5428, + "step": 28538 + }, + { + "epoch": 0.8462770216172938, + "grad_norm": 0.08127667754888535, + "learning_rate": 5.834471434245214e-05, + "loss": 2.5025, + "step": 28539 + }, + { + "epoch": 0.8463066749696053, + "grad_norm": 0.07655750960111618, + "learning_rate": 5.832265935980507e-05, + "loss": 2.5429, + "step": 28540 + }, + { + "epoch": 0.8463363283219167, + "grad_norm": 0.07980941236019135, + "learning_rate": 5.83006082882866e-05, + "loss": 2.5642, + "step": 28541 + }, + { + "epoch": 0.8463659816742283, + "grad_norm": 0.08222541958093643, + "learning_rate": 5.827856112809199e-05, + "loss": 2.5184, + "step": 28542 + }, + { + "epoch": 0.8463956350265398, + "grad_norm": 0.08227168023586273, + "learning_rate": 5.82565178794166e-05, + "loss": 2.5393, + "step": 28543 + }, + { + "epoch": 0.8464252883788512, + "grad_norm": 0.08137627691030502, + "learning_rate": 5.823447854245556e-05, + "loss": 2.4736, + "step": 28544 + }, + { + "epoch": 0.8464549417311628, + "grad_norm": 0.08343975991010666, + "learning_rate": 5.8212443117404035e-05, + "loss": 2.5174, + "step": 28545 + }, + { + "epoch": 0.8464845950834742, + "grad_norm": 0.07428674399852753, + "learning_rate": 5.819041160445704e-05, + "loss": 2.5431, + "step": 28546 + }, + { + "epoch": 0.8465142484357857, + "grad_norm": 0.07823147624731064, + "learning_rate": 5.816838400380986e-05, + "loss": 2.5675, + "step": 28547 + }, + { + "epoch": 0.8465439017880971, + "grad_norm": 0.07391686737537384, + "learning_rate": 5.814636031565751e-05, + "loss": 2.5657, + "step": 28548 + }, + { + "epoch": 0.8465735551404087, + "grad_norm": 0.0770612582564354, + "learning_rate": 5.8124340540194996e-05, + "loss": 2.5501, + "step": 28549 + }, + { + "epoch": 0.8466032084927201, + "grad_norm": 0.07985253632068634, + "learning_rate": 5.810232467761728e-05, + "loss": 2.5736, + "step": 28550 + }, + { + "epoch": 0.8466328618450316, + "grad_norm": 0.07756864279508591, + "learning_rate": 5.8080312728119476e-05, + "loss": 2.5491, + "step": 28551 + }, + { + "epoch": 0.846662515197343, + "grad_norm": 0.08457330614328384, + "learning_rate": 5.805830469189621e-05, + "loss": 2.5672, + "step": 28552 + }, + { + "epoch": 0.8466921685496546, + "grad_norm": 0.08509331196546555, + "learning_rate": 5.8036300569142497e-05, + "loss": 2.5456, + "step": 28553 + }, + { + "epoch": 0.846721821901966, + "grad_norm": 0.07987245172262192, + "learning_rate": 5.801430036005323e-05, + "loss": 2.552, + "step": 28554 + }, + { + "epoch": 0.8467514752542775, + "grad_norm": 0.07979844510555267, + "learning_rate": 5.7992304064823196e-05, + "loss": 2.5418, + "step": 28555 + }, + { + "epoch": 0.8467811286065889, + "grad_norm": 0.08539026230573654, + "learning_rate": 5.797031168364719e-05, + "loss": 2.5265, + "step": 28556 + }, + { + "epoch": 0.8468107819589005, + "grad_norm": 0.07893847674131393, + "learning_rate": 5.7948323216719944e-05, + "loss": 2.5483, + "step": 28557 + }, + { + "epoch": 0.8468404353112119, + "grad_norm": 0.08957935124635696, + "learning_rate": 5.7926338664236134e-05, + "loss": 2.547, + "step": 28558 + }, + { + "epoch": 0.8468700886635234, + "grad_norm": 0.07830128818750381, + "learning_rate": 5.7904358026390436e-05, + "loss": 2.5518, + "step": 28559 + }, + { + "epoch": 0.8468997420158348, + "grad_norm": 0.08204519748687744, + "learning_rate": 5.7882381303377584e-05, + "loss": 2.5482, + "step": 28560 + }, + { + "epoch": 0.8469293953681464, + "grad_norm": 0.08040811866521835, + "learning_rate": 5.7860408495392255e-05, + "loss": 2.5576, + "step": 28561 + }, + { + "epoch": 0.8469590487204578, + "grad_norm": 0.07750140130519867, + "learning_rate": 5.78384396026288e-05, + "loss": 2.57, + "step": 28562 + }, + { + "epoch": 0.8469887020727693, + "grad_norm": 0.08213681727647781, + "learning_rate": 5.781647462528189e-05, + "loss": 2.5353, + "step": 28563 + }, + { + "epoch": 0.8470183554250809, + "grad_norm": 0.08400559425354004, + "learning_rate": 5.779451356354593e-05, + "loss": 2.5398, + "step": 28564 + }, + { + "epoch": 0.8470480087773923, + "grad_norm": 0.07502532005310059, + "learning_rate": 5.777255641761553e-05, + "loss": 2.5522, + "step": 28565 + }, + { + "epoch": 0.8470776621297038, + "grad_norm": 0.08520331233739853, + "learning_rate": 5.775060318768499e-05, + "loss": 2.5028, + "step": 28566 + }, + { + "epoch": 0.8471073154820152, + "grad_norm": 0.08061093837022781, + "learning_rate": 5.772865387394877e-05, + "loss": 2.5694, + "step": 28567 + }, + { + "epoch": 0.8471369688343268, + "grad_norm": 0.08195158839225769, + "learning_rate": 5.770670847660126e-05, + "loss": 2.5257, + "step": 28568 + }, + { + "epoch": 0.8471666221866382, + "grad_norm": 0.08431418985128403, + "learning_rate": 5.76847669958368e-05, + "loss": 2.5619, + "step": 28569 + }, + { + "epoch": 0.8471962755389497, + "grad_norm": 0.08264113962650299, + "learning_rate": 5.766282943184958e-05, + "loss": 2.5538, + "step": 28570 + }, + { + "epoch": 0.8472259288912611, + "grad_norm": 0.07372044771909714, + "learning_rate": 5.764089578483395e-05, + "loss": 2.5681, + "step": 28571 + }, + { + "epoch": 0.8472555822435727, + "grad_norm": 0.08671692758798599, + "learning_rate": 5.761896605498418e-05, + "loss": 2.5494, + "step": 28572 + }, + { + "epoch": 0.8472852355958841, + "grad_norm": 0.08093402534723282, + "learning_rate": 5.7597040242494346e-05, + "loss": 2.578, + "step": 28573 + }, + { + "epoch": 0.8473148889481956, + "grad_norm": 0.08407527953386307, + "learning_rate": 5.757511834755863e-05, + "loss": 2.5336, + "step": 28574 + }, + { + "epoch": 0.847344542300507, + "grad_norm": 0.08297213912010193, + "learning_rate": 5.7553200370371204e-05, + "loss": 2.5216, + "step": 28575 + }, + { + "epoch": 0.8473741956528186, + "grad_norm": 0.07871187478303909, + "learning_rate": 5.75312863111262e-05, + "loss": 2.5531, + "step": 28576 + }, + { + "epoch": 0.84740384900513, + "grad_norm": 0.09063544869422913, + "learning_rate": 5.75093761700175e-05, + "loss": 2.5587, + "step": 28577 + }, + { + "epoch": 0.8474335023574415, + "grad_norm": 0.07928559929132462, + "learning_rate": 5.74874699472393e-05, + "loss": 2.5353, + "step": 28578 + }, + { + "epoch": 0.847463155709753, + "grad_norm": 0.08015365898609161, + "learning_rate": 5.746556764298549e-05, + "loss": 2.5369, + "step": 28579 + }, + { + "epoch": 0.8474928090620645, + "grad_norm": 0.0898091197013855, + "learning_rate": 5.7443669257450035e-05, + "loss": 2.5523, + "step": 28580 + }, + { + "epoch": 0.8475224624143759, + "grad_norm": 0.07751277089118958, + "learning_rate": 5.742177479082683e-05, + "loss": 2.5297, + "step": 28581 + }, + { + "epoch": 0.8475521157666874, + "grad_norm": 0.07497543096542358, + "learning_rate": 5.739988424330983e-05, + "loss": 2.5363, + "step": 28582 + }, + { + "epoch": 0.8475817691189989, + "grad_norm": 0.08569585531949997, + "learning_rate": 5.737799761509277e-05, + "loss": 2.5574, + "step": 28583 + }, + { + "epoch": 0.8476114224713104, + "grad_norm": 0.07853926718235016, + "learning_rate": 5.735611490636955e-05, + "loss": 2.5303, + "step": 28584 + }, + { + "epoch": 0.8476410758236219, + "grad_norm": 0.07949843257665634, + "learning_rate": 5.733423611733391e-05, + "loss": 2.5666, + "step": 28585 + }, + { + "epoch": 0.8476707291759333, + "grad_norm": 0.08777850866317749, + "learning_rate": 5.7312361248179566e-05, + "loss": 2.5529, + "step": 28586 + }, + { + "epoch": 0.8477003825282449, + "grad_norm": 0.07920669764280319, + "learning_rate": 5.729049029910027e-05, + "loss": 2.5446, + "step": 28587 + }, + { + "epoch": 0.8477300358805563, + "grad_norm": 0.08269446343183517, + "learning_rate": 5.7268623270289696e-05, + "loss": 2.5769, + "step": 28588 + }, + { + "epoch": 0.8477596892328678, + "grad_norm": 0.08073015511035919, + "learning_rate": 5.7246760161941416e-05, + "loss": 2.5481, + "step": 28589 + }, + { + "epoch": 0.8477893425851792, + "grad_norm": 0.07374690473079681, + "learning_rate": 5.722490097424909e-05, + "loss": 2.5264, + "step": 28590 + }, + { + "epoch": 0.8478189959374908, + "grad_norm": 0.08572117984294891, + "learning_rate": 5.720304570740625e-05, + "loss": 2.5539, + "step": 28591 + }, + { + "epoch": 0.8478486492898022, + "grad_norm": 0.07994713634252548, + "learning_rate": 5.718119436160646e-05, + "loss": 2.5683, + "step": 28592 + }, + { + "epoch": 0.8478783026421137, + "grad_norm": 0.08258718252182007, + "learning_rate": 5.715934693704322e-05, + "loss": 2.5828, + "step": 28593 + }, + { + "epoch": 0.8479079559944251, + "grad_norm": 0.08073531091213226, + "learning_rate": 5.713750343390994e-05, + "loss": 2.5628, + "step": 28594 + }, + { + "epoch": 0.8479376093467367, + "grad_norm": 0.0820784792304039, + "learning_rate": 5.711566385240025e-05, + "loss": 2.5403, + "step": 28595 + }, + { + "epoch": 0.8479672626990481, + "grad_norm": 0.08202948421239853, + "learning_rate": 5.70938281927072e-05, + "loss": 2.534, + "step": 28596 + }, + { + "epoch": 0.8479969160513596, + "grad_norm": 0.08081845939159393, + "learning_rate": 5.7071996455024365e-05, + "loss": 2.5096, + "step": 28597 + }, + { + "epoch": 0.848026569403671, + "grad_norm": 0.07885456830263138, + "learning_rate": 5.705016863954493e-05, + "loss": 2.6012, + "step": 28598 + }, + { + "epoch": 0.8480562227559826, + "grad_norm": 0.08045108616352081, + "learning_rate": 5.7028344746462345e-05, + "loss": 2.5897, + "step": 28599 + }, + { + "epoch": 0.848085876108294, + "grad_norm": 0.08644010871648788, + "learning_rate": 5.700652477596985e-05, + "loss": 2.5947, + "step": 28600 + }, + { + "epoch": 0.8481155294606055, + "grad_norm": 0.07979071885347366, + "learning_rate": 5.6984708728260556e-05, + "loss": 2.5785, + "step": 28601 + }, + { + "epoch": 0.848145182812917, + "grad_norm": 0.08625088632106781, + "learning_rate": 5.696289660352777e-05, + "loss": 2.5545, + "step": 28602 + }, + { + "epoch": 0.8481748361652285, + "grad_norm": 0.07896819710731506, + "learning_rate": 5.694108840196455e-05, + "loss": 2.5209, + "step": 28603 + }, + { + "epoch": 0.8482044895175399, + "grad_norm": 0.07777533680200577, + "learning_rate": 5.691928412376407e-05, + "loss": 2.5794, + "step": 28604 + }, + { + "epoch": 0.8482341428698514, + "grad_norm": 0.07939160615205765, + "learning_rate": 5.689748376911935e-05, + "loss": 2.5678, + "step": 28605 + }, + { + "epoch": 0.848263796222163, + "grad_norm": 0.07853452861309052, + "learning_rate": 5.687568733822357e-05, + "loss": 2.5734, + "step": 28606 + }, + { + "epoch": 0.8482934495744744, + "grad_norm": 0.08548286557197571, + "learning_rate": 5.685389483126957e-05, + "loss": 2.5206, + "step": 28607 + }, + { + "epoch": 0.8483231029267859, + "grad_norm": 0.08168423920869827, + "learning_rate": 5.6832106248450366e-05, + "loss": 2.557, + "step": 28608 + }, + { + "epoch": 0.8483527562790973, + "grad_norm": 0.08005677163600922, + "learning_rate": 5.681032158995897e-05, + "loss": 2.593, + "step": 28609 + }, + { + "epoch": 0.8483824096314089, + "grad_norm": 0.08321884274482727, + "learning_rate": 5.678854085598822e-05, + "loss": 2.5639, + "step": 28610 + }, + { + "epoch": 0.8484120629837203, + "grad_norm": 0.0810302346944809, + "learning_rate": 5.6766764046730924e-05, + "loss": 2.5585, + "step": 28611 + }, + { + "epoch": 0.8484417163360318, + "grad_norm": 0.07432423532009125, + "learning_rate": 5.674499116238008e-05, + "loss": 2.5273, + "step": 28612 + }, + { + "epoch": 0.8484713696883432, + "grad_norm": 0.07512518763542175, + "learning_rate": 5.6723222203128475e-05, + "loss": 2.5234, + "step": 28613 + }, + { + "epoch": 0.8485010230406548, + "grad_norm": 0.07969272136688232, + "learning_rate": 5.6701457169168805e-05, + "loss": 2.5729, + "step": 28614 + }, + { + "epoch": 0.8485306763929662, + "grad_norm": 0.07908128201961517, + "learning_rate": 5.667969606069379e-05, + "loss": 2.5335, + "step": 28615 + }, + { + "epoch": 0.8485603297452777, + "grad_norm": 0.0804985910654068, + "learning_rate": 5.665793887789633e-05, + "loss": 2.5497, + "step": 28616 + }, + { + "epoch": 0.8485899830975892, + "grad_norm": 0.07989826053380966, + "learning_rate": 5.663618562096878e-05, + "loss": 2.5413, + "step": 28617 + }, + { + "epoch": 0.8486196364499007, + "grad_norm": 0.07611957937479019, + "learning_rate": 5.6614436290103875e-05, + "loss": 2.5414, + "step": 28618 + }, + { + "epoch": 0.8486492898022121, + "grad_norm": 0.08743678033351898, + "learning_rate": 5.659269088549429e-05, + "loss": 2.5416, + "step": 28619 + }, + { + "epoch": 0.8486789431545236, + "grad_norm": 0.07951163500547409, + "learning_rate": 5.657094940733254e-05, + "loss": 2.5623, + "step": 28620 + }, + { + "epoch": 0.8487085965068351, + "grad_norm": 0.08658552169799805, + "learning_rate": 5.654921185581114e-05, + "loss": 2.5323, + "step": 28621 + }, + { + "epoch": 0.8487382498591466, + "grad_norm": 0.08834484964609146, + "learning_rate": 5.652747823112253e-05, + "loss": 2.5644, + "step": 28622 + }, + { + "epoch": 0.848767903211458, + "grad_norm": 0.08141371607780457, + "learning_rate": 5.65057485334593e-05, + "loss": 2.5409, + "step": 28623 + }, + { + "epoch": 0.8487975565637695, + "grad_norm": 0.08324376493692398, + "learning_rate": 5.648402276301362e-05, + "loss": 2.554, + "step": 28624 + }, + { + "epoch": 0.848827209916081, + "grad_norm": 0.08280331641435623, + "learning_rate": 5.646230091997823e-05, + "loss": 2.5257, + "step": 28625 + }, + { + "epoch": 0.8488568632683925, + "grad_norm": 0.07663344591856003, + "learning_rate": 5.644058300454524e-05, + "loss": 2.5607, + "step": 28626 + }, + { + "epoch": 0.848886516620704, + "grad_norm": 0.07521098107099533, + "learning_rate": 5.641886901690713e-05, + "loss": 2.548, + "step": 28627 + }, + { + "epoch": 0.8489161699730154, + "grad_norm": 0.08159013837575912, + "learning_rate": 5.6397158957256e-05, + "loss": 2.5618, + "step": 28628 + }, + { + "epoch": 0.848945823325327, + "grad_norm": 0.08071747422218323, + "learning_rate": 5.6375452825784155e-05, + "loss": 2.5344, + "step": 28629 + }, + { + "epoch": 0.8489754766776384, + "grad_norm": 0.07901778072118759, + "learning_rate": 5.6353750622683775e-05, + "loss": 2.5165, + "step": 28630 + }, + { + "epoch": 0.8490051300299499, + "grad_norm": 0.08052079379558563, + "learning_rate": 5.633205234814715e-05, + "loss": 2.5503, + "step": 28631 + }, + { + "epoch": 0.8490347833822613, + "grad_norm": 0.07785303145647049, + "learning_rate": 5.631035800236633e-05, + "loss": 2.6057, + "step": 28632 + }, + { + "epoch": 0.8490644367345729, + "grad_norm": 0.08625838160514832, + "learning_rate": 5.628866758553347e-05, + "loss": 2.5702, + "step": 28633 + }, + { + "epoch": 0.8490940900868843, + "grad_norm": 0.08139918744564056, + "learning_rate": 5.6266981097840616e-05, + "loss": 2.5359, + "step": 28634 + }, + { + "epoch": 0.8491237434391958, + "grad_norm": 0.08067936450242996, + "learning_rate": 5.624529853947979e-05, + "loss": 2.5598, + "step": 28635 + }, + { + "epoch": 0.8491533967915073, + "grad_norm": 0.0824287012219429, + "learning_rate": 5.622361991064301e-05, + "loss": 2.5702, + "step": 28636 + }, + { + "epoch": 0.8491830501438188, + "grad_norm": 0.0782003179192543, + "learning_rate": 5.620194521152228e-05, + "loss": 2.5478, + "step": 28637 + }, + { + "epoch": 0.8492127034961302, + "grad_norm": 0.07601689547300339, + "learning_rate": 5.618027444230944e-05, + "loss": 2.5205, + "step": 28638 + }, + { + "epoch": 0.8492423568484417, + "grad_norm": 0.08154960721731186, + "learning_rate": 5.615860760319652e-05, + "loss": 2.5649, + "step": 28639 + }, + { + "epoch": 0.8492720102007532, + "grad_norm": 0.07787565141916275, + "learning_rate": 5.6136944694375304e-05, + "loss": 2.5132, + "step": 28640 + }, + { + "epoch": 0.8493016635530647, + "grad_norm": 0.08264867216348648, + "learning_rate": 5.611528571603758e-05, + "loss": 2.5708, + "step": 28641 + }, + { + "epoch": 0.8493313169053761, + "grad_norm": 0.08037188649177551, + "learning_rate": 5.609363066837525e-05, + "loss": 2.5357, + "step": 28642 + }, + { + "epoch": 0.8493609702576876, + "grad_norm": 0.08458172529935837, + "learning_rate": 5.607197955158e-05, + "loss": 2.5347, + "step": 28643 + }, + { + "epoch": 0.8493906236099991, + "grad_norm": 0.0820879340171814, + "learning_rate": 5.605033236584356e-05, + "loss": 2.5767, + "step": 28644 + }, + { + "epoch": 0.8494202769623106, + "grad_norm": 0.08495714515447617, + "learning_rate": 5.602868911135761e-05, + "loss": 2.5631, + "step": 28645 + }, + { + "epoch": 0.849449930314622, + "grad_norm": 0.07635920494794846, + "learning_rate": 5.600704978831389e-05, + "loss": 2.5158, + "step": 28646 + }, + { + "epoch": 0.8494795836669335, + "grad_norm": 0.08179407566785812, + "learning_rate": 5.598541439690391e-05, + "loss": 2.548, + "step": 28647 + }, + { + "epoch": 0.8495092370192451, + "grad_norm": 0.07583872228860855, + "learning_rate": 5.59637829373193e-05, + "loss": 2.5353, + "step": 28648 + }, + { + "epoch": 0.8495388903715565, + "grad_norm": 0.08012627065181732, + "learning_rate": 5.594215540975162e-05, + "loss": 2.5527, + "step": 28649 + }, + { + "epoch": 0.849568543723868, + "grad_norm": 0.07387713342905045, + "learning_rate": 5.592053181439233e-05, + "loss": 2.5377, + "step": 28650 + }, + { + "epoch": 0.8495981970761795, + "grad_norm": 0.07913031429052353, + "learning_rate": 5.5898912151433e-05, + "loss": 2.5693, + "step": 28651 + }, + { + "epoch": 0.849627850428491, + "grad_norm": 0.07536378502845764, + "learning_rate": 5.5877296421065035e-05, + "loss": 2.568, + "step": 28652 + }, + { + "epoch": 0.8496575037808024, + "grad_norm": 0.07464919984340668, + "learning_rate": 5.585568462347984e-05, + "loss": 2.5368, + "step": 28653 + }, + { + "epoch": 0.8496871571331139, + "grad_norm": 0.07880686968564987, + "learning_rate": 5.5834076758868814e-05, + "loss": 2.5218, + "step": 28654 + }, + { + "epoch": 0.8497168104854254, + "grad_norm": 0.07302338629961014, + "learning_rate": 5.5812472827423245e-05, + "loss": 2.543, + "step": 28655 + }, + { + "epoch": 0.8497464638377369, + "grad_norm": 0.0835915133357048, + "learning_rate": 5.579087282933448e-05, + "loss": 2.5332, + "step": 28656 + }, + { + "epoch": 0.8497761171900483, + "grad_norm": 0.07328356057405472, + "learning_rate": 5.576927676479376e-05, + "loss": 2.5642, + "step": 28657 + }, + { + "epoch": 0.8498057705423598, + "grad_norm": 0.07977645099163055, + "learning_rate": 5.574768463399238e-05, + "loss": 2.5503, + "step": 28658 + }, + { + "epoch": 0.8498354238946713, + "grad_norm": 0.08221221715211868, + "learning_rate": 5.572609643712151e-05, + "loss": 2.5193, + "step": 28659 + }, + { + "epoch": 0.8498650772469828, + "grad_norm": 0.07439029216766357, + "learning_rate": 5.570451217437228e-05, + "loss": 2.5161, + "step": 28660 + }, + { + "epoch": 0.8498947305992942, + "grad_norm": 0.08201482892036438, + "learning_rate": 5.568293184593598e-05, + "loss": 2.5514, + "step": 28661 + }, + { + "epoch": 0.8499243839516057, + "grad_norm": 0.08642113953828812, + "learning_rate": 5.5661355452003404e-05, + "loss": 2.5149, + "step": 28662 + }, + { + "epoch": 0.8499540373039172, + "grad_norm": 0.07869737595319748, + "learning_rate": 5.563978299276584e-05, + "loss": 2.5709, + "step": 28663 + }, + { + "epoch": 0.8499836906562287, + "grad_norm": 0.08351825177669525, + "learning_rate": 5.561821446841431e-05, + "loss": 2.5327, + "step": 28664 + }, + { + "epoch": 0.8500133440085401, + "grad_norm": 0.07820825278759003, + "learning_rate": 5.559664987913976e-05, + "loss": 2.5508, + "step": 28665 + }, + { + "epoch": 0.8500429973608516, + "grad_norm": 0.0785360261797905, + "learning_rate": 5.557508922513316e-05, + "loss": 2.5035, + "step": 28666 + }, + { + "epoch": 0.8500726507131631, + "grad_norm": 0.08076316863298416, + "learning_rate": 5.555353250658546e-05, + "loss": 2.5216, + "step": 28667 + }, + { + "epoch": 0.8501023040654746, + "grad_norm": 0.07801511138677597, + "learning_rate": 5.553197972368745e-05, + "loss": 2.5433, + "step": 28668 + }, + { + "epoch": 0.8501319574177861, + "grad_norm": 0.07940289378166199, + "learning_rate": 5.55104308766301e-05, + "loss": 2.5455, + "step": 28669 + }, + { + "epoch": 0.8501616107700976, + "grad_norm": 0.07811034470796585, + "learning_rate": 5.54888859656042e-05, + "loss": 2.5481, + "step": 28670 + }, + { + "epoch": 0.8501912641224091, + "grad_norm": 0.07971734553575516, + "learning_rate": 5.5467344990800585e-05, + "loss": 2.5506, + "step": 28671 + }, + { + "epoch": 0.8502209174747205, + "grad_norm": 0.08336254954338074, + "learning_rate": 5.544580795240983e-05, + "loss": 2.5647, + "step": 28672 + }, + { + "epoch": 0.850250570827032, + "grad_norm": 0.07545316964387894, + "learning_rate": 5.542427485062273e-05, + "loss": 2.5282, + "step": 28673 + }, + { + "epoch": 0.8502802241793435, + "grad_norm": 0.07704342156648636, + "learning_rate": 5.540274568563003e-05, + "loss": 2.5702, + "step": 28674 + }, + { + "epoch": 0.850309877531655, + "grad_norm": 0.08483294397592545, + "learning_rate": 5.538122045762217e-05, + "loss": 2.5473, + "step": 28675 + }, + { + "epoch": 0.8503395308839664, + "grad_norm": 0.07628723978996277, + "learning_rate": 5.5359699166790066e-05, + "loss": 2.5565, + "step": 28676 + }, + { + "epoch": 0.8503691842362779, + "grad_norm": 0.08019664138555527, + "learning_rate": 5.533818181332417e-05, + "loss": 2.5813, + "step": 28677 + }, + { + "epoch": 0.8503988375885894, + "grad_norm": 0.07589091360569, + "learning_rate": 5.531666839741495e-05, + "loss": 2.5031, + "step": 28678 + }, + { + "epoch": 0.8504284909409009, + "grad_norm": 0.07280362397432327, + "learning_rate": 5.529515891925302e-05, + "loss": 2.5236, + "step": 28679 + }, + { + "epoch": 0.8504581442932123, + "grad_norm": 0.07748319953680038, + "learning_rate": 5.5273653379028735e-05, + "loss": 2.5181, + "step": 28680 + }, + { + "epoch": 0.8504877976455238, + "grad_norm": 0.08095583319664001, + "learning_rate": 5.52521517769326e-05, + "loss": 2.5261, + "step": 28681 + }, + { + "epoch": 0.8505174509978353, + "grad_norm": 0.08584816753864288, + "learning_rate": 5.5230654113155084e-05, + "loss": 2.5277, + "step": 28682 + }, + { + "epoch": 0.8505471043501468, + "grad_norm": 0.08162059634923935, + "learning_rate": 5.520916038788642e-05, + "loss": 2.5333, + "step": 28683 + }, + { + "epoch": 0.8505767577024582, + "grad_norm": 0.08280570060014725, + "learning_rate": 5.518767060131696e-05, + "loss": 2.5349, + "step": 28684 + }, + { + "epoch": 0.8506064110547698, + "grad_norm": 0.07456585764884949, + "learning_rate": 5.5166184753637e-05, + "loss": 2.5272, + "step": 28685 + }, + { + "epoch": 0.8506360644070812, + "grad_norm": 0.07739535719156265, + "learning_rate": 5.514470284503686e-05, + "loss": 2.5299, + "step": 28686 + }, + { + "epoch": 0.8506657177593927, + "grad_norm": 0.07980059832334518, + "learning_rate": 5.5123224875706754e-05, + "loss": 2.5432, + "step": 28687 + }, + { + "epoch": 0.8506953711117041, + "grad_norm": 0.07356300950050354, + "learning_rate": 5.510175084583674e-05, + "loss": 2.5774, + "step": 28688 + }, + { + "epoch": 0.8507250244640157, + "grad_norm": 0.07564858347177505, + "learning_rate": 5.508028075561716e-05, + "loss": 2.5192, + "step": 28689 + }, + { + "epoch": 0.8507546778163272, + "grad_norm": 0.07925286889076233, + "learning_rate": 5.5058814605238096e-05, + "loss": 2.5486, + "step": 28690 + }, + { + "epoch": 0.8507843311686386, + "grad_norm": 0.07349803298711777, + "learning_rate": 5.503735239488955e-05, + "loss": 2.5538, + "step": 28691 + }, + { + "epoch": 0.8508139845209501, + "grad_norm": 0.08847767859697342, + "learning_rate": 5.5015894124761766e-05, + "loss": 2.5175, + "step": 28692 + }, + { + "epoch": 0.8508436378732616, + "grad_norm": 0.07800030708312988, + "learning_rate": 5.4994439795044535e-05, + "loss": 2.5286, + "step": 28693 + }, + { + "epoch": 0.8508732912255731, + "grad_norm": 0.08338995277881622, + "learning_rate": 5.4972989405927875e-05, + "loss": 2.538, + "step": 28694 + }, + { + "epoch": 0.8509029445778845, + "grad_norm": 0.07575064897537231, + "learning_rate": 5.4951542957601856e-05, + "loss": 2.5587, + "step": 28695 + }, + { + "epoch": 0.850932597930196, + "grad_norm": 0.07699909061193466, + "learning_rate": 5.493010045025626e-05, + "loss": 2.5447, + "step": 28696 + }, + { + "epoch": 0.8509622512825075, + "grad_norm": 0.08074299991130829, + "learning_rate": 5.4908661884081e-05, + "loss": 2.5261, + "step": 28697 + }, + { + "epoch": 0.850991904634819, + "grad_norm": 0.08659268915653229, + "learning_rate": 5.488722725926598e-05, + "loss": 2.5606, + "step": 28698 + }, + { + "epoch": 0.8510215579871304, + "grad_norm": 0.07366284728050232, + "learning_rate": 5.486579657600099e-05, + "loss": 2.5005, + "step": 28699 + }, + { + "epoch": 0.851051211339442, + "grad_norm": 0.08839154988527298, + "learning_rate": 5.484436983447572e-05, + "loss": 2.5213, + "step": 28700 + }, + { + "epoch": 0.8510808646917534, + "grad_norm": 0.07899145781993866, + "learning_rate": 5.482294703487989e-05, + "loss": 2.5505, + "step": 28701 + }, + { + "epoch": 0.8511105180440649, + "grad_norm": 0.07824952155351639, + "learning_rate": 5.480152817740336e-05, + "loss": 2.5351, + "step": 28702 + }, + { + "epoch": 0.8511401713963763, + "grad_norm": 0.08597349375486374, + "learning_rate": 5.478011326223587e-05, + "loss": 2.5391, + "step": 28703 + }, + { + "epoch": 0.8511698247486879, + "grad_norm": 0.08190663903951645, + "learning_rate": 5.475870228956675e-05, + "loss": 2.5204, + "step": 28704 + }, + { + "epoch": 0.8511994781009993, + "grad_norm": 0.08836600184440613, + "learning_rate": 5.473729525958571e-05, + "loss": 2.5748, + "step": 28705 + }, + { + "epoch": 0.8512291314533108, + "grad_norm": 0.0958533063530922, + "learning_rate": 5.4715892172482404e-05, + "loss": 2.5848, + "step": 28706 + }, + { + "epoch": 0.8512587848056222, + "grad_norm": 0.08114736527204514, + "learning_rate": 5.469449302844631e-05, + "loss": 2.5468, + "step": 28707 + }, + { + "epoch": 0.8512884381579338, + "grad_norm": 0.083012156188488, + "learning_rate": 5.467309782766688e-05, + "loss": 2.5355, + "step": 28708 + }, + { + "epoch": 0.8513180915102452, + "grad_norm": 0.0860368013381958, + "learning_rate": 5.4651706570333637e-05, + "loss": 2.5638, + "step": 28709 + }, + { + "epoch": 0.8513477448625567, + "grad_norm": 0.08806879818439484, + "learning_rate": 5.463031925663598e-05, + "loss": 2.5632, + "step": 28710 + }, + { + "epoch": 0.8513773982148682, + "grad_norm": 0.08364274352788925, + "learning_rate": 5.4608935886763245e-05, + "loss": 2.5509, + "step": 28711 + }, + { + "epoch": 0.8514070515671797, + "grad_norm": 0.08612271398305893, + "learning_rate": 5.4587556460904906e-05, + "loss": 2.5473, + "step": 28712 + }, + { + "epoch": 0.8514367049194912, + "grad_norm": 0.08271550387144089, + "learning_rate": 5.456618097925015e-05, + "loss": 2.5768, + "step": 28713 + }, + { + "epoch": 0.8514663582718026, + "grad_norm": 0.07718218863010406, + "learning_rate": 5.454480944198836e-05, + "loss": 2.5271, + "step": 28714 + }, + { + "epoch": 0.8514960116241141, + "grad_norm": 0.08242157846689224, + "learning_rate": 5.452344184930869e-05, + "loss": 2.566, + "step": 28715 + }, + { + "epoch": 0.8515256649764256, + "grad_norm": 0.08390339463949203, + "learning_rate": 5.450207820140046e-05, + "loss": 2.5559, + "step": 28716 + }, + { + "epoch": 0.8515553183287371, + "grad_norm": 0.07944200932979584, + "learning_rate": 5.4480718498452764e-05, + "loss": 2.5248, + "step": 28717 + }, + { + "epoch": 0.8515849716810485, + "grad_norm": 0.07845073938369751, + "learning_rate": 5.445936274065477e-05, + "loss": 2.5742, + "step": 28718 + }, + { + "epoch": 0.85161462503336, + "grad_norm": 0.08242326974868774, + "learning_rate": 5.443801092819567e-05, + "loss": 2.535, + "step": 28719 + }, + { + "epoch": 0.8516442783856715, + "grad_norm": 0.08294326812028885, + "learning_rate": 5.441666306126436e-05, + "loss": 2.5531, + "step": 28720 + }, + { + "epoch": 0.851673931737983, + "grad_norm": 0.08099059760570526, + "learning_rate": 5.439531914005002e-05, + "loss": 2.5395, + "step": 28721 + }, + { + "epoch": 0.8517035850902944, + "grad_norm": 0.07750170677900314, + "learning_rate": 5.437397916474168e-05, + "loss": 2.5738, + "step": 28722 + }, + { + "epoch": 0.851733238442606, + "grad_norm": 0.08350138366222382, + "learning_rate": 5.435264313552818e-05, + "loss": 2.5417, + "step": 28723 + }, + { + "epoch": 0.8517628917949174, + "grad_norm": 0.08091738820075989, + "learning_rate": 5.433131105259853e-05, + "loss": 2.5488, + "step": 28724 + }, + { + "epoch": 0.8517925451472289, + "grad_norm": 0.08154696226119995, + "learning_rate": 5.430998291614159e-05, + "loss": 2.5545, + "step": 28725 + }, + { + "epoch": 0.8518221984995403, + "grad_norm": 0.08042189478874207, + "learning_rate": 5.428865872634631e-05, + "loss": 2.5712, + "step": 28726 + }, + { + "epoch": 0.8518518518518519, + "grad_norm": 0.08031868934631348, + "learning_rate": 5.426733848340143e-05, + "loss": 2.5614, + "step": 28727 + }, + { + "epoch": 0.8518815052041633, + "grad_norm": 0.08275981992483139, + "learning_rate": 5.42460221874958e-05, + "loss": 2.5281, + "step": 28728 + }, + { + "epoch": 0.8519111585564748, + "grad_norm": 0.08199574053287506, + "learning_rate": 5.422470983881811e-05, + "loss": 2.5824, + "step": 28729 + }, + { + "epoch": 0.8519408119087862, + "grad_norm": 0.08529409766197205, + "learning_rate": 5.420340143755714e-05, + "loss": 2.5891, + "step": 28730 + }, + { + "epoch": 0.8519704652610978, + "grad_norm": 0.08322937041521072, + "learning_rate": 5.418209698390164e-05, + "loss": 2.5345, + "step": 28731 + }, + { + "epoch": 0.8520001186134093, + "grad_norm": 0.07534948736429214, + "learning_rate": 5.416079647804012e-05, + "loss": 2.5672, + "step": 28732 + }, + { + "epoch": 0.8520297719657207, + "grad_norm": 0.08213591575622559, + "learning_rate": 5.4139499920161316e-05, + "loss": 2.5609, + "step": 28733 + }, + { + "epoch": 0.8520594253180322, + "grad_norm": 0.08241865038871765, + "learning_rate": 5.411820731045375e-05, + "loss": 2.5551, + "step": 28734 + }, + { + "epoch": 0.8520890786703437, + "grad_norm": 0.07929795235395432, + "learning_rate": 5.409691864910599e-05, + "loss": 2.5667, + "step": 28735 + }, + { + "epoch": 0.8521187320226552, + "grad_norm": 0.0800456628203392, + "learning_rate": 5.4075633936306545e-05, + "loss": 2.5494, + "step": 28736 + }, + { + "epoch": 0.8521483853749666, + "grad_norm": 0.08094155043363571, + "learning_rate": 5.4054353172243994e-05, + "loss": 2.5197, + "step": 28737 + }, + { + "epoch": 0.8521780387272782, + "grad_norm": 0.08139771223068237, + "learning_rate": 5.4033076357106635e-05, + "loss": 2.5472, + "step": 28738 + }, + { + "epoch": 0.8522076920795896, + "grad_norm": 0.08868846297264099, + "learning_rate": 5.401180349108292e-05, + "loss": 2.5377, + "step": 28739 + }, + { + "epoch": 0.8522373454319011, + "grad_norm": 0.08109723776578903, + "learning_rate": 5.399053457436115e-05, + "loss": 2.5297, + "step": 28740 + }, + { + "epoch": 0.8522669987842125, + "grad_norm": 0.0809343084692955, + "learning_rate": 5.396926960712983e-05, + "loss": 2.5344, + "step": 28741 + }, + { + "epoch": 0.8522966521365241, + "grad_norm": 0.08081565052270889, + "learning_rate": 5.3948008589577155e-05, + "loss": 2.5483, + "step": 28742 + }, + { + "epoch": 0.8523263054888355, + "grad_norm": 0.08379554748535156, + "learning_rate": 5.392675152189147e-05, + "loss": 2.5465, + "step": 28743 + }, + { + "epoch": 0.852355958841147, + "grad_norm": 0.07820355147123337, + "learning_rate": 5.390549840426101e-05, + "loss": 2.5733, + "step": 28744 + }, + { + "epoch": 0.8523856121934584, + "grad_norm": 0.08722971379756927, + "learning_rate": 5.388424923687385e-05, + "loss": 2.5562, + "step": 28745 + }, + { + "epoch": 0.85241526554577, + "grad_norm": 0.07806457579135895, + "learning_rate": 5.3863004019918285e-05, + "loss": 2.5177, + "step": 28746 + }, + { + "epoch": 0.8524449188980814, + "grad_norm": 0.07531008124351501, + "learning_rate": 5.384176275358249e-05, + "loss": 2.5448, + "step": 28747 + }, + { + "epoch": 0.8524745722503929, + "grad_norm": 0.08767509460449219, + "learning_rate": 5.382052543805438e-05, + "loss": 2.5394, + "step": 28748 + }, + { + "epoch": 0.8525042256027043, + "grad_norm": 0.08141084760427475, + "learning_rate": 5.379929207352208e-05, + "loss": 2.5537, + "step": 28749 + }, + { + "epoch": 0.8525338789550159, + "grad_norm": 0.08213862776756287, + "learning_rate": 5.377806266017365e-05, + "loss": 2.5222, + "step": 28750 + }, + { + "epoch": 0.8525635323073274, + "grad_norm": 0.08711922913789749, + "learning_rate": 5.375683719819707e-05, + "loss": 2.561, + "step": 28751 + }, + { + "epoch": 0.8525931856596388, + "grad_norm": 0.07892433553934097, + "learning_rate": 5.373561568778029e-05, + "loss": 2.5488, + "step": 28752 + }, + { + "epoch": 0.8526228390119504, + "grad_norm": 0.08320026844739914, + "learning_rate": 5.371439812911111e-05, + "loss": 2.5457, + "step": 28753 + }, + { + "epoch": 0.8526524923642618, + "grad_norm": 0.07901917397975922, + "learning_rate": 5.3693184522377645e-05, + "loss": 2.5458, + "step": 28754 + }, + { + "epoch": 0.8526821457165733, + "grad_norm": 0.08108626306056976, + "learning_rate": 5.367197486776765e-05, + "loss": 2.5499, + "step": 28755 + }, + { + "epoch": 0.8527117990688847, + "grad_norm": 0.0772503986954689, + "learning_rate": 5.365076916546896e-05, + "loss": 2.5584, + "step": 28756 + }, + { + "epoch": 0.8527414524211963, + "grad_norm": 0.07917366176843643, + "learning_rate": 5.3629567415669264e-05, + "loss": 2.5969, + "step": 28757 + }, + { + "epoch": 0.8527711057735077, + "grad_norm": 0.08129365742206573, + "learning_rate": 5.360836961855653e-05, + "loss": 2.5645, + "step": 28758 + }, + { + "epoch": 0.8528007591258192, + "grad_norm": 0.0779869556427002, + "learning_rate": 5.3587175774318156e-05, + "loss": 2.5191, + "step": 28759 + }, + { + "epoch": 0.8528304124781306, + "grad_norm": 0.07725077122449875, + "learning_rate": 5.356598588314199e-05, + "loss": 2.5568, + "step": 28760 + }, + { + "epoch": 0.8528600658304422, + "grad_norm": 0.0835103914141655, + "learning_rate": 5.354479994521566e-05, + "loss": 2.5773, + "step": 28761 + }, + { + "epoch": 0.8528897191827536, + "grad_norm": 0.07872803509235382, + "learning_rate": 5.352361796072675e-05, + "loss": 2.5527, + "step": 28762 + }, + { + "epoch": 0.8529193725350651, + "grad_norm": 0.08418457955121994, + "learning_rate": 5.350243992986281e-05, + "loss": 2.5591, + "step": 28763 + }, + { + "epoch": 0.8529490258873765, + "grad_norm": 0.07929026335477829, + "learning_rate": 5.3481265852811436e-05, + "loss": 2.5284, + "step": 28764 + }, + { + "epoch": 0.8529786792396881, + "grad_norm": 0.07767212390899658, + "learning_rate": 5.346009572976007e-05, + "loss": 2.5123, + "step": 28765 + }, + { + "epoch": 0.8530083325919995, + "grad_norm": 0.08506613224744797, + "learning_rate": 5.343892956089613e-05, + "loss": 2.5417, + "step": 28766 + }, + { + "epoch": 0.853037985944311, + "grad_norm": 0.07756637036800385, + "learning_rate": 5.341776734640719e-05, + "loss": 2.5481, + "step": 28767 + }, + { + "epoch": 0.8530676392966224, + "grad_norm": 0.07922524958848953, + "learning_rate": 5.3396609086480696e-05, + "loss": 2.5469, + "step": 28768 + }, + { + "epoch": 0.853097292648934, + "grad_norm": 0.08108837157487869, + "learning_rate": 5.337545478130379e-05, + "loss": 2.538, + "step": 28769 + }, + { + "epoch": 0.8531269460012454, + "grad_norm": 0.08401147276163101, + "learning_rate": 5.3354304431063926e-05, + "loss": 2.5234, + "step": 28770 + }, + { + "epoch": 0.8531565993535569, + "grad_norm": 0.07709815353155136, + "learning_rate": 5.333315803594829e-05, + "loss": 2.5111, + "step": 28771 + }, + { + "epoch": 0.8531862527058685, + "grad_norm": 0.08680792897939682, + "learning_rate": 5.3312015596144236e-05, + "loss": 2.5451, + "step": 28772 + }, + { + "epoch": 0.8532159060581799, + "grad_norm": 0.08451178669929504, + "learning_rate": 5.3290877111839e-05, + "loss": 2.5648, + "step": 28773 + }, + { + "epoch": 0.8532455594104914, + "grad_norm": 0.08450299501419067, + "learning_rate": 5.32697425832197e-05, + "loss": 2.5542, + "step": 28774 + }, + { + "epoch": 0.8532752127628028, + "grad_norm": 0.08215568959712982, + "learning_rate": 5.3248612010473484e-05, + "loss": 2.5192, + "step": 28775 + }, + { + "epoch": 0.8533048661151144, + "grad_norm": 0.07945528626441956, + "learning_rate": 5.3227485393787525e-05, + "loss": 2.5222, + "step": 28776 + }, + { + "epoch": 0.8533345194674258, + "grad_norm": 0.07513034343719482, + "learning_rate": 5.3206362733348836e-05, + "loss": 2.538, + "step": 28777 + }, + { + "epoch": 0.8533641728197373, + "grad_norm": 0.0814477726817131, + "learning_rate": 5.3185244029344495e-05, + "loss": 2.551, + "step": 28778 + }, + { + "epoch": 0.8533938261720487, + "grad_norm": 0.07693392783403397, + "learning_rate": 5.316412928196157e-05, + "loss": 2.5123, + "step": 28779 + }, + { + "epoch": 0.8534234795243603, + "grad_norm": 0.07944103330373764, + "learning_rate": 5.314301849138692e-05, + "loss": 2.5525, + "step": 28780 + }, + { + "epoch": 0.8534531328766717, + "grad_norm": 0.0843672901391983, + "learning_rate": 5.3121911657807556e-05, + "loss": 2.5663, + "step": 28781 + }, + { + "epoch": 0.8534827862289832, + "grad_norm": 0.08124389499425888, + "learning_rate": 5.3100808781410384e-05, + "loss": 2.5992, + "step": 28782 + }, + { + "epoch": 0.8535124395812946, + "grad_norm": 0.07952994108200073, + "learning_rate": 5.307970986238225e-05, + "loss": 2.5185, + "step": 28783 + }, + { + "epoch": 0.8535420929336062, + "grad_norm": 0.0869787409901619, + "learning_rate": 5.305861490091002e-05, + "loss": 2.5699, + "step": 28784 + }, + { + "epoch": 0.8535717462859176, + "grad_norm": 0.08184193074703217, + "learning_rate": 5.303752389718042e-05, + "loss": 2.5135, + "step": 28785 + }, + { + "epoch": 0.8536013996382291, + "grad_norm": 0.07759802043437958, + "learning_rate": 5.30164368513803e-05, + "loss": 2.5521, + "step": 28786 + }, + { + "epoch": 0.8536310529905405, + "grad_norm": 0.07682034373283386, + "learning_rate": 5.299535376369635e-05, + "loss": 2.5967, + "step": 28787 + }, + { + "epoch": 0.8536607063428521, + "grad_norm": 0.08261812478303909, + "learning_rate": 5.2974274634315314e-05, + "loss": 2.5223, + "step": 28788 + }, + { + "epoch": 0.8536903596951635, + "grad_norm": 0.07860292494297028, + "learning_rate": 5.295319946342375e-05, + "loss": 2.5614, + "step": 28789 + }, + { + "epoch": 0.853720013047475, + "grad_norm": 0.07282102853059769, + "learning_rate": 5.293212825120835e-05, + "loss": 2.5467, + "step": 28790 + }, + { + "epoch": 0.8537496663997864, + "grad_norm": 0.08054891973733902, + "learning_rate": 5.29110609978557e-05, + "loss": 2.52, + "step": 28791 + }, + { + "epoch": 0.853779319752098, + "grad_norm": 0.07884884625673294, + "learning_rate": 5.288999770355235e-05, + "loss": 2.5234, + "step": 28792 + }, + { + "epoch": 0.8538089731044095, + "grad_norm": 0.07651228457689285, + "learning_rate": 5.286893836848483e-05, + "loss": 2.5567, + "step": 28793 + }, + { + "epoch": 0.8538386264567209, + "grad_norm": 0.08337510377168655, + "learning_rate": 5.284788299283955e-05, + "loss": 2.5255, + "step": 28794 + }, + { + "epoch": 0.8538682798090325, + "grad_norm": 0.0814841240644455, + "learning_rate": 5.282683157680307e-05, + "loss": 2.5253, + "step": 28795 + }, + { + "epoch": 0.8538979331613439, + "grad_norm": 0.0756082758307457, + "learning_rate": 5.280578412056175e-05, + "loss": 2.5361, + "step": 28796 + }, + { + "epoch": 0.8539275865136554, + "grad_norm": 0.08356533944606781, + "learning_rate": 5.2784740624301994e-05, + "loss": 2.5418, + "step": 28797 + }, + { + "epoch": 0.8539572398659668, + "grad_norm": 0.08731387555599213, + "learning_rate": 5.2763701088210045e-05, + "loss": 2.5099, + "step": 28798 + }, + { + "epoch": 0.8539868932182784, + "grad_norm": 0.07687431573867798, + "learning_rate": 5.2742665512472365e-05, + "loss": 2.5685, + "step": 28799 + }, + { + "epoch": 0.8540165465705898, + "grad_norm": 0.0811733677983284, + "learning_rate": 5.272163389727514e-05, + "loss": 2.5342, + "step": 28800 + }, + { + "epoch": 0.8540461999229013, + "grad_norm": 0.08177157491445541, + "learning_rate": 5.2700606242804594e-05, + "loss": 2.5686, + "step": 28801 + }, + { + "epoch": 0.8540758532752127, + "grad_norm": 0.08264882117509842, + "learning_rate": 5.267958254924698e-05, + "loss": 2.554, + "step": 28802 + }, + { + "epoch": 0.8541055066275243, + "grad_norm": 0.07746566087007523, + "learning_rate": 5.2658562816788545e-05, + "loss": 2.5525, + "step": 28803 + }, + { + "epoch": 0.8541351599798357, + "grad_norm": 0.07549934834241867, + "learning_rate": 5.2637547045615185e-05, + "loss": 2.4966, + "step": 28804 + }, + { + "epoch": 0.8541648133321472, + "grad_norm": 0.0779355838894844, + "learning_rate": 5.2616535235913085e-05, + "loss": 2.5039, + "step": 28805 + }, + { + "epoch": 0.8541944666844586, + "grad_norm": 0.08184551447629929, + "learning_rate": 5.259552738786844e-05, + "loss": 2.5426, + "step": 28806 + }, + { + "epoch": 0.8542241200367702, + "grad_norm": 0.07561972737312317, + "learning_rate": 5.2574523501667134e-05, + "loss": 2.5648, + "step": 28807 + }, + { + "epoch": 0.8542537733890816, + "grad_norm": 0.07983986288309097, + "learning_rate": 5.255352357749532e-05, + "loss": 2.5476, + "step": 28808 + }, + { + "epoch": 0.8542834267413931, + "grad_norm": 0.07670600712299347, + "learning_rate": 5.253252761553878e-05, + "loss": 2.5505, + "step": 28809 + }, + { + "epoch": 0.8543130800937045, + "grad_norm": 0.08005604147911072, + "learning_rate": 5.25115356159836e-05, + "loss": 2.5527, + "step": 28810 + }, + { + "epoch": 0.8543427334460161, + "grad_norm": 0.07153765857219696, + "learning_rate": 5.2490547579015504e-05, + "loss": 2.5538, + "step": 28811 + }, + { + "epoch": 0.8543723867983275, + "grad_norm": 0.07752565294504166, + "learning_rate": 5.246956350482046e-05, + "loss": 2.548, + "step": 28812 + }, + { + "epoch": 0.854402040150639, + "grad_norm": 0.0791163519024849, + "learning_rate": 5.2448583393584326e-05, + "loss": 2.5252, + "step": 28813 + }, + { + "epoch": 0.8544316935029506, + "grad_norm": 0.07482237368822098, + "learning_rate": 5.2427607245492725e-05, + "loss": 2.5372, + "step": 28814 + }, + { + "epoch": 0.854461346855262, + "grad_norm": 0.0800793245434761, + "learning_rate": 5.240663506073151e-05, + "loss": 2.5707, + "step": 28815 + }, + { + "epoch": 0.8544910002075735, + "grad_norm": 0.08075135946273804, + "learning_rate": 5.238566683948631e-05, + "loss": 2.5633, + "step": 28816 + }, + { + "epoch": 0.8545206535598849, + "grad_norm": 0.07918576896190643, + "learning_rate": 5.236470258194292e-05, + "loss": 2.5198, + "step": 28817 + }, + { + "epoch": 0.8545503069121965, + "grad_norm": 0.08665626496076584, + "learning_rate": 5.234374228828681e-05, + "loss": 2.5762, + "step": 28818 + }, + { + "epoch": 0.8545799602645079, + "grad_norm": 0.07823614776134491, + "learning_rate": 5.2322785958703764e-05, + "loss": 2.5347, + "step": 28819 + }, + { + "epoch": 0.8546096136168194, + "grad_norm": 0.07334988564252853, + "learning_rate": 5.2301833593379314e-05, + "loss": 2.5505, + "step": 28820 + }, + { + "epoch": 0.8546392669691308, + "grad_norm": 0.08472925424575806, + "learning_rate": 5.228088519249902e-05, + "loss": 2.5604, + "step": 28821 + }, + { + "epoch": 0.8546689203214424, + "grad_norm": 0.0802416205406189, + "learning_rate": 5.2259940756248295e-05, + "loss": 2.5357, + "step": 28822 + }, + { + "epoch": 0.8546985736737538, + "grad_norm": 0.07513678073883057, + "learning_rate": 5.223900028481271e-05, + "loss": 2.5484, + "step": 28823 + }, + { + "epoch": 0.8547282270260653, + "grad_norm": 0.08060875535011292, + "learning_rate": 5.2218063778377565e-05, + "loss": 2.5596, + "step": 28824 + }, + { + "epoch": 0.8547578803783767, + "grad_norm": 0.07564463466405869, + "learning_rate": 5.219713123712838e-05, + "loss": 2.5227, + "step": 28825 + }, + { + "epoch": 0.8547875337306883, + "grad_norm": 0.07893229275941849, + "learning_rate": 5.2176202661250394e-05, + "loss": 2.5268, + "step": 28826 + }, + { + "epoch": 0.8548171870829997, + "grad_norm": 0.0754174068570137, + "learning_rate": 5.215527805092901e-05, + "loss": 2.5311, + "step": 28827 + }, + { + "epoch": 0.8548468404353112, + "grad_norm": 0.0739981159567833, + "learning_rate": 5.213435740634953e-05, + "loss": 2.5209, + "step": 28828 + }, + { + "epoch": 0.8548764937876226, + "grad_norm": 0.07843151688575745, + "learning_rate": 5.211344072769719e-05, + "loss": 2.5501, + "step": 28829 + }, + { + "epoch": 0.8549061471399342, + "grad_norm": 0.08133430033922195, + "learning_rate": 5.209252801515718e-05, + "loss": 2.5452, + "step": 28830 + }, + { + "epoch": 0.8549358004922456, + "grad_norm": 0.08047499507665634, + "learning_rate": 5.207161926891468e-05, + "loss": 2.5792, + "step": 28831 + }, + { + "epoch": 0.8549654538445571, + "grad_norm": 0.08371331542730331, + "learning_rate": 5.205071448915494e-05, + "loss": 2.5406, + "step": 28832 + }, + { + "epoch": 0.8549951071968686, + "grad_norm": 0.07886746525764465, + "learning_rate": 5.202981367606302e-05, + "loss": 2.5212, + "step": 28833 + }, + { + "epoch": 0.8550247605491801, + "grad_norm": 0.07910817116498947, + "learning_rate": 5.200891682982406e-05, + "loss": 2.5931, + "step": 28834 + }, + { + "epoch": 0.8550544139014916, + "grad_norm": 0.07682736217975616, + "learning_rate": 5.198802395062296e-05, + "loss": 2.5417, + "step": 28835 + }, + { + "epoch": 0.855084067253803, + "grad_norm": 0.07663486897945404, + "learning_rate": 5.196713503864481e-05, + "loss": 2.5655, + "step": 28836 + }, + { + "epoch": 0.8551137206061146, + "grad_norm": 0.08201590925455093, + "learning_rate": 5.194625009407461e-05, + "loss": 2.5325, + "step": 28837 + }, + { + "epoch": 0.855143373958426, + "grad_norm": 0.07753206789493561, + "learning_rate": 5.192536911709722e-05, + "loss": 2.5698, + "step": 28838 + }, + { + "epoch": 0.8551730273107375, + "grad_norm": 0.07826706767082214, + "learning_rate": 5.190449210789766e-05, + "loss": 2.5296, + "step": 28839 + }, + { + "epoch": 0.8552026806630489, + "grad_norm": 0.09031572937965393, + "learning_rate": 5.188361906666067e-05, + "loss": 2.5153, + "step": 28840 + }, + { + "epoch": 0.8552323340153605, + "grad_norm": 0.08435675501823425, + "learning_rate": 5.1862749993571214e-05, + "loss": 2.5562, + "step": 28841 + }, + { + "epoch": 0.8552619873676719, + "grad_norm": 0.07427453249692917, + "learning_rate": 5.184188488881397e-05, + "loss": 2.5118, + "step": 28842 + }, + { + "epoch": 0.8552916407199834, + "grad_norm": 0.08065982908010483, + "learning_rate": 5.182102375257369e-05, + "loss": 2.5783, + "step": 28843 + }, + { + "epoch": 0.8553212940722948, + "grad_norm": 0.07881225645542145, + "learning_rate": 5.180016658503539e-05, + "loss": 2.5393, + "step": 28844 + }, + { + "epoch": 0.8553509474246064, + "grad_norm": 0.08148413896560669, + "learning_rate": 5.177931338638342e-05, + "loss": 2.5381, + "step": 28845 + }, + { + "epoch": 0.8553806007769178, + "grad_norm": 0.08253645896911621, + "learning_rate": 5.1758464156802574e-05, + "loss": 2.5447, + "step": 28846 + }, + { + "epoch": 0.8554102541292293, + "grad_norm": 0.08754269033670425, + "learning_rate": 5.173761889647749e-05, + "loss": 2.5764, + "step": 28847 + }, + { + "epoch": 0.8554399074815408, + "grad_norm": 0.08310125023126602, + "learning_rate": 5.171677760559268e-05, + "loss": 2.5396, + "step": 28848 + }, + { + "epoch": 0.8554695608338523, + "grad_norm": 0.07756157219409943, + "learning_rate": 5.169594028433283e-05, + "loss": 2.5342, + "step": 28849 + }, + { + "epoch": 0.8554992141861637, + "grad_norm": 0.0868501141667366, + "learning_rate": 5.16751069328823e-05, + "loss": 2.5654, + "step": 28850 + }, + { + "epoch": 0.8555288675384752, + "grad_norm": 0.08230778574943542, + "learning_rate": 5.165427755142571e-05, + "loss": 2.5386, + "step": 28851 + }, + { + "epoch": 0.8555585208907867, + "grad_norm": 0.0775914266705513, + "learning_rate": 5.163345214014742e-05, + "loss": 2.5038, + "step": 28852 + }, + { + "epoch": 0.8555881742430982, + "grad_norm": 0.0838107019662857, + "learning_rate": 5.161263069923189e-05, + "loss": 2.5513, + "step": 28853 + }, + { + "epoch": 0.8556178275954096, + "grad_norm": 0.07901373505592346, + "learning_rate": 5.159181322886347e-05, + "loss": 2.5112, + "step": 28854 + }, + { + "epoch": 0.8556474809477211, + "grad_norm": 0.08486982434988022, + "learning_rate": 5.157099972922652e-05, + "loss": 2.5224, + "step": 28855 + }, + { + "epoch": 0.8556771343000327, + "grad_norm": 0.08559694141149521, + "learning_rate": 5.155019020050533e-05, + "loss": 2.5414, + "step": 28856 + }, + { + "epoch": 0.8557067876523441, + "grad_norm": 0.08670850843191147, + "learning_rate": 5.152938464288415e-05, + "loss": 2.5379, + "step": 28857 + }, + { + "epoch": 0.8557364410046556, + "grad_norm": 0.09018174558877945, + "learning_rate": 5.1508583056547266e-05, + "loss": 2.509, + "step": 28858 + }, + { + "epoch": 0.855766094356967, + "grad_norm": 0.08724970370531082, + "learning_rate": 5.148778544167887e-05, + "loss": 2.5719, + "step": 28859 + }, + { + "epoch": 0.8557957477092786, + "grad_norm": 0.08968064188957214, + "learning_rate": 5.1466991798463145e-05, + "loss": 2.5742, + "step": 28860 + }, + { + "epoch": 0.85582540106159, + "grad_norm": 0.093843474984169, + "learning_rate": 5.1446202127084176e-05, + "loss": 2.5471, + "step": 28861 + }, + { + "epoch": 0.8558550544139015, + "grad_norm": 0.0792151615023613, + "learning_rate": 5.142541642772608e-05, + "loss": 2.5244, + "step": 28862 + }, + { + "epoch": 0.855884707766213, + "grad_norm": 0.08043905347585678, + "learning_rate": 5.140463470057294e-05, + "loss": 2.5742, + "step": 28863 + }, + { + "epoch": 0.8559143611185245, + "grad_norm": 0.08852769434452057, + "learning_rate": 5.138385694580871e-05, + "loss": 2.4998, + "step": 28864 + }, + { + "epoch": 0.8559440144708359, + "grad_norm": 0.08144263923168182, + "learning_rate": 5.136308316361749e-05, + "loss": 2.5517, + "step": 28865 + }, + { + "epoch": 0.8559736678231474, + "grad_norm": 0.07830922305583954, + "learning_rate": 5.1342313354183154e-05, + "loss": 2.5406, + "step": 28866 + }, + { + "epoch": 0.8560033211754589, + "grad_norm": 0.09515143930912018, + "learning_rate": 5.132154751768964e-05, + "loss": 2.5131, + "step": 28867 + }, + { + "epoch": 0.8560329745277704, + "grad_norm": 0.0812729224562645, + "learning_rate": 5.130078565432089e-05, + "loss": 2.5216, + "step": 28868 + }, + { + "epoch": 0.8560626278800818, + "grad_norm": 0.07631179690361023, + "learning_rate": 5.128002776426055e-05, + "loss": 2.5393, + "step": 28869 + }, + { + "epoch": 0.8560922812323933, + "grad_norm": 0.15705707669258118, + "learning_rate": 5.125927384769269e-05, + "loss": 2.5336, + "step": 28870 + }, + { + "epoch": 0.8561219345847048, + "grad_norm": 0.08429823070764542, + "learning_rate": 5.123852390480099e-05, + "loss": 2.542, + "step": 28871 + }, + { + "epoch": 0.8561515879370163, + "grad_norm": 0.08023486286401749, + "learning_rate": 5.121777793576915e-05, + "loss": 2.5467, + "step": 28872 + }, + { + "epoch": 0.8561812412893277, + "grad_norm": 0.07957357168197632, + "learning_rate": 5.1197035940780955e-05, + "loss": 2.5367, + "step": 28873 + }, + { + "epoch": 0.8562108946416392, + "grad_norm": 0.14656184613704681, + "learning_rate": 5.117629792002004e-05, + "loss": 2.56, + "step": 28874 + }, + { + "epoch": 0.8562405479939507, + "grad_norm": 0.07923031598329544, + "learning_rate": 5.115556387367004e-05, + "loss": 2.5008, + "step": 28875 + }, + { + "epoch": 0.8562702013462622, + "grad_norm": 0.08056268095970154, + "learning_rate": 5.113483380191458e-05, + "loss": 2.5254, + "step": 28876 + }, + { + "epoch": 0.8562998546985737, + "grad_norm": 0.08090148866176605, + "learning_rate": 5.1114107704937185e-05, + "loss": 2.5427, + "step": 28877 + }, + { + "epoch": 0.8563295080508851, + "grad_norm": 0.0796017274260521, + "learning_rate": 5.109338558292143e-05, + "loss": 2.5593, + "step": 28878 + }, + { + "epoch": 0.8563591614031967, + "grad_norm": 0.08237259089946747, + "learning_rate": 5.107266743605088e-05, + "loss": 2.5649, + "step": 28879 + }, + { + "epoch": 0.8563888147555081, + "grad_norm": 0.08314159512519836, + "learning_rate": 5.105195326450884e-05, + "loss": 2.5537, + "step": 28880 + }, + { + "epoch": 0.8564184681078196, + "grad_norm": 0.07952114939689636, + "learning_rate": 5.103124306847884e-05, + "loss": 2.5603, + "step": 28881 + }, + { + "epoch": 0.856448121460131, + "grad_norm": 0.0772591382265091, + "learning_rate": 5.1010536848144106e-05, + "loss": 2.5432, + "step": 28882 + }, + { + "epoch": 0.8564777748124426, + "grad_norm": 0.08212956041097641, + "learning_rate": 5.0989834603688226e-05, + "loss": 2.5055, + "step": 28883 + }, + { + "epoch": 0.856507428164754, + "grad_norm": 0.08288168907165527, + "learning_rate": 5.096913633529449e-05, + "loss": 2.5304, + "step": 28884 + }, + { + "epoch": 0.8565370815170655, + "grad_norm": 0.0784020647406578, + "learning_rate": 5.094844204314608e-05, + "loss": 2.5421, + "step": 28885 + }, + { + "epoch": 0.856566734869377, + "grad_norm": 0.08128125220537186, + "learning_rate": 5.092775172742631e-05, + "loss": 2.6025, + "step": 28886 + }, + { + "epoch": 0.8565963882216885, + "grad_norm": 0.08572317659854889, + "learning_rate": 5.0907065388318355e-05, + "loss": 2.4952, + "step": 28887 + }, + { + "epoch": 0.8566260415739999, + "grad_norm": 0.08807623386383057, + "learning_rate": 5.088638302600546e-05, + "loss": 2.5245, + "step": 28888 + }, + { + "epoch": 0.8566556949263114, + "grad_norm": 0.0796351507306099, + "learning_rate": 5.0865704640670806e-05, + "loss": 2.5455, + "step": 28889 + }, + { + "epoch": 0.8566853482786229, + "grad_norm": 0.08537889271974564, + "learning_rate": 5.084503023249737e-05, + "loss": 2.5712, + "step": 28890 + }, + { + "epoch": 0.8567150016309344, + "grad_norm": 0.08156954497098923, + "learning_rate": 5.0824359801668216e-05, + "loss": 2.5381, + "step": 28891 + }, + { + "epoch": 0.8567446549832458, + "grad_norm": 0.0823088064789772, + "learning_rate": 5.080369334836649e-05, + "loss": 2.5768, + "step": 28892 + }, + { + "epoch": 0.8567743083355573, + "grad_norm": 0.08290308713912964, + "learning_rate": 5.0783030872775194e-05, + "loss": 2.5613, + "step": 28893 + }, + { + "epoch": 0.8568039616878688, + "grad_norm": 0.07585231214761734, + "learning_rate": 5.0762372375077245e-05, + "loss": 2.5823, + "step": 28894 + }, + { + "epoch": 0.8568336150401803, + "grad_norm": 0.07840199023485184, + "learning_rate": 5.0741717855455506e-05, + "loss": 2.5506, + "step": 28895 + }, + { + "epoch": 0.8568632683924917, + "grad_norm": 0.08723090589046478, + "learning_rate": 5.072106731409304e-05, + "loss": 2.5458, + "step": 28896 + }, + { + "epoch": 0.8568929217448032, + "grad_norm": 0.08061035722494125, + "learning_rate": 5.0700420751172705e-05, + "loss": 2.5235, + "step": 28897 + }, + { + "epoch": 0.8569225750971148, + "grad_norm": 0.08246085792779922, + "learning_rate": 5.067977816687719e-05, + "loss": 2.5585, + "step": 28898 + }, + { + "epoch": 0.8569522284494262, + "grad_norm": 0.08759362250566483, + "learning_rate": 5.06591395613894e-05, + "loss": 2.5322, + "step": 28899 + }, + { + "epoch": 0.8569818818017377, + "grad_norm": 0.07414290308952332, + "learning_rate": 5.0638504934892135e-05, + "loss": 2.5648, + "step": 28900 + }, + { + "epoch": 0.8570115351540492, + "grad_norm": 0.08553827553987503, + "learning_rate": 5.0617874287567974e-05, + "loss": 2.5666, + "step": 28901 + }, + { + "epoch": 0.8570411885063607, + "grad_norm": 0.08293857425451279, + "learning_rate": 5.059724761959966e-05, + "loss": 2.53, + "step": 28902 + }, + { + "epoch": 0.8570708418586721, + "grad_norm": 0.07656043767929077, + "learning_rate": 5.057662493116988e-05, + "loss": 2.598, + "step": 28903 + }, + { + "epoch": 0.8571004952109836, + "grad_norm": 0.08081886172294617, + "learning_rate": 5.05560062224612e-05, + "loss": 2.5361, + "step": 28904 + }, + { + "epoch": 0.8571301485632951, + "grad_norm": 0.08331288397312164, + "learning_rate": 5.0535391493656215e-05, + "loss": 2.556, + "step": 28905 + }, + { + "epoch": 0.8571598019156066, + "grad_norm": 0.07862341403961182, + "learning_rate": 5.051478074493748e-05, + "loss": 2.5573, + "step": 28906 + }, + { + "epoch": 0.857189455267918, + "grad_norm": 0.07890986651182175, + "learning_rate": 5.049417397648759e-05, + "loss": 2.5544, + "step": 28907 + }, + { + "epoch": 0.8572191086202295, + "grad_norm": 0.07698801904916763, + "learning_rate": 5.0473571188488776e-05, + "loss": 2.5438, + "step": 28908 + }, + { + "epoch": 0.857248761972541, + "grad_norm": 0.0762876570224762, + "learning_rate": 5.0452972381123785e-05, + "loss": 2.5596, + "step": 28909 + }, + { + "epoch": 0.8572784153248525, + "grad_norm": 0.07340525835752487, + "learning_rate": 5.0432377554574973e-05, + "loss": 2.5645, + "step": 28910 + }, + { + "epoch": 0.8573080686771639, + "grad_norm": 0.07520344853401184, + "learning_rate": 5.041178670902452e-05, + "loss": 2.5399, + "step": 28911 + }, + { + "epoch": 0.8573377220294754, + "grad_norm": 0.07804537564516068, + "learning_rate": 5.039119984465484e-05, + "loss": 2.5473, + "step": 28912 + }, + { + "epoch": 0.8573673753817869, + "grad_norm": 0.07523702830076218, + "learning_rate": 5.0370616961648295e-05, + "loss": 2.541, + "step": 28913 + }, + { + "epoch": 0.8573970287340984, + "grad_norm": 0.07395943999290466, + "learning_rate": 5.035003806018712e-05, + "loss": 2.5292, + "step": 28914 + }, + { + "epoch": 0.8574266820864098, + "grad_norm": 0.08076604455709457, + "learning_rate": 5.032946314045356e-05, + "loss": 2.5489, + "step": 28915 + }, + { + "epoch": 0.8574563354387214, + "grad_norm": 0.07886023074388504, + "learning_rate": 5.030889220262974e-05, + "loss": 2.5309, + "step": 28916 + }, + { + "epoch": 0.8574859887910328, + "grad_norm": 0.0779896154999733, + "learning_rate": 5.028832524689791e-05, + "loss": 2.5303, + "step": 28917 + }, + { + "epoch": 0.8575156421433443, + "grad_norm": 0.07574684172868729, + "learning_rate": 5.02677622734401e-05, + "loss": 2.542, + "step": 28918 + }, + { + "epoch": 0.8575452954956558, + "grad_norm": 0.08448474109172821, + "learning_rate": 5.024720328243848e-05, + "loss": 2.5193, + "step": 28919 + }, + { + "epoch": 0.8575749488479673, + "grad_norm": 0.083407923579216, + "learning_rate": 5.0226648274075083e-05, + "loss": 2.5551, + "step": 28920 + }, + { + "epoch": 0.8576046022002788, + "grad_norm": 0.07640884071588516, + "learning_rate": 5.020609724853192e-05, + "loss": 2.5422, + "step": 28921 + }, + { + "epoch": 0.8576342555525902, + "grad_norm": 0.0825410932302475, + "learning_rate": 5.018555020599097e-05, + "loss": 2.5679, + "step": 28922 + }, + { + "epoch": 0.8576639089049017, + "grad_norm": 0.08106665313243866, + "learning_rate": 5.016500714663419e-05, + "loss": 2.5673, + "step": 28923 + }, + { + "epoch": 0.8576935622572132, + "grad_norm": 0.07730436325073242, + "learning_rate": 5.0144468070643435e-05, + "loss": 2.5454, + "step": 28924 + }, + { + "epoch": 0.8577232156095247, + "grad_norm": 0.08233710378408432, + "learning_rate": 5.012393297820067e-05, + "loss": 2.5753, + "step": 28925 + }, + { + "epoch": 0.8577528689618361, + "grad_norm": 0.07224730402231216, + "learning_rate": 5.01034018694877e-05, + "loss": 2.5505, + "step": 28926 + }, + { + "epoch": 0.8577825223141476, + "grad_norm": 0.07511578500270844, + "learning_rate": 5.008287474468631e-05, + "loss": 2.5438, + "step": 28927 + }, + { + "epoch": 0.8578121756664591, + "grad_norm": 0.0774565115571022, + "learning_rate": 5.0062351603978316e-05, + "loss": 2.5343, + "step": 28928 + }, + { + "epoch": 0.8578418290187706, + "grad_norm": 0.07418090105056763, + "learning_rate": 5.004183244754546e-05, + "loss": 2.5643, + "step": 28929 + }, + { + "epoch": 0.857871482371082, + "grad_norm": 0.07785146683454514, + "learning_rate": 5.002131727556936e-05, + "loss": 2.5547, + "step": 28930 + }, + { + "epoch": 0.8579011357233935, + "grad_norm": 0.07209862023591995, + "learning_rate": 5.000080608823171e-05, + "loss": 2.5551, + "step": 28931 + }, + { + "epoch": 0.857930789075705, + "grad_norm": 0.07290523499250412, + "learning_rate": 4.9980298885714205e-05, + "loss": 2.5656, + "step": 28932 + }, + { + "epoch": 0.8579604424280165, + "grad_norm": 0.07645750045776367, + "learning_rate": 4.9959795668198416e-05, + "loss": 2.5637, + "step": 28933 + }, + { + "epoch": 0.8579900957803279, + "grad_norm": 0.07841897010803223, + "learning_rate": 4.993929643586587e-05, + "loss": 2.5784, + "step": 28934 + }, + { + "epoch": 0.8580197491326395, + "grad_norm": 0.08117052912712097, + "learning_rate": 4.9918801188898074e-05, + "loss": 2.5608, + "step": 28935 + }, + { + "epoch": 0.8580494024849509, + "grad_norm": 0.0790378749370575, + "learning_rate": 4.989830992747657e-05, + "loss": 2.5743, + "step": 28936 + }, + { + "epoch": 0.8580790558372624, + "grad_norm": 0.07628703862428665, + "learning_rate": 4.987782265178281e-05, + "loss": 2.5185, + "step": 28937 + }, + { + "epoch": 0.8581087091895738, + "grad_norm": 0.07964885234832764, + "learning_rate": 4.985733936199815e-05, + "loss": 2.5507, + "step": 28938 + }, + { + "epoch": 0.8581383625418854, + "grad_norm": 0.07710780203342438, + "learning_rate": 4.983686005830407e-05, + "loss": 2.5209, + "step": 28939 + }, + { + "epoch": 0.8581680158941969, + "grad_norm": 0.07936210930347443, + "learning_rate": 4.981638474088179e-05, + "loss": 2.5671, + "step": 28940 + }, + { + "epoch": 0.8581976692465083, + "grad_norm": 0.07918640226125717, + "learning_rate": 4.979591340991274e-05, + "loss": 2.5431, + "step": 28941 + }, + { + "epoch": 0.8582273225988198, + "grad_norm": 0.08268076926469803, + "learning_rate": 4.9775446065578155e-05, + "loss": 2.5633, + "step": 28942 + }, + { + "epoch": 0.8582569759511313, + "grad_norm": 0.08050794154405594, + "learning_rate": 4.975498270805928e-05, + "loss": 2.5293, + "step": 28943 + }, + { + "epoch": 0.8582866293034428, + "grad_norm": 0.07817699015140533, + "learning_rate": 4.973452333753742e-05, + "loss": 2.5511, + "step": 28944 + }, + { + "epoch": 0.8583162826557542, + "grad_norm": 0.07802984118461609, + "learning_rate": 4.9714067954193534e-05, + "loss": 2.5611, + "step": 28945 + }, + { + "epoch": 0.8583459360080657, + "grad_norm": 0.07611889392137527, + "learning_rate": 4.9693616558208866e-05, + "loss": 2.5256, + "step": 28946 + }, + { + "epoch": 0.8583755893603772, + "grad_norm": 0.07994291186332703, + "learning_rate": 4.9673169149764444e-05, + "loss": 2.528, + "step": 28947 + }, + { + "epoch": 0.8584052427126887, + "grad_norm": 0.07241040468215942, + "learning_rate": 4.965272572904145e-05, + "loss": 2.5272, + "step": 28948 + }, + { + "epoch": 0.8584348960650001, + "grad_norm": 0.07361050695180893, + "learning_rate": 4.963228629622091e-05, + "loss": 2.5462, + "step": 28949 + }, + { + "epoch": 0.8584645494173117, + "grad_norm": 0.0778416246175766, + "learning_rate": 4.961185085148379e-05, + "loss": 2.5384, + "step": 28950 + }, + { + "epoch": 0.8584942027696231, + "grad_norm": 0.07825420796871185, + "learning_rate": 4.9591419395010996e-05, + "loss": 2.5693, + "step": 28951 + }, + { + "epoch": 0.8585238561219346, + "grad_norm": 0.07735340297222137, + "learning_rate": 4.957099192698355e-05, + "loss": 2.5452, + "step": 28952 + }, + { + "epoch": 0.858553509474246, + "grad_norm": 0.0795486718416214, + "learning_rate": 4.955056844758221e-05, + "loss": 2.5771, + "step": 28953 + }, + { + "epoch": 0.8585831628265576, + "grad_norm": 0.07679007202386856, + "learning_rate": 4.953014895698799e-05, + "loss": 2.584, + "step": 28954 + }, + { + "epoch": 0.858612816178869, + "grad_norm": 0.08163385093212128, + "learning_rate": 4.950973345538168e-05, + "loss": 2.5453, + "step": 28955 + }, + { + "epoch": 0.8586424695311805, + "grad_norm": 0.07699138671159744, + "learning_rate": 4.948932194294387e-05, + "loss": 2.5469, + "step": 28956 + }, + { + "epoch": 0.8586721228834919, + "grad_norm": 0.07536877691745758, + "learning_rate": 4.946891441985552e-05, + "loss": 2.498, + "step": 28957 + }, + { + "epoch": 0.8587017762358035, + "grad_norm": 0.0747716873884201, + "learning_rate": 4.944851088629721e-05, + "loss": 2.4936, + "step": 28958 + }, + { + "epoch": 0.858731429588115, + "grad_norm": 0.11397277563810349, + "learning_rate": 4.942811134244968e-05, + "loss": 2.5521, + "step": 28959 + }, + { + "epoch": 0.8587610829404264, + "grad_norm": 0.07846515625715256, + "learning_rate": 4.940771578849351e-05, + "loss": 2.5492, + "step": 28960 + }, + { + "epoch": 0.8587907362927379, + "grad_norm": 0.07646111398935318, + "learning_rate": 4.93873242246094e-05, + "loss": 2.5605, + "step": 28961 + }, + { + "epoch": 0.8588203896450494, + "grad_norm": 0.0741039365530014, + "learning_rate": 4.936693665097791e-05, + "loss": 2.567, + "step": 28962 + }, + { + "epoch": 0.8588500429973609, + "grad_norm": 0.077946737408638, + "learning_rate": 4.934655306777952e-05, + "loss": 2.5048, + "step": 28963 + }, + { + "epoch": 0.8588796963496723, + "grad_norm": 0.07780389487743378, + "learning_rate": 4.93261734751948e-05, + "loss": 2.5355, + "step": 28964 + }, + { + "epoch": 0.8589093497019838, + "grad_norm": 0.07828207314014435, + "learning_rate": 4.9305797873404224e-05, + "loss": 2.5262, + "step": 28965 + }, + { + "epoch": 0.8589390030542953, + "grad_norm": 0.07507593929767609, + "learning_rate": 4.9285426262588086e-05, + "loss": 2.5409, + "step": 28966 + }, + { + "epoch": 0.8589686564066068, + "grad_norm": 0.07883662730455399, + "learning_rate": 4.926505864292691e-05, + "loss": 2.558, + "step": 28967 + }, + { + "epoch": 0.8589983097589182, + "grad_norm": 0.08084825426340103, + "learning_rate": 4.9244695014600936e-05, + "loss": 2.5484, + "step": 28968 + }, + { + "epoch": 0.8590279631112298, + "grad_norm": 0.07807087898254395, + "learning_rate": 4.9224335377790584e-05, + "loss": 2.5414, + "step": 28969 + }, + { + "epoch": 0.8590576164635412, + "grad_norm": 0.07938718795776367, + "learning_rate": 4.9203979732676155e-05, + "loss": 2.5574, + "step": 28970 + }, + { + "epoch": 0.8590872698158527, + "grad_norm": 0.08168406784534454, + "learning_rate": 4.9183628079437824e-05, + "loss": 2.5696, + "step": 28971 + }, + { + "epoch": 0.8591169231681641, + "grad_norm": 0.07800580561161041, + "learning_rate": 4.916328041825585e-05, + "loss": 2.511, + "step": 28972 + }, + { + "epoch": 0.8591465765204757, + "grad_norm": 0.07853421568870544, + "learning_rate": 4.914293674931031e-05, + "loss": 2.5411, + "step": 28973 + }, + { + "epoch": 0.8591762298727871, + "grad_norm": 0.08117132633924484, + "learning_rate": 4.912259707278155e-05, + "loss": 2.5606, + "step": 28974 + }, + { + "epoch": 0.8592058832250986, + "grad_norm": 0.08058097213506699, + "learning_rate": 4.91022613888496e-05, + "loss": 2.5431, + "step": 28975 + }, + { + "epoch": 0.85923553657741, + "grad_norm": 0.07968439161777496, + "learning_rate": 4.9081929697694596e-05, + "loss": 2.5081, + "step": 28976 + }, + { + "epoch": 0.8592651899297216, + "grad_norm": 0.07919161766767502, + "learning_rate": 4.906160199949644e-05, + "loss": 2.507, + "step": 28977 + }, + { + "epoch": 0.859294843282033, + "grad_norm": 0.08207039535045624, + "learning_rate": 4.9041278294435165e-05, + "loss": 2.5567, + "step": 28978 + }, + { + "epoch": 0.8593244966343445, + "grad_norm": 0.08234930783510208, + "learning_rate": 4.902095858269079e-05, + "loss": 2.5612, + "step": 28979 + }, + { + "epoch": 0.859354149986656, + "grad_norm": 0.08581424504518509, + "learning_rate": 4.900064286444328e-05, + "loss": 2.5896, + "step": 28980 + }, + { + "epoch": 0.8593838033389675, + "grad_norm": 0.07570485770702362, + "learning_rate": 4.8980331139872435e-05, + "loss": 2.4967, + "step": 28981 + }, + { + "epoch": 0.859413456691279, + "grad_norm": 0.07807715982198715, + "learning_rate": 4.896002340915823e-05, + "loss": 2.5276, + "step": 28982 + }, + { + "epoch": 0.8594431100435904, + "grad_norm": 0.08543428033590317, + "learning_rate": 4.8939719672480396e-05, + "loss": 2.5495, + "step": 28983 + }, + { + "epoch": 0.859472763395902, + "grad_norm": 0.07434920966625214, + "learning_rate": 4.891941993001875e-05, + "loss": 2.5193, + "step": 28984 + }, + { + "epoch": 0.8595024167482134, + "grad_norm": 0.078957200050354, + "learning_rate": 4.889912418195308e-05, + "loss": 2.5414, + "step": 28985 + }, + { + "epoch": 0.8595320701005249, + "grad_norm": 0.0820876806974411, + "learning_rate": 4.887883242846314e-05, + "loss": 2.5378, + "step": 28986 + }, + { + "epoch": 0.8595617234528363, + "grad_norm": 0.08294474333524704, + "learning_rate": 4.88585446697285e-05, + "loss": 2.5563, + "step": 28987 + }, + { + "epoch": 0.8595913768051479, + "grad_norm": 0.07548677921295166, + "learning_rate": 4.883826090592891e-05, + "loss": 2.5641, + "step": 28988 + }, + { + "epoch": 0.8596210301574593, + "grad_norm": 0.08865105360746384, + "learning_rate": 4.8817981137244004e-05, + "loss": 2.5582, + "step": 28989 + }, + { + "epoch": 0.8596506835097708, + "grad_norm": 0.08346979320049286, + "learning_rate": 4.8797705363853305e-05, + "loss": 2.5713, + "step": 28990 + }, + { + "epoch": 0.8596803368620822, + "grad_norm": 0.07631093263626099, + "learning_rate": 4.877743358593634e-05, + "loss": 2.5675, + "step": 28991 + }, + { + "epoch": 0.8597099902143938, + "grad_norm": 0.08070012927055359, + "learning_rate": 4.875716580367268e-05, + "loss": 2.5431, + "step": 28992 + }, + { + "epoch": 0.8597396435667052, + "grad_norm": 0.08061686903238297, + "learning_rate": 4.873690201724174e-05, + "loss": 2.5469, + "step": 28993 + }, + { + "epoch": 0.8597692969190167, + "grad_norm": 0.08361824601888657, + "learning_rate": 4.871664222682304e-05, + "loss": 2.5384, + "step": 28994 + }, + { + "epoch": 0.8597989502713281, + "grad_norm": 0.0784183219075203, + "learning_rate": 4.8696386432595886e-05, + "loss": 2.5543, + "step": 28995 + }, + { + "epoch": 0.8598286036236397, + "grad_norm": 0.08448700606822968, + "learning_rate": 4.867613463473969e-05, + "loss": 2.5359, + "step": 28996 + }, + { + "epoch": 0.8598582569759511, + "grad_norm": 0.08094191551208496, + "learning_rate": 4.865588683343386e-05, + "loss": 2.5642, + "step": 28997 + }, + { + "epoch": 0.8598879103282626, + "grad_norm": 0.07854209840297699, + "learning_rate": 4.8635643028857535e-05, + "loss": 2.5664, + "step": 28998 + }, + { + "epoch": 0.859917563680574, + "grad_norm": 0.08470916748046875, + "learning_rate": 4.861540322119012e-05, + "loss": 2.5313, + "step": 28999 + }, + { + "epoch": 0.8599472170328856, + "grad_norm": 0.0905274897813797, + "learning_rate": 4.859516741061082e-05, + "loss": 2.5868, + "step": 29000 + }, + { + "epoch": 0.8599768703851971, + "grad_norm": 0.0836472362279892, + "learning_rate": 4.857493559729875e-05, + "loss": 2.5347, + "step": 29001 + }, + { + "epoch": 0.8600065237375085, + "grad_norm": 0.0769970640540123, + "learning_rate": 4.8554707781433116e-05, + "loss": 2.5106, + "step": 29002 + }, + { + "epoch": 0.86003617708982, + "grad_norm": 0.08403066545724869, + "learning_rate": 4.853448396319304e-05, + "loss": 2.5562, + "step": 29003 + }, + { + "epoch": 0.8600658304421315, + "grad_norm": 0.08378318697214127, + "learning_rate": 4.8514264142757556e-05, + "loss": 2.5718, + "step": 29004 + }, + { + "epoch": 0.860095483794443, + "grad_norm": 0.07863938063383102, + "learning_rate": 4.849404832030579e-05, + "loss": 2.5313, + "step": 29005 + }, + { + "epoch": 0.8601251371467544, + "grad_norm": 0.07489649951457977, + "learning_rate": 4.8473836496016777e-05, + "loss": 2.5163, + "step": 29006 + }, + { + "epoch": 0.860154790499066, + "grad_norm": 0.08486030250787735, + "learning_rate": 4.845362867006942e-05, + "loss": 2.5241, + "step": 29007 + }, + { + "epoch": 0.8601844438513774, + "grad_norm": 0.08077660948038101, + "learning_rate": 4.843342484264263e-05, + "loss": 2.51, + "step": 29008 + }, + { + "epoch": 0.8602140972036889, + "grad_norm": 0.07446681708097458, + "learning_rate": 4.841322501391543e-05, + "loss": 2.5311, + "step": 29009 + }, + { + "epoch": 0.8602437505560003, + "grad_norm": 0.08003182709217072, + "learning_rate": 4.839302918406674e-05, + "loss": 2.5389, + "step": 29010 + }, + { + "epoch": 0.8602734039083119, + "grad_norm": 0.07793790847063065, + "learning_rate": 4.8372837353275135e-05, + "loss": 2.5302, + "step": 29011 + }, + { + "epoch": 0.8603030572606233, + "grad_norm": 0.0772763267159462, + "learning_rate": 4.8352649521719525e-05, + "loss": 2.5589, + "step": 29012 + }, + { + "epoch": 0.8603327106129348, + "grad_norm": 0.07941833138465881, + "learning_rate": 4.833246568957883e-05, + "loss": 2.5259, + "step": 29013 + }, + { + "epoch": 0.8603623639652462, + "grad_norm": 0.07618371397256851, + "learning_rate": 4.831228585703168e-05, + "loss": 2.5417, + "step": 29014 + }, + { + "epoch": 0.8603920173175578, + "grad_norm": 0.07441233098506927, + "learning_rate": 4.8292110024256766e-05, + "loss": 2.5487, + "step": 29015 + }, + { + "epoch": 0.8604216706698692, + "grad_norm": 0.08231884241104126, + "learning_rate": 4.827193819143272e-05, + "loss": 2.5606, + "step": 29016 + }, + { + "epoch": 0.8604513240221807, + "grad_norm": 0.07594693452119827, + "learning_rate": 4.8251770358738235e-05, + "loss": 2.5556, + "step": 29017 + }, + { + "epoch": 0.8604809773744921, + "grad_norm": 0.07453438639640808, + "learning_rate": 4.823160652635189e-05, + "loss": 2.5813, + "step": 29018 + }, + { + "epoch": 0.8605106307268037, + "grad_norm": 0.07741294801235199, + "learning_rate": 4.8211446694452155e-05, + "loss": 2.5577, + "step": 29019 + }, + { + "epoch": 0.8605402840791151, + "grad_norm": 0.08150335401296616, + "learning_rate": 4.819129086321772e-05, + "loss": 2.5337, + "step": 29020 + }, + { + "epoch": 0.8605699374314266, + "grad_norm": 0.07321707904338837, + "learning_rate": 4.817113903282688e-05, + "loss": 2.559, + "step": 29021 + }, + { + "epoch": 0.8605995907837382, + "grad_norm": 0.08342210203409195, + "learning_rate": 4.8150991203458116e-05, + "loss": 2.5721, + "step": 29022 + }, + { + "epoch": 0.8606292441360496, + "grad_norm": 0.07830207794904709, + "learning_rate": 4.813084737528995e-05, + "loss": 2.5813, + "step": 29023 + }, + { + "epoch": 0.8606588974883611, + "grad_norm": 0.07120928913354874, + "learning_rate": 4.8110707548500555e-05, + "loss": 2.525, + "step": 29024 + }, + { + "epoch": 0.8606885508406725, + "grad_norm": 0.08883439749479294, + "learning_rate": 4.809057172326853e-05, + "loss": 2.5412, + "step": 29025 + }, + { + "epoch": 0.8607182041929841, + "grad_norm": 0.07310060411691666, + "learning_rate": 4.807043989977206e-05, + "loss": 2.557, + "step": 29026 + }, + { + "epoch": 0.8607478575452955, + "grad_norm": 0.0795045793056488, + "learning_rate": 4.8050312078189384e-05, + "loss": 2.5473, + "step": 29027 + }, + { + "epoch": 0.860777510897607, + "grad_norm": 0.08432937413454056, + "learning_rate": 4.8030188258698757e-05, + "loss": 2.526, + "step": 29028 + }, + { + "epoch": 0.8608071642499184, + "grad_norm": 0.07864054292440414, + "learning_rate": 4.8010068441478426e-05, + "loss": 2.5533, + "step": 29029 + }, + { + "epoch": 0.86083681760223, + "grad_norm": 0.08196604251861572, + "learning_rate": 4.798995262670653e-05, + "loss": 2.5628, + "step": 29030 + }, + { + "epoch": 0.8608664709545414, + "grad_norm": 0.08255642652511597, + "learning_rate": 4.7969840814561294e-05, + "loss": 2.5537, + "step": 29031 + }, + { + "epoch": 0.8608961243068529, + "grad_norm": 0.07984725385904312, + "learning_rate": 4.79497330052206e-05, + "loss": 2.5591, + "step": 29032 + }, + { + "epoch": 0.8609257776591643, + "grad_norm": 0.08513389527797699, + "learning_rate": 4.792962919886262e-05, + "loss": 2.5795, + "step": 29033 + }, + { + "epoch": 0.8609554310114759, + "grad_norm": 0.08092613518238068, + "learning_rate": 4.790952939566534e-05, + "loss": 2.5463, + "step": 29034 + }, + { + "epoch": 0.8609850843637873, + "grad_norm": 0.07817114144563675, + "learning_rate": 4.7889433595806776e-05, + "loss": 2.5578, + "step": 29035 + }, + { + "epoch": 0.8610147377160988, + "grad_norm": 0.07907368987798691, + "learning_rate": 4.7869341799464896e-05, + "loss": 2.5692, + "step": 29036 + }, + { + "epoch": 0.8610443910684102, + "grad_norm": 0.07981406152248383, + "learning_rate": 4.7849254006817555e-05, + "loss": 2.559, + "step": 29037 + }, + { + "epoch": 0.8610740444207218, + "grad_norm": 0.0762820914387703, + "learning_rate": 4.782917021804273e-05, + "loss": 2.5266, + "step": 29038 + }, + { + "epoch": 0.8611036977730332, + "grad_norm": 0.0784597098827362, + "learning_rate": 4.780909043331821e-05, + "loss": 2.5266, + "step": 29039 + }, + { + "epoch": 0.8611333511253447, + "grad_norm": 0.07927998900413513, + "learning_rate": 4.7789014652821815e-05, + "loss": 2.5461, + "step": 29040 + }, + { + "epoch": 0.8611630044776561, + "grad_norm": 0.07734987139701843, + "learning_rate": 4.776894287673139e-05, + "loss": 2.5479, + "step": 29041 + }, + { + "epoch": 0.8611926578299677, + "grad_norm": 0.07921890914440155, + "learning_rate": 4.7748875105224577e-05, + "loss": 2.5564, + "step": 29042 + }, + { + "epoch": 0.8612223111822792, + "grad_norm": 0.07483173906803131, + "learning_rate": 4.772881133847906e-05, + "loss": 2.5696, + "step": 29043 + }, + { + "epoch": 0.8612519645345906, + "grad_norm": 0.08002122491598129, + "learning_rate": 4.770875157667254e-05, + "loss": 2.5602, + "step": 29044 + }, + { + "epoch": 0.8612816178869022, + "grad_norm": 0.07359615713357925, + "learning_rate": 4.768869581998264e-05, + "loss": 2.526, + "step": 29045 + }, + { + "epoch": 0.8613112712392136, + "grad_norm": 0.08126366883516312, + "learning_rate": 4.766864406858701e-05, + "loss": 2.5657, + "step": 29046 + }, + { + "epoch": 0.8613409245915251, + "grad_norm": 0.07482010871171951, + "learning_rate": 4.764859632266316e-05, + "loss": 2.5522, + "step": 29047 + }, + { + "epoch": 0.8613705779438365, + "grad_norm": 0.07887861877679825, + "learning_rate": 4.7628552582388686e-05, + "loss": 2.5342, + "step": 29048 + }, + { + "epoch": 0.8614002312961481, + "grad_norm": 0.07647351920604706, + "learning_rate": 4.760851284794099e-05, + "loss": 2.5332, + "step": 29049 + }, + { + "epoch": 0.8614298846484595, + "grad_norm": 0.08234751969575882, + "learning_rate": 4.758847711949749e-05, + "loss": 2.5378, + "step": 29050 + }, + { + "epoch": 0.861459538000771, + "grad_norm": 0.077696792781353, + "learning_rate": 4.7568445397235704e-05, + "loss": 2.5225, + "step": 29051 + }, + { + "epoch": 0.8614891913530824, + "grad_norm": 0.08390182256698608, + "learning_rate": 4.7548417681333164e-05, + "loss": 2.5303, + "step": 29052 + }, + { + "epoch": 0.861518844705394, + "grad_norm": 0.07587558031082153, + "learning_rate": 4.7528393971966897e-05, + "loss": 2.5453, + "step": 29053 + }, + { + "epoch": 0.8615484980577054, + "grad_norm": 0.07709146291017532, + "learning_rate": 4.750837426931443e-05, + "loss": 2.5169, + "step": 29054 + }, + { + "epoch": 0.8615781514100169, + "grad_norm": 0.08313744515180588, + "learning_rate": 4.748835857355293e-05, + "loss": 2.5483, + "step": 29055 + }, + { + "epoch": 0.8616078047623283, + "grad_norm": 0.07547255605459213, + "learning_rate": 4.746834688485974e-05, + "loss": 2.5267, + "step": 29056 + }, + { + "epoch": 0.8616374581146399, + "grad_norm": 0.08794920891523361, + "learning_rate": 4.744833920341196e-05, + "loss": 2.5215, + "step": 29057 + }, + { + "epoch": 0.8616671114669513, + "grad_norm": 0.08877770602703094, + "learning_rate": 4.74283355293868e-05, + "loss": 2.5533, + "step": 29058 + }, + { + "epoch": 0.8616967648192628, + "grad_norm": 0.07676227390766144, + "learning_rate": 4.740833586296145e-05, + "loss": 2.5485, + "step": 29059 + }, + { + "epoch": 0.8617264181715742, + "grad_norm": 0.10502652823925018, + "learning_rate": 4.7388340204312984e-05, + "loss": 2.5474, + "step": 29060 + }, + { + "epoch": 0.8617560715238858, + "grad_norm": 0.081606425344944, + "learning_rate": 4.7368348553618434e-05, + "loss": 2.5443, + "step": 29061 + }, + { + "epoch": 0.8617857248761972, + "grad_norm": 0.07953278720378876, + "learning_rate": 4.7348360911054875e-05, + "loss": 2.5373, + "step": 29062 + }, + { + "epoch": 0.8618153782285087, + "grad_norm": 0.09811492264270782, + "learning_rate": 4.732837727679923e-05, + "loss": 2.5572, + "step": 29063 + }, + { + "epoch": 0.8618450315808203, + "grad_norm": 0.08165176957845688, + "learning_rate": 4.730839765102857e-05, + "loss": 2.5296, + "step": 29064 + }, + { + "epoch": 0.8618746849331317, + "grad_norm": 0.0813521221280098, + "learning_rate": 4.72884220339197e-05, + "loss": 2.5541, + "step": 29065 + }, + { + "epoch": 0.8619043382854432, + "grad_norm": 0.08998388051986694, + "learning_rate": 4.726845042564959e-05, + "loss": 2.5262, + "step": 29066 + }, + { + "epoch": 0.8619339916377546, + "grad_norm": 0.07884985953569412, + "learning_rate": 4.724848282639505e-05, + "loss": 2.5824, + "step": 29067 + }, + { + "epoch": 0.8619636449900662, + "grad_norm": 0.08227263391017914, + "learning_rate": 4.722851923633287e-05, + "loss": 2.577, + "step": 29068 + }, + { + "epoch": 0.8619932983423776, + "grad_norm": 0.0835011675953865, + "learning_rate": 4.720855965563992e-05, + "loss": 2.5635, + "step": 29069 + }, + { + "epoch": 0.8620229516946891, + "grad_norm": 0.0778987854719162, + "learning_rate": 4.718860408449288e-05, + "loss": 2.5382, + "step": 29070 + }, + { + "epoch": 0.8620526050470005, + "grad_norm": 0.07526417076587677, + "learning_rate": 4.7168652523068456e-05, + "loss": 2.5179, + "step": 29071 + }, + { + "epoch": 0.8620822583993121, + "grad_norm": 0.07456163316965103, + "learning_rate": 4.714870497154333e-05, + "loss": 2.5628, + "step": 29072 + }, + { + "epoch": 0.8621119117516235, + "grad_norm": 0.07799050211906433, + "learning_rate": 4.71287614300942e-05, + "loss": 2.5631, + "step": 29073 + }, + { + "epoch": 0.862141565103935, + "grad_norm": 0.07443896681070328, + "learning_rate": 4.710882189889759e-05, + "loss": 2.53, + "step": 29074 + }, + { + "epoch": 0.8621712184562464, + "grad_norm": 0.07920265942811966, + "learning_rate": 4.7088886378130243e-05, + "loss": 2.5306, + "step": 29075 + }, + { + "epoch": 0.862200871808558, + "grad_norm": 0.07809815555810928, + "learning_rate": 4.706895486796831e-05, + "loss": 2.5488, + "step": 29076 + }, + { + "epoch": 0.8622305251608694, + "grad_norm": 0.07697352766990662, + "learning_rate": 4.704902736858863e-05, + "loss": 2.551, + "step": 29077 + }, + { + "epoch": 0.8622601785131809, + "grad_norm": 0.07448828220367432, + "learning_rate": 4.702910388016757e-05, + "loss": 2.5604, + "step": 29078 + }, + { + "epoch": 0.8622898318654924, + "grad_norm": 0.07418840378522873, + "learning_rate": 4.7009184402881545e-05, + "loss": 2.5427, + "step": 29079 + }, + { + "epoch": 0.8623194852178039, + "grad_norm": 0.08222221583127975, + "learning_rate": 4.698926893690692e-05, + "loss": 2.5089, + "step": 29080 + }, + { + "epoch": 0.8623491385701153, + "grad_norm": 0.07683765143156052, + "learning_rate": 4.6969357482420094e-05, + "loss": 2.5432, + "step": 29081 + }, + { + "epoch": 0.8623787919224268, + "grad_norm": 0.08129051327705383, + "learning_rate": 4.6949450039597386e-05, + "loss": 2.5256, + "step": 29082 + }, + { + "epoch": 0.8624084452747383, + "grad_norm": 0.0838344544172287, + "learning_rate": 4.692954660861509e-05, + "loss": 2.5158, + "step": 29083 + }, + { + "epoch": 0.8624380986270498, + "grad_norm": 0.08247588574886322, + "learning_rate": 4.690964718964935e-05, + "loss": 2.5413, + "step": 29084 + }, + { + "epoch": 0.8624677519793613, + "grad_norm": 0.07591218501329422, + "learning_rate": 4.688975178287652e-05, + "loss": 2.5238, + "step": 29085 + }, + { + "epoch": 0.8624974053316727, + "grad_norm": 0.08679280430078506, + "learning_rate": 4.68698603884728e-05, + "loss": 2.5439, + "step": 29086 + }, + { + "epoch": 0.8625270586839843, + "grad_norm": 0.07557403296232224, + "learning_rate": 4.684997300661409e-05, + "loss": 2.5655, + "step": 29087 + }, + { + "epoch": 0.8625567120362957, + "grad_norm": 0.077324777841568, + "learning_rate": 4.6830089637476705e-05, + "loss": 2.5421, + "step": 29088 + }, + { + "epoch": 0.8625863653886072, + "grad_norm": 0.08374935388565063, + "learning_rate": 4.681021028123656e-05, + "loss": 2.5259, + "step": 29089 + }, + { + "epoch": 0.8626160187409186, + "grad_norm": 0.07787773013114929, + "learning_rate": 4.679033493806989e-05, + "loss": 2.5448, + "step": 29090 + }, + { + "epoch": 0.8626456720932302, + "grad_norm": 0.07655949890613556, + "learning_rate": 4.677046360815257e-05, + "loss": 2.5704, + "step": 29091 + }, + { + "epoch": 0.8626753254455416, + "grad_norm": 0.08179941773414612, + "learning_rate": 4.675059629166062e-05, + "loss": 2.565, + "step": 29092 + }, + { + "epoch": 0.8627049787978531, + "grad_norm": 0.0830114483833313, + "learning_rate": 4.6730732988769956e-05, + "loss": 2.5171, + "step": 29093 + }, + { + "epoch": 0.8627346321501645, + "grad_norm": 0.07766487449407578, + "learning_rate": 4.671087369965643e-05, + "loss": 2.5499, + "step": 29094 + }, + { + "epoch": 0.8627642855024761, + "grad_norm": 0.07692769169807434, + "learning_rate": 4.6691018424495915e-05, + "loss": 2.5392, + "step": 29095 + }, + { + "epoch": 0.8627939388547875, + "grad_norm": 0.0776946023106575, + "learning_rate": 4.667116716346431e-05, + "loss": 2.5432, + "step": 29096 + }, + { + "epoch": 0.862823592207099, + "grad_norm": 0.0774713084101677, + "learning_rate": 4.6651319916737264e-05, + "loss": 2.5404, + "step": 29097 + }, + { + "epoch": 0.8628532455594105, + "grad_norm": 0.07996232807636261, + "learning_rate": 4.6631476684490635e-05, + "loss": 2.5504, + "step": 29098 + }, + { + "epoch": 0.862882898911722, + "grad_norm": 0.07519582659006119, + "learning_rate": 4.661163746690006e-05, + "loss": 2.5427, + "step": 29099 + }, + { + "epoch": 0.8629125522640334, + "grad_norm": 0.07221348583698273, + "learning_rate": 4.6591802264141284e-05, + "loss": 2.5536, + "step": 29100 + }, + { + "epoch": 0.8629422056163449, + "grad_norm": 0.08023280650377274, + "learning_rate": 4.6571971076389885e-05, + "loss": 2.542, + "step": 29101 + }, + { + "epoch": 0.8629718589686564, + "grad_norm": 0.07402972877025604, + "learning_rate": 4.655214390382145e-05, + "loss": 2.5589, + "step": 29102 + }, + { + "epoch": 0.8630015123209679, + "grad_norm": 0.08191076666116714, + "learning_rate": 4.653232074661168e-05, + "loss": 2.5719, + "step": 29103 + }, + { + "epoch": 0.8630311656732793, + "grad_norm": 0.0755571499466896, + "learning_rate": 4.6512501604936086e-05, + "loss": 2.5309, + "step": 29104 + }, + { + "epoch": 0.8630608190255908, + "grad_norm": 0.07863347977399826, + "learning_rate": 4.649268647897009e-05, + "loss": 2.558, + "step": 29105 + }, + { + "epoch": 0.8630904723779024, + "grad_norm": 0.07668273150920868, + "learning_rate": 4.6472875368889223e-05, + "loss": 2.5325, + "step": 29106 + }, + { + "epoch": 0.8631201257302138, + "grad_norm": 0.0788547471165657, + "learning_rate": 4.6453068274869005e-05, + "loss": 2.5257, + "step": 29107 + }, + { + "epoch": 0.8631497790825253, + "grad_norm": 0.07620546221733093, + "learning_rate": 4.6433265197084576e-05, + "loss": 2.5241, + "step": 29108 + }, + { + "epoch": 0.8631794324348367, + "grad_norm": 0.07777771353721619, + "learning_rate": 4.6413466135711455e-05, + "loss": 2.5509, + "step": 29109 + }, + { + "epoch": 0.8632090857871483, + "grad_norm": 0.0817137211561203, + "learning_rate": 4.639367109092496e-05, + "loss": 2.559, + "step": 29110 + }, + { + "epoch": 0.8632387391394597, + "grad_norm": 0.07696975022554398, + "learning_rate": 4.637388006290039e-05, + "loss": 2.5472, + "step": 29111 + }, + { + "epoch": 0.8632683924917712, + "grad_norm": 0.08243345469236374, + "learning_rate": 4.635409305181293e-05, + "loss": 2.5587, + "step": 29112 + }, + { + "epoch": 0.8632980458440827, + "grad_norm": 0.0755138024687767, + "learning_rate": 4.633431005783789e-05, + "loss": 2.5577, + "step": 29113 + }, + { + "epoch": 0.8633276991963942, + "grad_norm": 0.08235746622085571, + "learning_rate": 4.6314531081150354e-05, + "loss": 2.5359, + "step": 29114 + }, + { + "epoch": 0.8633573525487056, + "grad_norm": 0.07942819595336914, + "learning_rate": 4.629475612192546e-05, + "loss": 2.5592, + "step": 29115 + }, + { + "epoch": 0.8633870059010171, + "grad_norm": 0.07676630467176437, + "learning_rate": 4.6274985180338457e-05, + "loss": 2.5745, + "step": 29116 + }, + { + "epoch": 0.8634166592533286, + "grad_norm": 0.07463502883911133, + "learning_rate": 4.625521825656442e-05, + "loss": 2.5142, + "step": 29117 + }, + { + "epoch": 0.8634463126056401, + "grad_norm": 0.08837681263685226, + "learning_rate": 4.623545535077822e-05, + "loss": 2.5432, + "step": 29118 + }, + { + "epoch": 0.8634759659579515, + "grad_norm": 0.07582497596740723, + "learning_rate": 4.621569646315499e-05, + "loss": 2.5567, + "step": 29119 + }, + { + "epoch": 0.863505619310263, + "grad_norm": 0.07471922785043716, + "learning_rate": 4.619594159386964e-05, + "loss": 2.5389, + "step": 29120 + }, + { + "epoch": 0.8635352726625745, + "grad_norm": 0.08039640635251999, + "learning_rate": 4.617619074309709e-05, + "loss": 2.5566, + "step": 29121 + }, + { + "epoch": 0.863564926014886, + "grad_norm": 0.07345971465110779, + "learning_rate": 4.6156443911012316e-05, + "loss": 2.5521, + "step": 29122 + }, + { + "epoch": 0.8635945793671974, + "grad_norm": 0.08173826336860657, + "learning_rate": 4.6136701097790114e-05, + "loss": 2.5353, + "step": 29123 + }, + { + "epoch": 0.8636242327195089, + "grad_norm": 0.0781908705830574, + "learning_rate": 4.6116962303605294e-05, + "loss": 2.568, + "step": 29124 + }, + { + "epoch": 0.8636538860718204, + "grad_norm": 0.08109859377145767, + "learning_rate": 4.609722752863271e-05, + "loss": 2.5581, + "step": 29125 + }, + { + "epoch": 0.8636835394241319, + "grad_norm": 0.07589557766914368, + "learning_rate": 4.607749677304712e-05, + "loss": 2.5277, + "step": 29126 + }, + { + "epoch": 0.8637131927764434, + "grad_norm": 0.08233379572629929, + "learning_rate": 4.605777003702316e-05, + "loss": 2.5758, + "step": 29127 + }, + { + "epoch": 0.8637428461287548, + "grad_norm": 0.07367761433124542, + "learning_rate": 4.6038047320735575e-05, + "loss": 2.5431, + "step": 29128 + }, + { + "epoch": 0.8637724994810664, + "grad_norm": 0.08665472269058228, + "learning_rate": 4.601832862435906e-05, + "loss": 2.543, + "step": 29129 + }, + { + "epoch": 0.8638021528333778, + "grad_norm": 0.07813210785388947, + "learning_rate": 4.5998613948068094e-05, + "loss": 2.5461, + "step": 29130 + }, + { + "epoch": 0.8638318061856893, + "grad_norm": 0.07797527313232422, + "learning_rate": 4.597890329203735e-05, + "loss": 2.5325, + "step": 29131 + }, + { + "epoch": 0.8638614595380008, + "grad_norm": 0.0807792991399765, + "learning_rate": 4.595919665644138e-05, + "loss": 2.5399, + "step": 29132 + }, + { + "epoch": 0.8638911128903123, + "grad_norm": 0.07468398660421371, + "learning_rate": 4.5939494041454646e-05, + "loss": 2.5342, + "step": 29133 + }, + { + "epoch": 0.8639207662426237, + "grad_norm": 0.07415100932121277, + "learning_rate": 4.591979544725161e-05, + "loss": 2.5317, + "step": 29134 + }, + { + "epoch": 0.8639504195949352, + "grad_norm": 0.08215146511793137, + "learning_rate": 4.590010087400676e-05, + "loss": 2.5709, + "step": 29135 + }, + { + "epoch": 0.8639800729472467, + "grad_norm": 0.07541543990373611, + "learning_rate": 4.588041032189438e-05, + "loss": 2.543, + "step": 29136 + }, + { + "epoch": 0.8640097262995582, + "grad_norm": 0.07937734574079514, + "learning_rate": 4.586072379108902e-05, + "loss": 2.5197, + "step": 29137 + }, + { + "epoch": 0.8640393796518696, + "grad_norm": 0.07600654661655426, + "learning_rate": 4.584104128176486e-05, + "loss": 2.5251, + "step": 29138 + }, + { + "epoch": 0.8640690330041811, + "grad_norm": 0.0777982622385025, + "learning_rate": 4.582136279409621e-05, + "loss": 2.5479, + "step": 29139 + }, + { + "epoch": 0.8640986863564926, + "grad_norm": 0.0812687873840332, + "learning_rate": 4.580168832825737e-05, + "loss": 2.5484, + "step": 29140 + }, + { + "epoch": 0.8641283397088041, + "grad_norm": 0.08396095037460327, + "learning_rate": 4.578201788442254e-05, + "loss": 2.5597, + "step": 29141 + }, + { + "epoch": 0.8641579930611155, + "grad_norm": 0.08382996171712875, + "learning_rate": 4.5762351462765914e-05, + "loss": 2.5303, + "step": 29142 + }, + { + "epoch": 0.864187646413427, + "grad_norm": 0.07695972919464111, + "learning_rate": 4.574268906346163e-05, + "loss": 2.5368, + "step": 29143 + }, + { + "epoch": 0.8642172997657385, + "grad_norm": 0.0767691507935524, + "learning_rate": 4.572303068668382e-05, + "loss": 2.5391, + "step": 29144 + }, + { + "epoch": 0.86424695311805, + "grad_norm": 0.07786605507135391, + "learning_rate": 4.5703376332606516e-05, + "loss": 2.5446, + "step": 29145 + }, + { + "epoch": 0.8642766064703614, + "grad_norm": 0.0822899267077446, + "learning_rate": 4.568372600140386e-05, + "loss": 2.5256, + "step": 29146 + }, + { + "epoch": 0.864306259822673, + "grad_norm": 0.08130139112472534, + "learning_rate": 4.566407969324976e-05, + "loss": 2.5321, + "step": 29147 + }, + { + "epoch": 0.8643359131749845, + "grad_norm": 0.07160184532403946, + "learning_rate": 4.5644437408318196e-05, + "loss": 2.5527, + "step": 29148 + }, + { + "epoch": 0.8643655665272959, + "grad_norm": 0.07898570597171783, + "learning_rate": 4.5624799146783134e-05, + "loss": 2.5325, + "step": 29149 + }, + { + "epoch": 0.8643952198796074, + "grad_norm": 0.08950883895158768, + "learning_rate": 4.560516490881844e-05, + "loss": 2.5598, + "step": 29150 + }, + { + "epoch": 0.8644248732319189, + "grad_norm": 0.08085040748119354, + "learning_rate": 4.558553469459814e-05, + "loss": 2.5327, + "step": 29151 + }, + { + "epoch": 0.8644545265842304, + "grad_norm": 0.07537408173084259, + "learning_rate": 4.556590850429587e-05, + "loss": 2.5607, + "step": 29152 + }, + { + "epoch": 0.8644841799365418, + "grad_norm": 0.0812607854604721, + "learning_rate": 4.554628633808544e-05, + "loss": 2.5334, + "step": 29153 + }, + { + "epoch": 0.8645138332888533, + "grad_norm": 0.07924771308898926, + "learning_rate": 4.552666819614054e-05, + "loss": 2.5065, + "step": 29154 + }, + { + "epoch": 0.8645434866411648, + "grad_norm": 0.07996023446321487, + "learning_rate": 4.550705407863515e-05, + "loss": 2.5336, + "step": 29155 + }, + { + "epoch": 0.8645731399934763, + "grad_norm": 0.08047635108232498, + "learning_rate": 4.548744398574273e-05, + "loss": 2.5575, + "step": 29156 + }, + { + "epoch": 0.8646027933457877, + "grad_norm": 0.0776657983660698, + "learning_rate": 4.546783791763709e-05, + "loss": 2.5544, + "step": 29157 + }, + { + "epoch": 0.8646324466980992, + "grad_norm": 0.07661444693803787, + "learning_rate": 4.544823587449171e-05, + "loss": 2.563, + "step": 29158 + }, + { + "epoch": 0.8646621000504107, + "grad_norm": 0.08224781602621078, + "learning_rate": 4.5428637856480274e-05, + "loss": 2.5202, + "step": 29159 + }, + { + "epoch": 0.8646917534027222, + "grad_norm": 0.07707357406616211, + "learning_rate": 4.54090438637762e-05, + "loss": 2.5092, + "step": 29160 + }, + { + "epoch": 0.8647214067550336, + "grad_norm": 0.07688573002815247, + "learning_rate": 4.538945389655314e-05, + "loss": 2.5432, + "step": 29161 + }, + { + "epoch": 0.8647510601073451, + "grad_norm": 0.08203060179948807, + "learning_rate": 4.536986795498454e-05, + "loss": 2.5667, + "step": 29162 + }, + { + "epoch": 0.8647807134596566, + "grad_norm": 0.078451007604599, + "learning_rate": 4.5350286039243735e-05, + "loss": 2.5525, + "step": 29163 + }, + { + "epoch": 0.8648103668119681, + "grad_norm": 0.08150646090507507, + "learning_rate": 4.533070814950424e-05, + "loss": 2.5757, + "step": 29164 + }, + { + "epoch": 0.8648400201642795, + "grad_norm": 0.08163373917341232, + "learning_rate": 4.53111342859393e-05, + "loss": 2.508, + "step": 29165 + }, + { + "epoch": 0.864869673516591, + "grad_norm": 0.07866725325584412, + "learning_rate": 4.529156444872229e-05, + "loss": 2.5489, + "step": 29166 + }, + { + "epoch": 0.8648993268689026, + "grad_norm": 0.08118835091590881, + "learning_rate": 4.52719986380265e-05, + "loss": 2.5502, + "step": 29167 + }, + { + "epoch": 0.864928980221214, + "grad_norm": 0.08092829585075378, + "learning_rate": 4.5252436854025244e-05, + "loss": 2.5532, + "step": 29168 + }, + { + "epoch": 0.8649586335735255, + "grad_norm": 0.08166132867336273, + "learning_rate": 4.5232879096891775e-05, + "loss": 2.5202, + "step": 29169 + }, + { + "epoch": 0.864988286925837, + "grad_norm": 0.08083614706993103, + "learning_rate": 4.521332536679917e-05, + "loss": 2.5656, + "step": 29170 + }, + { + "epoch": 0.8650179402781485, + "grad_norm": 0.07914159446954727, + "learning_rate": 4.5193775663920686e-05, + "loss": 2.5185, + "step": 29171 + }, + { + "epoch": 0.8650475936304599, + "grad_norm": 0.07588058710098267, + "learning_rate": 4.51742299884294e-05, + "loss": 2.5307, + "step": 29172 + }, + { + "epoch": 0.8650772469827714, + "grad_norm": 0.07667236775159836, + "learning_rate": 4.515468834049835e-05, + "loss": 2.5328, + "step": 29173 + }, + { + "epoch": 0.8651069003350829, + "grad_norm": 0.08023789525032043, + "learning_rate": 4.5135150720300614e-05, + "loss": 2.5517, + "step": 29174 + }, + { + "epoch": 0.8651365536873944, + "grad_norm": 0.08164618909358978, + "learning_rate": 4.5115617128009154e-05, + "loss": 2.5154, + "step": 29175 + }, + { + "epoch": 0.8651662070397058, + "grad_norm": 0.08269007503986359, + "learning_rate": 4.5096087563797016e-05, + "loss": 2.5622, + "step": 29176 + }, + { + "epoch": 0.8651958603920173, + "grad_norm": 0.079959936439991, + "learning_rate": 4.507656202783705e-05, + "loss": 2.5829, + "step": 29177 + }, + { + "epoch": 0.8652255137443288, + "grad_norm": 0.0802207887172699, + "learning_rate": 4.5057040520302295e-05, + "loss": 2.542, + "step": 29178 + }, + { + "epoch": 0.8652551670966403, + "grad_norm": 0.07989931106567383, + "learning_rate": 4.503752304136549e-05, + "loss": 2.5397, + "step": 29179 + }, + { + "epoch": 0.8652848204489517, + "grad_norm": 0.07861512899398804, + "learning_rate": 4.501800959119939e-05, + "loss": 2.5433, + "step": 29180 + }, + { + "epoch": 0.8653144738012633, + "grad_norm": 0.08010692894458771, + "learning_rate": 4.499850016997708e-05, + "loss": 2.5699, + "step": 29181 + }, + { + "epoch": 0.8653441271535747, + "grad_norm": 0.07637704908847809, + "learning_rate": 4.4978994777871095e-05, + "loss": 2.5819, + "step": 29182 + }, + { + "epoch": 0.8653737805058862, + "grad_norm": 0.07678981125354767, + "learning_rate": 4.495949341505434e-05, + "loss": 2.5351, + "step": 29183 + }, + { + "epoch": 0.8654034338581976, + "grad_norm": 0.08446606248617172, + "learning_rate": 4.4939996081699244e-05, + "loss": 2.537, + "step": 29184 + }, + { + "epoch": 0.8654330872105092, + "grad_norm": 0.0765262320637703, + "learning_rate": 4.49205027779786e-05, + "loss": 2.5613, + "step": 29185 + }, + { + "epoch": 0.8654627405628206, + "grad_norm": 0.07214190065860748, + "learning_rate": 4.4901013504065056e-05, + "loss": 2.517, + "step": 29186 + }, + { + "epoch": 0.8654923939151321, + "grad_norm": 0.07918606698513031, + "learning_rate": 4.488152826013114e-05, + "loss": 2.5329, + "step": 29187 + }, + { + "epoch": 0.8655220472674436, + "grad_norm": 0.08318088948726654, + "learning_rate": 4.4862047046349386e-05, + "loss": 2.5969, + "step": 29188 + }, + { + "epoch": 0.8655517006197551, + "grad_norm": 0.07814714312553406, + "learning_rate": 4.484256986289237e-05, + "loss": 2.5047, + "step": 29189 + }, + { + "epoch": 0.8655813539720666, + "grad_norm": 0.08390418440103531, + "learning_rate": 4.482309670993251e-05, + "loss": 2.5768, + "step": 29190 + }, + { + "epoch": 0.865611007324378, + "grad_norm": 0.08245618641376495, + "learning_rate": 4.4803627587642226e-05, + "loss": 2.5132, + "step": 29191 + }, + { + "epoch": 0.8656406606766895, + "grad_norm": 0.08037915080785751, + "learning_rate": 4.4784162496193994e-05, + "loss": 2.5273, + "step": 29192 + }, + { + "epoch": 0.865670314029001, + "grad_norm": 0.08325432240962982, + "learning_rate": 4.4764701435760116e-05, + "loss": 2.5613, + "step": 29193 + }, + { + "epoch": 0.8656999673813125, + "grad_norm": 0.08019323647022247, + "learning_rate": 4.474524440651295e-05, + "loss": 2.5107, + "step": 29194 + }, + { + "epoch": 0.8657296207336239, + "grad_norm": 0.08122263848781586, + "learning_rate": 4.472579140862476e-05, + "loss": 2.557, + "step": 29195 + }, + { + "epoch": 0.8657592740859354, + "grad_norm": 0.08021347969770432, + "learning_rate": 4.47063424422679e-05, + "loss": 2.5653, + "step": 29196 + }, + { + "epoch": 0.8657889274382469, + "grad_norm": 0.07591301947832108, + "learning_rate": 4.468689750761446e-05, + "loss": 2.5547, + "step": 29197 + }, + { + "epoch": 0.8658185807905584, + "grad_norm": 0.07739479094743729, + "learning_rate": 4.466745660483673e-05, + "loss": 2.5458, + "step": 29198 + }, + { + "epoch": 0.8658482341428698, + "grad_norm": 0.07975504547357559, + "learning_rate": 4.464801973410687e-05, + "loss": 2.5535, + "step": 29199 + }, + { + "epoch": 0.8658778874951814, + "grad_norm": 0.07596459239721298, + "learning_rate": 4.46285868955969e-05, + "loss": 2.5465, + "step": 29200 + }, + { + "epoch": 0.8659075408474928, + "grad_norm": 0.08350212872028351, + "learning_rate": 4.460915808947896e-05, + "loss": 2.5616, + "step": 29201 + }, + { + "epoch": 0.8659371941998043, + "grad_norm": 0.07708977907896042, + "learning_rate": 4.458973331592508e-05, + "loss": 2.5224, + "step": 29202 + }, + { + "epoch": 0.8659668475521157, + "grad_norm": 0.07877163589000702, + "learning_rate": 4.45703125751073e-05, + "loss": 2.5569, + "step": 29203 + }, + { + "epoch": 0.8659965009044273, + "grad_norm": 0.07874856889247894, + "learning_rate": 4.4550895867197624e-05, + "loss": 2.5601, + "step": 29204 + }, + { + "epoch": 0.8660261542567387, + "grad_norm": 0.07763756811618805, + "learning_rate": 4.453148319236788e-05, + "loss": 2.5353, + "step": 29205 + }, + { + "epoch": 0.8660558076090502, + "grad_norm": 0.08310450613498688, + "learning_rate": 4.451207455079004e-05, + "loss": 2.5528, + "step": 29206 + }, + { + "epoch": 0.8660854609613616, + "grad_norm": 0.07714119553565979, + "learning_rate": 4.4492669942635954e-05, + "loss": 2.5386, + "step": 29207 + }, + { + "epoch": 0.8661151143136732, + "grad_norm": 0.07595662772655487, + "learning_rate": 4.44732693680775e-05, + "loss": 2.5143, + "step": 29208 + }, + { + "epoch": 0.8661447676659847, + "grad_norm": 0.08076457679271698, + "learning_rate": 4.4453872827286415e-05, + "loss": 2.5265, + "step": 29209 + }, + { + "epoch": 0.8661744210182961, + "grad_norm": 0.0773443877696991, + "learning_rate": 4.443448032043451e-05, + "loss": 2.5351, + "step": 29210 + }, + { + "epoch": 0.8662040743706076, + "grad_norm": 0.07604661583900452, + "learning_rate": 4.441509184769343e-05, + "loss": 2.5585, + "step": 29211 + }, + { + "epoch": 0.8662337277229191, + "grad_norm": 0.08023300766944885, + "learning_rate": 4.4395707409234934e-05, + "loss": 2.5491, + "step": 29212 + }, + { + "epoch": 0.8662633810752306, + "grad_norm": 0.0800430029630661, + "learning_rate": 4.437632700523064e-05, + "loss": 2.5238, + "step": 29213 + }, + { + "epoch": 0.866293034427542, + "grad_norm": 0.07678764313459396, + "learning_rate": 4.435695063585221e-05, + "loss": 2.5477, + "step": 29214 + }, + { + "epoch": 0.8663226877798536, + "grad_norm": 0.08783420920372009, + "learning_rate": 4.433757830127116e-05, + "loss": 2.5397, + "step": 29215 + }, + { + "epoch": 0.866352341132165, + "grad_norm": 0.07912370562553406, + "learning_rate": 4.4318210001659086e-05, + "loss": 2.5676, + "step": 29216 + }, + { + "epoch": 0.8663819944844765, + "grad_norm": 0.07905762642621994, + "learning_rate": 4.4298845737187566e-05, + "loss": 2.5163, + "step": 29217 + }, + { + "epoch": 0.8664116478367879, + "grad_norm": 0.09316452592611313, + "learning_rate": 4.42794855080278e-05, + "loss": 2.572, + "step": 29218 + }, + { + "epoch": 0.8664413011890995, + "grad_norm": 0.08419287949800491, + "learning_rate": 4.426012931435147e-05, + "loss": 2.545, + "step": 29219 + }, + { + "epoch": 0.8664709545414109, + "grad_norm": 0.07445476204156876, + "learning_rate": 4.424077715632996e-05, + "loss": 2.556, + "step": 29220 + }, + { + "epoch": 0.8665006078937224, + "grad_norm": 0.07676433026790619, + "learning_rate": 4.422142903413462e-05, + "loss": 2.5716, + "step": 29221 + }, + { + "epoch": 0.8665302612460338, + "grad_norm": 0.0754227340221405, + "learning_rate": 4.420208494793671e-05, + "loss": 2.5548, + "step": 29222 + }, + { + "epoch": 0.8665599145983454, + "grad_norm": 0.07427939772605896, + "learning_rate": 4.4182744897907576e-05, + "loss": 2.5286, + "step": 29223 + }, + { + "epoch": 0.8665895679506568, + "grad_norm": 0.07337208092212677, + "learning_rate": 4.416340888421855e-05, + "loss": 2.4974, + "step": 29224 + }, + { + "epoch": 0.8666192213029683, + "grad_norm": 0.07348685711622238, + "learning_rate": 4.4144076907040696e-05, + "loss": 2.5471, + "step": 29225 + }, + { + "epoch": 0.8666488746552797, + "grad_norm": 0.07787565141916275, + "learning_rate": 4.412474896654534e-05, + "loss": 2.5501, + "step": 29226 + }, + { + "epoch": 0.8666785280075913, + "grad_norm": 0.07562001049518585, + "learning_rate": 4.4105425062903656e-05, + "loss": 2.5436, + "step": 29227 + }, + { + "epoch": 0.8667081813599027, + "grad_norm": 0.07986703515052795, + "learning_rate": 4.408610519628659e-05, + "loss": 2.5375, + "step": 29228 + }, + { + "epoch": 0.8667378347122142, + "grad_norm": 0.07829353958368301, + "learning_rate": 4.406678936686531e-05, + "loss": 2.5828, + "step": 29229 + }, + { + "epoch": 0.8667674880645257, + "grad_norm": 0.07861440628767014, + "learning_rate": 4.404747757481087e-05, + "loss": 2.5276, + "step": 29230 + }, + { + "epoch": 0.8667971414168372, + "grad_norm": 0.0764346569776535, + "learning_rate": 4.4028169820294226e-05, + "loss": 2.5142, + "step": 29231 + }, + { + "epoch": 0.8668267947691487, + "grad_norm": 0.0791112557053566, + "learning_rate": 4.400886610348648e-05, + "loss": 2.5651, + "step": 29232 + }, + { + "epoch": 0.8668564481214601, + "grad_norm": 0.07570105046033859, + "learning_rate": 4.3989566424558483e-05, + "loss": 2.5547, + "step": 29233 + }, + { + "epoch": 0.8668861014737717, + "grad_norm": 0.07882081717252731, + "learning_rate": 4.397027078368116e-05, + "loss": 2.5616, + "step": 29234 + }, + { + "epoch": 0.8669157548260831, + "grad_norm": 0.07870037853717804, + "learning_rate": 4.395097918102531e-05, + "loss": 2.5487, + "step": 29235 + }, + { + "epoch": 0.8669454081783946, + "grad_norm": 0.08086703717708588, + "learning_rate": 4.393169161676192e-05, + "loss": 2.5358, + "step": 29236 + }, + { + "epoch": 0.866975061530706, + "grad_norm": 0.07444898039102554, + "learning_rate": 4.391240809106162e-05, + "loss": 2.5208, + "step": 29237 + }, + { + "epoch": 0.8670047148830176, + "grad_norm": 0.07409945875406265, + "learning_rate": 4.389312860409533e-05, + "loss": 2.5448, + "step": 29238 + }, + { + "epoch": 0.867034368235329, + "grad_norm": 0.076853908598423, + "learning_rate": 4.387385315603359e-05, + "loss": 2.5319, + "step": 29239 + }, + { + "epoch": 0.8670640215876405, + "grad_norm": 0.08202332258224487, + "learning_rate": 4.385458174704715e-05, + "loss": 2.56, + "step": 29240 + }, + { + "epoch": 0.8670936749399519, + "grad_norm": 0.07531898468732834, + "learning_rate": 4.38353143773067e-05, + "loss": 2.5408, + "step": 29241 + }, + { + "epoch": 0.8671233282922635, + "grad_norm": 0.07495979964733124, + "learning_rate": 4.3816051046982886e-05, + "loss": 2.5835, + "step": 29242 + }, + { + "epoch": 0.8671529816445749, + "grad_norm": 0.07740862667560577, + "learning_rate": 4.3796791756246176e-05, + "loss": 2.5312, + "step": 29243 + }, + { + "epoch": 0.8671826349968864, + "grad_norm": 0.0723203718662262, + "learning_rate": 4.377753650526711e-05, + "loss": 2.5529, + "step": 29244 + }, + { + "epoch": 0.8672122883491978, + "grad_norm": 0.07881062477827072, + "learning_rate": 4.375828529421638e-05, + "loss": 2.5872, + "step": 29245 + }, + { + "epoch": 0.8672419417015094, + "grad_norm": 0.07593617588281631, + "learning_rate": 4.3739038123264354e-05, + "loss": 2.5471, + "step": 29246 + }, + { + "epoch": 0.8672715950538208, + "grad_norm": 0.07509779185056686, + "learning_rate": 4.371979499258144e-05, + "loss": 2.5558, + "step": 29247 + }, + { + "epoch": 0.8673012484061323, + "grad_norm": 0.07203761488199234, + "learning_rate": 4.3700555902338124e-05, + "loss": 2.5077, + "step": 29248 + }, + { + "epoch": 0.8673309017584437, + "grad_norm": 0.07752453535795212, + "learning_rate": 4.368132085270465e-05, + "loss": 2.5261, + "step": 29249 + }, + { + "epoch": 0.8673605551107553, + "grad_norm": 0.08202186226844788, + "learning_rate": 4.3662089843851396e-05, + "loss": 2.5421, + "step": 29250 + }, + { + "epoch": 0.8673902084630668, + "grad_norm": 0.0754542350769043, + "learning_rate": 4.3642862875948705e-05, + "loss": 2.5424, + "step": 29251 + }, + { + "epoch": 0.8674198618153782, + "grad_norm": 0.08178656548261642, + "learning_rate": 4.362363994916674e-05, + "loss": 2.5466, + "step": 29252 + }, + { + "epoch": 0.8674495151676898, + "grad_norm": 0.08266546577215195, + "learning_rate": 4.360442106367579e-05, + "loss": 2.5521, + "step": 29253 + }, + { + "epoch": 0.8674791685200012, + "grad_norm": 0.07965591549873352, + "learning_rate": 4.3585206219646066e-05, + "loss": 2.5238, + "step": 29254 + }, + { + "epoch": 0.8675088218723127, + "grad_norm": 0.0727016031742096, + "learning_rate": 4.356599541724765e-05, + "loss": 2.5404, + "step": 29255 + }, + { + "epoch": 0.8675384752246241, + "grad_norm": 0.07862622290849686, + "learning_rate": 4.354678865665068e-05, + "loss": 2.5831, + "step": 29256 + }, + { + "epoch": 0.8675681285769357, + "grad_norm": 0.07968320697546005, + "learning_rate": 4.3527585938025196e-05, + "loss": 2.5415, + "step": 29257 + }, + { + "epoch": 0.8675977819292471, + "grad_norm": 0.0756332278251648, + "learning_rate": 4.350838726154138e-05, + "loss": 2.549, + "step": 29258 + }, + { + "epoch": 0.8676274352815586, + "grad_norm": 0.07394953817129135, + "learning_rate": 4.3489192627369225e-05, + "loss": 2.5378, + "step": 29259 + }, + { + "epoch": 0.86765708863387, + "grad_norm": 0.08074090629816055, + "learning_rate": 4.3470002035678526e-05, + "loss": 2.5386, + "step": 29260 + }, + { + "epoch": 0.8676867419861816, + "grad_norm": 0.08032235503196716, + "learning_rate": 4.3450815486639315e-05, + "loss": 2.5607, + "step": 29261 + }, + { + "epoch": 0.867716395338493, + "grad_norm": 0.07508566230535507, + "learning_rate": 4.343163298042152e-05, + "loss": 2.5454, + "step": 29262 + }, + { + "epoch": 0.8677460486908045, + "grad_norm": 0.07475986331701279, + "learning_rate": 4.3412454517194996e-05, + "loss": 2.5491, + "step": 29263 + }, + { + "epoch": 0.8677757020431159, + "grad_norm": 0.08579301834106445, + "learning_rate": 4.339328009712951e-05, + "loss": 2.5407, + "step": 29264 + }, + { + "epoch": 0.8678053553954275, + "grad_norm": 0.08074580878019333, + "learning_rate": 4.337410972039496e-05, + "loss": 2.5467, + "step": 29265 + }, + { + "epoch": 0.8678350087477389, + "grad_norm": 0.07925388216972351, + "learning_rate": 4.3354943387161005e-05, + "loss": 2.5274, + "step": 29266 + }, + { + "epoch": 0.8678646621000504, + "grad_norm": 0.081407330930233, + "learning_rate": 4.333578109759745e-05, + "loss": 2.5329, + "step": 29267 + }, + { + "epoch": 0.8678943154523618, + "grad_norm": 0.07825875282287598, + "learning_rate": 4.3316622851873935e-05, + "loss": 2.5445, + "step": 29268 + }, + { + "epoch": 0.8679239688046734, + "grad_norm": 0.08227569609880447, + "learning_rate": 4.329746865016004e-05, + "loss": 2.5345, + "step": 29269 + }, + { + "epoch": 0.8679536221569848, + "grad_norm": 0.08077972382307053, + "learning_rate": 4.327831849262554e-05, + "loss": 2.5255, + "step": 29270 + }, + { + "epoch": 0.8679832755092963, + "grad_norm": 0.07408777624368668, + "learning_rate": 4.325917237943988e-05, + "loss": 2.5371, + "step": 29271 + }, + { + "epoch": 0.8680129288616079, + "grad_norm": 0.07689346373081207, + "learning_rate": 4.324003031077267e-05, + "loss": 2.5456, + "step": 29272 + }, + { + "epoch": 0.8680425822139193, + "grad_norm": 0.08123950660228729, + "learning_rate": 4.322089228679338e-05, + "loss": 2.5407, + "step": 29273 + }, + { + "epoch": 0.8680722355662308, + "grad_norm": 0.08117479085922241, + "learning_rate": 4.320175830767148e-05, + "loss": 2.5463, + "step": 29274 + }, + { + "epoch": 0.8681018889185422, + "grad_norm": 0.08147584646940231, + "learning_rate": 4.3182628373576447e-05, + "loss": 2.5745, + "step": 29275 + }, + { + "epoch": 0.8681315422708538, + "grad_norm": 0.08092321455478668, + "learning_rate": 4.316350248467765e-05, + "loss": 2.5562, + "step": 29276 + }, + { + "epoch": 0.8681611956231652, + "grad_norm": 0.0887000560760498, + "learning_rate": 4.314438064114445e-05, + "loss": 2.5252, + "step": 29277 + }, + { + "epoch": 0.8681908489754767, + "grad_norm": 0.08186599612236023, + "learning_rate": 4.3125262843146165e-05, + "loss": 2.5418, + "step": 29278 + }, + { + "epoch": 0.8682205023277881, + "grad_norm": 0.08628956973552704, + "learning_rate": 4.310614909085209e-05, + "loss": 2.5308, + "step": 29279 + }, + { + "epoch": 0.8682501556800997, + "grad_norm": 0.07685600221157074, + "learning_rate": 4.308703938443148e-05, + "loss": 2.5229, + "step": 29280 + }, + { + "epoch": 0.8682798090324111, + "grad_norm": 0.08350758254528046, + "learning_rate": 4.30679337240536e-05, + "loss": 2.536, + "step": 29281 + }, + { + "epoch": 0.8683094623847226, + "grad_norm": 0.07607658952474594, + "learning_rate": 4.3048832109887584e-05, + "loss": 2.5406, + "step": 29282 + }, + { + "epoch": 0.868339115737034, + "grad_norm": 0.07348001003265381, + "learning_rate": 4.302973454210263e-05, + "loss": 2.5676, + "step": 29283 + }, + { + "epoch": 0.8683687690893456, + "grad_norm": 0.08153868466615677, + "learning_rate": 4.301064102086777e-05, + "loss": 2.5466, + "step": 29284 + }, + { + "epoch": 0.868398422441657, + "grad_norm": 0.0723540335893631, + "learning_rate": 4.299155154635215e-05, + "loss": 2.5229, + "step": 29285 + }, + { + "epoch": 0.8684280757939685, + "grad_norm": 0.07895821332931519, + "learning_rate": 4.297246611872474e-05, + "loss": 2.5684, + "step": 29286 + }, + { + "epoch": 0.8684577291462799, + "grad_norm": 0.07544318586587906, + "learning_rate": 4.295338473815463e-05, + "loss": 2.5485, + "step": 29287 + }, + { + "epoch": 0.8684873824985915, + "grad_norm": 0.07616911828517914, + "learning_rate": 4.2934307404810746e-05, + "loss": 2.5841, + "step": 29288 + }, + { + "epoch": 0.8685170358509029, + "grad_norm": 0.07657936960458755, + "learning_rate": 4.2915234118862004e-05, + "loss": 2.5359, + "step": 29289 + }, + { + "epoch": 0.8685466892032144, + "grad_norm": 0.07851628959178925, + "learning_rate": 4.289616488047737e-05, + "loss": 2.5495, + "step": 29290 + }, + { + "epoch": 0.8685763425555258, + "grad_norm": 0.08100633323192596, + "learning_rate": 4.287709968982562e-05, + "loss": 2.534, + "step": 29291 + }, + { + "epoch": 0.8686059959078374, + "grad_norm": 0.0766700729727745, + "learning_rate": 4.2858038547075595e-05, + "loss": 2.5732, + "step": 29292 + }, + { + "epoch": 0.8686356492601489, + "grad_norm": 0.0773850753903389, + "learning_rate": 4.2838981452396234e-05, + "loss": 2.54, + "step": 29293 + }, + { + "epoch": 0.8686653026124603, + "grad_norm": 0.08210474997758865, + "learning_rate": 4.2819928405956054e-05, + "loss": 2.5325, + "step": 29294 + }, + { + "epoch": 0.8686949559647719, + "grad_norm": 0.08226916939020157, + "learning_rate": 4.280087940792388e-05, + "loss": 2.569, + "step": 29295 + }, + { + "epoch": 0.8687246093170833, + "grad_norm": 0.0784764289855957, + "learning_rate": 4.278183445846834e-05, + "loss": 2.5218, + "step": 29296 + }, + { + "epoch": 0.8687542626693948, + "grad_norm": 0.07848942279815674, + "learning_rate": 4.2762793557758196e-05, + "loss": 2.5615, + "step": 29297 + }, + { + "epoch": 0.8687839160217062, + "grad_norm": 0.07740426808595657, + "learning_rate": 4.274375670596198e-05, + "loss": 2.5382, + "step": 29298 + }, + { + "epoch": 0.8688135693740178, + "grad_norm": 0.08001505583524704, + "learning_rate": 4.272472390324833e-05, + "loss": 2.5745, + "step": 29299 + }, + { + "epoch": 0.8688432227263292, + "grad_norm": 0.07874500751495361, + "learning_rate": 4.270569514978573e-05, + "loss": 2.5319, + "step": 29300 + }, + { + "epoch": 0.8688728760786407, + "grad_norm": 0.08178669959306717, + "learning_rate": 4.268667044574265e-05, + "loss": 2.5332, + "step": 29301 + }, + { + "epoch": 0.8689025294309521, + "grad_norm": 0.0780249759554863, + "learning_rate": 4.266764979128762e-05, + "loss": 2.5638, + "step": 29302 + }, + { + "epoch": 0.8689321827832637, + "grad_norm": 0.07922513782978058, + "learning_rate": 4.264863318658907e-05, + "loss": 2.549, + "step": 29303 + }, + { + "epoch": 0.8689618361355751, + "grad_norm": 0.07494670152664185, + "learning_rate": 4.2629620631815466e-05, + "loss": 2.5147, + "step": 29304 + }, + { + "epoch": 0.8689914894878866, + "grad_norm": 0.07413026690483093, + "learning_rate": 4.2610612127134964e-05, + "loss": 2.5334, + "step": 29305 + }, + { + "epoch": 0.869021142840198, + "grad_norm": 0.07549001276493073, + "learning_rate": 4.2591607672716026e-05, + "loss": 2.5519, + "step": 29306 + }, + { + "epoch": 0.8690507961925096, + "grad_norm": 0.07392007112503052, + "learning_rate": 4.2572607268726914e-05, + "loss": 2.5499, + "step": 29307 + }, + { + "epoch": 0.869080449544821, + "grad_norm": 0.07248105108737946, + "learning_rate": 4.255361091533588e-05, + "loss": 2.5683, + "step": 29308 + }, + { + "epoch": 0.8691101028971325, + "grad_norm": 0.08311796933412552, + "learning_rate": 4.2534618612711074e-05, + "loss": 2.5715, + "step": 29309 + }, + { + "epoch": 0.869139756249444, + "grad_norm": 0.07535677403211594, + "learning_rate": 4.2515630361020796e-05, + "loss": 2.5271, + "step": 29310 + }, + { + "epoch": 0.8691694096017555, + "grad_norm": 0.07505223155021667, + "learning_rate": 4.249664616043314e-05, + "loss": 2.5323, + "step": 29311 + }, + { + "epoch": 0.8691990629540669, + "grad_norm": 0.07926665991544724, + "learning_rate": 4.247766601111619e-05, + "loss": 2.5497, + "step": 29312 + }, + { + "epoch": 0.8692287163063784, + "grad_norm": 0.07816316932439804, + "learning_rate": 4.245868991323809e-05, + "loss": 2.5211, + "step": 29313 + }, + { + "epoch": 0.86925836965869, + "grad_norm": 0.08232219517230988, + "learning_rate": 4.243971786696688e-05, + "loss": 2.564, + "step": 29314 + }, + { + "epoch": 0.8692880230110014, + "grad_norm": 0.07733162492513657, + "learning_rate": 4.2420749872470464e-05, + "loss": 2.5648, + "step": 29315 + }, + { + "epoch": 0.8693176763633129, + "grad_norm": 0.07848010212182999, + "learning_rate": 4.240178592991678e-05, + "loss": 2.5543, + "step": 29316 + }, + { + "epoch": 0.8693473297156243, + "grad_norm": 0.07750926166772842, + "learning_rate": 4.238282603947385e-05, + "loss": 2.535, + "step": 29317 + }, + { + "epoch": 0.8693769830679359, + "grad_norm": 0.08011862635612488, + "learning_rate": 4.2363870201309605e-05, + "loss": 2.5679, + "step": 29318 + }, + { + "epoch": 0.8694066364202473, + "grad_norm": 0.07637699693441391, + "learning_rate": 4.2344918415591794e-05, + "loss": 2.5182, + "step": 29319 + }, + { + "epoch": 0.8694362897725588, + "grad_norm": 0.0820096880197525, + "learning_rate": 4.232597068248828e-05, + "loss": 2.5275, + "step": 29320 + }, + { + "epoch": 0.8694659431248702, + "grad_norm": 0.08169089257717133, + "learning_rate": 4.2307027002166873e-05, + "loss": 2.5547, + "step": 29321 + }, + { + "epoch": 0.8694955964771818, + "grad_norm": 0.0733865424990654, + "learning_rate": 4.228808737479517e-05, + "loss": 2.5343, + "step": 29322 + }, + { + "epoch": 0.8695252498294932, + "grad_norm": 0.07129395753145218, + "learning_rate": 4.2269151800541136e-05, + "loss": 2.5286, + "step": 29323 + }, + { + "epoch": 0.8695549031818047, + "grad_norm": 0.07658976316452026, + "learning_rate": 4.225022027957248e-05, + "loss": 2.5603, + "step": 29324 + }, + { + "epoch": 0.8695845565341161, + "grad_norm": 0.07289545238018036, + "learning_rate": 4.223129281205651e-05, + "loss": 2.5405, + "step": 29325 + }, + { + "epoch": 0.8696142098864277, + "grad_norm": 0.07555510848760605, + "learning_rate": 4.221236939816109e-05, + "loss": 2.5604, + "step": 29326 + }, + { + "epoch": 0.8696438632387391, + "grad_norm": 0.07328113168478012, + "learning_rate": 4.219345003805369e-05, + "loss": 2.522, + "step": 29327 + }, + { + "epoch": 0.8696735165910506, + "grad_norm": 0.07896457612514496, + "learning_rate": 4.217453473190186e-05, + "loss": 2.5463, + "step": 29328 + }, + { + "epoch": 0.869703169943362, + "grad_norm": 0.0747060775756836, + "learning_rate": 4.2155623479873116e-05, + "loss": 2.5044, + "step": 29329 + }, + { + "epoch": 0.8697328232956736, + "grad_norm": 0.07150979340076447, + "learning_rate": 4.213671628213489e-05, + "loss": 2.5534, + "step": 29330 + }, + { + "epoch": 0.869762476647985, + "grad_norm": 0.07107895612716675, + "learning_rate": 4.2117813138854654e-05, + "loss": 2.5681, + "step": 29331 + }, + { + "epoch": 0.8697921300002965, + "grad_norm": 0.07509071379899979, + "learning_rate": 4.209891405019983e-05, + "loss": 2.5309, + "step": 29332 + }, + { + "epoch": 0.869821783352608, + "grad_norm": 0.07534404844045639, + "learning_rate": 4.2080019016337675e-05, + "loss": 2.556, + "step": 29333 + }, + { + "epoch": 0.8698514367049195, + "grad_norm": 0.07528921216726303, + "learning_rate": 4.20611280374355e-05, + "loss": 2.5519, + "step": 29334 + }, + { + "epoch": 0.869881090057231, + "grad_norm": 0.0794948861002922, + "learning_rate": 4.204224111366073e-05, + "loss": 2.5192, + "step": 29335 + }, + { + "epoch": 0.8699107434095424, + "grad_norm": 0.07939038425683975, + "learning_rate": 4.202335824518044e-05, + "loss": 2.5206, + "step": 29336 + }, + { + "epoch": 0.869940396761854, + "grad_norm": 0.0726289227604866, + "learning_rate": 4.2004479432162014e-05, + "loss": 2.587, + "step": 29337 + }, + { + "epoch": 0.8699700501141654, + "grad_norm": 0.07613028585910797, + "learning_rate": 4.198560467477247e-05, + "loss": 2.5645, + "step": 29338 + }, + { + "epoch": 0.8699997034664769, + "grad_norm": 0.08426694571971893, + "learning_rate": 4.196673397317902e-05, + "loss": 2.5739, + "step": 29339 + }, + { + "epoch": 0.8700293568187883, + "grad_norm": 0.07778377085924149, + "learning_rate": 4.1947867327548794e-05, + "loss": 2.5508, + "step": 29340 + }, + { + "epoch": 0.8700590101710999, + "grad_norm": 0.07599549740552902, + "learning_rate": 4.1929004738048784e-05, + "loss": 2.5541, + "step": 29341 + }, + { + "epoch": 0.8700886635234113, + "grad_norm": 0.08370262384414673, + "learning_rate": 4.1910146204846126e-05, + "loss": 2.5309, + "step": 29342 + }, + { + "epoch": 0.8701183168757228, + "grad_norm": 0.07412778586149216, + "learning_rate": 4.1891291728107684e-05, + "loss": 2.5422, + "step": 29343 + }, + { + "epoch": 0.8701479702280343, + "grad_norm": 0.07569531351327896, + "learning_rate": 4.18724413080005e-05, + "loss": 2.5335, + "step": 29344 + }, + { + "epoch": 0.8701776235803458, + "grad_norm": 0.08582575619220734, + "learning_rate": 4.185359494469149e-05, + "loss": 2.5669, + "step": 29345 + }, + { + "epoch": 0.8702072769326572, + "grad_norm": 0.07994279265403748, + "learning_rate": 4.183475263834757e-05, + "loss": 2.5255, + "step": 29346 + }, + { + "epoch": 0.8702369302849687, + "grad_norm": 0.07606803625822067, + "learning_rate": 4.1815914389135514e-05, + "loss": 2.5244, + "step": 29347 + }, + { + "epoch": 0.8702665836372802, + "grad_norm": 0.09201238304376602, + "learning_rate": 4.179708019722223e-05, + "loss": 2.5329, + "step": 29348 + }, + { + "epoch": 0.8702962369895917, + "grad_norm": 0.0843484029173851, + "learning_rate": 4.1778250062774414e-05, + "loss": 2.528, + "step": 29349 + }, + { + "epoch": 0.8703258903419031, + "grad_norm": 0.07616923004388809, + "learning_rate": 4.175942398595889e-05, + "loss": 2.4961, + "step": 29350 + }, + { + "epoch": 0.8703555436942146, + "grad_norm": 0.08377613127231598, + "learning_rate": 4.174060196694224e-05, + "loss": 2.5192, + "step": 29351 + }, + { + "epoch": 0.8703851970465261, + "grad_norm": 0.07565732300281525, + "learning_rate": 4.172178400589127e-05, + "loss": 2.548, + "step": 29352 + }, + { + "epoch": 0.8704148503988376, + "grad_norm": 0.0825032964348793, + "learning_rate": 4.1702970102972584e-05, + "loss": 2.597, + "step": 29353 + }, + { + "epoch": 0.870444503751149, + "grad_norm": 0.07969469577074051, + "learning_rate": 4.1684160258352754e-05, + "loss": 2.5744, + "step": 29354 + }, + { + "epoch": 0.8704741571034605, + "grad_norm": 0.08221877366304398, + "learning_rate": 4.166535447219833e-05, + "loss": 2.5532, + "step": 29355 + }, + { + "epoch": 0.8705038104557721, + "grad_norm": 0.07789237797260284, + "learning_rate": 4.1646552744675884e-05, + "loss": 2.5368, + "step": 29356 + }, + { + "epoch": 0.8705334638080835, + "grad_norm": 0.08003073185682297, + "learning_rate": 4.16277550759519e-05, + "loss": 2.5448, + "step": 29357 + }, + { + "epoch": 0.870563117160395, + "grad_norm": 0.07714734226465225, + "learning_rate": 4.1608961466192807e-05, + "loss": 2.5622, + "step": 29358 + }, + { + "epoch": 0.8705927705127064, + "grad_norm": 0.07814734429121017, + "learning_rate": 4.1590171915565076e-05, + "loss": 2.5615, + "step": 29359 + }, + { + "epoch": 0.870622423865018, + "grad_norm": 0.07690312713384628, + "learning_rate": 4.157138642423502e-05, + "loss": 2.5467, + "step": 29360 + }, + { + "epoch": 0.8706520772173294, + "grad_norm": 0.0809563472867012, + "learning_rate": 4.155260499236896e-05, + "loss": 2.5526, + "step": 29361 + }, + { + "epoch": 0.8706817305696409, + "grad_norm": 0.07650040090084076, + "learning_rate": 4.153382762013336e-05, + "loss": 2.491, + "step": 29362 + }, + { + "epoch": 0.8707113839219524, + "grad_norm": 0.07654201984405518, + "learning_rate": 4.151505430769437e-05, + "loss": 2.5746, + "step": 29363 + }, + { + "epoch": 0.8707410372742639, + "grad_norm": 0.07280359417200089, + "learning_rate": 4.14962850552183e-05, + "loss": 2.5436, + "step": 29364 + }, + { + "epoch": 0.8707706906265753, + "grad_norm": 0.07738835364580154, + "learning_rate": 4.147751986287135e-05, + "loss": 2.517, + "step": 29365 + }, + { + "epoch": 0.8708003439788868, + "grad_norm": 0.07420811057090759, + "learning_rate": 4.1458758730819616e-05, + "loss": 2.5772, + "step": 29366 + }, + { + "epoch": 0.8708299973311983, + "grad_norm": 0.07575665414333344, + "learning_rate": 4.144000165922929e-05, + "loss": 2.5304, + "step": 29367 + }, + { + "epoch": 0.8708596506835098, + "grad_norm": 0.08368715643882751, + "learning_rate": 4.142124864826646e-05, + "loss": 2.559, + "step": 29368 + }, + { + "epoch": 0.8708893040358212, + "grad_norm": 0.07477430999279022, + "learning_rate": 4.140249969809729e-05, + "loss": 2.5324, + "step": 29369 + }, + { + "epoch": 0.8709189573881327, + "grad_norm": 0.07806368917226791, + "learning_rate": 4.138375480888762e-05, + "loss": 2.5458, + "step": 29370 + }, + { + "epoch": 0.8709486107404442, + "grad_norm": 0.07952587306499481, + "learning_rate": 4.13650139808035e-05, + "loss": 2.5392, + "step": 29371 + }, + { + "epoch": 0.8709782640927557, + "grad_norm": 0.07495667040348053, + "learning_rate": 4.134627721401096e-05, + "loss": 2.5448, + "step": 29372 + }, + { + "epoch": 0.8710079174450671, + "grad_norm": 0.07239525020122528, + "learning_rate": 4.132754450867582e-05, + "loss": 2.5084, + "step": 29373 + }, + { + "epoch": 0.8710375707973786, + "grad_norm": 0.07579553872346878, + "learning_rate": 4.130881586496388e-05, + "loss": 2.523, + "step": 29374 + }, + { + "epoch": 0.8710672241496901, + "grad_norm": 0.07876887917518616, + "learning_rate": 4.129009128304123e-05, + "loss": 2.5631, + "step": 29375 + }, + { + "epoch": 0.8710968775020016, + "grad_norm": 0.08161145448684692, + "learning_rate": 4.127137076307358e-05, + "loss": 2.5762, + "step": 29376 + }, + { + "epoch": 0.8711265308543131, + "grad_norm": 0.07702232152223587, + "learning_rate": 4.1252654305226676e-05, + "loss": 2.5724, + "step": 29377 + }, + { + "epoch": 0.8711561842066246, + "grad_norm": 0.08355841040611267, + "learning_rate": 4.1233941909666226e-05, + "loss": 2.5343, + "step": 29378 + }, + { + "epoch": 0.8711858375589361, + "grad_norm": 0.0819844901561737, + "learning_rate": 4.121523357655799e-05, + "loss": 2.5537, + "step": 29379 + }, + { + "epoch": 0.8712154909112475, + "grad_norm": 0.07551446557044983, + "learning_rate": 4.119652930606771e-05, + "loss": 2.5718, + "step": 29380 + }, + { + "epoch": 0.871245144263559, + "grad_norm": 0.0756363794207573, + "learning_rate": 4.117782909836082e-05, + "loss": 2.5368, + "step": 29381 + }, + { + "epoch": 0.8712747976158705, + "grad_norm": 0.08171243220567703, + "learning_rate": 4.115913295360302e-05, + "loss": 2.503, + "step": 29382 + }, + { + "epoch": 0.871304450968182, + "grad_norm": 0.07665048539638519, + "learning_rate": 4.114044087195984e-05, + "loss": 2.5761, + "step": 29383 + }, + { + "epoch": 0.8713341043204934, + "grad_norm": 0.0718286857008934, + "learning_rate": 4.1121752853596814e-05, + "loss": 2.5721, + "step": 29384 + }, + { + "epoch": 0.8713637576728049, + "grad_norm": 0.0758177638053894, + "learning_rate": 4.110306889867943e-05, + "loss": 2.552, + "step": 29385 + }, + { + "epoch": 0.8713934110251164, + "grad_norm": 0.0811164528131485, + "learning_rate": 4.108438900737316e-05, + "loss": 2.545, + "step": 29386 + }, + { + "epoch": 0.8714230643774279, + "grad_norm": 0.0797996073961258, + "learning_rate": 4.106571317984325e-05, + "loss": 2.5218, + "step": 29387 + }, + { + "epoch": 0.8714527177297393, + "grad_norm": 0.07942166179418564, + "learning_rate": 4.104704141625537e-05, + "loss": 2.5596, + "step": 29388 + }, + { + "epoch": 0.8714823710820508, + "grad_norm": 0.07343500107526779, + "learning_rate": 4.1028373716774704e-05, + "loss": 2.5185, + "step": 29389 + }, + { + "epoch": 0.8715120244343623, + "grad_norm": 0.08286964893341064, + "learning_rate": 4.100971008156667e-05, + "loss": 2.5392, + "step": 29390 + }, + { + "epoch": 0.8715416777866738, + "grad_norm": 0.08029241859912872, + "learning_rate": 4.0991050510796325e-05, + "loss": 2.5626, + "step": 29391 + }, + { + "epoch": 0.8715713311389852, + "grad_norm": 0.07316453009843826, + "learning_rate": 4.0972395004629015e-05, + "loss": 2.5475, + "step": 29392 + }, + { + "epoch": 0.8716009844912967, + "grad_norm": 0.07651588320732117, + "learning_rate": 4.095374356322995e-05, + "loss": 2.5467, + "step": 29393 + }, + { + "epoch": 0.8716306378436082, + "grad_norm": 0.075511135160923, + "learning_rate": 4.093509618676433e-05, + "loss": 2.5436, + "step": 29394 + }, + { + "epoch": 0.8716602911959197, + "grad_norm": 0.07910162210464478, + "learning_rate": 4.0916452875397135e-05, + "loss": 2.5664, + "step": 29395 + }, + { + "epoch": 0.8716899445482312, + "grad_norm": 0.07723741978406906, + "learning_rate": 4.089781362929362e-05, + "loss": 2.5418, + "step": 29396 + }, + { + "epoch": 0.8717195979005427, + "grad_norm": 0.07640731334686279, + "learning_rate": 4.087917844861877e-05, + "loss": 2.5215, + "step": 29397 + }, + { + "epoch": 0.8717492512528542, + "grad_norm": 0.08168958872556686, + "learning_rate": 4.0860547333537554e-05, + "loss": 2.5923, + "step": 29398 + }, + { + "epoch": 0.8717789046051656, + "grad_norm": 0.0788297951221466, + "learning_rate": 4.084192028421496e-05, + "loss": 2.5696, + "step": 29399 + }, + { + "epoch": 0.8718085579574771, + "grad_norm": 0.07524789124727249, + "learning_rate": 4.082329730081613e-05, + "loss": 2.5171, + "step": 29400 + }, + { + "epoch": 0.8718382113097886, + "grad_norm": 0.08158725500106812, + "learning_rate": 4.080467838350571e-05, + "loss": 2.5188, + "step": 29401 + }, + { + "epoch": 0.8718678646621001, + "grad_norm": 0.08704713732004166, + "learning_rate": 4.078606353244874e-05, + "loss": 2.5709, + "step": 29402 + }, + { + "epoch": 0.8718975180144115, + "grad_norm": 0.07425197213888168, + "learning_rate": 4.0767452747809964e-05, + "loss": 2.5222, + "step": 29403 + }, + { + "epoch": 0.871927171366723, + "grad_norm": 0.08054840564727783, + "learning_rate": 4.0748846029754205e-05, + "loss": 2.5428, + "step": 29404 + }, + { + "epoch": 0.8719568247190345, + "grad_norm": 0.08314453065395355, + "learning_rate": 4.0730243378446273e-05, + "loss": 2.5534, + "step": 29405 + }, + { + "epoch": 0.871986478071346, + "grad_norm": 0.07202122360467911, + "learning_rate": 4.071164479405082e-05, + "loss": 2.5223, + "step": 29406 + }, + { + "epoch": 0.8720161314236574, + "grad_norm": 0.07509004324674606, + "learning_rate": 4.0693050276732645e-05, + "loss": 2.5471, + "step": 29407 + }, + { + "epoch": 0.872045784775969, + "grad_norm": 0.08416510373353958, + "learning_rate": 4.0674459826656295e-05, + "loss": 2.5307, + "step": 29408 + }, + { + "epoch": 0.8720754381282804, + "grad_norm": 0.07689473032951355, + "learning_rate": 4.065587344398653e-05, + "loss": 2.5585, + "step": 29409 + }, + { + "epoch": 0.8721050914805919, + "grad_norm": 0.07676947861909866, + "learning_rate": 4.063729112888775e-05, + "loss": 2.5323, + "step": 29410 + }, + { + "epoch": 0.8721347448329033, + "grad_norm": 0.0826093852519989, + "learning_rate": 4.061871288152469e-05, + "loss": 2.5722, + "step": 29411 + }, + { + "epoch": 0.8721643981852149, + "grad_norm": 0.07786665856838226, + "learning_rate": 4.060013870206175e-05, + "loss": 2.5554, + "step": 29412 + }, + { + "epoch": 0.8721940515375263, + "grad_norm": 0.08021440356969833, + "learning_rate": 4.058156859066342e-05, + "loss": 2.5585, + "step": 29413 + }, + { + "epoch": 0.8722237048898378, + "grad_norm": 0.07914865016937256, + "learning_rate": 4.0563002547494175e-05, + "loss": 2.552, + "step": 29414 + }, + { + "epoch": 0.8722533582421492, + "grad_norm": 0.07764888554811478, + "learning_rate": 4.054444057271839e-05, + "loss": 2.5698, + "step": 29415 + }, + { + "epoch": 0.8722830115944608, + "grad_norm": 0.07611449062824249, + "learning_rate": 4.0525882666500426e-05, + "loss": 2.5476, + "step": 29416 + }, + { + "epoch": 0.8723126649467723, + "grad_norm": 0.08169833570718765, + "learning_rate": 4.0507328829004706e-05, + "loss": 2.5366, + "step": 29417 + }, + { + "epoch": 0.8723423182990837, + "grad_norm": 0.07708290964365005, + "learning_rate": 4.048877906039539e-05, + "loss": 2.5437, + "step": 29418 + }, + { + "epoch": 0.8723719716513952, + "grad_norm": 0.08229146897792816, + "learning_rate": 4.047023336083683e-05, + "loss": 2.5711, + "step": 29419 + }, + { + "epoch": 0.8724016250037067, + "grad_norm": 0.07565054297447205, + "learning_rate": 4.045169173049329e-05, + "loss": 2.5232, + "step": 29420 + }, + { + "epoch": 0.8724312783560182, + "grad_norm": 0.08044075965881348, + "learning_rate": 4.0433154169528805e-05, + "loss": 2.5632, + "step": 29421 + }, + { + "epoch": 0.8724609317083296, + "grad_norm": 0.07995504885911942, + "learning_rate": 4.0414620678107695e-05, + "loss": 2.5101, + "step": 29422 + }, + { + "epoch": 0.8724905850606411, + "grad_norm": 0.07495420426130295, + "learning_rate": 4.0396091256393986e-05, + "loss": 2.5573, + "step": 29423 + }, + { + "epoch": 0.8725202384129526, + "grad_norm": 0.07136008888483047, + "learning_rate": 4.037756590455188e-05, + "loss": 2.5544, + "step": 29424 + }, + { + "epoch": 0.8725498917652641, + "grad_norm": 0.07743575423955917, + "learning_rate": 4.035904462274509e-05, + "loss": 2.5802, + "step": 29425 + }, + { + "epoch": 0.8725795451175755, + "grad_norm": 0.07773937284946442, + "learning_rate": 4.034052741113803e-05, + "loss": 2.556, + "step": 29426 + }, + { + "epoch": 0.872609198469887, + "grad_norm": 0.07677038758993149, + "learning_rate": 4.03220142698944e-05, + "loss": 2.5398, + "step": 29427 + }, + { + "epoch": 0.8726388518221985, + "grad_norm": 0.07558224350214005, + "learning_rate": 4.0303505199178305e-05, + "loss": 2.5243, + "step": 29428 + }, + { + "epoch": 0.87266850517451, + "grad_norm": 0.07888618111610413, + "learning_rate": 4.0285000199153544e-05, + "loss": 2.5954, + "step": 29429 + }, + { + "epoch": 0.8726981585268214, + "grad_norm": 0.08999216556549072, + "learning_rate": 4.026649926998405e-05, + "loss": 2.579, + "step": 29430 + }, + { + "epoch": 0.872727811879133, + "grad_norm": 0.07514936476945877, + "learning_rate": 4.024800241183363e-05, + "loss": 2.5133, + "step": 29431 + }, + { + "epoch": 0.8727574652314444, + "grad_norm": 0.08088062703609467, + "learning_rate": 4.022950962486599e-05, + "loss": 2.545, + "step": 29432 + }, + { + "epoch": 0.8727871185837559, + "grad_norm": 0.0826995000243187, + "learning_rate": 4.0211020909245e-05, + "loss": 2.5481, + "step": 29433 + }, + { + "epoch": 0.8728167719360673, + "grad_norm": 0.07433909922838211, + "learning_rate": 4.019253626513436e-05, + "loss": 2.5351, + "step": 29434 + }, + { + "epoch": 0.8728464252883789, + "grad_norm": 0.08441627025604248, + "learning_rate": 4.017405569269783e-05, + "loss": 2.5304, + "step": 29435 + }, + { + "epoch": 0.8728760786406903, + "grad_norm": 0.07896623760461807, + "learning_rate": 4.0155579192098834e-05, + "loss": 2.5425, + "step": 29436 + }, + { + "epoch": 0.8729057319930018, + "grad_norm": 0.0733385682106018, + "learning_rate": 4.013710676350113e-05, + "loss": 2.5608, + "step": 29437 + }, + { + "epoch": 0.8729353853453133, + "grad_norm": 0.07813764363527298, + "learning_rate": 4.0118638407068195e-05, + "loss": 2.5536, + "step": 29438 + }, + { + "epoch": 0.8729650386976248, + "grad_norm": 0.07903548330068588, + "learning_rate": 4.0100174122963795e-05, + "loss": 2.5139, + "step": 29439 + }, + { + "epoch": 0.8729946920499363, + "grad_norm": 0.07746938616037369, + "learning_rate": 4.008171391135124e-05, + "loss": 2.5694, + "step": 29440 + }, + { + "epoch": 0.8730243454022477, + "grad_norm": 0.07841511815786362, + "learning_rate": 4.006325777239406e-05, + "loss": 2.5568, + "step": 29441 + }, + { + "epoch": 0.8730539987545592, + "grad_norm": 0.07798177003860474, + "learning_rate": 4.0044805706255685e-05, + "loss": 2.5061, + "step": 29442 + }, + { + "epoch": 0.8730836521068707, + "grad_norm": 0.07315367460250854, + "learning_rate": 4.0026357713099484e-05, + "loss": 2.5288, + "step": 29443 + }, + { + "epoch": 0.8731133054591822, + "grad_norm": 0.07640428096055984, + "learning_rate": 4.000791379308882e-05, + "loss": 2.504, + "step": 29444 + }, + { + "epoch": 0.8731429588114936, + "grad_norm": 0.07357891649007797, + "learning_rate": 3.998947394638719e-05, + "loss": 2.5241, + "step": 29445 + }, + { + "epoch": 0.8731726121638052, + "grad_norm": 0.07070562988519669, + "learning_rate": 3.9971038173157615e-05, + "loss": 2.5308, + "step": 29446 + }, + { + "epoch": 0.8732022655161166, + "grad_norm": 0.0793648287653923, + "learning_rate": 3.995260647356341e-05, + "loss": 2.5438, + "step": 29447 + }, + { + "epoch": 0.8732319188684281, + "grad_norm": 0.0721527636051178, + "learning_rate": 3.99341788477679e-05, + "loss": 2.5452, + "step": 29448 + }, + { + "epoch": 0.8732615722207395, + "grad_norm": 0.07474792003631592, + "learning_rate": 3.991575529593422e-05, + "loss": 2.5281, + "step": 29449 + }, + { + "epoch": 0.873291225573051, + "grad_norm": 0.0738494023680687, + "learning_rate": 3.989733581822547e-05, + "loss": 2.5391, + "step": 29450 + }, + { + "epoch": 0.8733208789253625, + "grad_norm": 0.07781246304512024, + "learning_rate": 3.9878920414804675e-05, + "loss": 2.5267, + "step": 29451 + }, + { + "epoch": 0.873350532277674, + "grad_norm": 0.07438182085752487, + "learning_rate": 3.986050908583511e-05, + "loss": 2.5516, + "step": 29452 + }, + { + "epoch": 0.8733801856299854, + "grad_norm": 0.07396610081195831, + "learning_rate": 3.984210183147974e-05, + "loss": 2.5432, + "step": 29453 + }, + { + "epoch": 0.873409838982297, + "grad_norm": 0.077072873711586, + "learning_rate": 3.982369865190155e-05, + "loss": 2.5552, + "step": 29454 + }, + { + "epoch": 0.8734394923346084, + "grad_norm": 0.07670972496271133, + "learning_rate": 3.9805299547263475e-05, + "loss": 2.5698, + "step": 29455 + }, + { + "epoch": 0.8734691456869199, + "grad_norm": 0.07724186778068542, + "learning_rate": 3.9786904517728595e-05, + "loss": 2.5404, + "step": 29456 + }, + { + "epoch": 0.8734987990392313, + "grad_norm": 0.07684921473264694, + "learning_rate": 3.9768513563459563e-05, + "loss": 2.5372, + "step": 29457 + }, + { + "epoch": 0.8735284523915429, + "grad_norm": 0.07554096728563309, + "learning_rate": 3.97501266846193e-05, + "loss": 2.5764, + "step": 29458 + }, + { + "epoch": 0.8735581057438544, + "grad_norm": 0.07016123086214066, + "learning_rate": 3.973174388137068e-05, + "loss": 2.5265, + "step": 29459 + }, + { + "epoch": 0.8735877590961658, + "grad_norm": 0.0756833478808403, + "learning_rate": 3.971336515387652e-05, + "loss": 2.5713, + "step": 29460 + }, + { + "epoch": 0.8736174124484773, + "grad_norm": 0.07277212291955948, + "learning_rate": 3.969499050229947e-05, + "loss": 2.5449, + "step": 29461 + }, + { + "epoch": 0.8736470658007888, + "grad_norm": 0.0786782056093216, + "learning_rate": 3.967661992680233e-05, + "loss": 2.543, + "step": 29462 + }, + { + "epoch": 0.8736767191531003, + "grad_norm": 0.07504789531230927, + "learning_rate": 3.96582534275477e-05, + "loss": 2.5494, + "step": 29463 + }, + { + "epoch": 0.8737063725054117, + "grad_norm": 0.07486768811941147, + "learning_rate": 3.963989100469817e-05, + "loss": 2.5431, + "step": 29464 + }, + { + "epoch": 0.8737360258577233, + "grad_norm": 0.07080205529928207, + "learning_rate": 3.9621532658416505e-05, + "loss": 2.5599, + "step": 29465 + }, + { + "epoch": 0.8737656792100347, + "grad_norm": 0.07869158685207367, + "learning_rate": 3.960317838886529e-05, + "loss": 2.5272, + "step": 29466 + }, + { + "epoch": 0.8737953325623462, + "grad_norm": 0.0783253014087677, + "learning_rate": 3.95848281962069e-05, + "loss": 2.5601, + "step": 29467 + }, + { + "epoch": 0.8738249859146576, + "grad_norm": 0.07596361637115479, + "learning_rate": 3.956648208060382e-05, + "loss": 2.5381, + "step": 29468 + }, + { + "epoch": 0.8738546392669692, + "grad_norm": 0.0779070109128952, + "learning_rate": 3.954814004221863e-05, + "loss": 2.5354, + "step": 29469 + }, + { + "epoch": 0.8738842926192806, + "grad_norm": 0.07743804156780243, + "learning_rate": 3.952980208121365e-05, + "loss": 2.5306, + "step": 29470 + }, + { + "epoch": 0.8739139459715921, + "grad_norm": 0.07707946002483368, + "learning_rate": 3.951146819775136e-05, + "loss": 2.5653, + "step": 29471 + }, + { + "epoch": 0.8739435993239035, + "grad_norm": 0.08470811694860458, + "learning_rate": 3.9493138391994034e-05, + "loss": 2.54, + "step": 29472 + }, + { + "epoch": 0.8739732526762151, + "grad_norm": 0.07791080325841904, + "learning_rate": 3.947481266410402e-05, + "loss": 2.5477, + "step": 29473 + }, + { + "epoch": 0.8740029060285265, + "grad_norm": 0.0715622752904892, + "learning_rate": 3.945649101424359e-05, + "loss": 2.5535, + "step": 29474 + }, + { + "epoch": 0.874032559380838, + "grad_norm": 0.07725470513105392, + "learning_rate": 3.9438173442575e-05, + "loss": 2.5219, + "step": 29475 + }, + { + "epoch": 0.8740622127331494, + "grad_norm": 0.07878373563289642, + "learning_rate": 3.94198599492604e-05, + "loss": 2.5121, + "step": 29476 + }, + { + "epoch": 0.874091866085461, + "grad_norm": 0.07960671931505203, + "learning_rate": 3.940155053446198e-05, + "loss": 2.5493, + "step": 29477 + }, + { + "epoch": 0.8741215194377724, + "grad_norm": 0.0793587937951088, + "learning_rate": 3.938324519834197e-05, + "loss": 2.5426, + "step": 29478 + }, + { + "epoch": 0.8741511727900839, + "grad_norm": 0.07934927940368652, + "learning_rate": 3.936494394106232e-05, + "loss": 2.5231, + "step": 29479 + }, + { + "epoch": 0.8741808261423955, + "grad_norm": 0.07897951453924179, + "learning_rate": 3.934664676278515e-05, + "loss": 2.5273, + "step": 29480 + }, + { + "epoch": 0.8742104794947069, + "grad_norm": 0.07442426681518555, + "learning_rate": 3.9328353663672546e-05, + "loss": 2.5254, + "step": 29481 + }, + { + "epoch": 0.8742401328470184, + "grad_norm": 0.07685764878988266, + "learning_rate": 3.931006464388642e-05, + "loss": 2.5624, + "step": 29482 + }, + { + "epoch": 0.8742697861993298, + "grad_norm": 0.07531990110874176, + "learning_rate": 3.929177970358877e-05, + "loss": 2.5291, + "step": 29483 + }, + { + "epoch": 0.8742994395516414, + "grad_norm": 0.08087708801031113, + "learning_rate": 3.927349884294146e-05, + "loss": 2.5337, + "step": 29484 + }, + { + "epoch": 0.8743290929039528, + "grad_norm": 0.07368254661560059, + "learning_rate": 3.925522206210641e-05, + "loss": 2.5152, + "step": 29485 + }, + { + "epoch": 0.8743587462562643, + "grad_norm": 0.07820434868335724, + "learning_rate": 3.923694936124544e-05, + "loss": 2.5265, + "step": 29486 + }, + { + "epoch": 0.8743883996085757, + "grad_norm": 0.08044466376304626, + "learning_rate": 3.921868074052037e-05, + "loss": 2.5424, + "step": 29487 + }, + { + "epoch": 0.8744180529608873, + "grad_norm": 0.07480476051568985, + "learning_rate": 3.9200416200093016e-05, + "loss": 2.5304, + "step": 29488 + }, + { + "epoch": 0.8744477063131987, + "grad_norm": 0.0798240676522255, + "learning_rate": 3.918215574012501e-05, + "loss": 2.5336, + "step": 29489 + }, + { + "epoch": 0.8744773596655102, + "grad_norm": 0.07818467915058136, + "learning_rate": 3.916389936077819e-05, + "loss": 2.5624, + "step": 29490 + }, + { + "epoch": 0.8745070130178216, + "grad_norm": 0.07571802288293839, + "learning_rate": 3.914564706221407e-05, + "loss": 2.5307, + "step": 29491 + }, + { + "epoch": 0.8745366663701332, + "grad_norm": 0.07974868267774582, + "learning_rate": 3.912739884459443e-05, + "loss": 2.5557, + "step": 29492 + }, + { + "epoch": 0.8745663197224446, + "grad_norm": 0.07348747551441193, + "learning_rate": 3.9109154708080684e-05, + "loss": 2.5765, + "step": 29493 + }, + { + "epoch": 0.8745959730747561, + "grad_norm": 0.07995016127824783, + "learning_rate": 3.9090914652834544e-05, + "loss": 2.5738, + "step": 29494 + }, + { + "epoch": 0.8746256264270675, + "grad_norm": 0.07417904585599899, + "learning_rate": 3.907267867901748e-05, + "loss": 2.4892, + "step": 29495 + }, + { + "epoch": 0.8746552797793791, + "grad_norm": 0.07611145824193954, + "learning_rate": 3.905444678679093e-05, + "loss": 2.5529, + "step": 29496 + }, + { + "epoch": 0.8746849331316905, + "grad_norm": 0.07565909624099731, + "learning_rate": 3.903621897631637e-05, + "loss": 2.5621, + "step": 29497 + }, + { + "epoch": 0.874714586484002, + "grad_norm": 0.07580526918172836, + "learning_rate": 3.901799524775523e-05, + "loss": 2.5314, + "step": 29498 + }, + { + "epoch": 0.8747442398363134, + "grad_norm": 0.07315603643655777, + "learning_rate": 3.899977560126888e-05, + "loss": 2.561, + "step": 29499 + }, + { + "epoch": 0.874773893188625, + "grad_norm": 0.0775250792503357, + "learning_rate": 3.8981560037018745e-05, + "loss": 2.534, + "step": 29500 + }, + { + "epoch": 0.8748035465409365, + "grad_norm": 0.07533344626426697, + "learning_rate": 3.896334855516587e-05, + "loss": 2.5679, + "step": 29501 + }, + { + "epoch": 0.8748331998932479, + "grad_norm": 0.07776907831430435, + "learning_rate": 3.8945141155871775e-05, + "loss": 2.5252, + "step": 29502 + }, + { + "epoch": 0.8748628532455595, + "grad_norm": 0.07842420786619186, + "learning_rate": 3.892693783929746e-05, + "loss": 2.5489, + "step": 29503 + }, + { + "epoch": 0.8748925065978709, + "grad_norm": 0.07624290883541107, + "learning_rate": 3.8908738605604345e-05, + "loss": 2.5547, + "step": 29504 + }, + { + "epoch": 0.8749221599501824, + "grad_norm": 0.0716981515288353, + "learning_rate": 3.8890543454953465e-05, + "loss": 2.5371, + "step": 29505 + }, + { + "epoch": 0.8749518133024938, + "grad_norm": 0.07740405201911926, + "learning_rate": 3.887235238750597e-05, + "loss": 2.489, + "step": 29506 + }, + { + "epoch": 0.8749814666548054, + "grad_norm": 0.07538507133722305, + "learning_rate": 3.885416540342301e-05, + "loss": 2.5648, + "step": 29507 + }, + { + "epoch": 0.8750111200071168, + "grad_norm": 0.0769173726439476, + "learning_rate": 3.883598250286552e-05, + "loss": 2.5369, + "step": 29508 + }, + { + "epoch": 0.8750407733594283, + "grad_norm": 0.07267671823501587, + "learning_rate": 3.881780368599458e-05, + "loss": 2.5375, + "step": 29509 + }, + { + "epoch": 0.8750704267117397, + "grad_norm": 0.08302045613527298, + "learning_rate": 3.8799628952971124e-05, + "loss": 2.5499, + "step": 29510 + }, + { + "epoch": 0.8751000800640513, + "grad_norm": 0.07105637341737747, + "learning_rate": 3.878145830395624e-05, + "loss": 2.5276, + "step": 29511 + }, + { + "epoch": 0.8751297334163627, + "grad_norm": 0.07810850441455841, + "learning_rate": 3.8763291739110586e-05, + "loss": 2.5298, + "step": 29512 + }, + { + "epoch": 0.8751593867686742, + "grad_norm": 0.07905500382184982, + "learning_rate": 3.874512925859519e-05, + "loss": 2.5558, + "step": 29513 + }, + { + "epoch": 0.8751890401209856, + "grad_norm": 0.07740423083305359, + "learning_rate": 3.872697086257082e-05, + "loss": 2.5409, + "step": 29514 + }, + { + "epoch": 0.8752186934732972, + "grad_norm": 0.07776550203561783, + "learning_rate": 3.8708816551198344e-05, + "loss": 2.5232, + "step": 29515 + }, + { + "epoch": 0.8752483468256086, + "grad_norm": 0.07725877314805984, + "learning_rate": 3.869066632463836e-05, + "loss": 2.5172, + "step": 29516 + }, + { + "epoch": 0.8752780001779201, + "grad_norm": 0.08657562732696533, + "learning_rate": 3.867252018305178e-05, + "loss": 2.5884, + "step": 29517 + }, + { + "epoch": 0.8753076535302315, + "grad_norm": 0.08049514889717102, + "learning_rate": 3.865437812659922e-05, + "loss": 2.5447, + "step": 29518 + }, + { + "epoch": 0.8753373068825431, + "grad_norm": 0.08156053721904755, + "learning_rate": 3.863624015544137e-05, + "loss": 2.5277, + "step": 29519 + }, + { + "epoch": 0.8753669602348545, + "grad_norm": 0.08027438074350357, + "learning_rate": 3.861810626973877e-05, + "loss": 2.5654, + "step": 29520 + }, + { + "epoch": 0.875396613587166, + "grad_norm": 0.07155420631170273, + "learning_rate": 3.859997646965213e-05, + "loss": 2.5383, + "step": 29521 + }, + { + "epoch": 0.8754262669394776, + "grad_norm": 0.07229495048522949, + "learning_rate": 3.858185075534182e-05, + "loss": 2.5551, + "step": 29522 + }, + { + "epoch": 0.875455920291789, + "grad_norm": 0.07709401100873947, + "learning_rate": 3.856372912696837e-05, + "loss": 2.5391, + "step": 29523 + }, + { + "epoch": 0.8754855736441005, + "grad_norm": 0.07799892127513885, + "learning_rate": 3.8545611584692385e-05, + "loss": 2.5431, + "step": 29524 + }, + { + "epoch": 0.8755152269964119, + "grad_norm": 0.07059864699840546, + "learning_rate": 3.852749812867418e-05, + "loss": 2.5223, + "step": 29525 + }, + { + "epoch": 0.8755448803487235, + "grad_norm": 0.08037952333688736, + "learning_rate": 3.850938875907417e-05, + "loss": 2.5183, + "step": 29526 + }, + { + "epoch": 0.8755745337010349, + "grad_norm": 0.08040931075811386, + "learning_rate": 3.849128347605274e-05, + "loss": 2.5746, + "step": 29527 + }, + { + "epoch": 0.8756041870533464, + "grad_norm": 0.07285723090171814, + "learning_rate": 3.847318227977026e-05, + "loss": 2.536, + "step": 29528 + }, + { + "epoch": 0.8756338404056578, + "grad_norm": 0.07820746302604675, + "learning_rate": 3.845508517038682e-05, + "loss": 2.5723, + "step": 29529 + }, + { + "epoch": 0.8756634937579694, + "grad_norm": 0.0785854160785675, + "learning_rate": 3.8436992148062955e-05, + "loss": 2.5807, + "step": 29530 + }, + { + "epoch": 0.8756931471102808, + "grad_norm": 0.0719020813703537, + "learning_rate": 3.8418903212958714e-05, + "loss": 2.5567, + "step": 29531 + }, + { + "epoch": 0.8757228004625923, + "grad_norm": 0.07742864638566971, + "learning_rate": 3.840081836523446e-05, + "loss": 2.5443, + "step": 29532 + }, + { + "epoch": 0.8757524538149037, + "grad_norm": 0.07969150692224503, + "learning_rate": 3.838273760505007e-05, + "loss": 2.5411, + "step": 29533 + }, + { + "epoch": 0.8757821071672153, + "grad_norm": 0.07302290946245193, + "learning_rate": 3.8364660932565744e-05, + "loss": 2.5025, + "step": 29534 + }, + { + "epoch": 0.8758117605195267, + "grad_norm": 0.07763105630874634, + "learning_rate": 3.8346588347941636e-05, + "loss": 2.5307, + "step": 29535 + }, + { + "epoch": 0.8758414138718382, + "grad_norm": 0.07797696441411972, + "learning_rate": 3.832851985133767e-05, + "loss": 2.5051, + "step": 29536 + }, + { + "epoch": 0.8758710672241496, + "grad_norm": 0.07896657288074493, + "learning_rate": 3.831045544291395e-05, + "loss": 2.5233, + "step": 29537 + }, + { + "epoch": 0.8759007205764612, + "grad_norm": 0.07734580338001251, + "learning_rate": 3.8292395122830395e-05, + "loss": 2.5339, + "step": 29538 + }, + { + "epoch": 0.8759303739287726, + "grad_norm": 0.07455538958311081, + "learning_rate": 3.8274338891246926e-05, + "loss": 2.5441, + "step": 29539 + }, + { + "epoch": 0.8759600272810841, + "grad_norm": 0.07818391174077988, + "learning_rate": 3.825628674832343e-05, + "loss": 2.4995, + "step": 29540 + }, + { + "epoch": 0.8759896806333956, + "grad_norm": 0.07587561011314392, + "learning_rate": 3.823823869421983e-05, + "loss": 2.5218, + "step": 29541 + }, + { + "epoch": 0.8760193339857071, + "grad_norm": 0.07626011222600937, + "learning_rate": 3.822019472909583e-05, + "loss": 2.541, + "step": 29542 + }, + { + "epoch": 0.8760489873380186, + "grad_norm": 0.07672377675771713, + "learning_rate": 3.8202154853111234e-05, + "loss": 2.5347, + "step": 29543 + }, + { + "epoch": 0.87607864069033, + "grad_norm": 0.0763271376490593, + "learning_rate": 3.818411906642588e-05, + "loss": 2.5408, + "step": 29544 + }, + { + "epoch": 0.8761082940426416, + "grad_norm": 0.07643946260213852, + "learning_rate": 3.8166087369199407e-05, + "loss": 2.5477, + "step": 29545 + }, + { + "epoch": 0.876137947394953, + "grad_norm": 0.0758298933506012, + "learning_rate": 3.8148059761591524e-05, + "loss": 2.5447, + "step": 29546 + }, + { + "epoch": 0.8761676007472645, + "grad_norm": 0.07906241714954376, + "learning_rate": 3.8130036243761826e-05, + "loss": 2.5514, + "step": 29547 + }, + { + "epoch": 0.8761972540995759, + "grad_norm": 0.07756667584180832, + "learning_rate": 3.8112016815869955e-05, + "loss": 2.5616, + "step": 29548 + }, + { + "epoch": 0.8762269074518875, + "grad_norm": 0.08016461879014969, + "learning_rate": 3.8094001478075405e-05, + "loss": 2.5621, + "step": 29549 + }, + { + "epoch": 0.8762565608041989, + "grad_norm": 0.07737082242965698, + "learning_rate": 3.8075990230537816e-05, + "loss": 2.5541, + "step": 29550 + }, + { + "epoch": 0.8762862141565104, + "grad_norm": 0.0761498361825943, + "learning_rate": 3.805798307341657e-05, + "loss": 2.5553, + "step": 29551 + }, + { + "epoch": 0.8763158675088218, + "grad_norm": 0.07713522017002106, + "learning_rate": 3.80399800068712e-05, + "loss": 2.5379, + "step": 29552 + }, + { + "epoch": 0.8763455208611334, + "grad_norm": 0.07890484482049942, + "learning_rate": 3.8021981031061136e-05, + "loss": 2.5365, + "step": 29553 + }, + { + "epoch": 0.8763751742134448, + "grad_norm": 0.07883421331644058, + "learning_rate": 3.8003986146145695e-05, + "loss": 2.5342, + "step": 29554 + }, + { + "epoch": 0.8764048275657563, + "grad_norm": 0.0813206136226654, + "learning_rate": 3.7985995352284254e-05, + "loss": 2.5158, + "step": 29555 + }, + { + "epoch": 0.8764344809180677, + "grad_norm": 0.07960303127765656, + "learning_rate": 3.796800864963612e-05, + "loss": 2.5489, + "step": 29556 + }, + { + "epoch": 0.8764641342703793, + "grad_norm": 0.07446684688329697, + "learning_rate": 3.7950026038360616e-05, + "loss": 2.543, + "step": 29557 + }, + { + "epoch": 0.8764937876226907, + "grad_norm": 0.07630576938390732, + "learning_rate": 3.79320475186169e-05, + "loss": 2.5631, + "step": 29558 + }, + { + "epoch": 0.8765234409750022, + "grad_norm": 0.07335124909877777, + "learning_rate": 3.791407309056422e-05, + "loss": 2.5455, + "step": 29559 + }, + { + "epoch": 0.8765530943273137, + "grad_norm": 0.07755965739488602, + "learning_rate": 3.7896102754361725e-05, + "loss": 2.5891, + "step": 29560 + }, + { + "epoch": 0.8765827476796252, + "grad_norm": 0.07607647776603699, + "learning_rate": 3.787813651016858e-05, + "loss": 2.5537, + "step": 29561 + }, + { + "epoch": 0.8766124010319366, + "grad_norm": 0.07435626536607742, + "learning_rate": 3.7860174358143876e-05, + "loss": 2.6008, + "step": 29562 + }, + { + "epoch": 0.8766420543842481, + "grad_norm": 0.07040418684482574, + "learning_rate": 3.7842216298446643e-05, + "loss": 2.5288, + "step": 29563 + }, + { + "epoch": 0.8766717077365597, + "grad_norm": 0.07047364860773087, + "learning_rate": 3.7824262331235926e-05, + "loss": 2.5659, + "step": 29564 + }, + { + "epoch": 0.8767013610888711, + "grad_norm": 0.07878071814775467, + "learning_rate": 3.7806312456670655e-05, + "loss": 2.5504, + "step": 29565 + }, + { + "epoch": 0.8767310144411826, + "grad_norm": 0.08221281319856644, + "learning_rate": 3.778836667490992e-05, + "loss": 2.5316, + "step": 29566 + }, + { + "epoch": 0.876760667793494, + "grad_norm": 0.07262379676103592, + "learning_rate": 3.77704249861125e-05, + "loss": 2.5576, + "step": 29567 + }, + { + "epoch": 0.8767903211458056, + "grad_norm": 0.08403530716896057, + "learning_rate": 3.775248739043719e-05, + "loss": 2.5477, + "step": 29568 + }, + { + "epoch": 0.876819974498117, + "grad_norm": 0.08113199472427368, + "learning_rate": 3.773455388804303e-05, + "loss": 2.5445, + "step": 29569 + }, + { + "epoch": 0.8768496278504285, + "grad_norm": 0.07851037383079529, + "learning_rate": 3.771662447908874e-05, + "loss": 2.5324, + "step": 29570 + }, + { + "epoch": 0.87687928120274, + "grad_norm": 0.08215490728616714, + "learning_rate": 3.7698699163733084e-05, + "loss": 2.5317, + "step": 29571 + }, + { + "epoch": 0.8769089345550515, + "grad_norm": 0.07957226783037186, + "learning_rate": 3.7680777942134815e-05, + "loss": 2.5544, + "step": 29572 + }, + { + "epoch": 0.8769385879073629, + "grad_norm": 0.07639148086309433, + "learning_rate": 3.766286081445258e-05, + "loss": 2.5359, + "step": 29573 + }, + { + "epoch": 0.8769682412596744, + "grad_norm": 0.07614278048276901, + "learning_rate": 3.764494778084509e-05, + "loss": 2.5268, + "step": 29574 + }, + { + "epoch": 0.8769978946119859, + "grad_norm": 0.07717511802911758, + "learning_rate": 3.762703884147095e-05, + "loss": 2.5258, + "step": 29575 + }, + { + "epoch": 0.8770275479642974, + "grad_norm": 0.07280953228473663, + "learning_rate": 3.760913399648885e-05, + "loss": 2.539, + "step": 29576 + }, + { + "epoch": 0.8770572013166088, + "grad_norm": 0.08084387332201004, + "learning_rate": 3.759123324605712e-05, + "loss": 2.5589, + "step": 29577 + }, + { + "epoch": 0.8770868546689203, + "grad_norm": 0.07469293475151062, + "learning_rate": 3.757333659033441e-05, + "loss": 2.5344, + "step": 29578 + }, + { + "epoch": 0.8771165080212318, + "grad_norm": 0.07362179458141327, + "learning_rate": 3.755544402947919e-05, + "loss": 2.5227, + "step": 29579 + }, + { + "epoch": 0.8771461613735433, + "grad_norm": 0.072316013276577, + "learning_rate": 3.7537555563649796e-05, + "loss": 2.5162, + "step": 29580 + }, + { + "epoch": 0.8771758147258547, + "grad_norm": 0.07233822345733643, + "learning_rate": 3.751967119300481e-05, + "loss": 2.5071, + "step": 29581 + }, + { + "epoch": 0.8772054680781662, + "grad_norm": 0.07432237267494202, + "learning_rate": 3.7501790917702505e-05, + "loss": 2.5166, + "step": 29582 + }, + { + "epoch": 0.8772351214304777, + "grad_norm": 0.07531624287366867, + "learning_rate": 3.748391473790125e-05, + "loss": 2.5482, + "step": 29583 + }, + { + "epoch": 0.8772647747827892, + "grad_norm": 0.07680726051330566, + "learning_rate": 3.746604265375936e-05, + "loss": 2.5813, + "step": 29584 + }, + { + "epoch": 0.8772944281351007, + "grad_norm": 0.0806109830737114, + "learning_rate": 3.7448174665434986e-05, + "loss": 2.5282, + "step": 29585 + }, + { + "epoch": 0.8773240814874121, + "grad_norm": 0.07996901869773865, + "learning_rate": 3.743031077308645e-05, + "loss": 2.5054, + "step": 29586 + }, + { + "epoch": 0.8773537348397237, + "grad_norm": 0.07402198761701584, + "learning_rate": 3.7412450976871956e-05, + "loss": 2.5066, + "step": 29587 + }, + { + "epoch": 0.8773833881920351, + "grad_norm": 0.07761408388614655, + "learning_rate": 3.739459527694961e-05, + "loss": 2.5618, + "step": 29588 + }, + { + "epoch": 0.8774130415443466, + "grad_norm": 0.08158047497272491, + "learning_rate": 3.737674367347743e-05, + "loss": 2.5322, + "step": 29589 + }, + { + "epoch": 0.877442694896658, + "grad_norm": 0.0713350772857666, + "learning_rate": 3.7358896166613645e-05, + "loss": 2.5513, + "step": 29590 + }, + { + "epoch": 0.8774723482489696, + "grad_norm": 0.07404892891645432, + "learning_rate": 3.734105275651628e-05, + "loss": 2.5384, + "step": 29591 + }, + { + "epoch": 0.877502001601281, + "grad_norm": 0.0811929702758789, + "learning_rate": 3.7323213443343276e-05, + "loss": 2.5739, + "step": 29592 + }, + { + "epoch": 0.8775316549535925, + "grad_norm": 0.07817959785461426, + "learning_rate": 3.730537822725249e-05, + "loss": 2.5446, + "step": 29593 + }, + { + "epoch": 0.877561308305904, + "grad_norm": 0.07541222125291824, + "learning_rate": 3.728754710840215e-05, + "loss": 2.5358, + "step": 29594 + }, + { + "epoch": 0.8775909616582155, + "grad_norm": 0.07366330921649933, + "learning_rate": 3.726972008695001e-05, + "loss": 2.5331, + "step": 29595 + }, + { + "epoch": 0.8776206150105269, + "grad_norm": 0.07899738103151321, + "learning_rate": 3.725189716305388e-05, + "loss": 2.5662, + "step": 29596 + }, + { + "epoch": 0.8776502683628384, + "grad_norm": 0.07712262868881226, + "learning_rate": 3.7234078336871755e-05, + "loss": 2.4983, + "step": 29597 + }, + { + "epoch": 0.8776799217151499, + "grad_norm": 0.08688535541296005, + "learning_rate": 3.7216263608561226e-05, + "loss": 2.5211, + "step": 29598 + }, + { + "epoch": 0.8777095750674614, + "grad_norm": 0.07596156746149063, + "learning_rate": 3.719845297828006e-05, + "loss": 2.5067, + "step": 29599 + }, + { + "epoch": 0.8777392284197728, + "grad_norm": 0.07956348359584808, + "learning_rate": 3.718064644618607e-05, + "loss": 2.5505, + "step": 29600 + }, + { + "epoch": 0.8777688817720843, + "grad_norm": 0.07879247516393661, + "learning_rate": 3.716284401243691e-05, + "loss": 2.5044, + "step": 29601 + }, + { + "epoch": 0.8777985351243958, + "grad_norm": 0.08379136770963669, + "learning_rate": 3.7145045677190184e-05, + "loss": 2.5447, + "step": 29602 + }, + { + "epoch": 0.8778281884767073, + "grad_norm": 0.07668830454349518, + "learning_rate": 3.7127251440603526e-05, + "loss": 2.5795, + "step": 29603 + }, + { + "epoch": 0.8778578418290188, + "grad_norm": 0.07771699875593185, + "learning_rate": 3.710946130283455e-05, + "loss": 2.5068, + "step": 29604 + }, + { + "epoch": 0.8778874951813302, + "grad_norm": 0.07463160902261734, + "learning_rate": 3.709167526404072e-05, + "loss": 2.5373, + "step": 29605 + }, + { + "epoch": 0.8779171485336418, + "grad_norm": 0.07923202961683273, + "learning_rate": 3.707389332437949e-05, + "loss": 2.5293, + "step": 29606 + }, + { + "epoch": 0.8779468018859532, + "grad_norm": 0.0745554193854332, + "learning_rate": 3.705611548400844e-05, + "loss": 2.5747, + "step": 29607 + }, + { + "epoch": 0.8779764552382647, + "grad_norm": 0.07368651032447815, + "learning_rate": 3.703834174308507e-05, + "loss": 2.5731, + "step": 29608 + }, + { + "epoch": 0.8780061085905762, + "grad_norm": 0.07659128308296204, + "learning_rate": 3.702057210176657e-05, + "loss": 2.5558, + "step": 29609 + }, + { + "epoch": 0.8780357619428877, + "grad_norm": 0.07152656465768814, + "learning_rate": 3.700280656021032e-05, + "loss": 2.5295, + "step": 29610 + }, + { + "epoch": 0.8780654152951991, + "grad_norm": 0.07724978774785995, + "learning_rate": 3.698504511857376e-05, + "loss": 2.5468, + "step": 29611 + }, + { + "epoch": 0.8780950686475106, + "grad_norm": 0.07277455180883408, + "learning_rate": 3.696728777701408e-05, + "loss": 2.5618, + "step": 29612 + }, + { + "epoch": 0.878124721999822, + "grad_norm": 0.07117500901222229, + "learning_rate": 3.69495345356885e-05, + "loss": 2.5161, + "step": 29613 + }, + { + "epoch": 0.8781543753521336, + "grad_norm": 0.0708998292684555, + "learning_rate": 3.6931785394754335e-05, + "loss": 2.5382, + "step": 29614 + }, + { + "epoch": 0.878184028704445, + "grad_norm": 0.07550730556249619, + "learning_rate": 3.691404035436863e-05, + "loss": 2.5333, + "step": 29615 + }, + { + "epoch": 0.8782136820567565, + "grad_norm": 0.07336056232452393, + "learning_rate": 3.689629941468864e-05, + "loss": 2.5592, + "step": 29616 + }, + { + "epoch": 0.878243335409068, + "grad_norm": 0.07270687818527222, + "learning_rate": 3.687856257587141e-05, + "loss": 2.5463, + "step": 29617 + }, + { + "epoch": 0.8782729887613795, + "grad_norm": 0.07981362193822861, + "learning_rate": 3.6860829838073985e-05, + "loss": 2.5467, + "step": 29618 + }, + { + "epoch": 0.8783026421136909, + "grad_norm": 0.0789145827293396, + "learning_rate": 3.68431012014534e-05, + "loss": 2.5543, + "step": 29619 + }, + { + "epoch": 0.8783322954660024, + "grad_norm": 0.0752682164311409, + "learning_rate": 3.682537666616664e-05, + "loss": 2.5872, + "step": 29620 + }, + { + "epoch": 0.8783619488183139, + "grad_norm": 0.07623613625764847, + "learning_rate": 3.68076562323707e-05, + "loss": 2.5206, + "step": 29621 + }, + { + "epoch": 0.8783916021706254, + "grad_norm": 0.07885224372148514, + "learning_rate": 3.67899399002225e-05, + "loss": 2.5533, + "step": 29622 + }, + { + "epoch": 0.8784212555229368, + "grad_norm": 0.07775728404521942, + "learning_rate": 3.6772227669878867e-05, + "loss": 2.541, + "step": 29623 + }, + { + "epoch": 0.8784509088752483, + "grad_norm": 0.07046408206224442, + "learning_rate": 3.675451954149661e-05, + "loss": 2.5613, + "step": 29624 + }, + { + "epoch": 0.8784805622275599, + "grad_norm": 0.07869119197130203, + "learning_rate": 3.673681551523267e-05, + "loss": 2.5105, + "step": 29625 + }, + { + "epoch": 0.8785102155798713, + "grad_norm": 0.08070387691259384, + "learning_rate": 3.671911559124375e-05, + "loss": 2.5529, + "step": 29626 + }, + { + "epoch": 0.8785398689321828, + "grad_norm": 0.07596386969089508, + "learning_rate": 3.670141976968655e-05, + "loss": 2.5493, + "step": 29627 + }, + { + "epoch": 0.8785695222844943, + "grad_norm": 0.07694417983293533, + "learning_rate": 3.668372805071779e-05, + "loss": 2.5235, + "step": 29628 + }, + { + "epoch": 0.8785991756368058, + "grad_norm": 0.07604589313268661, + "learning_rate": 3.666604043449418e-05, + "loss": 2.5254, + "step": 29629 + }, + { + "epoch": 0.8786288289891172, + "grad_norm": 0.07806001603603363, + "learning_rate": 3.66483569211723e-05, + "loss": 2.5297, + "step": 29630 + }, + { + "epoch": 0.8786584823414287, + "grad_norm": 0.07787342369556427, + "learning_rate": 3.663067751090882e-05, + "loss": 2.5646, + "step": 29631 + }, + { + "epoch": 0.8786881356937402, + "grad_norm": 0.07273875176906586, + "learning_rate": 3.661300220386004e-05, + "loss": 2.542, + "step": 29632 + }, + { + "epoch": 0.8787177890460517, + "grad_norm": 0.07297825813293457, + "learning_rate": 3.6595331000182807e-05, + "loss": 2.5137, + "step": 29633 + }, + { + "epoch": 0.8787474423983631, + "grad_norm": 0.07525058090686798, + "learning_rate": 3.657766390003342e-05, + "loss": 2.5423, + "step": 29634 + }, + { + "epoch": 0.8787770957506746, + "grad_norm": 0.07622143626213074, + "learning_rate": 3.656000090356837e-05, + "loss": 2.501, + "step": 29635 + }, + { + "epoch": 0.8788067491029861, + "grad_norm": 0.07443669438362122, + "learning_rate": 3.6542342010944085e-05, + "loss": 2.5279, + "step": 29636 + }, + { + "epoch": 0.8788364024552976, + "grad_norm": 0.07710952311754227, + "learning_rate": 3.6524687222316886e-05, + "loss": 2.6002, + "step": 29637 + }, + { + "epoch": 0.878866055807609, + "grad_norm": 0.07705667614936829, + "learning_rate": 3.650703653784315e-05, + "loss": 2.5132, + "step": 29638 + }, + { + "epoch": 0.8788957091599205, + "grad_norm": 0.07721523195505142, + "learning_rate": 3.64893899576792e-05, + "loss": 2.5443, + "step": 29639 + }, + { + "epoch": 0.878925362512232, + "grad_norm": 0.0730719119310379, + "learning_rate": 3.6471747481981175e-05, + "loss": 2.5487, + "step": 29640 + }, + { + "epoch": 0.8789550158645435, + "grad_norm": 0.07410816103219986, + "learning_rate": 3.6454109110905465e-05, + "loss": 2.5284, + "step": 29641 + }, + { + "epoch": 0.8789846692168549, + "grad_norm": 0.07539256662130356, + "learning_rate": 3.6436474844608215e-05, + "loss": 2.5435, + "step": 29642 + }, + { + "epoch": 0.8790143225691665, + "grad_norm": 0.07265067845582962, + "learning_rate": 3.6418844683245525e-05, + "loss": 2.5449, + "step": 29643 + }, + { + "epoch": 0.8790439759214779, + "grad_norm": 0.0747944563627243, + "learning_rate": 3.6401218626973485e-05, + "loss": 2.5339, + "step": 29644 + }, + { + "epoch": 0.8790736292737894, + "grad_norm": 0.08408086746931076, + "learning_rate": 3.638359667594815e-05, + "loss": 2.5031, + "step": 29645 + }, + { + "epoch": 0.8791032826261009, + "grad_norm": 0.07344073057174683, + "learning_rate": 3.6365978830325775e-05, + "loss": 2.5538, + "step": 29646 + }, + { + "epoch": 0.8791329359784124, + "grad_norm": 0.07775762677192688, + "learning_rate": 3.634836509026218e-05, + "loss": 2.5444, + "step": 29647 + }, + { + "epoch": 0.8791625893307239, + "grad_norm": 0.08360570669174194, + "learning_rate": 3.633075545591347e-05, + "loss": 2.5775, + "step": 29648 + }, + { + "epoch": 0.8791922426830353, + "grad_norm": 0.07969300448894501, + "learning_rate": 3.631314992743545e-05, + "loss": 2.5543, + "step": 29649 + }, + { + "epoch": 0.8792218960353468, + "grad_norm": 0.07350356131792068, + "learning_rate": 3.6295548504984065e-05, + "loss": 2.5745, + "step": 29650 + }, + { + "epoch": 0.8792515493876583, + "grad_norm": 0.08567333221435547, + "learning_rate": 3.627795118871524e-05, + "loss": 2.5679, + "step": 29651 + }, + { + "epoch": 0.8792812027399698, + "grad_norm": 0.08132927119731903, + "learning_rate": 3.6260357978784794e-05, + "loss": 2.5255, + "step": 29652 + }, + { + "epoch": 0.8793108560922812, + "grad_norm": 0.07484602183103561, + "learning_rate": 3.624276887534844e-05, + "loss": 2.574, + "step": 29653 + }, + { + "epoch": 0.8793405094445927, + "grad_norm": 0.08109229803085327, + "learning_rate": 3.622518387856194e-05, + "loss": 2.5475, + "step": 29654 + }, + { + "epoch": 0.8793701627969042, + "grad_norm": 0.08245891332626343, + "learning_rate": 3.620760298858106e-05, + "loss": 2.5672, + "step": 29655 + }, + { + "epoch": 0.8793998161492157, + "grad_norm": 0.08065102994441986, + "learning_rate": 3.619002620556144e-05, + "loss": 2.5396, + "step": 29656 + }, + { + "epoch": 0.8794294695015271, + "grad_norm": 0.07301072776317596, + "learning_rate": 3.617245352965875e-05, + "loss": 2.5475, + "step": 29657 + }, + { + "epoch": 0.8794591228538386, + "grad_norm": 0.07813473045825958, + "learning_rate": 3.615488496102859e-05, + "loss": 2.516, + "step": 29658 + }, + { + "epoch": 0.8794887762061501, + "grad_norm": 0.07521601021289825, + "learning_rate": 3.6137320499826544e-05, + "loss": 2.5689, + "step": 29659 + }, + { + "epoch": 0.8795184295584616, + "grad_norm": 0.077585369348526, + "learning_rate": 3.6119760146208156e-05, + "loss": 2.5672, + "step": 29660 + }, + { + "epoch": 0.879548082910773, + "grad_norm": 0.08129578828811646, + "learning_rate": 3.610220390032892e-05, + "loss": 2.5197, + "step": 29661 + }, + { + "epoch": 0.8795777362630846, + "grad_norm": 0.07611070573329926, + "learning_rate": 3.608465176234432e-05, + "loss": 2.5264, + "step": 29662 + }, + { + "epoch": 0.879607389615396, + "grad_norm": 0.07196205109357834, + "learning_rate": 3.606710373240985e-05, + "loss": 2.5547, + "step": 29663 + }, + { + "epoch": 0.8796370429677075, + "grad_norm": 0.08168245106935501, + "learning_rate": 3.60495598106807e-05, + "loss": 2.5547, + "step": 29664 + }, + { + "epoch": 0.8796666963200189, + "grad_norm": 0.07937517762184143, + "learning_rate": 3.6032019997312315e-05, + "loss": 2.5622, + "step": 29665 + }, + { + "epoch": 0.8796963496723305, + "grad_norm": 0.07696061581373215, + "learning_rate": 3.601448429246007e-05, + "loss": 2.523, + "step": 29666 + }, + { + "epoch": 0.879726003024642, + "grad_norm": 0.07627352327108383, + "learning_rate": 3.599695269627917e-05, + "loss": 2.5353, + "step": 29667 + }, + { + "epoch": 0.8797556563769534, + "grad_norm": 0.07247795909643173, + "learning_rate": 3.5979425208924944e-05, + "loss": 2.5597, + "step": 29668 + }, + { + "epoch": 0.8797853097292649, + "grad_norm": 0.0794181302189827, + "learning_rate": 3.596190183055248e-05, + "loss": 2.569, + "step": 29669 + }, + { + "epoch": 0.8798149630815764, + "grad_norm": 0.07890918850898743, + "learning_rate": 3.5944382561317104e-05, + "loss": 2.5424, + "step": 29670 + }, + { + "epoch": 0.8798446164338879, + "grad_norm": 0.07925400882959366, + "learning_rate": 3.5926867401373744e-05, + "loss": 2.5674, + "step": 29671 + }, + { + "epoch": 0.8798742697861993, + "grad_norm": 0.08091644942760468, + "learning_rate": 3.590935635087777e-05, + "loss": 2.5656, + "step": 29672 + }, + { + "epoch": 0.8799039231385108, + "grad_norm": 0.08020677417516708, + "learning_rate": 3.5891849409984135e-05, + "loss": 2.5215, + "step": 29673 + }, + { + "epoch": 0.8799335764908223, + "grad_norm": 0.07779940962791443, + "learning_rate": 3.5874346578847804e-05, + "loss": 2.5636, + "step": 29674 + }, + { + "epoch": 0.8799632298431338, + "grad_norm": 0.07459893077611923, + "learning_rate": 3.585684785762372e-05, + "loss": 2.5558, + "step": 29675 + }, + { + "epoch": 0.8799928831954452, + "grad_norm": 0.0790339931845665, + "learning_rate": 3.5839353246466976e-05, + "loss": 2.5488, + "step": 29676 + }, + { + "epoch": 0.8800225365477568, + "grad_norm": 0.08547569811344147, + "learning_rate": 3.582186274553245e-05, + "loss": 2.5714, + "step": 29677 + }, + { + "epoch": 0.8800521899000682, + "grad_norm": 0.07858540117740631, + "learning_rate": 3.580437635497497e-05, + "loss": 2.5429, + "step": 29678 + }, + { + "epoch": 0.8800818432523797, + "grad_norm": 0.08206427097320557, + "learning_rate": 3.57868940749494e-05, + "loss": 2.5512, + "step": 29679 + }, + { + "epoch": 0.8801114966046911, + "grad_norm": 0.0796147957444191, + "learning_rate": 3.576941590561061e-05, + "loss": 2.5254, + "step": 29680 + }, + { + "epoch": 0.8801411499570027, + "grad_norm": 0.07847560197114944, + "learning_rate": 3.575194184711328e-05, + "loss": 2.5095, + "step": 29681 + }, + { + "epoch": 0.8801708033093141, + "grad_norm": 0.07866457104682922, + "learning_rate": 3.573447189961221e-05, + "loss": 2.5672, + "step": 29682 + }, + { + "epoch": 0.8802004566616256, + "grad_norm": 0.07787848263978958, + "learning_rate": 3.571700606326211e-05, + "loss": 2.5444, + "step": 29683 + }, + { + "epoch": 0.880230110013937, + "grad_norm": 0.07420419156551361, + "learning_rate": 3.569954433821759e-05, + "loss": 2.5376, + "step": 29684 + }, + { + "epoch": 0.8802597633662486, + "grad_norm": 0.07456807047128677, + "learning_rate": 3.56820867246333e-05, + "loss": 2.519, + "step": 29685 + }, + { + "epoch": 0.88028941671856, + "grad_norm": 0.0821194127202034, + "learning_rate": 3.5664633222663834e-05, + "loss": 2.5758, + "step": 29686 + }, + { + "epoch": 0.8803190700708715, + "grad_norm": 0.07868105173110962, + "learning_rate": 3.5647183832463737e-05, + "loss": 2.5356, + "step": 29687 + }, + { + "epoch": 0.880348723423183, + "grad_norm": 0.07642043381929398, + "learning_rate": 3.5629738554187494e-05, + "loss": 2.5496, + "step": 29688 + }, + { + "epoch": 0.8803783767754945, + "grad_norm": 0.07630961388349533, + "learning_rate": 3.561229738798971e-05, + "loss": 2.5386, + "step": 29689 + }, + { + "epoch": 0.880408030127806, + "grad_norm": 0.08089973032474518, + "learning_rate": 3.55948603340247e-05, + "loss": 2.5324, + "step": 29690 + }, + { + "epoch": 0.8804376834801174, + "grad_norm": 0.07310612499713898, + "learning_rate": 3.55774273924469e-05, + "loss": 2.5111, + "step": 29691 + }, + { + "epoch": 0.880467336832429, + "grad_norm": 0.08057762682437897, + "learning_rate": 3.5559998563410686e-05, + "loss": 2.543, + "step": 29692 + }, + { + "epoch": 0.8804969901847404, + "grad_norm": 0.07570397108793259, + "learning_rate": 3.5542573847070434e-05, + "loss": 2.5448, + "step": 29693 + }, + { + "epoch": 0.8805266435370519, + "grad_norm": 0.07719908654689789, + "learning_rate": 3.5525153243580466e-05, + "loss": 2.5364, + "step": 29694 + }, + { + "epoch": 0.8805562968893633, + "grad_norm": 0.07432456314563751, + "learning_rate": 3.550773675309493e-05, + "loss": 2.5508, + "step": 29695 + }, + { + "epoch": 0.8805859502416749, + "grad_norm": 0.07400438189506531, + "learning_rate": 3.54903243757681e-05, + "loss": 2.5876, + "step": 29696 + }, + { + "epoch": 0.8806156035939863, + "grad_norm": 0.07391750067472458, + "learning_rate": 3.547291611175418e-05, + "loss": 2.5435, + "step": 29697 + }, + { + "epoch": 0.8806452569462978, + "grad_norm": 0.07948819547891617, + "learning_rate": 3.545551196120739e-05, + "loss": 2.5583, + "step": 29698 + }, + { + "epoch": 0.8806749102986092, + "grad_norm": 0.08205408602952957, + "learning_rate": 3.54381119242817e-05, + "loss": 2.5633, + "step": 29699 + }, + { + "epoch": 0.8807045636509208, + "grad_norm": 0.07478058338165283, + "learning_rate": 3.542071600113134e-05, + "loss": 2.4963, + "step": 29700 + }, + { + "epoch": 0.8807342170032322, + "grad_norm": 0.08135423809289932, + "learning_rate": 3.540332419191022e-05, + "loss": 2.5254, + "step": 29701 + }, + { + "epoch": 0.8807638703555437, + "grad_norm": 0.08059336990118027, + "learning_rate": 3.5385936496772465e-05, + "loss": 2.5289, + "step": 29702 + }, + { + "epoch": 0.8807935237078551, + "grad_norm": 0.07922912389039993, + "learning_rate": 3.536855291587193e-05, + "loss": 2.5395, + "step": 29703 + }, + { + "epoch": 0.8808231770601667, + "grad_norm": 0.07806725054979324, + "learning_rate": 3.5351173449362675e-05, + "loss": 2.5476, + "step": 29704 + }, + { + "epoch": 0.8808528304124781, + "grad_norm": 0.07737638801336288, + "learning_rate": 3.533379809739851e-05, + "loss": 2.5156, + "step": 29705 + }, + { + "epoch": 0.8808824837647896, + "grad_norm": 0.07372966408729553, + "learning_rate": 3.531642686013331e-05, + "loss": 2.5865, + "step": 29706 + }, + { + "epoch": 0.880912137117101, + "grad_norm": 0.07539153844118118, + "learning_rate": 3.529905973772091e-05, + "loss": 2.5792, + "step": 29707 + }, + { + "epoch": 0.8809417904694126, + "grad_norm": 0.07951575517654419, + "learning_rate": 3.528169673031523e-05, + "loss": 2.5382, + "step": 29708 + }, + { + "epoch": 0.8809714438217241, + "grad_norm": 0.075688935816288, + "learning_rate": 3.526433783806976e-05, + "loss": 2.5222, + "step": 29709 + }, + { + "epoch": 0.8810010971740355, + "grad_norm": 0.07414523512125015, + "learning_rate": 3.524698306113827e-05, + "loss": 2.5717, + "step": 29710 + }, + { + "epoch": 0.881030750526347, + "grad_norm": 0.07988277077674866, + "learning_rate": 3.522963239967464e-05, + "loss": 2.5772, + "step": 29711 + }, + { + "epoch": 0.8810604038786585, + "grad_norm": 0.07596699148416519, + "learning_rate": 3.5212285853832346e-05, + "loss": 2.5738, + "step": 29712 + }, + { + "epoch": 0.88109005723097, + "grad_norm": 0.07791131734848022, + "learning_rate": 3.5194943423765056e-05, + "loss": 2.5429, + "step": 29713 + }, + { + "epoch": 0.8811197105832814, + "grad_norm": 0.0807197093963623, + "learning_rate": 3.517760510962631e-05, + "loss": 2.549, + "step": 29714 + }, + { + "epoch": 0.881149363935593, + "grad_norm": 0.07731819897890091, + "learning_rate": 3.51602709115697e-05, + "loss": 2.5348, + "step": 29715 + }, + { + "epoch": 0.8811790172879044, + "grad_norm": 0.07395854592323303, + "learning_rate": 3.514294082974867e-05, + "loss": 2.5616, + "step": 29716 + }, + { + "epoch": 0.8812086706402159, + "grad_norm": 0.0763600766658783, + "learning_rate": 3.512561486431665e-05, + "loss": 2.5368, + "step": 29717 + }, + { + "epoch": 0.8812383239925273, + "grad_norm": 0.0763021931052208, + "learning_rate": 3.5108293015427226e-05, + "loss": 2.5437, + "step": 29718 + }, + { + "epoch": 0.8812679773448389, + "grad_norm": 0.07317313551902771, + "learning_rate": 3.509097528323357e-05, + "loss": 2.5521, + "step": 29719 + }, + { + "epoch": 0.8812976306971503, + "grad_norm": 0.07236698269844055, + "learning_rate": 3.5073661667889114e-05, + "loss": 2.5374, + "step": 29720 + }, + { + "epoch": 0.8813272840494618, + "grad_norm": 0.07531147450208664, + "learning_rate": 3.5056352169547225e-05, + "loss": 2.5241, + "step": 29721 + }, + { + "epoch": 0.8813569374017732, + "grad_norm": 0.07137561589479446, + "learning_rate": 3.5039046788361117e-05, + "loss": 2.578, + "step": 29722 + }, + { + "epoch": 0.8813865907540848, + "grad_norm": 0.07378615438938141, + "learning_rate": 3.502174552448401e-05, + "loss": 2.5588, + "step": 29723 + }, + { + "epoch": 0.8814162441063962, + "grad_norm": 0.0742737203836441, + "learning_rate": 3.500444837806921e-05, + "loss": 2.555, + "step": 29724 + }, + { + "epoch": 0.8814458974587077, + "grad_norm": 0.07296174019575119, + "learning_rate": 3.498715534926983e-05, + "loss": 2.5797, + "step": 29725 + }, + { + "epoch": 0.8814755508110191, + "grad_norm": 0.07262773811817169, + "learning_rate": 3.4969866438239017e-05, + "loss": 2.5917, + "step": 29726 + }, + { + "epoch": 0.8815052041633307, + "grad_norm": 0.07729731500148773, + "learning_rate": 3.495258164512982e-05, + "loss": 2.5534, + "step": 29727 + }, + { + "epoch": 0.8815348575156421, + "grad_norm": 0.07450661808252335, + "learning_rate": 3.4935300970095455e-05, + "loss": 2.5207, + "step": 29728 + }, + { + "epoch": 0.8815645108679536, + "grad_norm": 0.07340996712446213, + "learning_rate": 3.491802441328879e-05, + "loss": 2.553, + "step": 29729 + }, + { + "epoch": 0.8815941642202652, + "grad_norm": 0.07114844769239426, + "learning_rate": 3.490075197486276e-05, + "loss": 2.5491, + "step": 29730 + }, + { + "epoch": 0.8816238175725766, + "grad_norm": 0.07227375358343124, + "learning_rate": 3.488348365497046e-05, + "loss": 2.5166, + "step": 29731 + }, + { + "epoch": 0.8816534709248881, + "grad_norm": 0.07853101193904877, + "learning_rate": 3.4866219453764725e-05, + "loss": 2.5289, + "step": 29732 + }, + { + "epoch": 0.8816831242771995, + "grad_norm": 0.0744045153260231, + "learning_rate": 3.484895937139848e-05, + "loss": 2.533, + "step": 29733 + }, + { + "epoch": 0.8817127776295111, + "grad_norm": 0.07525446265935898, + "learning_rate": 3.483170340802455e-05, + "loss": 2.5652, + "step": 29734 + }, + { + "epoch": 0.8817424309818225, + "grad_norm": 0.08340106159448624, + "learning_rate": 3.4814451563795703e-05, + "loss": 2.5296, + "step": 29735 + }, + { + "epoch": 0.881772084334134, + "grad_norm": 0.07619617134332657, + "learning_rate": 3.4797203838864644e-05, + "loss": 2.5264, + "step": 29736 + }, + { + "epoch": 0.8818017376864454, + "grad_norm": 0.06962093710899353, + "learning_rate": 3.477996023338431e-05, + "loss": 2.4953, + "step": 29737 + }, + { + "epoch": 0.881831391038757, + "grad_norm": 0.07872670143842697, + "learning_rate": 3.4762720747507247e-05, + "loss": 2.5498, + "step": 29738 + }, + { + "epoch": 0.8818610443910684, + "grad_norm": 0.08035137504339218, + "learning_rate": 3.4745485381386275e-05, + "loss": 2.5358, + "step": 29739 + }, + { + "epoch": 0.8818906977433799, + "grad_norm": 0.07296586036682129, + "learning_rate": 3.472825413517378e-05, + "loss": 2.5266, + "step": 29740 + }, + { + "epoch": 0.8819203510956913, + "grad_norm": 0.07579806447029114, + "learning_rate": 3.4711027009022453e-05, + "loss": 2.4973, + "step": 29741 + }, + { + "epoch": 0.8819500044480029, + "grad_norm": 0.07321228086948395, + "learning_rate": 3.469380400308486e-05, + "loss": 2.5462, + "step": 29742 + }, + { + "epoch": 0.8819796578003143, + "grad_norm": 0.07896863669157028, + "learning_rate": 3.467658511751348e-05, + "loss": 2.5284, + "step": 29743 + }, + { + "epoch": 0.8820093111526258, + "grad_norm": 0.08109797537326813, + "learning_rate": 3.465937035246086e-05, + "loss": 2.5729, + "step": 29744 + }, + { + "epoch": 0.8820389645049372, + "grad_norm": 0.07909097522497177, + "learning_rate": 3.464215970807938e-05, + "loss": 2.5256, + "step": 29745 + }, + { + "epoch": 0.8820686178572488, + "grad_norm": 0.08066685497760773, + "learning_rate": 3.462495318452141e-05, + "loss": 2.5453, + "step": 29746 + }, + { + "epoch": 0.8820982712095602, + "grad_norm": 0.07736899703741074, + "learning_rate": 3.4607750781939394e-05, + "loss": 2.5331, + "step": 29747 + }, + { + "epoch": 0.8821279245618717, + "grad_norm": 0.08186017721891403, + "learning_rate": 3.4590552500485594e-05, + "loss": 2.5553, + "step": 29748 + }, + { + "epoch": 0.8821575779141831, + "grad_norm": 0.08112046867609024, + "learning_rate": 3.457335834031239e-05, + "loss": 2.5421, + "step": 29749 + }, + { + "epoch": 0.8821872312664947, + "grad_norm": 0.07641086727380753, + "learning_rate": 3.455616830157193e-05, + "loss": 2.5364, + "step": 29750 + }, + { + "epoch": 0.8822168846188062, + "grad_norm": 0.075388103723526, + "learning_rate": 3.453898238441655e-05, + "loss": 2.569, + "step": 29751 + }, + { + "epoch": 0.8822465379711176, + "grad_norm": 0.07917451858520508, + "learning_rate": 3.4521800588998345e-05, + "loss": 2.5391, + "step": 29752 + }, + { + "epoch": 0.8822761913234292, + "grad_norm": 0.07906918972730637, + "learning_rate": 3.4504622915469464e-05, + "loss": 2.5366, + "step": 29753 + }, + { + "epoch": 0.8823058446757406, + "grad_norm": 0.07106311619281769, + "learning_rate": 3.448744936398207e-05, + "loss": 2.5768, + "step": 29754 + }, + { + "epoch": 0.8823354980280521, + "grad_norm": 0.07951323688030243, + "learning_rate": 3.4470279934688264e-05, + "loss": 2.5581, + "step": 29755 + }, + { + "epoch": 0.8823651513803635, + "grad_norm": 0.07711956650018692, + "learning_rate": 3.445311462773998e-05, + "loss": 2.5432, + "step": 29756 + }, + { + "epoch": 0.8823948047326751, + "grad_norm": 0.07140366733074188, + "learning_rate": 3.443595344328931e-05, + "loss": 2.5289, + "step": 29757 + }, + { + "epoch": 0.8824244580849865, + "grad_norm": 0.07596854120492935, + "learning_rate": 3.441879638148815e-05, + "loss": 2.5406, + "step": 29758 + }, + { + "epoch": 0.882454111437298, + "grad_norm": 0.08064201474189758, + "learning_rate": 3.440164344248847e-05, + "loss": 2.5645, + "step": 29759 + }, + { + "epoch": 0.8824837647896094, + "grad_norm": 0.07392662018537521, + "learning_rate": 3.438449462644222e-05, + "loss": 2.5675, + "step": 29760 + }, + { + "epoch": 0.882513418141921, + "grad_norm": 0.07419824600219727, + "learning_rate": 3.43673499335011e-05, + "loss": 2.5594, + "step": 29761 + }, + { + "epoch": 0.8825430714942324, + "grad_norm": 0.07344198226928711, + "learning_rate": 3.435020936381711e-05, + "loss": 2.5198, + "step": 29762 + }, + { + "epoch": 0.8825727248465439, + "grad_norm": 0.07356822490692139, + "learning_rate": 3.4333072917541896e-05, + "loss": 2.5691, + "step": 29763 + }, + { + "epoch": 0.8826023781988553, + "grad_norm": 0.07266058772802353, + "learning_rate": 3.431594059482723e-05, + "loss": 2.5364, + "step": 29764 + }, + { + "epoch": 0.8826320315511669, + "grad_norm": 0.07470553368330002, + "learning_rate": 3.429881239582488e-05, + "loss": 2.5359, + "step": 29765 + }, + { + "epoch": 0.8826616849034783, + "grad_norm": 0.07196328043937683, + "learning_rate": 3.428168832068646e-05, + "loss": 2.5186, + "step": 29766 + }, + { + "epoch": 0.8826913382557898, + "grad_norm": 0.0752984955906868, + "learning_rate": 3.4264568369563656e-05, + "loss": 2.5577, + "step": 29767 + }, + { + "epoch": 0.8827209916081012, + "grad_norm": 0.07531268149614334, + "learning_rate": 3.424745254260803e-05, + "loss": 2.545, + "step": 29768 + }, + { + "epoch": 0.8827506449604128, + "grad_norm": 0.07315729558467865, + "learning_rate": 3.423034083997112e-05, + "loss": 2.533, + "step": 29769 + }, + { + "epoch": 0.8827802983127242, + "grad_norm": 0.06984668970108032, + "learning_rate": 3.421323326180453e-05, + "loss": 2.5477, + "step": 29770 + }, + { + "epoch": 0.8828099516650357, + "grad_norm": 0.0780259221792221, + "learning_rate": 3.4196129808259705e-05, + "loss": 2.5504, + "step": 29771 + }, + { + "epoch": 0.8828396050173473, + "grad_norm": 0.07282017916440964, + "learning_rate": 3.4179030479488114e-05, + "loss": 2.5441, + "step": 29772 + }, + { + "epoch": 0.8828692583696587, + "grad_norm": 0.07839659601449966, + "learning_rate": 3.416193527564121e-05, + "loss": 2.5519, + "step": 29773 + }, + { + "epoch": 0.8828989117219702, + "grad_norm": 0.07529928535223007, + "learning_rate": 3.4144844196870195e-05, + "loss": 2.5461, + "step": 29774 + }, + { + "epoch": 0.8829285650742816, + "grad_norm": 0.07652132213115692, + "learning_rate": 3.412775724332662e-05, + "loss": 2.5912, + "step": 29775 + }, + { + "epoch": 0.8829582184265932, + "grad_norm": 0.07737105339765549, + "learning_rate": 3.411067441516175e-05, + "loss": 2.5601, + "step": 29776 + }, + { + "epoch": 0.8829878717789046, + "grad_norm": 0.08057309687137604, + "learning_rate": 3.409359571252679e-05, + "loss": 2.5609, + "step": 29777 + }, + { + "epoch": 0.8830175251312161, + "grad_norm": 0.07895723730325699, + "learning_rate": 3.4076521135573026e-05, + "loss": 2.5431, + "step": 29778 + }, + { + "epoch": 0.8830471784835275, + "grad_norm": 0.08268178999423981, + "learning_rate": 3.405945068445165e-05, + "loss": 2.5715, + "step": 29779 + }, + { + "epoch": 0.8830768318358391, + "grad_norm": 0.0772222951054573, + "learning_rate": 3.404238435931378e-05, + "loss": 2.5531, + "step": 29780 + }, + { + "epoch": 0.8831064851881505, + "grad_norm": 0.07705707103013992, + "learning_rate": 3.402532216031062e-05, + "loss": 2.5006, + "step": 29781 + }, + { + "epoch": 0.883136138540462, + "grad_norm": 0.0776849165558815, + "learning_rate": 3.400826408759322e-05, + "loss": 2.5504, + "step": 29782 + }, + { + "epoch": 0.8831657918927734, + "grad_norm": 0.07673827558755875, + "learning_rate": 3.399121014131257e-05, + "loss": 2.5286, + "step": 29783 + }, + { + "epoch": 0.883195445245085, + "grad_norm": 0.0808837041258812, + "learning_rate": 3.3974160321619875e-05, + "loss": 2.5663, + "step": 29784 + }, + { + "epoch": 0.8832250985973964, + "grad_norm": 0.07943279296159744, + "learning_rate": 3.395711462866591e-05, + "loss": 2.543, + "step": 29785 + }, + { + "epoch": 0.8832547519497079, + "grad_norm": 0.08643306791782379, + "learning_rate": 3.394007306260166e-05, + "loss": 2.53, + "step": 29786 + }, + { + "epoch": 0.8832844053020193, + "grad_norm": 0.07981151342391968, + "learning_rate": 3.3923035623578015e-05, + "loss": 2.58, + "step": 29787 + }, + { + "epoch": 0.8833140586543309, + "grad_norm": 0.07508712261915207, + "learning_rate": 3.39060023117459e-05, + "loss": 2.5345, + "step": 29788 + }, + { + "epoch": 0.8833437120066423, + "grad_norm": 0.07588247209787369, + "learning_rate": 3.38889731272562e-05, + "loss": 2.5278, + "step": 29789 + }, + { + "epoch": 0.8833733653589538, + "grad_norm": 0.07793410122394562, + "learning_rate": 3.3871948070259616e-05, + "loss": 2.5528, + "step": 29790 + }, + { + "epoch": 0.8834030187112653, + "grad_norm": 0.07548391073942184, + "learning_rate": 3.385492714090699e-05, + "loss": 2.5135, + "step": 29791 + }, + { + "epoch": 0.8834326720635768, + "grad_norm": 0.08348816633224487, + "learning_rate": 3.383791033934896e-05, + "loss": 2.527, + "step": 29792 + }, + { + "epoch": 0.8834623254158883, + "grad_norm": 0.07706204056739807, + "learning_rate": 3.3820897665736263e-05, + "loss": 2.5316, + "step": 29793 + }, + { + "epoch": 0.8834919787681997, + "grad_norm": 0.08023550361394882, + "learning_rate": 3.380388912021959e-05, + "loss": 2.5217, + "step": 29794 + }, + { + "epoch": 0.8835216321205113, + "grad_norm": 0.08339395374059677, + "learning_rate": 3.378688470294944e-05, + "loss": 2.5354, + "step": 29795 + }, + { + "epoch": 0.8835512854728227, + "grad_norm": 0.08207431435585022, + "learning_rate": 3.376988441407647e-05, + "loss": 2.5141, + "step": 29796 + }, + { + "epoch": 0.8835809388251342, + "grad_norm": 0.07411077618598938, + "learning_rate": 3.375288825375117e-05, + "loss": 2.5351, + "step": 29797 + }, + { + "epoch": 0.8836105921774456, + "grad_norm": 0.08338458836078644, + "learning_rate": 3.373589622212408e-05, + "loss": 2.5261, + "step": 29798 + }, + { + "epoch": 0.8836402455297572, + "grad_norm": 0.08246014267206192, + "learning_rate": 3.371890831934565e-05, + "loss": 2.5443, + "step": 29799 + }, + { + "epoch": 0.8836698988820686, + "grad_norm": 0.08133445680141449, + "learning_rate": 3.370192454556631e-05, + "loss": 2.5489, + "step": 29800 + }, + { + "epoch": 0.8836995522343801, + "grad_norm": 0.07581093162298203, + "learning_rate": 3.3684944900936485e-05, + "loss": 2.5264, + "step": 29801 + }, + { + "epoch": 0.8837292055866915, + "grad_norm": 0.08125551044940948, + "learning_rate": 3.366796938560651e-05, + "loss": 2.5503, + "step": 29802 + }, + { + "epoch": 0.8837588589390031, + "grad_norm": 0.08422956615686417, + "learning_rate": 3.365099799972671e-05, + "loss": 2.5722, + "step": 29803 + }, + { + "epoch": 0.8837885122913145, + "grad_norm": 0.08015646785497665, + "learning_rate": 3.3634030743447505e-05, + "loss": 2.5383, + "step": 29804 + }, + { + "epoch": 0.883818165643626, + "grad_norm": 0.08095839619636536, + "learning_rate": 3.361706761691891e-05, + "loss": 2.56, + "step": 29805 + }, + { + "epoch": 0.8838478189959375, + "grad_norm": 0.08368609845638275, + "learning_rate": 3.360010862029117e-05, + "loss": 2.5646, + "step": 29806 + }, + { + "epoch": 0.883877472348249, + "grad_norm": 0.07976588606834412, + "learning_rate": 3.358315375371457e-05, + "loss": 2.5305, + "step": 29807 + }, + { + "epoch": 0.8839071257005604, + "grad_norm": 0.0755419135093689, + "learning_rate": 3.3566203017339204e-05, + "loss": 2.556, + "step": 29808 + }, + { + "epoch": 0.8839367790528719, + "grad_norm": 0.07847197353839874, + "learning_rate": 3.3549256411315175e-05, + "loss": 2.5481, + "step": 29809 + }, + { + "epoch": 0.8839664324051834, + "grad_norm": 0.07802505046129227, + "learning_rate": 3.3532313935792594e-05, + "loss": 2.5417, + "step": 29810 + }, + { + "epoch": 0.8839960857574949, + "grad_norm": 0.07634105533361435, + "learning_rate": 3.351537559092138e-05, + "loss": 2.5381, + "step": 29811 + }, + { + "epoch": 0.8840257391098064, + "grad_norm": 0.0730946883559227, + "learning_rate": 3.3498441376851595e-05, + "loss": 2.5341, + "step": 29812 + }, + { + "epoch": 0.8840553924621178, + "grad_norm": 0.08117059618234634, + "learning_rate": 3.3481511293733114e-05, + "loss": 2.5567, + "step": 29813 + }, + { + "epoch": 0.8840850458144294, + "grad_norm": 0.0733356922864914, + "learning_rate": 3.346458534171598e-05, + "loss": 2.5514, + "step": 29814 + }, + { + "epoch": 0.8841146991667408, + "grad_norm": 0.07163894176483154, + "learning_rate": 3.344766352095013e-05, + "loss": 2.5289, + "step": 29815 + }, + { + "epoch": 0.8841443525190523, + "grad_norm": 0.06891734153032303, + "learning_rate": 3.343074583158523e-05, + "loss": 2.5428, + "step": 29816 + }, + { + "epoch": 0.8841740058713637, + "grad_norm": 0.07283437997102737, + "learning_rate": 3.341383227377115e-05, + "loss": 2.5215, + "step": 29817 + }, + { + "epoch": 0.8842036592236753, + "grad_norm": 0.07797400653362274, + "learning_rate": 3.3396922847657663e-05, + "loss": 2.5272, + "step": 29818 + }, + { + "epoch": 0.8842333125759867, + "grad_norm": 0.07708560675382614, + "learning_rate": 3.338001755339454e-05, + "loss": 2.5558, + "step": 29819 + }, + { + "epoch": 0.8842629659282982, + "grad_norm": 0.07585227489471436, + "learning_rate": 3.336311639113143e-05, + "loss": 2.5653, + "step": 29820 + }, + { + "epoch": 0.8842926192806096, + "grad_norm": 0.07210274785757065, + "learning_rate": 3.334621936101801e-05, + "loss": 2.5053, + "step": 29821 + }, + { + "epoch": 0.8843222726329212, + "grad_norm": 0.079864501953125, + "learning_rate": 3.332932646320397e-05, + "loss": 2.5613, + "step": 29822 + }, + { + "epoch": 0.8843519259852326, + "grad_norm": 0.07360705733299255, + "learning_rate": 3.331243769783876e-05, + "loss": 2.5418, + "step": 29823 + }, + { + "epoch": 0.8843815793375441, + "grad_norm": 0.0741233080625534, + "learning_rate": 3.329555306507209e-05, + "loss": 2.5456, + "step": 29824 + }, + { + "epoch": 0.8844112326898556, + "grad_norm": 0.07387089729309082, + "learning_rate": 3.32786725650534e-05, + "loss": 2.534, + "step": 29825 + }, + { + "epoch": 0.8844408860421671, + "grad_norm": 0.08147002011537552, + "learning_rate": 3.326179619793218e-05, + "loss": 2.5549, + "step": 29826 + }, + { + "epoch": 0.8844705393944785, + "grad_norm": 0.07994739711284637, + "learning_rate": 3.3244923963857866e-05, + "loss": 2.5422, + "step": 29827 + }, + { + "epoch": 0.88450019274679, + "grad_norm": 0.07145671546459198, + "learning_rate": 3.322805586297983e-05, + "loss": 2.5282, + "step": 29828 + }, + { + "epoch": 0.8845298460991015, + "grad_norm": 0.08117110282182693, + "learning_rate": 3.321119189544752e-05, + "loss": 2.5204, + "step": 29829 + }, + { + "epoch": 0.884559499451413, + "grad_norm": 0.07982847839593887, + "learning_rate": 3.31943320614102e-05, + "loss": 2.5733, + "step": 29830 + }, + { + "epoch": 0.8845891528037244, + "grad_norm": 0.07370501756668091, + "learning_rate": 3.317747636101725e-05, + "loss": 2.5388, + "step": 29831 + }, + { + "epoch": 0.8846188061560359, + "grad_norm": 0.07705861330032349, + "learning_rate": 3.316062479441784e-05, + "loss": 2.5397, + "step": 29832 + }, + { + "epoch": 0.8846484595083475, + "grad_norm": 0.07593192905187607, + "learning_rate": 3.3143777361761216e-05, + "loss": 2.5634, + "step": 29833 + }, + { + "epoch": 0.8846781128606589, + "grad_norm": 0.0711033046245575, + "learning_rate": 3.312693406319661e-05, + "loss": 2.5364, + "step": 29834 + }, + { + "epoch": 0.8847077662129704, + "grad_norm": 0.07621745765209198, + "learning_rate": 3.311009489887312e-05, + "loss": 2.5563, + "step": 29835 + }, + { + "epoch": 0.8847374195652818, + "grad_norm": 0.07444953173398972, + "learning_rate": 3.3093259868939853e-05, + "loss": 2.5376, + "step": 29836 + }, + { + "epoch": 0.8847670729175934, + "grad_norm": 0.07529996335506439, + "learning_rate": 3.3076428973545955e-05, + "loss": 2.5287, + "step": 29837 + }, + { + "epoch": 0.8847967262699048, + "grad_norm": 0.0771757960319519, + "learning_rate": 3.3059602212840436e-05, + "loss": 2.5019, + "step": 29838 + }, + { + "epoch": 0.8848263796222163, + "grad_norm": 0.06811996549367905, + "learning_rate": 3.3042779586972274e-05, + "loss": 2.5569, + "step": 29839 + }, + { + "epoch": 0.8848560329745278, + "grad_norm": 0.07529840618371964, + "learning_rate": 3.3025961096090404e-05, + "loss": 2.5324, + "step": 29840 + }, + { + "epoch": 0.8848856863268393, + "grad_norm": 0.08218250423669815, + "learning_rate": 3.300914674034383e-05, + "loss": 2.5209, + "step": 29841 + }, + { + "epoch": 0.8849153396791507, + "grad_norm": 0.07979082316160202, + "learning_rate": 3.2992336519881424e-05, + "loss": 2.5557, + "step": 29842 + }, + { + "epoch": 0.8849449930314622, + "grad_norm": 0.07290876656770706, + "learning_rate": 3.297553043485208e-05, + "loss": 2.5679, + "step": 29843 + }, + { + "epoch": 0.8849746463837737, + "grad_norm": 0.07525047659873962, + "learning_rate": 3.2958728485404546e-05, + "loss": 2.5232, + "step": 29844 + }, + { + "epoch": 0.8850042997360852, + "grad_norm": 0.07604562491178513, + "learning_rate": 3.2941930671687606e-05, + "loss": 2.5528, + "step": 29845 + }, + { + "epoch": 0.8850339530883966, + "grad_norm": 0.07247110456228256, + "learning_rate": 3.292513699385008e-05, + "loss": 2.5457, + "step": 29846 + }, + { + "epoch": 0.8850636064407081, + "grad_norm": 0.07772664725780487, + "learning_rate": 3.290834745204063e-05, + "loss": 2.5382, + "step": 29847 + }, + { + "epoch": 0.8850932597930196, + "grad_norm": 0.07595846056938171, + "learning_rate": 3.289156204640798e-05, + "loss": 2.541, + "step": 29848 + }, + { + "epoch": 0.8851229131453311, + "grad_norm": 0.07675571739673615, + "learning_rate": 3.287478077710071e-05, + "loss": 2.5155, + "step": 29849 + }, + { + "epoch": 0.8851525664976425, + "grad_norm": 0.07524674385786057, + "learning_rate": 3.285800364426744e-05, + "loss": 2.5116, + "step": 29850 + }, + { + "epoch": 0.885182219849954, + "grad_norm": 0.07816361635923386, + "learning_rate": 3.284123064805666e-05, + "loss": 2.5401, + "step": 29851 + }, + { + "epoch": 0.8852118732022655, + "grad_norm": 0.08029699325561523, + "learning_rate": 3.282446178861698e-05, + "loss": 2.5268, + "step": 29852 + }, + { + "epoch": 0.885241526554577, + "grad_norm": 0.07614213228225708, + "learning_rate": 3.2807697066096874e-05, + "loss": 2.5521, + "step": 29853 + }, + { + "epoch": 0.8852711799068885, + "grad_norm": 0.07423238456249237, + "learning_rate": 3.279093648064485e-05, + "loss": 2.5573, + "step": 29854 + }, + { + "epoch": 0.8853008332592, + "grad_norm": 0.08486340939998627, + "learning_rate": 3.2774180032409284e-05, + "loss": 2.5319, + "step": 29855 + }, + { + "epoch": 0.8853304866115115, + "grad_norm": 0.07956302165985107, + "learning_rate": 3.2757427721538504e-05, + "loss": 2.5546, + "step": 29856 + }, + { + "epoch": 0.8853601399638229, + "grad_norm": 0.06934809684753418, + "learning_rate": 3.274067954818094e-05, + "loss": 2.529, + "step": 29857 + }, + { + "epoch": 0.8853897933161344, + "grad_norm": 0.0760742798447609, + "learning_rate": 3.272393551248487e-05, + "loss": 2.5213, + "step": 29858 + }, + { + "epoch": 0.8854194466684459, + "grad_norm": 0.079737588763237, + "learning_rate": 3.270719561459856e-05, + "loss": 2.5879, + "step": 29859 + }, + { + "epoch": 0.8854491000207574, + "grad_norm": 0.07794230431318283, + "learning_rate": 3.269045985467029e-05, + "loss": 2.5084, + "step": 29860 + }, + { + "epoch": 0.8854787533730688, + "grad_norm": 0.07625526189804077, + "learning_rate": 3.2673728232848146e-05, + "loss": 2.5449, + "step": 29861 + }, + { + "epoch": 0.8855084067253803, + "grad_norm": 0.07760083675384521, + "learning_rate": 3.2657000749280354e-05, + "loss": 2.5231, + "step": 29862 + }, + { + "epoch": 0.8855380600776918, + "grad_norm": 0.08394113928079605, + "learning_rate": 3.264027740411507e-05, + "loss": 2.5351, + "step": 29863 + }, + { + "epoch": 0.8855677134300033, + "grad_norm": 0.0717843770980835, + "learning_rate": 3.262355819750029e-05, + "loss": 2.535, + "step": 29864 + }, + { + "epoch": 0.8855973667823147, + "grad_norm": 0.07166707515716553, + "learning_rate": 3.260684312958412e-05, + "loss": 2.5247, + "step": 29865 + }, + { + "epoch": 0.8856270201346262, + "grad_norm": 0.07546473294496536, + "learning_rate": 3.25901322005146e-05, + "loss": 2.5383, + "step": 29866 + }, + { + "epoch": 0.8856566734869377, + "grad_norm": 0.075818732380867, + "learning_rate": 3.2573425410439725e-05, + "loss": 2.5316, + "step": 29867 + }, + { + "epoch": 0.8856863268392492, + "grad_norm": 0.07155122607946396, + "learning_rate": 3.2556722759507386e-05, + "loss": 2.5206, + "step": 29868 + }, + { + "epoch": 0.8857159801915606, + "grad_norm": 0.0734361782670021, + "learning_rate": 3.2540024247865506e-05, + "loss": 2.5248, + "step": 29869 + }, + { + "epoch": 0.8857456335438721, + "grad_norm": 0.07562585175037384, + "learning_rate": 3.252332987566203e-05, + "loss": 2.526, + "step": 29870 + }, + { + "epoch": 0.8857752868961836, + "grad_norm": 0.07519492506980896, + "learning_rate": 3.250663964304462e-05, + "loss": 2.5685, + "step": 29871 + }, + { + "epoch": 0.8858049402484951, + "grad_norm": 0.07548754662275314, + "learning_rate": 3.2489953550161154e-05, + "loss": 2.5879, + "step": 29872 + }, + { + "epoch": 0.8858345936008065, + "grad_norm": 0.06716988980770111, + "learning_rate": 3.247327159715941e-05, + "loss": 2.5288, + "step": 29873 + }, + { + "epoch": 0.885864246953118, + "grad_norm": 0.07415622472763062, + "learning_rate": 3.2456593784187085e-05, + "loss": 2.547, + "step": 29874 + }, + { + "epoch": 0.8858939003054296, + "grad_norm": 0.0763721764087677, + "learning_rate": 3.243992011139191e-05, + "loss": 2.553, + "step": 29875 + }, + { + "epoch": 0.885923553657741, + "grad_norm": 0.0758935809135437, + "learning_rate": 3.242325057892143e-05, + "loss": 2.5643, + "step": 29876 + }, + { + "epoch": 0.8859532070100525, + "grad_norm": 0.07624954730272293, + "learning_rate": 3.240658518692341e-05, + "loss": 2.51, + "step": 29877 + }, + { + "epoch": 0.885982860362364, + "grad_norm": 0.07868903130292892, + "learning_rate": 3.238992393554518e-05, + "loss": 2.5512, + "step": 29878 + }, + { + "epoch": 0.8860125137146755, + "grad_norm": 0.07505679130554199, + "learning_rate": 3.237326682493458e-05, + "loss": 2.5362, + "step": 29879 + }, + { + "epoch": 0.8860421670669869, + "grad_norm": 0.07329884171485901, + "learning_rate": 3.2356613855239026e-05, + "loss": 2.5433, + "step": 29880 + }, + { + "epoch": 0.8860718204192984, + "grad_norm": 0.07226955145597458, + "learning_rate": 3.23399650266058e-05, + "loss": 2.5458, + "step": 29881 + }, + { + "epoch": 0.8861014737716099, + "grad_norm": 0.07032526284456253, + "learning_rate": 3.23233203391825e-05, + "loss": 2.543, + "step": 29882 + }, + { + "epoch": 0.8861311271239214, + "grad_norm": 0.07585740089416504, + "learning_rate": 3.2306679793116464e-05, + "loss": 2.5557, + "step": 29883 + }, + { + "epoch": 0.8861607804762328, + "grad_norm": 0.07186335325241089, + "learning_rate": 3.229004338855512e-05, + "loss": 2.5375, + "step": 29884 + }, + { + "epoch": 0.8861904338285443, + "grad_norm": 0.07358315587043762, + "learning_rate": 3.2273411125645634e-05, + "loss": 2.5248, + "step": 29885 + }, + { + "epoch": 0.8862200871808558, + "grad_norm": 0.07360102981328964, + "learning_rate": 3.225678300453544e-05, + "loss": 2.5507, + "step": 29886 + }, + { + "epoch": 0.8862497405331673, + "grad_norm": 0.07154004275798798, + "learning_rate": 3.22401590253717e-05, + "loss": 2.5779, + "step": 29887 + }, + { + "epoch": 0.8862793938854787, + "grad_norm": 0.07256636023521423, + "learning_rate": 3.222353918830162e-05, + "loss": 2.5489, + "step": 29888 + }, + { + "epoch": 0.8863090472377902, + "grad_norm": 0.0773405209183693, + "learning_rate": 3.220692349347237e-05, + "loss": 2.5733, + "step": 29889 + }, + { + "epoch": 0.8863387005901017, + "grad_norm": 0.08045832067728043, + "learning_rate": 3.219031194103117e-05, + "loss": 2.5532, + "step": 29890 + }, + { + "epoch": 0.8863683539424132, + "grad_norm": 0.072295181453228, + "learning_rate": 3.217370453112506e-05, + "loss": 2.5051, + "step": 29891 + }, + { + "epoch": 0.8863980072947246, + "grad_norm": 0.07359392940998077, + "learning_rate": 3.2157101263901036e-05, + "loss": 2.5778, + "step": 29892 + }, + { + "epoch": 0.8864276606470362, + "grad_norm": 0.07506163418292999, + "learning_rate": 3.21405021395062e-05, + "loss": 2.5603, + "step": 29893 + }, + { + "epoch": 0.8864573139993476, + "grad_norm": 0.07523424178361893, + "learning_rate": 3.212390715808755e-05, + "loss": 2.568, + "step": 29894 + }, + { + "epoch": 0.8864869673516591, + "grad_norm": 0.07160770148038864, + "learning_rate": 3.2107316319792025e-05, + "loss": 2.4963, + "step": 29895 + }, + { + "epoch": 0.8865166207039706, + "grad_norm": 0.08329028636217117, + "learning_rate": 3.20907296247665e-05, + "loss": 2.5502, + "step": 29896 + }, + { + "epoch": 0.8865462740562821, + "grad_norm": 0.07990900427103043, + "learning_rate": 3.207414707315786e-05, + "loss": 2.5428, + "step": 29897 + }, + { + "epoch": 0.8865759274085936, + "grad_norm": 0.07689553499221802, + "learning_rate": 3.2057568665113e-05, + "loss": 2.5435, + "step": 29898 + }, + { + "epoch": 0.886605580760905, + "grad_norm": 0.08045069873332977, + "learning_rate": 3.204099440077868e-05, + "loss": 2.5506, + "step": 29899 + }, + { + "epoch": 0.8866352341132165, + "grad_norm": 0.07042115181684494, + "learning_rate": 3.2024424280301725e-05, + "loss": 2.5469, + "step": 29900 + }, + { + "epoch": 0.886664887465528, + "grad_norm": 0.0782242864370346, + "learning_rate": 3.200785830382874e-05, + "loss": 2.5469, + "step": 29901 + }, + { + "epoch": 0.8866945408178395, + "grad_norm": 0.07961173355579376, + "learning_rate": 3.199129647150656e-05, + "loss": 2.5584, + "step": 29902 + }, + { + "epoch": 0.8867241941701509, + "grad_norm": 0.08373779058456421, + "learning_rate": 3.197473878348173e-05, + "loss": 2.5457, + "step": 29903 + }, + { + "epoch": 0.8867538475224624, + "grad_norm": 0.07331901043653488, + "learning_rate": 3.195818523990096e-05, + "loss": 2.5306, + "step": 29904 + }, + { + "epoch": 0.8867835008747739, + "grad_norm": 0.07449205219745636, + "learning_rate": 3.194163584091081e-05, + "loss": 2.515, + "step": 29905 + }, + { + "epoch": 0.8868131542270854, + "grad_norm": 0.07602277398109436, + "learning_rate": 3.192509058665777e-05, + "loss": 2.565, + "step": 29906 + }, + { + "epoch": 0.8868428075793968, + "grad_norm": 0.07551495730876923, + "learning_rate": 3.190854947728844e-05, + "loss": 2.5487, + "step": 29907 + }, + { + "epoch": 0.8868724609317084, + "grad_norm": 0.07353895902633667, + "learning_rate": 3.1892012512949206e-05, + "loss": 2.5213, + "step": 29908 + }, + { + "epoch": 0.8869021142840198, + "grad_norm": 0.07424809038639069, + "learning_rate": 3.187547969378663e-05, + "loss": 2.5564, + "step": 29909 + }, + { + "epoch": 0.8869317676363313, + "grad_norm": 0.07428613305091858, + "learning_rate": 3.1858951019946955e-05, + "loss": 2.5544, + "step": 29910 + }, + { + "epoch": 0.8869614209886427, + "grad_norm": 0.07341155409812927, + "learning_rate": 3.184242649157665e-05, + "loss": 2.5335, + "step": 29911 + }, + { + "epoch": 0.8869910743409543, + "grad_norm": 0.07670372724533081, + "learning_rate": 3.182590610882202e-05, + "loss": 2.5472, + "step": 29912 + }, + { + "epoch": 0.8870207276932657, + "grad_norm": 0.07776878029108047, + "learning_rate": 3.180938987182935e-05, + "loss": 2.5317, + "step": 29913 + }, + { + "epoch": 0.8870503810455772, + "grad_norm": 0.07092034816741943, + "learning_rate": 3.179287778074491e-05, + "loss": 2.536, + "step": 29914 + }, + { + "epoch": 0.8870800343978886, + "grad_norm": 0.07463931292295456, + "learning_rate": 3.1776369835714966e-05, + "loss": 2.5764, + "step": 29915 + }, + { + "epoch": 0.8871096877502002, + "grad_norm": 0.08157220482826233, + "learning_rate": 3.175986603688552e-05, + "loss": 2.5412, + "step": 29916 + }, + { + "epoch": 0.8871393411025117, + "grad_norm": 0.07673025876283646, + "learning_rate": 3.1743366384402835e-05, + "loss": 2.5523, + "step": 29917 + }, + { + "epoch": 0.8871689944548231, + "grad_norm": 0.07863159477710724, + "learning_rate": 3.1726870878413025e-05, + "loss": 2.5437, + "step": 29918 + }, + { + "epoch": 0.8871986478071346, + "grad_norm": 0.07332436740398407, + "learning_rate": 3.171037951906219e-05, + "loss": 2.5057, + "step": 29919 + }, + { + "epoch": 0.8872283011594461, + "grad_norm": 0.07796666026115417, + "learning_rate": 3.169389230649633e-05, + "loss": 2.5398, + "step": 29920 + }, + { + "epoch": 0.8872579545117576, + "grad_norm": 0.07876910269260406, + "learning_rate": 3.167740924086143e-05, + "loss": 2.5384, + "step": 29921 + }, + { + "epoch": 0.887287607864069, + "grad_norm": 0.0734684020280838, + "learning_rate": 3.166093032230344e-05, + "loss": 2.5483, + "step": 29922 + }, + { + "epoch": 0.8873172612163805, + "grad_norm": 0.07585551589727402, + "learning_rate": 3.164445555096829e-05, + "loss": 2.5265, + "step": 29923 + }, + { + "epoch": 0.887346914568692, + "grad_norm": 0.08049189299345016, + "learning_rate": 3.1627984927001916e-05, + "loss": 2.5305, + "step": 29924 + }, + { + "epoch": 0.8873765679210035, + "grad_norm": 0.07758137583732605, + "learning_rate": 3.161151845055021e-05, + "loss": 2.5685, + "step": 29925 + }, + { + "epoch": 0.8874062212733149, + "grad_norm": 0.07617904990911484, + "learning_rate": 3.159505612175878e-05, + "loss": 2.5696, + "step": 29926 + }, + { + "epoch": 0.8874358746256265, + "grad_norm": 0.07405315339565277, + "learning_rate": 3.1578597940773556e-05, + "loss": 2.5466, + "step": 29927 + }, + { + "epoch": 0.8874655279779379, + "grad_norm": 0.07418661564588547, + "learning_rate": 3.156214390774026e-05, + "loss": 2.5626, + "step": 29928 + }, + { + "epoch": 0.8874951813302494, + "grad_norm": 0.07049725204706192, + "learning_rate": 3.15456940228046e-05, + "loss": 2.5351, + "step": 29929 + }, + { + "epoch": 0.8875248346825608, + "grad_norm": 0.0783311277627945, + "learning_rate": 3.1529248286112145e-05, + "loss": 2.5268, + "step": 29930 + }, + { + "epoch": 0.8875544880348724, + "grad_norm": 0.07431358098983765, + "learning_rate": 3.151280669780865e-05, + "loss": 2.5162, + "step": 29931 + }, + { + "epoch": 0.8875841413871838, + "grad_norm": 0.07457684725522995, + "learning_rate": 3.1496369258039725e-05, + "loss": 2.5262, + "step": 29932 + }, + { + "epoch": 0.8876137947394953, + "grad_norm": 0.07590623199939728, + "learning_rate": 3.147993596695081e-05, + "loss": 2.5315, + "step": 29933 + }, + { + "epoch": 0.8876434480918067, + "grad_norm": 0.07381689548492432, + "learning_rate": 3.146350682468752e-05, + "loss": 2.5314, + "step": 29934 + }, + { + "epoch": 0.8876731014441183, + "grad_norm": 0.07426314800977707, + "learning_rate": 3.1447081831395276e-05, + "loss": 2.5342, + "step": 29935 + }, + { + "epoch": 0.8877027547964297, + "grad_norm": 0.07317723333835602, + "learning_rate": 3.143066098721964e-05, + "loss": 2.5113, + "step": 29936 + }, + { + "epoch": 0.8877324081487412, + "grad_norm": 0.07915408909320831, + "learning_rate": 3.141424429230583e-05, + "loss": 2.5736, + "step": 29937 + }, + { + "epoch": 0.8877620615010527, + "grad_norm": 0.07637259364128113, + "learning_rate": 3.1397831746799335e-05, + "loss": 2.555, + "step": 29938 + }, + { + "epoch": 0.8877917148533642, + "grad_norm": 0.07485143840312958, + "learning_rate": 3.1381423350845486e-05, + "loss": 2.5462, + "step": 29939 + }, + { + "epoch": 0.8878213682056757, + "grad_norm": 0.07249019294977188, + "learning_rate": 3.1365019104589555e-05, + "loss": 2.5387, + "step": 29940 + }, + { + "epoch": 0.8878510215579871, + "grad_norm": 0.0761001780629158, + "learning_rate": 3.1348619008176814e-05, + "loss": 2.548, + "step": 29941 + }, + { + "epoch": 0.8878806749102987, + "grad_norm": 0.07164055854082108, + "learning_rate": 3.133222306175254e-05, + "loss": 2.5328, + "step": 29942 + }, + { + "epoch": 0.8879103282626101, + "grad_norm": 0.07634235918521881, + "learning_rate": 3.1315831265461726e-05, + "loss": 2.5094, + "step": 29943 + }, + { + "epoch": 0.8879399816149216, + "grad_norm": 0.07131744176149368, + "learning_rate": 3.129944361944981e-05, + "loss": 2.548, + "step": 29944 + }, + { + "epoch": 0.887969634967233, + "grad_norm": 0.07704748958349228, + "learning_rate": 3.1283060123861725e-05, + "loss": 2.5457, + "step": 29945 + }, + { + "epoch": 0.8879992883195446, + "grad_norm": 0.07609628885984421, + "learning_rate": 3.1266680778842704e-05, + "loss": 2.5269, + "step": 29946 + }, + { + "epoch": 0.888028941671856, + "grad_norm": 0.07865168154239655, + "learning_rate": 3.125030558453762e-05, + "loss": 2.5535, + "step": 29947 + }, + { + "epoch": 0.8880585950241675, + "grad_norm": 0.07493120431900024, + "learning_rate": 3.1233934541091524e-05, + "loss": 2.5412, + "step": 29948 + }, + { + "epoch": 0.8880882483764789, + "grad_norm": 0.07258066534996033, + "learning_rate": 3.1217567648649415e-05, + "loss": 2.5772, + "step": 29949 + }, + { + "epoch": 0.8881179017287905, + "grad_norm": 0.0754782184958458, + "learning_rate": 3.1201204907356174e-05, + "loss": 2.5514, + "step": 29950 + }, + { + "epoch": 0.8881475550811019, + "grad_norm": 0.07354715466499329, + "learning_rate": 3.118484631735674e-05, + "loss": 2.553, + "step": 29951 + }, + { + "epoch": 0.8881772084334134, + "grad_norm": 0.07629288733005524, + "learning_rate": 3.1168491878796e-05, + "loss": 2.5243, + "step": 29952 + }, + { + "epoch": 0.8882068617857248, + "grad_norm": 0.08300777524709702, + "learning_rate": 3.115214159181873e-05, + "loss": 2.4977, + "step": 29953 + }, + { + "epoch": 0.8882365151380364, + "grad_norm": 0.0743475928902626, + "learning_rate": 3.113579545656969e-05, + "loss": 2.5504, + "step": 29954 + }, + { + "epoch": 0.8882661684903478, + "grad_norm": 0.07738173753023148, + "learning_rate": 3.111945347319356e-05, + "loss": 2.5635, + "step": 29955 + }, + { + "epoch": 0.8882958218426593, + "grad_norm": 0.08182118088006973, + "learning_rate": 3.1103115641835324e-05, + "loss": 2.5609, + "step": 29956 + }, + { + "epoch": 0.8883254751949707, + "grad_norm": 0.08351031690835953, + "learning_rate": 3.108678196263948e-05, + "loss": 2.5652, + "step": 29957 + }, + { + "epoch": 0.8883551285472823, + "grad_norm": 0.07314006984233856, + "learning_rate": 3.107045243575063e-05, + "loss": 2.5281, + "step": 29958 + }, + { + "epoch": 0.8883847818995938, + "grad_norm": 0.08166661858558655, + "learning_rate": 3.1054127061313386e-05, + "loss": 2.5328, + "step": 29959 + }, + { + "epoch": 0.8884144352519052, + "grad_norm": 0.08023153990507126, + "learning_rate": 3.1037805839472355e-05, + "loss": 2.5554, + "step": 29960 + }, + { + "epoch": 0.8884440886042168, + "grad_norm": 0.07654785364866257, + "learning_rate": 3.102148877037203e-05, + "loss": 2.4911, + "step": 29961 + }, + { + "epoch": 0.8884737419565282, + "grad_norm": 0.0744025781750679, + "learning_rate": 3.1005175854156966e-05, + "loss": 2.5582, + "step": 29962 + }, + { + "epoch": 0.8885033953088397, + "grad_norm": 0.08254028111696243, + "learning_rate": 3.098886709097154e-05, + "loss": 2.5322, + "step": 29963 + }, + { + "epoch": 0.8885330486611511, + "grad_norm": 0.07425357401371002, + "learning_rate": 3.09725624809602e-05, + "loss": 2.5572, + "step": 29964 + }, + { + "epoch": 0.8885627020134627, + "grad_norm": 0.0749998688697815, + "learning_rate": 3.095626202426732e-05, + "loss": 2.5616, + "step": 29965 + }, + { + "epoch": 0.8885923553657741, + "grad_norm": 0.07448632270097733, + "learning_rate": 3.09399657210373e-05, + "loss": 2.5494, + "step": 29966 + }, + { + "epoch": 0.8886220087180856, + "grad_norm": 0.07563915103673935, + "learning_rate": 3.0923673571414345e-05, + "loss": 2.572, + "step": 29967 + }, + { + "epoch": 0.888651662070397, + "grad_norm": 0.07771947979927063, + "learning_rate": 3.090738557554279e-05, + "loss": 2.5437, + "step": 29968 + }, + { + "epoch": 0.8886813154227086, + "grad_norm": 0.07280001044273376, + "learning_rate": 3.089110173356685e-05, + "loss": 2.5159, + "step": 29969 + }, + { + "epoch": 0.88871096877502, + "grad_norm": 0.07872933894395828, + "learning_rate": 3.087482204563075e-05, + "loss": 2.5164, + "step": 29970 + }, + { + "epoch": 0.8887406221273315, + "grad_norm": 0.07855436950922012, + "learning_rate": 3.085854651187864e-05, + "loss": 2.5436, + "step": 29971 + }, + { + "epoch": 0.8887702754796429, + "grad_norm": 0.0746268630027771, + "learning_rate": 3.084227513245458e-05, + "loss": 2.5261, + "step": 29972 + }, + { + "epoch": 0.8887999288319545, + "grad_norm": 0.07458452135324478, + "learning_rate": 3.082600790750273e-05, + "loss": 2.53, + "step": 29973 + }, + { + "epoch": 0.8888295821842659, + "grad_norm": 0.08431476354598999, + "learning_rate": 3.0809744837167085e-05, + "loss": 2.5481, + "step": 29974 + }, + { + "epoch": 0.8888592355365774, + "grad_norm": 0.08159905672073364, + "learning_rate": 3.07934859215917e-05, + "loss": 2.5311, + "step": 29975 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 0.0718584805727005, + "learning_rate": 3.077723116092052e-05, + "loss": 2.5371, + "step": 29976 + }, + { + "epoch": 0.8889185422412004, + "grad_norm": 0.07742363214492798, + "learning_rate": 3.0760980555297525e-05, + "loss": 2.4993, + "step": 29977 + }, + { + "epoch": 0.8889481955935118, + "grad_norm": 0.0831616148352623, + "learning_rate": 3.074473410486661e-05, + "loss": 2.5566, + "step": 29978 + }, + { + "epoch": 0.8889778489458233, + "grad_norm": 0.07771851867437363, + "learning_rate": 3.072849180977155e-05, + "loss": 2.5148, + "step": 29979 + }, + { + "epoch": 0.8890075022981349, + "grad_norm": 0.07459141314029694, + "learning_rate": 3.07122536701564e-05, + "loss": 2.527, + "step": 29980 + }, + { + "epoch": 0.8890371556504463, + "grad_norm": 0.08042342215776443, + "learning_rate": 3.069601968616459e-05, + "loss": 2.5329, + "step": 29981 + }, + { + "epoch": 0.8890668090027578, + "grad_norm": 0.07698129862546921, + "learning_rate": 3.067978985794018e-05, + "loss": 2.5238, + "step": 29982 + }, + { + "epoch": 0.8890964623550692, + "grad_norm": 0.07487064599990845, + "learning_rate": 3.0663564185626766e-05, + "loss": 2.4965, + "step": 29983 + }, + { + "epoch": 0.8891261157073808, + "grad_norm": 0.07864558696746826, + "learning_rate": 3.064734266936809e-05, + "loss": 2.5261, + "step": 29984 + }, + { + "epoch": 0.8891557690596922, + "grad_norm": 0.08617159724235535, + "learning_rate": 3.063112530930773e-05, + "loss": 2.5646, + "step": 29985 + }, + { + "epoch": 0.8891854224120037, + "grad_norm": 0.07348611950874329, + "learning_rate": 3.061491210558936e-05, + "loss": 2.5305, + "step": 29986 + }, + { + "epoch": 0.8892150757643151, + "grad_norm": 0.07529541850090027, + "learning_rate": 3.0598703058356434e-05, + "loss": 2.5491, + "step": 29987 + }, + { + "epoch": 0.8892447291166267, + "grad_norm": 0.08055482804775238, + "learning_rate": 3.058249816775266e-05, + "loss": 2.5453, + "step": 29988 + }, + { + "epoch": 0.8892743824689381, + "grad_norm": 0.0768430307507515, + "learning_rate": 3.056629743392136e-05, + "loss": 2.5119, + "step": 29989 + }, + { + "epoch": 0.8893040358212496, + "grad_norm": 0.072752945125103, + "learning_rate": 3.05501008570061e-05, + "loss": 2.5572, + "step": 29990 + }, + { + "epoch": 0.889333689173561, + "grad_norm": 0.07145773619413376, + "learning_rate": 3.053390843715037e-05, + "loss": 2.5321, + "step": 29991 + }, + { + "epoch": 0.8893633425258726, + "grad_norm": 0.08029096573591232, + "learning_rate": 3.051772017449739e-05, + "loss": 2.5255, + "step": 29992 + }, + { + "epoch": 0.889392995878184, + "grad_norm": 0.07689838111400604, + "learning_rate": 3.0501536069190538e-05, + "loss": 2.5447, + "step": 29993 + }, + { + "epoch": 0.8894226492304955, + "grad_norm": 0.06992927193641663, + "learning_rate": 3.0485356121373154e-05, + "loss": 2.5368, + "step": 29994 + }, + { + "epoch": 0.8894523025828069, + "grad_norm": 0.07543345540761948, + "learning_rate": 3.0469180331188563e-05, + "loss": 2.5375, + "step": 29995 + }, + { + "epoch": 0.8894819559351185, + "grad_norm": 0.07798441499471664, + "learning_rate": 3.0453008698779982e-05, + "loss": 2.5821, + "step": 29996 + }, + { + "epoch": 0.8895116092874299, + "grad_norm": 0.07450217753648758, + "learning_rate": 3.0436841224290633e-05, + "loss": 2.6132, + "step": 29997 + }, + { + "epoch": 0.8895412626397414, + "grad_norm": 0.07421547919511795, + "learning_rate": 3.0420677907863627e-05, + "loss": 2.5801, + "step": 29998 + }, + { + "epoch": 0.8895709159920528, + "grad_norm": 0.07317785918712616, + "learning_rate": 3.0404518749642118e-05, + "loss": 2.5603, + "step": 29999 + }, + { + "epoch": 0.8896005693443644, + "grad_norm": 0.0741848424077034, + "learning_rate": 3.0388363749769222e-05, + "loss": 2.5726, + "step": 30000 + }, + { + "epoch": 0.8896302226966759, + "grad_norm": 0.07268820703029633, + "learning_rate": 3.0372212908388043e-05, + "loss": 2.5569, + "step": 30001 + }, + { + "epoch": 0.8896598760489873, + "grad_norm": 0.07437986880540848, + "learning_rate": 3.0356066225641465e-05, + "loss": 2.5266, + "step": 30002 + }, + { + "epoch": 0.8896895294012989, + "grad_norm": 0.07907562702894211, + "learning_rate": 3.033992370167249e-05, + "loss": 2.5329, + "step": 30003 + }, + { + "epoch": 0.8897191827536103, + "grad_norm": 0.07114556431770325, + "learning_rate": 3.0323785336624166e-05, + "loss": 2.538, + "step": 30004 + }, + { + "epoch": 0.8897488361059218, + "grad_norm": 0.07230131328105927, + "learning_rate": 3.0307651130639325e-05, + "loss": 2.5646, + "step": 30005 + }, + { + "epoch": 0.8897784894582332, + "grad_norm": 0.07468551397323608, + "learning_rate": 3.0291521083860795e-05, + "loss": 2.5423, + "step": 30006 + }, + { + "epoch": 0.8898081428105448, + "grad_norm": 0.076235331594944, + "learning_rate": 3.0275395196431465e-05, + "loss": 2.549, + "step": 30007 + }, + { + "epoch": 0.8898377961628562, + "grad_norm": 0.08181142061948776, + "learning_rate": 3.0259273468494163e-05, + "loss": 2.5331, + "step": 30008 + }, + { + "epoch": 0.8898674495151677, + "grad_norm": 0.07761064171791077, + "learning_rate": 3.0243155900191667e-05, + "loss": 2.5409, + "step": 30009 + }, + { + "epoch": 0.8898971028674791, + "grad_norm": 0.07307949662208557, + "learning_rate": 3.0227042491666636e-05, + "loss": 2.5426, + "step": 30010 + }, + { + "epoch": 0.8899267562197907, + "grad_norm": 0.07183390855789185, + "learning_rate": 3.021093324306179e-05, + "loss": 2.5305, + "step": 30011 + }, + { + "epoch": 0.8899564095721021, + "grad_norm": 0.07575976848602295, + "learning_rate": 3.0194828154519847e-05, + "loss": 2.55, + "step": 30012 + }, + { + "epoch": 0.8899860629244136, + "grad_norm": 0.07482670247554779, + "learning_rate": 3.0178727226183255e-05, + "loss": 2.5247, + "step": 30013 + }, + { + "epoch": 0.890015716276725, + "grad_norm": 0.06914971768856049, + "learning_rate": 3.0162630458194674e-05, + "loss": 2.5268, + "step": 30014 + }, + { + "epoch": 0.8900453696290366, + "grad_norm": 0.073147714138031, + "learning_rate": 3.0146537850696654e-05, + "loss": 2.5589, + "step": 30015 + }, + { + "epoch": 0.890075022981348, + "grad_norm": 0.0706692710518837, + "learning_rate": 3.0130449403831695e-05, + "loss": 2.5372, + "step": 30016 + }, + { + "epoch": 0.8901046763336595, + "grad_norm": 0.07221128046512604, + "learning_rate": 3.0114365117742237e-05, + "loss": 2.5322, + "step": 30017 + }, + { + "epoch": 0.890134329685971, + "grad_norm": 0.07492519170045853, + "learning_rate": 3.0098284992570777e-05, + "loss": 2.5296, + "step": 30018 + }, + { + "epoch": 0.8901639830382825, + "grad_norm": 0.07587583363056183, + "learning_rate": 3.0082209028459595e-05, + "loss": 2.5413, + "step": 30019 + }, + { + "epoch": 0.890193636390594, + "grad_norm": 0.06819082796573639, + "learning_rate": 3.0066137225551126e-05, + "loss": 2.5541, + "step": 30020 + }, + { + "epoch": 0.8902232897429054, + "grad_norm": 0.07706998288631439, + "learning_rate": 3.005006958398765e-05, + "loss": 2.5387, + "step": 30021 + }, + { + "epoch": 0.890252943095217, + "grad_norm": 0.07596299797296524, + "learning_rate": 3.0034006103911604e-05, + "loss": 2.5291, + "step": 30022 + }, + { + "epoch": 0.8902825964475284, + "grad_norm": 0.07700023800134659, + "learning_rate": 3.0017946785465045e-05, + "loss": 2.552, + "step": 30023 + }, + { + "epoch": 0.8903122497998399, + "grad_norm": 0.0705590695142746, + "learning_rate": 3.000189162879019e-05, + "loss": 2.5271, + "step": 30024 + }, + { + "epoch": 0.8903419031521513, + "grad_norm": 0.08076223731040955, + "learning_rate": 2.9985840634029314e-05, + "loss": 2.5179, + "step": 30025 + }, + { + "epoch": 0.8903715565044629, + "grad_norm": 0.07604973763227463, + "learning_rate": 2.996979380132442e-05, + "loss": 2.548, + "step": 30026 + }, + { + "epoch": 0.8904012098567743, + "grad_norm": 0.07328485697507858, + "learning_rate": 2.9953751130817775e-05, + "loss": 2.5646, + "step": 30027 + }, + { + "epoch": 0.8904308632090858, + "grad_norm": 0.07814208418130875, + "learning_rate": 2.993771262265127e-05, + "loss": 2.5287, + "step": 30028 + }, + { + "epoch": 0.8904605165613972, + "grad_norm": 0.07538733631372452, + "learning_rate": 2.9921678276967012e-05, + "loss": 2.5503, + "step": 30029 + }, + { + "epoch": 0.8904901699137088, + "grad_norm": 0.07397247105836868, + "learning_rate": 2.9905648093907e-05, + "loss": 2.577, + "step": 30030 + }, + { + "epoch": 0.8905198232660202, + "grad_norm": 0.07512808591127396, + "learning_rate": 2.9889622073613176e-05, + "loss": 2.5572, + "step": 30031 + }, + { + "epoch": 0.8905494766183317, + "grad_norm": 0.07381610572338104, + "learning_rate": 2.9873600216227425e-05, + "loss": 2.5383, + "step": 30032 + }, + { + "epoch": 0.8905791299706431, + "grad_norm": 0.07978153973817825, + "learning_rate": 2.9857582521891637e-05, + "loss": 2.5528, + "step": 30033 + }, + { + "epoch": 0.8906087833229547, + "grad_norm": 0.0712490826845169, + "learning_rate": 2.984156899074769e-05, + "loss": 2.5476, + "step": 30034 + }, + { + "epoch": 0.8906384366752661, + "grad_norm": 0.07901205122470856, + "learning_rate": 2.9825559622937315e-05, + "loss": 2.5385, + "step": 30035 + }, + { + "epoch": 0.8906680900275776, + "grad_norm": 0.07828942686319351, + "learning_rate": 2.9809554418602335e-05, + "loss": 2.54, + "step": 30036 + }, + { + "epoch": 0.890697743379889, + "grad_norm": 0.07419325411319733, + "learning_rate": 2.979355337788442e-05, + "loss": 2.5186, + "step": 30037 + }, + { + "epoch": 0.8907273967322006, + "grad_norm": 0.07703789323568344, + "learning_rate": 2.977755650092534e-05, + "loss": 2.5339, + "step": 30038 + }, + { + "epoch": 0.890757050084512, + "grad_norm": 0.07985766232013702, + "learning_rate": 2.976156378786671e-05, + "loss": 2.5472, + "step": 30039 + }, + { + "epoch": 0.8907867034368235, + "grad_norm": 0.07295113056898117, + "learning_rate": 2.9745575238850132e-05, + "loss": 2.5309, + "step": 30040 + }, + { + "epoch": 0.8908163567891351, + "grad_norm": 0.07700008153915405, + "learning_rate": 2.972959085401722e-05, + "loss": 2.5231, + "step": 30041 + }, + { + "epoch": 0.8908460101414465, + "grad_norm": 0.07751020044088364, + "learning_rate": 2.971361063350947e-05, + "loss": 2.5749, + "step": 30042 + }, + { + "epoch": 0.890875663493758, + "grad_norm": 0.07516395300626755, + "learning_rate": 2.9697634577468437e-05, + "loss": 2.5507, + "step": 30043 + }, + { + "epoch": 0.8909053168460694, + "grad_norm": 0.07379396259784698, + "learning_rate": 2.968166268603556e-05, + "loss": 2.4728, + "step": 30044 + }, + { + "epoch": 0.890934970198381, + "grad_norm": 0.07290662825107574, + "learning_rate": 2.966569495935234e-05, + "loss": 2.5368, + "step": 30045 + }, + { + "epoch": 0.8909646235506924, + "grad_norm": 0.0753018781542778, + "learning_rate": 2.9649731397560108e-05, + "loss": 2.5028, + "step": 30046 + }, + { + "epoch": 0.8909942769030039, + "grad_norm": 0.07125040888786316, + "learning_rate": 2.9633772000800196e-05, + "loss": 2.5774, + "step": 30047 + }, + { + "epoch": 0.8910239302553153, + "grad_norm": 0.07169543206691742, + "learning_rate": 2.961781676921399e-05, + "loss": 2.4951, + "step": 30048 + }, + { + "epoch": 0.8910535836076269, + "grad_norm": 0.07579679787158966, + "learning_rate": 2.9601865702942766e-05, + "loss": 2.548, + "step": 30049 + }, + { + "epoch": 0.8910832369599383, + "grad_norm": 0.07527218759059906, + "learning_rate": 2.9585918802127743e-05, + "loss": 2.508, + "step": 30050 + }, + { + "epoch": 0.8911128903122498, + "grad_norm": 0.07554083317518234, + "learning_rate": 2.95699760669102e-05, + "loss": 2.5371, + "step": 30051 + }, + { + "epoch": 0.8911425436645612, + "grad_norm": 0.07889959961175919, + "learning_rate": 2.955403749743124e-05, + "loss": 2.5552, + "step": 30052 + }, + { + "epoch": 0.8911721970168728, + "grad_norm": 0.07804732024669647, + "learning_rate": 2.9538103093832036e-05, + "loss": 2.5642, + "step": 30053 + }, + { + "epoch": 0.8912018503691842, + "grad_norm": 0.07204075902700424, + "learning_rate": 2.9522172856253636e-05, + "loss": 2.5521, + "step": 30054 + }, + { + "epoch": 0.8912315037214957, + "grad_norm": 0.0794050320982933, + "learning_rate": 2.950624678483721e-05, + "loss": 2.5153, + "step": 30055 + }, + { + "epoch": 0.8912611570738072, + "grad_norm": 0.07597007602453232, + "learning_rate": 2.9490324879723808e-05, + "loss": 2.5036, + "step": 30056 + }, + { + "epoch": 0.8912908104261187, + "grad_norm": 0.07582367211580276, + "learning_rate": 2.9474407141054204e-05, + "loss": 2.5369, + "step": 30057 + }, + { + "epoch": 0.8913204637784301, + "grad_norm": 0.07758879661560059, + "learning_rate": 2.9458493568969514e-05, + "loss": 2.5165, + "step": 30058 + }, + { + "epoch": 0.8913501171307416, + "grad_norm": 0.07875680923461914, + "learning_rate": 2.944258416361062e-05, + "loss": 2.5341, + "step": 30059 + }, + { + "epoch": 0.8913797704830531, + "grad_norm": 0.07423724979162216, + "learning_rate": 2.942667892511841e-05, + "loss": 2.5336, + "step": 30060 + }, + { + "epoch": 0.8914094238353646, + "grad_norm": 0.07784005254507065, + "learning_rate": 2.9410777853633773e-05, + "loss": 2.5183, + "step": 30061 + }, + { + "epoch": 0.8914390771876761, + "grad_norm": 0.08065057545900345, + "learning_rate": 2.939488094929743e-05, + "loss": 2.5326, + "step": 30062 + }, + { + "epoch": 0.8914687305399875, + "grad_norm": 0.07480233907699585, + "learning_rate": 2.9378988212250212e-05, + "loss": 2.569, + "step": 30063 + }, + { + "epoch": 0.8914983838922991, + "grad_norm": 0.07284210622310638, + "learning_rate": 2.9363099642632897e-05, + "loss": 2.546, + "step": 30064 + }, + { + "epoch": 0.8915280372446105, + "grad_norm": 0.07246958464384079, + "learning_rate": 2.9347215240586034e-05, + "loss": 2.53, + "step": 30065 + }, + { + "epoch": 0.891557690596922, + "grad_norm": 0.0832914486527443, + "learning_rate": 2.9331335006250402e-05, + "loss": 2.5488, + "step": 30066 + }, + { + "epoch": 0.8915873439492334, + "grad_norm": 0.07516583055257797, + "learning_rate": 2.931545893976667e-05, + "loss": 2.5222, + "step": 30067 + }, + { + "epoch": 0.891616997301545, + "grad_norm": 0.07146590203046799, + "learning_rate": 2.9299587041275276e-05, + "loss": 2.5555, + "step": 30068 + }, + { + "epoch": 0.8916466506538564, + "grad_norm": 0.07734881341457367, + "learning_rate": 2.928371931091678e-05, + "loss": 2.5465, + "step": 30069 + }, + { + "epoch": 0.8916763040061679, + "grad_norm": 0.08109933137893677, + "learning_rate": 2.9267855748831784e-05, + "loss": 2.5001, + "step": 30070 + }, + { + "epoch": 0.8917059573584794, + "grad_norm": 0.07433250546455383, + "learning_rate": 2.9251996355160738e-05, + "loss": 2.5254, + "step": 30071 + }, + { + "epoch": 0.8917356107107909, + "grad_norm": 0.07418026775121689, + "learning_rate": 2.923614113004397e-05, + "loss": 2.5648, + "step": 30072 + }, + { + "epoch": 0.8917652640631023, + "grad_norm": 0.0824226588010788, + "learning_rate": 2.9220290073622035e-05, + "loss": 2.5268, + "step": 30073 + }, + { + "epoch": 0.8917949174154138, + "grad_norm": 0.07786493003368378, + "learning_rate": 2.9204443186035268e-05, + "loss": 2.5557, + "step": 30074 + }, + { + "epoch": 0.8918245707677253, + "grad_norm": 0.07358124852180481, + "learning_rate": 2.9188600467424e-05, + "loss": 2.5673, + "step": 30075 + }, + { + "epoch": 0.8918542241200368, + "grad_norm": 0.07526638358831406, + "learning_rate": 2.917276191792845e-05, + "loss": 2.5576, + "step": 30076 + }, + { + "epoch": 0.8918838774723482, + "grad_norm": 0.07225547730922699, + "learning_rate": 2.915692753768895e-05, + "loss": 2.5599, + "step": 30077 + }, + { + "epoch": 0.8919135308246597, + "grad_norm": 0.07419267296791077, + "learning_rate": 2.9141097326845667e-05, + "loss": 2.5305, + "step": 30078 + }, + { + "epoch": 0.8919431841769712, + "grad_norm": 0.0767284408211708, + "learning_rate": 2.912527128553877e-05, + "loss": 2.5271, + "step": 30079 + }, + { + "epoch": 0.8919728375292827, + "grad_norm": 0.0730796828866005, + "learning_rate": 2.9109449413908416e-05, + "loss": 2.5356, + "step": 30080 + }, + { + "epoch": 0.8920024908815941, + "grad_norm": 0.07166402786970139, + "learning_rate": 2.909363171209467e-05, + "loss": 2.5588, + "step": 30081 + }, + { + "epoch": 0.8920321442339056, + "grad_norm": 0.07248982787132263, + "learning_rate": 2.9077818180237692e-05, + "loss": 2.5527, + "step": 30082 + }, + { + "epoch": 0.8920617975862172, + "grad_norm": 0.07565101236104965, + "learning_rate": 2.906200881847748e-05, + "loss": 2.4955, + "step": 30083 + }, + { + "epoch": 0.8920914509385286, + "grad_norm": 0.07703426480293274, + "learning_rate": 2.9046203626953982e-05, + "loss": 2.5335, + "step": 30084 + }, + { + "epoch": 0.8921211042908401, + "grad_norm": 0.07954263687133789, + "learning_rate": 2.9030402605807137e-05, + "loss": 2.5594, + "step": 30085 + }, + { + "epoch": 0.8921507576431515, + "grad_norm": 0.07229618728160858, + "learning_rate": 2.901460575517695e-05, + "loss": 2.5705, + "step": 30086 + }, + { + "epoch": 0.8921804109954631, + "grad_norm": 0.0711289495229721, + "learning_rate": 2.89988130752033e-05, + "loss": 2.5227, + "step": 30087 + }, + { + "epoch": 0.8922100643477745, + "grad_norm": 0.07216954231262207, + "learning_rate": 2.8983024566026083e-05, + "loss": 2.5098, + "step": 30088 + }, + { + "epoch": 0.892239717700086, + "grad_norm": 0.0732196718454361, + "learning_rate": 2.8967240227784963e-05, + "loss": 2.5129, + "step": 30089 + }, + { + "epoch": 0.8922693710523975, + "grad_norm": 0.07852528989315033, + "learning_rate": 2.8951460060619827e-05, + "loss": 2.5757, + "step": 30090 + }, + { + "epoch": 0.892299024404709, + "grad_norm": 0.078302763402462, + "learning_rate": 2.8935684064670286e-05, + "loss": 2.5474, + "step": 30091 + }, + { + "epoch": 0.8923286777570204, + "grad_norm": 0.07433725893497467, + "learning_rate": 2.8919912240076175e-05, + "loss": 2.5705, + "step": 30092 + }, + { + "epoch": 0.8923583311093319, + "grad_norm": 0.07372644543647766, + "learning_rate": 2.8904144586977045e-05, + "loss": 2.4986, + "step": 30093 + }, + { + "epoch": 0.8923879844616434, + "grad_norm": 0.069255530834198, + "learning_rate": 2.8888381105512618e-05, + "loss": 2.5503, + "step": 30094 + }, + { + "epoch": 0.8924176378139549, + "grad_norm": 0.07808942347764969, + "learning_rate": 2.8872621795822453e-05, + "loss": 2.5456, + "step": 30095 + }, + { + "epoch": 0.8924472911662663, + "grad_norm": 0.0736376941204071, + "learning_rate": 2.88568666580461e-05, + "loss": 2.5508, + "step": 30096 + }, + { + "epoch": 0.8924769445185778, + "grad_norm": 0.06694342941045761, + "learning_rate": 2.8841115692323005e-05, + "loss": 2.5387, + "step": 30097 + }, + { + "epoch": 0.8925065978708893, + "grad_norm": 0.07153527438640594, + "learning_rate": 2.8825368898792724e-05, + "loss": 2.5427, + "step": 30098 + }, + { + "epoch": 0.8925362512232008, + "grad_norm": 0.07464627921581268, + "learning_rate": 2.8809626277594703e-05, + "loss": 2.4956, + "step": 30099 + }, + { + "epoch": 0.8925659045755122, + "grad_norm": 0.07160472869873047, + "learning_rate": 2.8793887828868326e-05, + "loss": 2.5568, + "step": 30100 + }, + { + "epoch": 0.8925955579278237, + "grad_norm": 0.0712042972445488, + "learning_rate": 2.877815355275293e-05, + "loss": 2.5551, + "step": 30101 + }, + { + "epoch": 0.8926252112801352, + "grad_norm": 0.07548777759075165, + "learning_rate": 2.8762423449387842e-05, + "loss": 2.5553, + "step": 30102 + }, + { + "epoch": 0.8926548646324467, + "grad_norm": 0.07267913967370987, + "learning_rate": 2.8746697518912403e-05, + "loss": 2.5527, + "step": 30103 + }, + { + "epoch": 0.8926845179847582, + "grad_norm": 0.0722365528345108, + "learning_rate": 2.8730975761465884e-05, + "loss": 2.5083, + "step": 30104 + }, + { + "epoch": 0.8927141713370697, + "grad_norm": 0.07539147883653641, + "learning_rate": 2.8715258177187452e-05, + "loss": 2.5445, + "step": 30105 + }, + { + "epoch": 0.8927438246893812, + "grad_norm": 0.07534211874008179, + "learning_rate": 2.869954476621628e-05, + "loss": 2.5197, + "step": 30106 + }, + { + "epoch": 0.8927734780416926, + "grad_norm": 0.07148130238056183, + "learning_rate": 2.8683835528691525e-05, + "loss": 2.5569, + "step": 30107 + }, + { + "epoch": 0.8928031313940041, + "grad_norm": 0.07031605392694473, + "learning_rate": 2.8668130464752307e-05, + "loss": 2.5656, + "step": 30108 + }, + { + "epoch": 0.8928327847463156, + "grad_norm": 0.0713878870010376, + "learning_rate": 2.865242957453773e-05, + "loss": 2.5681, + "step": 30109 + }, + { + "epoch": 0.8928624380986271, + "grad_norm": 0.07362000644207001, + "learning_rate": 2.86367328581868e-05, + "loss": 2.5682, + "step": 30110 + }, + { + "epoch": 0.8928920914509385, + "grad_norm": 0.06808815151453018, + "learning_rate": 2.862104031583851e-05, + "loss": 2.496, + "step": 30111 + }, + { + "epoch": 0.89292174480325, + "grad_norm": 0.0715111717581749, + "learning_rate": 2.8605351947631752e-05, + "loss": 2.5317, + "step": 30112 + }, + { + "epoch": 0.8929513981555615, + "grad_norm": 0.07033425569534302, + "learning_rate": 2.8589667753705585e-05, + "loss": 2.518, + "step": 30113 + }, + { + "epoch": 0.892981051507873, + "grad_norm": 0.07507331669330597, + "learning_rate": 2.8573987734198837e-05, + "loss": 2.5472, + "step": 30114 + }, + { + "epoch": 0.8930107048601844, + "grad_norm": 0.07233088463544846, + "learning_rate": 2.8558311889250286e-05, + "loss": 2.5398, + "step": 30115 + }, + { + "epoch": 0.8930403582124959, + "grad_norm": 0.07329407334327698, + "learning_rate": 2.8542640218998826e-05, + "loss": 2.592, + "step": 30116 + }, + { + "epoch": 0.8930700115648074, + "grad_norm": 0.07426594942808151, + "learning_rate": 2.8526972723583235e-05, + "loss": 2.5789, + "step": 30117 + }, + { + "epoch": 0.8930996649171189, + "grad_norm": 0.0771438404917717, + "learning_rate": 2.8511309403142228e-05, + "loss": 2.531, + "step": 30118 + }, + { + "epoch": 0.8931293182694303, + "grad_norm": 0.074802465736866, + "learning_rate": 2.8495650257814476e-05, + "loss": 2.5584, + "step": 30119 + }, + { + "epoch": 0.8931589716217418, + "grad_norm": 0.07323870062828064, + "learning_rate": 2.8479995287738703e-05, + "loss": 2.55, + "step": 30120 + }, + { + "epoch": 0.8931886249740533, + "grad_norm": 0.07362119853496552, + "learning_rate": 2.8464344493053462e-05, + "loss": 2.5425, + "step": 30121 + }, + { + "epoch": 0.8932182783263648, + "grad_norm": 0.07812879979610443, + "learning_rate": 2.844869787389753e-05, + "loss": 2.5333, + "step": 30122 + }, + { + "epoch": 0.8932479316786762, + "grad_norm": 0.07706394791603088, + "learning_rate": 2.8433055430409195e-05, + "loss": 2.4964, + "step": 30123 + }, + { + "epoch": 0.8932775850309878, + "grad_norm": 0.07281674444675446, + "learning_rate": 2.8417417162727054e-05, + "loss": 2.5583, + "step": 30124 + }, + { + "epoch": 0.8933072383832993, + "grad_norm": 0.07388933002948761, + "learning_rate": 2.8401783070989672e-05, + "loss": 2.5149, + "step": 30125 + }, + { + "epoch": 0.8933368917356107, + "grad_norm": 0.06942712515592575, + "learning_rate": 2.8386153155335493e-05, + "loss": 2.54, + "step": 30126 + }, + { + "epoch": 0.8933665450879222, + "grad_norm": 0.07617859542369843, + "learning_rate": 2.8370527415902846e-05, + "loss": 2.5235, + "step": 30127 + }, + { + "epoch": 0.8933961984402337, + "grad_norm": 0.07453867048025131, + "learning_rate": 2.8354905852830128e-05, + "loss": 2.527, + "step": 30128 + }, + { + "epoch": 0.8934258517925452, + "grad_norm": 0.07631126046180725, + "learning_rate": 2.8339288466255664e-05, + "loss": 2.5417, + "step": 30129 + }, + { + "epoch": 0.8934555051448566, + "grad_norm": 0.07361653447151184, + "learning_rate": 2.8323675256317794e-05, + "loss": 2.5754, + "step": 30130 + }, + { + "epoch": 0.8934851584971681, + "grad_norm": 0.07294978946447372, + "learning_rate": 2.8308066223154738e-05, + "loss": 2.5573, + "step": 30131 + }, + { + "epoch": 0.8935148118494796, + "grad_norm": 0.07533308863639832, + "learning_rate": 2.8292461366904776e-05, + "loss": 2.5293, + "step": 30132 + }, + { + "epoch": 0.8935444652017911, + "grad_norm": 0.07016132771968842, + "learning_rate": 2.8276860687705963e-05, + "loss": 2.5255, + "step": 30133 + }, + { + "epoch": 0.8935741185541025, + "grad_norm": 0.07626283913850784, + "learning_rate": 2.8261264185696522e-05, + "loss": 2.5592, + "step": 30134 + }, + { + "epoch": 0.893603771906414, + "grad_norm": 0.07923639565706253, + "learning_rate": 2.8245671861014565e-05, + "loss": 2.5427, + "step": 30135 + }, + { + "epoch": 0.8936334252587255, + "grad_norm": 0.0718386247754097, + "learning_rate": 2.8230083713798093e-05, + "loss": 2.5013, + "step": 30136 + }, + { + "epoch": 0.893663078611037, + "grad_norm": 0.0746425911784172, + "learning_rate": 2.8214499744185275e-05, + "loss": 2.5411, + "step": 30137 + }, + { + "epoch": 0.8936927319633484, + "grad_norm": 0.07563794404268265, + "learning_rate": 2.819891995231405e-05, + "loss": 2.5067, + "step": 30138 + }, + { + "epoch": 0.89372238531566, + "grad_norm": 0.07677079737186432, + "learning_rate": 2.818334433832237e-05, + "loss": 2.5285, + "step": 30139 + }, + { + "epoch": 0.8937520386679714, + "grad_norm": 0.07359777390956879, + "learning_rate": 2.8167772902348178e-05, + "loss": 2.4982, + "step": 30140 + }, + { + "epoch": 0.8937816920202829, + "grad_norm": 0.0683080330491066, + "learning_rate": 2.8152205644529306e-05, + "loss": 2.4955, + "step": 30141 + }, + { + "epoch": 0.8938113453725943, + "grad_norm": 0.07371225208044052, + "learning_rate": 2.8136642565003647e-05, + "loss": 2.4839, + "step": 30142 + }, + { + "epoch": 0.8938409987249059, + "grad_norm": 0.07467349618673325, + "learning_rate": 2.812108366390914e-05, + "loss": 2.5527, + "step": 30143 + }, + { + "epoch": 0.8938706520772173, + "grad_norm": 0.07204500585794449, + "learning_rate": 2.8105528941383297e-05, + "loss": 2.5655, + "step": 30144 + }, + { + "epoch": 0.8939003054295288, + "grad_norm": 0.07803722470998764, + "learning_rate": 2.8089978397564052e-05, + "loss": 2.4855, + "step": 30145 + }, + { + "epoch": 0.8939299587818403, + "grad_norm": 0.07830265909433365, + "learning_rate": 2.8074432032589024e-05, + "loss": 2.508, + "step": 30146 + }, + { + "epoch": 0.8939596121341518, + "grad_norm": 0.0731978565454483, + "learning_rate": 2.805888984659588e-05, + "loss": 2.5299, + "step": 30147 + }, + { + "epoch": 0.8939892654864633, + "grad_norm": 0.07353968918323517, + "learning_rate": 2.8043351839722286e-05, + "loss": 2.57, + "step": 30148 + }, + { + "epoch": 0.8940189188387747, + "grad_norm": 0.07152676582336426, + "learning_rate": 2.8027818012105743e-05, + "loss": 2.5787, + "step": 30149 + }, + { + "epoch": 0.8940485721910862, + "grad_norm": 0.07303866744041443, + "learning_rate": 2.8012288363883975e-05, + "loss": 2.5248, + "step": 30150 + }, + { + "epoch": 0.8940782255433977, + "grad_norm": 0.07504158467054367, + "learning_rate": 2.7996762895194426e-05, + "loss": 2.5397, + "step": 30151 + }, + { + "epoch": 0.8941078788957092, + "grad_norm": 0.06940162181854248, + "learning_rate": 2.7981241606174546e-05, + "loss": 2.5677, + "step": 30152 + }, + { + "epoch": 0.8941375322480206, + "grad_norm": 0.07152517884969711, + "learning_rate": 2.7965724496961885e-05, + "loss": 2.518, + "step": 30153 + }, + { + "epoch": 0.8941671856003321, + "grad_norm": 0.07281927764415741, + "learning_rate": 2.7950211567693616e-05, + "loss": 2.5163, + "step": 30154 + }, + { + "epoch": 0.8941968389526436, + "grad_norm": 0.07217645645141602, + "learning_rate": 2.7934702818507298e-05, + "loss": 2.561, + "step": 30155 + }, + { + "epoch": 0.8942264923049551, + "grad_norm": 0.07362744212150574, + "learning_rate": 2.7919198249540202e-05, + "loss": 2.5418, + "step": 30156 + }, + { + "epoch": 0.8942561456572665, + "grad_norm": 0.07218046486377716, + "learning_rate": 2.7903697860929665e-05, + "loss": 2.5912, + "step": 30157 + }, + { + "epoch": 0.894285799009578, + "grad_norm": 0.07896211743354797, + "learning_rate": 2.788820165281292e-05, + "loss": 2.5235, + "step": 30158 + }, + { + "epoch": 0.8943154523618895, + "grad_norm": 0.07518983632326126, + "learning_rate": 2.7872709625327175e-05, + "loss": 2.5365, + "step": 30159 + }, + { + "epoch": 0.894345105714201, + "grad_norm": 0.07862520962953568, + "learning_rate": 2.7857221778609608e-05, + "loss": 2.549, + "step": 30160 + }, + { + "epoch": 0.8943747590665124, + "grad_norm": 0.07898823171854019, + "learning_rate": 2.7841738112797387e-05, + "loss": 2.5378, + "step": 30161 + }, + { + "epoch": 0.894404412418824, + "grad_norm": 0.07653264701366425, + "learning_rate": 2.7826258628027513e-05, + "loss": 2.5327, + "step": 30162 + }, + { + "epoch": 0.8944340657711354, + "grad_norm": 0.06759928911924362, + "learning_rate": 2.7810783324437317e-05, + "loss": 2.5452, + "step": 30163 + }, + { + "epoch": 0.8944637191234469, + "grad_norm": 0.07811828702688217, + "learning_rate": 2.7795312202163692e-05, + "loss": 2.546, + "step": 30164 + }, + { + "epoch": 0.8944933724757583, + "grad_norm": 0.0771758109331131, + "learning_rate": 2.7779845261343584e-05, + "loss": 2.5328, + "step": 30165 + }, + { + "epoch": 0.8945230258280699, + "grad_norm": 0.07188354432582855, + "learning_rate": 2.7764382502113994e-05, + "loss": 2.5579, + "step": 30166 + }, + { + "epoch": 0.8945526791803814, + "grad_norm": 0.06993920356035233, + "learning_rate": 2.774892392461187e-05, + "loss": 2.5731, + "step": 30167 + }, + { + "epoch": 0.8945823325326928, + "grad_norm": 0.08652786165475845, + "learning_rate": 2.7733469528974098e-05, + "loss": 2.5779, + "step": 30168 + }, + { + "epoch": 0.8946119858850043, + "grad_norm": 0.07191470265388489, + "learning_rate": 2.771801931533752e-05, + "loss": 2.5554, + "step": 30169 + }, + { + "epoch": 0.8946416392373158, + "grad_norm": 0.07463255524635315, + "learning_rate": 2.7702573283838905e-05, + "loss": 2.513, + "step": 30170 + }, + { + "epoch": 0.8946712925896273, + "grad_norm": 0.07735727727413177, + "learning_rate": 2.7687131434615098e-05, + "loss": 2.586, + "step": 30171 + }, + { + "epoch": 0.8947009459419387, + "grad_norm": 0.07545322179794312, + "learning_rate": 2.767169376780282e-05, + "loss": 2.5382, + "step": 30172 + }, + { + "epoch": 0.8947305992942503, + "grad_norm": 0.07129573822021484, + "learning_rate": 2.7656260283538738e-05, + "loss": 2.5605, + "step": 30173 + }, + { + "epoch": 0.8947602526465617, + "grad_norm": 0.0743725374341011, + "learning_rate": 2.7640830981959573e-05, + "loss": 2.5121, + "step": 30174 + }, + { + "epoch": 0.8947899059988732, + "grad_norm": 0.07297176122665405, + "learning_rate": 2.762540586320189e-05, + "loss": 2.5395, + "step": 30175 + }, + { + "epoch": 0.8948195593511846, + "grad_norm": 0.07386283576488495, + "learning_rate": 2.7609984927402355e-05, + "loss": 2.5356, + "step": 30176 + }, + { + "epoch": 0.8948492127034962, + "grad_norm": 0.07371095567941666, + "learning_rate": 2.759456817469752e-05, + "loss": 2.556, + "step": 30177 + }, + { + "epoch": 0.8948788660558076, + "grad_norm": 0.07090918719768524, + "learning_rate": 2.7579155605223837e-05, + "loss": 2.5445, + "step": 30178 + }, + { + "epoch": 0.8949085194081191, + "grad_norm": 0.07926133275032043, + "learning_rate": 2.7563747219117808e-05, + "loss": 2.5486, + "step": 30179 + }, + { + "epoch": 0.8949381727604305, + "grad_norm": 0.07271425426006317, + "learning_rate": 2.7548343016515932e-05, + "loss": 2.5556, + "step": 30180 + }, + { + "epoch": 0.8949678261127421, + "grad_norm": 0.0724552720785141, + "learning_rate": 2.753294299755449e-05, + "loss": 2.5541, + "step": 30181 + }, + { + "epoch": 0.8949974794650535, + "grad_norm": 0.07894579321146011, + "learning_rate": 2.7517547162370037e-05, + "loss": 2.5599, + "step": 30182 + }, + { + "epoch": 0.895027132817365, + "grad_norm": 0.06934860348701477, + "learning_rate": 2.7502155511098748e-05, + "loss": 2.5308, + "step": 30183 + }, + { + "epoch": 0.8950567861696764, + "grad_norm": 0.07344450801610947, + "learning_rate": 2.7486768043876952e-05, + "loss": 2.5108, + "step": 30184 + }, + { + "epoch": 0.895086439521988, + "grad_norm": 0.07078173756599426, + "learning_rate": 2.747138476084099e-05, + "loss": 2.5194, + "step": 30185 + }, + { + "epoch": 0.8951160928742994, + "grad_norm": 0.06938808411359787, + "learning_rate": 2.745600566212697e-05, + "loss": 2.544, + "step": 30186 + }, + { + "epoch": 0.8951457462266109, + "grad_norm": 0.07413962483406067, + "learning_rate": 2.7440630747871174e-05, + "loss": 2.4788, + "step": 30187 + }, + { + "epoch": 0.8951753995789224, + "grad_norm": 0.07049228996038437, + "learning_rate": 2.7425260018209718e-05, + "loss": 2.5387, + "step": 30188 + }, + { + "epoch": 0.8952050529312339, + "grad_norm": 0.07429495453834534, + "learning_rate": 2.7409893473278657e-05, + "loss": 2.5495, + "step": 30189 + }, + { + "epoch": 0.8952347062835454, + "grad_norm": 0.07319371402263641, + "learning_rate": 2.7394531113214103e-05, + "loss": 2.557, + "step": 30190 + }, + { + "epoch": 0.8952643596358568, + "grad_norm": 0.07608701288700104, + "learning_rate": 2.737917293815212e-05, + "loss": 2.5387, + "step": 30191 + }, + { + "epoch": 0.8952940129881684, + "grad_norm": 0.07153859734535217, + "learning_rate": 2.73638189482287e-05, + "loss": 2.5754, + "step": 30192 + }, + { + "epoch": 0.8953236663404798, + "grad_norm": 0.07296441495418549, + "learning_rate": 2.7348469143579802e-05, + "loss": 2.5699, + "step": 30193 + }, + { + "epoch": 0.8953533196927913, + "grad_norm": 0.07102972269058228, + "learning_rate": 2.7333123524341306e-05, + "loss": 2.571, + "step": 30194 + }, + { + "epoch": 0.8953829730451027, + "grad_norm": 0.06967300921678543, + "learning_rate": 2.7317782090649112e-05, + "loss": 2.5058, + "step": 30195 + }, + { + "epoch": 0.8954126263974143, + "grad_norm": 0.07025858014822006, + "learning_rate": 2.7302444842639162e-05, + "loss": 2.5366, + "step": 30196 + }, + { + "epoch": 0.8954422797497257, + "grad_norm": 0.07219535112380981, + "learning_rate": 2.7287111780447127e-05, + "loss": 2.5675, + "step": 30197 + }, + { + "epoch": 0.8954719331020372, + "grad_norm": 0.07299042493104935, + "learning_rate": 2.727178290420895e-05, + "loss": 2.5375, + "step": 30198 + }, + { + "epoch": 0.8955015864543486, + "grad_norm": 0.07094898074865341, + "learning_rate": 2.7256458214060253e-05, + "loss": 2.5551, + "step": 30199 + }, + { + "epoch": 0.8955312398066602, + "grad_norm": 0.06944924592971802, + "learning_rate": 2.72411377101367e-05, + "loss": 2.5481, + "step": 30200 + }, + { + "epoch": 0.8955608931589716, + "grad_norm": 0.07009099423885345, + "learning_rate": 2.722582139257401e-05, + "loss": 2.4973, + "step": 30201 + }, + { + "epoch": 0.8955905465112831, + "grad_norm": 0.0708351582288742, + "learning_rate": 2.7210509261507864e-05, + "loss": 2.566, + "step": 30202 + }, + { + "epoch": 0.8956201998635945, + "grad_norm": 0.07067257910966873, + "learning_rate": 2.719520131707376e-05, + "loss": 2.5355, + "step": 30203 + }, + { + "epoch": 0.8956498532159061, + "grad_norm": 0.07232405990362167, + "learning_rate": 2.7179897559407364e-05, + "loss": 2.5131, + "step": 30204 + }, + { + "epoch": 0.8956795065682175, + "grad_norm": 0.07664341479539871, + "learning_rate": 2.7164597988644123e-05, + "loss": 2.5775, + "step": 30205 + }, + { + "epoch": 0.895709159920529, + "grad_norm": 0.06860984116792679, + "learning_rate": 2.7149302604919547e-05, + "loss": 2.54, + "step": 30206 + }, + { + "epoch": 0.8957388132728404, + "grad_norm": 0.07228116691112518, + "learning_rate": 2.713401140836902e-05, + "loss": 2.5553, + "step": 30207 + }, + { + "epoch": 0.895768466625152, + "grad_norm": 0.07827487587928772, + "learning_rate": 2.7118724399128102e-05, + "loss": 2.557, + "step": 30208 + }, + { + "epoch": 0.8957981199774635, + "grad_norm": 0.07227180898189545, + "learning_rate": 2.7103441577331966e-05, + "loss": 2.5458, + "step": 30209 + }, + { + "epoch": 0.8958277733297749, + "grad_norm": 0.06840616464614868, + "learning_rate": 2.7088162943116003e-05, + "loss": 2.5339, + "step": 30210 + }, + { + "epoch": 0.8958574266820865, + "grad_norm": 0.07403693348169327, + "learning_rate": 2.7072888496615546e-05, + "loss": 2.5219, + "step": 30211 + }, + { + "epoch": 0.8958870800343979, + "grad_norm": 0.07330397516489029, + "learning_rate": 2.7057618237965818e-05, + "loss": 2.5045, + "step": 30212 + }, + { + "epoch": 0.8959167333867094, + "grad_norm": 0.07069236785173416, + "learning_rate": 2.7042352167302108e-05, + "loss": 2.5638, + "step": 30213 + }, + { + "epoch": 0.8959463867390208, + "grad_norm": 0.0756545290350914, + "learning_rate": 2.7027090284759416e-05, + "loss": 2.5582, + "step": 30214 + }, + { + "epoch": 0.8959760400913324, + "grad_norm": 0.07688368111848831, + "learning_rate": 2.7011832590473127e-05, + "loss": 2.5443, + "step": 30215 + }, + { + "epoch": 0.8960056934436438, + "grad_norm": 0.07309505343437195, + "learning_rate": 2.6996579084578253e-05, + "loss": 2.5459, + "step": 30216 + }, + { + "epoch": 0.8960353467959553, + "grad_norm": 0.07271523028612137, + "learning_rate": 2.6981329767209905e-05, + "loss": 2.551, + "step": 30217 + }, + { + "epoch": 0.8960650001482667, + "grad_norm": 0.07649430632591248, + "learning_rate": 2.696608463850303e-05, + "loss": 2.5581, + "step": 30218 + }, + { + "epoch": 0.8960946535005783, + "grad_norm": 0.0713217705488205, + "learning_rate": 2.695084369859274e-05, + "loss": 2.5796, + "step": 30219 + }, + { + "epoch": 0.8961243068528897, + "grad_norm": 0.0724983736872673, + "learning_rate": 2.6935606947613824e-05, + "loss": 2.5023, + "step": 30220 + }, + { + "epoch": 0.8961539602052012, + "grad_norm": 0.07814544439315796, + "learning_rate": 2.692037438570133e-05, + "loss": 2.5175, + "step": 30221 + }, + { + "epoch": 0.8961836135575126, + "grad_norm": 0.07175140082836151, + "learning_rate": 2.6905146012990155e-05, + "loss": 2.5059, + "step": 30222 + }, + { + "epoch": 0.8962132669098242, + "grad_norm": 0.07719767093658447, + "learning_rate": 2.6889921829615082e-05, + "loss": 2.5494, + "step": 30223 + }, + { + "epoch": 0.8962429202621356, + "grad_norm": 0.07278616726398468, + "learning_rate": 2.6874701835711e-05, + "loss": 2.5384, + "step": 30224 + }, + { + "epoch": 0.8962725736144471, + "grad_norm": 0.07568282634019852, + "learning_rate": 2.6859486031412638e-05, + "loss": 2.5039, + "step": 30225 + }, + { + "epoch": 0.8963022269667585, + "grad_norm": 0.07588443905115128, + "learning_rate": 2.684427441685472e-05, + "loss": 2.4918, + "step": 30226 + }, + { + "epoch": 0.8963318803190701, + "grad_norm": 0.07329721748828888, + "learning_rate": 2.6829066992171857e-05, + "loss": 2.5306, + "step": 30227 + }, + { + "epoch": 0.8963615336713816, + "grad_norm": 0.07415378093719482, + "learning_rate": 2.6813863757498945e-05, + "loss": 2.5152, + "step": 30228 + }, + { + "epoch": 0.896391187023693, + "grad_norm": 0.07967648655176163, + "learning_rate": 2.6798664712970545e-05, + "loss": 2.5603, + "step": 30229 + }, + { + "epoch": 0.8964208403760046, + "grad_norm": 0.07439403235912323, + "learning_rate": 2.6783469858721155e-05, + "loss": 2.5308, + "step": 30230 + }, + { + "epoch": 0.896450493728316, + "grad_norm": 0.07633844763040543, + "learning_rate": 2.6768279194885335e-05, + "loss": 2.5348, + "step": 30231 + }, + { + "epoch": 0.8964801470806275, + "grad_norm": 0.0733444020152092, + "learning_rate": 2.675309272159765e-05, + "loss": 2.5437, + "step": 30232 + }, + { + "epoch": 0.8965098004329389, + "grad_norm": 0.0715842992067337, + "learning_rate": 2.6737910438992542e-05, + "loss": 2.5872, + "step": 30233 + }, + { + "epoch": 0.8965394537852505, + "grad_norm": 0.06781536340713501, + "learning_rate": 2.6722732347204516e-05, + "loss": 2.5179, + "step": 30234 + }, + { + "epoch": 0.8965691071375619, + "grad_norm": 0.07585577666759491, + "learning_rate": 2.6707558446367854e-05, + "loss": 2.5364, + "step": 30235 + }, + { + "epoch": 0.8965987604898734, + "grad_norm": 0.0729549303650856, + "learning_rate": 2.669238873661706e-05, + "loss": 2.5719, + "step": 30236 + }, + { + "epoch": 0.8966284138421848, + "grad_norm": 0.07507319748401642, + "learning_rate": 2.6677223218086412e-05, + "loss": 2.579, + "step": 30237 + }, + { + "epoch": 0.8966580671944964, + "grad_norm": 0.07483135163784027, + "learning_rate": 2.6662061890910138e-05, + "loss": 2.5315, + "step": 30238 + }, + { + "epoch": 0.8966877205468078, + "grad_norm": 0.0734778493642807, + "learning_rate": 2.664690475522258e-05, + "loss": 2.5156, + "step": 30239 + }, + { + "epoch": 0.8967173738991193, + "grad_norm": 0.07729408144950867, + "learning_rate": 2.6631751811157955e-05, + "loss": 2.523, + "step": 30240 + }, + { + "epoch": 0.8967470272514307, + "grad_norm": 0.07641837000846863, + "learning_rate": 2.661660305885044e-05, + "loss": 2.5292, + "step": 30241 + }, + { + "epoch": 0.8967766806037423, + "grad_norm": 0.07982844114303589, + "learning_rate": 2.6601458498434096e-05, + "loss": 2.5408, + "step": 30242 + }, + { + "epoch": 0.8968063339560537, + "grad_norm": 0.0790436863899231, + "learning_rate": 2.658631813004314e-05, + "loss": 2.5518, + "step": 30243 + }, + { + "epoch": 0.8968359873083652, + "grad_norm": 0.07124307751655579, + "learning_rate": 2.6571181953811586e-05, + "loss": 2.5816, + "step": 30244 + }, + { + "epoch": 0.8968656406606766, + "grad_norm": 0.07333773374557495, + "learning_rate": 2.6556049969873486e-05, + "loss": 2.55, + "step": 30245 + }, + { + "epoch": 0.8968952940129882, + "grad_norm": 0.07712151110172272, + "learning_rate": 2.6540922178362793e-05, + "loss": 2.5654, + "step": 30246 + }, + { + "epoch": 0.8969249473652996, + "grad_norm": 0.078435979783535, + "learning_rate": 2.6525798579413508e-05, + "loss": 2.5552, + "step": 30247 + }, + { + "epoch": 0.8969546007176111, + "grad_norm": 0.06831381469964981, + "learning_rate": 2.651067917315958e-05, + "loss": 2.5153, + "step": 30248 + }, + { + "epoch": 0.8969842540699227, + "grad_norm": 0.08071494102478027, + "learning_rate": 2.6495563959734848e-05, + "loss": 2.5391, + "step": 30249 + }, + { + "epoch": 0.8970139074222341, + "grad_norm": 0.0782136619091034, + "learning_rate": 2.6480452939273202e-05, + "loss": 2.5724, + "step": 30250 + }, + { + "epoch": 0.8970435607745456, + "grad_norm": 0.07031475752592087, + "learning_rate": 2.646534611190837e-05, + "loss": 2.5285, + "step": 30251 + }, + { + "epoch": 0.897073214126857, + "grad_norm": 0.07477276772260666, + "learning_rate": 2.6450243477774193e-05, + "loss": 2.5208, + "step": 30252 + }, + { + "epoch": 0.8971028674791686, + "grad_norm": 0.07358860224485397, + "learning_rate": 2.643514503700445e-05, + "loss": 2.5505, + "step": 30253 + }, + { + "epoch": 0.89713252083148, + "grad_norm": 0.07804364711046219, + "learning_rate": 2.6420050789732698e-05, + "loss": 2.5603, + "step": 30254 + }, + { + "epoch": 0.8971621741837915, + "grad_norm": 0.07269523292779922, + "learning_rate": 2.640496073609272e-05, + "loss": 2.5518, + "step": 30255 + }, + { + "epoch": 0.8971918275361029, + "grad_norm": 0.07281746715307236, + "learning_rate": 2.6389874876218133e-05, + "loss": 2.5182, + "step": 30256 + }, + { + "epoch": 0.8972214808884145, + "grad_norm": 0.07513220608234406, + "learning_rate": 2.63747932102425e-05, + "loss": 2.5552, + "step": 30257 + }, + { + "epoch": 0.8972511342407259, + "grad_norm": 0.0711483582854271, + "learning_rate": 2.6359715738299316e-05, + "loss": 2.5262, + "step": 30258 + }, + { + "epoch": 0.8972807875930374, + "grad_norm": 0.07297104597091675, + "learning_rate": 2.6344642460522205e-05, + "loss": 2.5232, + "step": 30259 + }, + { + "epoch": 0.8973104409453488, + "grad_norm": 0.0701875239610672, + "learning_rate": 2.632957337704456e-05, + "loss": 2.5299, + "step": 30260 + }, + { + "epoch": 0.8973400942976604, + "grad_norm": 0.07023170590400696, + "learning_rate": 2.6314508487999823e-05, + "loss": 2.5641, + "step": 30261 + }, + { + "epoch": 0.8973697476499718, + "grad_norm": 0.07056371122598648, + "learning_rate": 2.6299447793521447e-05, + "loss": 2.4942, + "step": 30262 + }, + { + "epoch": 0.8973994010022833, + "grad_norm": 0.07330095767974854, + "learning_rate": 2.628439129374277e-05, + "loss": 2.5542, + "step": 30263 + }, + { + "epoch": 0.8974290543545947, + "grad_norm": 0.06992535293102264, + "learning_rate": 2.6269338988797186e-05, + "loss": 2.521, + "step": 30264 + }, + { + "epoch": 0.8974587077069063, + "grad_norm": 0.06887434422969818, + "learning_rate": 2.6254290878817865e-05, + "loss": 2.5531, + "step": 30265 + }, + { + "epoch": 0.8974883610592177, + "grad_norm": 0.06954752653837204, + "learning_rate": 2.6239246963938036e-05, + "loss": 2.522, + "step": 30266 + }, + { + "epoch": 0.8975180144115292, + "grad_norm": 0.07064805924892426, + "learning_rate": 2.6224207244291086e-05, + "loss": 2.5513, + "step": 30267 + }, + { + "epoch": 0.8975476677638407, + "grad_norm": 0.07404012233018875, + "learning_rate": 2.6209171720010084e-05, + "loss": 2.513, + "step": 30268 + }, + { + "epoch": 0.8975773211161522, + "grad_norm": 0.06998351216316223, + "learning_rate": 2.6194140391228194e-05, + "loss": 2.5409, + "step": 30269 + }, + { + "epoch": 0.8976069744684637, + "grad_norm": 0.06845813989639282, + "learning_rate": 2.617911325807848e-05, + "loss": 2.5149, + "step": 30270 + }, + { + "epoch": 0.8976366278207751, + "grad_norm": 0.07185747474431992, + "learning_rate": 2.6164090320694113e-05, + "loss": 2.5595, + "step": 30271 + }, + { + "epoch": 0.8976662811730867, + "grad_norm": 0.0713847354054451, + "learning_rate": 2.6149071579207984e-05, + "loss": 2.5249, + "step": 30272 + }, + { + "epoch": 0.8976959345253981, + "grad_norm": 0.07794556021690369, + "learning_rate": 2.6134057033753213e-05, + "loss": 2.5249, + "step": 30273 + }, + { + "epoch": 0.8977255878777096, + "grad_norm": 0.08369181305170059, + "learning_rate": 2.61190466844628e-05, + "loss": 2.5538, + "step": 30274 + }, + { + "epoch": 0.897755241230021, + "grad_norm": 0.07617940753698349, + "learning_rate": 2.6104040531469477e-05, + "loss": 2.5521, + "step": 30275 + }, + { + "epoch": 0.8977848945823326, + "grad_norm": 0.07302028685808182, + "learning_rate": 2.608903857490619e-05, + "loss": 2.544, + "step": 30276 + }, + { + "epoch": 0.897814547934644, + "grad_norm": 0.08269962668418884, + "learning_rate": 2.6074040814905832e-05, + "loss": 2.497, + "step": 30277 + }, + { + "epoch": 0.8978442012869555, + "grad_norm": 0.07968952506780624, + "learning_rate": 2.6059047251601187e-05, + "loss": 2.565, + "step": 30278 + }, + { + "epoch": 0.8978738546392669, + "grad_norm": 0.0749472826719284, + "learning_rate": 2.6044057885124926e-05, + "loss": 2.5674, + "step": 30279 + }, + { + "epoch": 0.8979035079915785, + "grad_norm": 0.07294733077287674, + "learning_rate": 2.6029072715610003e-05, + "loss": 2.5629, + "step": 30280 + }, + { + "epoch": 0.8979331613438899, + "grad_norm": 0.0766502395272255, + "learning_rate": 2.6014091743189026e-05, + "loss": 2.5317, + "step": 30281 + }, + { + "epoch": 0.8979628146962014, + "grad_norm": 0.07695747166872025, + "learning_rate": 2.5999114967994563e-05, + "loss": 2.5496, + "step": 30282 + }, + { + "epoch": 0.8979924680485128, + "grad_norm": 0.07629898190498352, + "learning_rate": 2.5984142390159336e-05, + "loss": 2.5371, + "step": 30283 + }, + { + "epoch": 0.8980221214008244, + "grad_norm": 0.07104033976793289, + "learning_rate": 2.596917400981591e-05, + "loss": 2.5085, + "step": 30284 + }, + { + "epoch": 0.8980517747531358, + "grad_norm": 0.0731915533542633, + "learning_rate": 2.5954209827096788e-05, + "loss": 2.5665, + "step": 30285 + }, + { + "epoch": 0.8980814281054473, + "grad_norm": 0.0769970715045929, + "learning_rate": 2.5939249842134527e-05, + "loss": 2.5193, + "step": 30286 + }, + { + "epoch": 0.8981110814577588, + "grad_norm": 0.07385318726301193, + "learning_rate": 2.5924294055061525e-05, + "loss": 2.5508, + "step": 30287 + }, + { + "epoch": 0.8981407348100703, + "grad_norm": 0.0708928257226944, + "learning_rate": 2.5909342466010288e-05, + "loss": 2.5572, + "step": 30288 + }, + { + "epoch": 0.8981703881623817, + "grad_norm": 0.07679330557584763, + "learning_rate": 2.5894395075113263e-05, + "loss": 2.5748, + "step": 30289 + }, + { + "epoch": 0.8982000415146932, + "grad_norm": 0.07065479457378387, + "learning_rate": 2.587945188250268e-05, + "loss": 2.5653, + "step": 30290 + }, + { + "epoch": 0.8982296948670048, + "grad_norm": 0.07044505327939987, + "learning_rate": 2.5864512888310932e-05, + "loss": 2.517, + "step": 30291 + }, + { + "epoch": 0.8982593482193162, + "grad_norm": 0.07044330984354019, + "learning_rate": 2.5849578092670247e-05, + "loss": 2.5419, + "step": 30292 + }, + { + "epoch": 0.8982890015716277, + "grad_norm": 0.07054244726896286, + "learning_rate": 2.5834647495713015e-05, + "loss": 2.5617, + "step": 30293 + }, + { + "epoch": 0.8983186549239391, + "grad_norm": 0.06879634410142899, + "learning_rate": 2.581972109757136e-05, + "loss": 2.523, + "step": 30294 + }, + { + "epoch": 0.8983483082762507, + "grad_norm": 0.0739428922533989, + "learning_rate": 2.580479889837756e-05, + "loss": 2.5339, + "step": 30295 + }, + { + "epoch": 0.8983779616285621, + "grad_norm": 0.0715918093919754, + "learning_rate": 2.5789880898263564e-05, + "loss": 2.536, + "step": 30296 + }, + { + "epoch": 0.8984076149808736, + "grad_norm": 0.07317446172237396, + "learning_rate": 2.5774967097361602e-05, + "loss": 2.5383, + "step": 30297 + }, + { + "epoch": 0.898437268333185, + "grad_norm": 0.07369130849838257, + "learning_rate": 2.5760057495803678e-05, + "loss": 2.535, + "step": 30298 + }, + { + "epoch": 0.8984669216854966, + "grad_norm": 0.07055133581161499, + "learning_rate": 2.574515209372186e-05, + "loss": 2.5668, + "step": 30299 + }, + { + "epoch": 0.898496575037808, + "grad_norm": 0.07531337440013885, + "learning_rate": 2.573025089124814e-05, + "loss": 2.5521, + "step": 30300 + }, + { + "epoch": 0.8985262283901195, + "grad_norm": 0.07239086180925369, + "learning_rate": 2.5715353888514427e-05, + "loss": 2.5728, + "step": 30301 + }, + { + "epoch": 0.898555881742431, + "grad_norm": 0.07543680816888809, + "learning_rate": 2.570046108565266e-05, + "loss": 2.5318, + "step": 30302 + }, + { + "epoch": 0.8985855350947425, + "grad_norm": 0.06843673437833786, + "learning_rate": 2.568557248279474e-05, + "loss": 2.5178, + "step": 30303 + }, + { + "epoch": 0.8986151884470539, + "grad_norm": 0.07281914353370667, + "learning_rate": 2.5670688080072503e-05, + "loss": 2.536, + "step": 30304 + }, + { + "epoch": 0.8986448417993654, + "grad_norm": 0.07109806686639786, + "learning_rate": 2.5655807877617676e-05, + "loss": 2.5314, + "step": 30305 + }, + { + "epoch": 0.8986744951516769, + "grad_norm": 0.07147252559661865, + "learning_rate": 2.5640931875562157e-05, + "loss": 2.5494, + "step": 30306 + }, + { + "epoch": 0.8987041485039884, + "grad_norm": 0.08018806576728821, + "learning_rate": 2.562606007403756e-05, + "loss": 2.4984, + "step": 30307 + }, + { + "epoch": 0.8987338018562998, + "grad_norm": 0.06751790642738342, + "learning_rate": 2.5611192473175672e-05, + "loss": 2.5511, + "step": 30308 + }, + { + "epoch": 0.8987634552086113, + "grad_norm": 0.07195291668176651, + "learning_rate": 2.5596329073108105e-05, + "loss": 2.5013, + "step": 30309 + }, + { + "epoch": 0.8987931085609228, + "grad_norm": 0.07325860857963562, + "learning_rate": 2.5581469873966424e-05, + "loss": 2.5663, + "step": 30310 + }, + { + "epoch": 0.8988227619132343, + "grad_norm": 0.07009439915418625, + "learning_rate": 2.55666148758823e-05, + "loss": 2.5103, + "step": 30311 + }, + { + "epoch": 0.8988524152655458, + "grad_norm": 0.07278928905725479, + "learning_rate": 2.5551764078987238e-05, + "loss": 2.5556, + "step": 30312 + }, + { + "epoch": 0.8988820686178572, + "grad_norm": 0.07131752371788025, + "learning_rate": 2.5536917483412748e-05, + "loss": 2.5344, + "step": 30313 + }, + { + "epoch": 0.8989117219701688, + "grad_norm": 0.07195200771093369, + "learning_rate": 2.5522075089290275e-05, + "loss": 2.5425, + "step": 30314 + }, + { + "epoch": 0.8989413753224802, + "grad_norm": 0.07461118698120117, + "learning_rate": 2.5507236896751275e-05, + "loss": 2.5451, + "step": 30315 + }, + { + "epoch": 0.8989710286747917, + "grad_norm": 0.07307138293981552, + "learning_rate": 2.5492402905927137e-05, + "loss": 2.5724, + "step": 30316 + }, + { + "epoch": 0.8990006820271031, + "grad_norm": 0.07377529889345169, + "learning_rate": 2.5477573116949203e-05, + "loss": 2.5335, + "step": 30317 + }, + { + "epoch": 0.8990303353794147, + "grad_norm": 0.07252396643161774, + "learning_rate": 2.5462747529948814e-05, + "loss": 2.5412, + "step": 30318 + }, + { + "epoch": 0.8990599887317261, + "grad_norm": 0.074220210313797, + "learning_rate": 2.544792614505731e-05, + "loss": 2.5418, + "step": 30319 + }, + { + "epoch": 0.8990896420840376, + "grad_norm": 0.07226211577653885, + "learning_rate": 2.5433108962405805e-05, + "loss": 2.5432, + "step": 30320 + }, + { + "epoch": 0.899119295436349, + "grad_norm": 0.07031845301389694, + "learning_rate": 2.5418295982125585e-05, + "loss": 2.5482, + "step": 30321 + }, + { + "epoch": 0.8991489487886606, + "grad_norm": 0.07342606782913208, + "learning_rate": 2.540348720434782e-05, + "loss": 2.5348, + "step": 30322 + }, + { + "epoch": 0.899178602140972, + "grad_norm": 0.07864873111248016, + "learning_rate": 2.5388682629203687e-05, + "loss": 2.5399, + "step": 30323 + }, + { + "epoch": 0.8992082554932835, + "grad_norm": 0.0756470188498497, + "learning_rate": 2.5373882256824186e-05, + "loss": 2.5396, + "step": 30324 + }, + { + "epoch": 0.899237908845595, + "grad_norm": 0.07150156795978546, + "learning_rate": 2.5359086087340445e-05, + "loss": 2.571, + "step": 30325 + }, + { + "epoch": 0.8992675621979065, + "grad_norm": 0.07131732255220413, + "learning_rate": 2.534429412088346e-05, + "loss": 2.572, + "step": 30326 + }, + { + "epoch": 0.8992972155502179, + "grad_norm": 0.07166305184364319, + "learning_rate": 2.532950635758424e-05, + "loss": 2.5526, + "step": 30327 + }, + { + "epoch": 0.8993268689025294, + "grad_norm": 0.07360902428627014, + "learning_rate": 2.5314722797573687e-05, + "loss": 2.547, + "step": 30328 + }, + { + "epoch": 0.8993565222548409, + "grad_norm": 0.07409483939409256, + "learning_rate": 2.5299943440982797e-05, + "loss": 2.544, + "step": 30329 + }, + { + "epoch": 0.8993861756071524, + "grad_norm": 0.07174845784902573, + "learning_rate": 2.5285168287942307e-05, + "loss": 2.5401, + "step": 30330 + }, + { + "epoch": 0.8994158289594638, + "grad_norm": 0.07508932054042816, + "learning_rate": 2.527039733858316e-05, + "loss": 2.5469, + "step": 30331 + }, + { + "epoch": 0.8994454823117753, + "grad_norm": 0.07534920424222946, + "learning_rate": 2.525563059303615e-05, + "loss": 2.5436, + "step": 30332 + }, + { + "epoch": 0.8994751356640869, + "grad_norm": 0.07271837443113327, + "learning_rate": 2.5240868051432054e-05, + "loss": 2.5486, + "step": 30333 + }, + { + "epoch": 0.8995047890163983, + "grad_norm": 0.06902968883514404, + "learning_rate": 2.522610971390149e-05, + "loss": 2.5532, + "step": 30334 + }, + { + "epoch": 0.8995344423687098, + "grad_norm": 0.06610704213380814, + "learning_rate": 2.5211355580575302e-05, + "loss": 2.516, + "step": 30335 + }, + { + "epoch": 0.8995640957210213, + "grad_norm": 0.0714598298072815, + "learning_rate": 2.5196605651583993e-05, + "loss": 2.5513, + "step": 30336 + }, + { + "epoch": 0.8995937490733328, + "grad_norm": 0.07097620517015457, + "learning_rate": 2.5181859927058236e-05, + "loss": 2.5541, + "step": 30337 + }, + { + "epoch": 0.8996234024256442, + "grad_norm": 0.07045233249664307, + "learning_rate": 2.5167118407128654e-05, + "loss": 2.5669, + "step": 30338 + }, + { + "epoch": 0.8996530557779557, + "grad_norm": 0.06790844351053238, + "learning_rate": 2.5152381091925692e-05, + "loss": 2.5209, + "step": 30339 + }, + { + "epoch": 0.8996827091302672, + "grad_norm": 0.06941056251525879, + "learning_rate": 2.5137647981580024e-05, + "loss": 2.5154, + "step": 30340 + }, + { + "epoch": 0.8997123624825787, + "grad_norm": 0.07029924541711807, + "learning_rate": 2.5122919076221884e-05, + "loss": 2.5035, + "step": 30341 + }, + { + "epoch": 0.8997420158348901, + "grad_norm": 0.06992661952972412, + "learning_rate": 2.510819437598183e-05, + "loss": 2.5497, + "step": 30342 + }, + { + "epoch": 0.8997716691872016, + "grad_norm": 0.07162298262119293, + "learning_rate": 2.5093473880990148e-05, + "loss": 2.5255, + "step": 30343 + }, + { + "epoch": 0.8998013225395131, + "grad_norm": 0.07470371574163437, + "learning_rate": 2.5078757591377343e-05, + "loss": 2.5225, + "step": 30344 + }, + { + "epoch": 0.8998309758918246, + "grad_norm": 0.06869851052761078, + "learning_rate": 2.5064045507273703e-05, + "loss": 2.525, + "step": 30345 + }, + { + "epoch": 0.899860629244136, + "grad_norm": 0.07178541272878647, + "learning_rate": 2.5049337628809398e-05, + "loss": 2.5563, + "step": 30346 + }, + { + "epoch": 0.8998902825964475, + "grad_norm": 0.0724855288863182, + "learning_rate": 2.503463395611477e-05, + "loss": 2.5553, + "step": 30347 + }, + { + "epoch": 0.899919935948759, + "grad_norm": 0.07329864054918289, + "learning_rate": 2.501993448931994e-05, + "loss": 2.5819, + "step": 30348 + }, + { + "epoch": 0.8999495893010705, + "grad_norm": 0.07625139504671097, + "learning_rate": 2.5005239228555133e-05, + "loss": 2.5691, + "step": 30349 + }, + { + "epoch": 0.8999792426533819, + "grad_norm": 0.06872353702783585, + "learning_rate": 2.4990548173950578e-05, + "loss": 2.5251, + "step": 30350 + }, + { + "epoch": 0.9000088960056934, + "grad_norm": 0.074470154941082, + "learning_rate": 2.4975861325636174e-05, + "loss": 2.5474, + "step": 30351 + }, + { + "epoch": 0.9000385493580049, + "grad_norm": 0.07314691692590714, + "learning_rate": 2.4961178683742035e-05, + "loss": 2.5132, + "step": 30352 + }, + { + "epoch": 0.9000682027103164, + "grad_norm": 0.07584060728549957, + "learning_rate": 2.4946500248398174e-05, + "loss": 2.5272, + "step": 30353 + }, + { + "epoch": 0.9000978560626279, + "grad_norm": 0.0718255564570427, + "learning_rate": 2.493182601973465e-05, + "loss": 2.5418, + "step": 30354 + }, + { + "epoch": 0.9001275094149394, + "grad_norm": 0.07247288525104523, + "learning_rate": 2.4917155997881302e-05, + "loss": 2.5492, + "step": 30355 + }, + { + "epoch": 0.9001571627672509, + "grad_norm": 0.07290025800466537, + "learning_rate": 2.490249018296803e-05, + "loss": 2.5093, + "step": 30356 + }, + { + "epoch": 0.9001868161195623, + "grad_norm": 0.07108882069587708, + "learning_rate": 2.4887828575124837e-05, + "loss": 2.5327, + "step": 30357 + }, + { + "epoch": 0.9002164694718738, + "grad_norm": 0.07092508673667908, + "learning_rate": 2.4873171174481457e-05, + "loss": 2.5435, + "step": 30358 + }, + { + "epoch": 0.9002461228241853, + "grad_norm": 0.07091408222913742, + "learning_rate": 2.485851798116773e-05, + "loss": 2.57, + "step": 30359 + }, + { + "epoch": 0.9002757761764968, + "grad_norm": 0.07368069887161255, + "learning_rate": 2.4843868995313322e-05, + "loss": 2.5617, + "step": 30360 + }, + { + "epoch": 0.9003054295288082, + "grad_norm": 0.0735437348484993, + "learning_rate": 2.4829224217048142e-05, + "loss": 2.5294, + "step": 30361 + }, + { + "epoch": 0.9003350828811197, + "grad_norm": 0.07463111728429794, + "learning_rate": 2.4814583646501686e-05, + "loss": 2.5246, + "step": 30362 + }, + { + "epoch": 0.9003647362334312, + "grad_norm": 0.07230811566114426, + "learning_rate": 2.4799947283803635e-05, + "loss": 2.5391, + "step": 30363 + }, + { + "epoch": 0.9003943895857427, + "grad_norm": 0.07860212028026581, + "learning_rate": 2.478531512908361e-05, + "loss": 2.5281, + "step": 30364 + }, + { + "epoch": 0.9004240429380541, + "grad_norm": 0.07757065445184708, + "learning_rate": 2.4770687182471162e-05, + "loss": 2.5473, + "step": 30365 + }, + { + "epoch": 0.9004536962903656, + "grad_norm": 0.07105068862438202, + "learning_rate": 2.4756063444095868e-05, + "loss": 2.5464, + "step": 30366 + }, + { + "epoch": 0.9004833496426771, + "grad_norm": 0.07281873375177383, + "learning_rate": 2.4741443914087224e-05, + "loss": 2.5627, + "step": 30367 + }, + { + "epoch": 0.9005130029949886, + "grad_norm": 0.07548226416110992, + "learning_rate": 2.472682859257469e-05, + "loss": 2.5435, + "step": 30368 + }, + { + "epoch": 0.9005426563473, + "grad_norm": 0.07403651624917984, + "learning_rate": 2.471221747968755e-05, + "loss": 2.5593, + "step": 30369 + }, + { + "epoch": 0.9005723096996116, + "grad_norm": 0.07985173165798187, + "learning_rate": 2.4697610575555418e-05, + "loss": 2.5556, + "step": 30370 + }, + { + "epoch": 0.900601963051923, + "grad_norm": 0.07388770580291748, + "learning_rate": 2.4683007880307583e-05, + "loss": 2.5386, + "step": 30371 + }, + { + "epoch": 0.9006316164042345, + "grad_norm": 0.0691262036561966, + "learning_rate": 2.4668409394073223e-05, + "loss": 2.5384, + "step": 30372 + }, + { + "epoch": 0.9006612697565459, + "grad_norm": 0.0726529061794281, + "learning_rate": 2.465381511698167e-05, + "loss": 2.5289, + "step": 30373 + }, + { + "epoch": 0.9006909231088575, + "grad_norm": 0.07634871453046799, + "learning_rate": 2.4639225049162217e-05, + "loss": 2.5743, + "step": 30374 + }, + { + "epoch": 0.900720576461169, + "grad_norm": 0.07433456927537918, + "learning_rate": 2.462463919074398e-05, + "loss": 2.5603, + "step": 30375 + }, + { + "epoch": 0.9007502298134804, + "grad_norm": 0.06938063353300095, + "learning_rate": 2.461005754185619e-05, + "loss": 2.5355, + "step": 30376 + }, + { + "epoch": 0.9007798831657919, + "grad_norm": 0.07297153025865555, + "learning_rate": 2.4595480102627966e-05, + "loss": 2.5329, + "step": 30377 + }, + { + "epoch": 0.9008095365181034, + "grad_norm": 0.07384132593870163, + "learning_rate": 2.4580906873188312e-05, + "loss": 2.5591, + "step": 30378 + }, + { + "epoch": 0.9008391898704149, + "grad_norm": 0.07131106406450272, + "learning_rate": 2.4566337853666354e-05, + "loss": 2.5439, + "step": 30379 + }, + { + "epoch": 0.9008688432227263, + "grad_norm": 0.07070864737033844, + "learning_rate": 2.4551773044191095e-05, + "loss": 2.5358, + "step": 30380 + }, + { + "epoch": 0.9008984965750378, + "grad_norm": 0.07439693063497543, + "learning_rate": 2.4537212444891488e-05, + "loss": 2.5017, + "step": 30381 + }, + { + "epoch": 0.9009281499273493, + "grad_norm": 0.06654234230518341, + "learning_rate": 2.452265605589643e-05, + "loss": 2.5299, + "step": 30382 + }, + { + "epoch": 0.9009578032796608, + "grad_norm": 0.07388395071029663, + "learning_rate": 2.450810387733493e-05, + "loss": 2.5244, + "step": 30383 + }, + { + "epoch": 0.9009874566319722, + "grad_norm": 0.06876102834939957, + "learning_rate": 2.4493555909335774e-05, + "loss": 2.571, + "step": 30384 + }, + { + "epoch": 0.9010171099842837, + "grad_norm": 0.07432913780212402, + "learning_rate": 2.44790121520278e-05, + "loss": 2.5468, + "step": 30385 + }, + { + "epoch": 0.9010467633365952, + "grad_norm": 0.06989492475986481, + "learning_rate": 2.4464472605539744e-05, + "loss": 2.5443, + "step": 30386 + }, + { + "epoch": 0.9010764166889067, + "grad_norm": 0.07367058843374252, + "learning_rate": 2.44499372700005e-05, + "loss": 2.5325, + "step": 30387 + }, + { + "epoch": 0.9011060700412181, + "grad_norm": 0.07471055537462234, + "learning_rate": 2.443540614553863e-05, + "loss": 2.5311, + "step": 30388 + }, + { + "epoch": 0.9011357233935297, + "grad_norm": 0.07033395767211914, + "learning_rate": 2.4420879232282866e-05, + "loss": 2.5336, + "step": 30389 + }, + { + "epoch": 0.9011653767458411, + "grad_norm": 0.07298717647790909, + "learning_rate": 2.4406356530361884e-05, + "loss": 2.5517, + "step": 30390 + }, + { + "epoch": 0.9011950300981526, + "grad_norm": 0.06807870417833328, + "learning_rate": 2.439183803990419e-05, + "loss": 2.5565, + "step": 30391 + }, + { + "epoch": 0.901224683450464, + "grad_norm": 0.06972498446702957, + "learning_rate": 2.437732376103846e-05, + "loss": 2.573, + "step": 30392 + }, + { + "epoch": 0.9012543368027756, + "grad_norm": 0.07144537568092346, + "learning_rate": 2.436281369389315e-05, + "loss": 2.5493, + "step": 30393 + }, + { + "epoch": 0.901283990155087, + "grad_norm": 0.07078555971384048, + "learning_rate": 2.434830783859676e-05, + "loss": 2.5284, + "step": 30394 + }, + { + "epoch": 0.9013136435073985, + "grad_norm": 0.07004377990961075, + "learning_rate": 2.433380619527775e-05, + "loss": 2.5326, + "step": 30395 + }, + { + "epoch": 0.90134329685971, + "grad_norm": 0.07252776622772217, + "learning_rate": 2.4319308764064574e-05, + "loss": 2.5035, + "step": 30396 + }, + { + "epoch": 0.9013729502120215, + "grad_norm": 0.07230503857135773, + "learning_rate": 2.430481554508551e-05, + "loss": 2.5453, + "step": 30397 + }, + { + "epoch": 0.901402603564333, + "grad_norm": 0.06941227614879608, + "learning_rate": 2.4290326538468966e-05, + "loss": 2.5504, + "step": 30398 + }, + { + "epoch": 0.9014322569166444, + "grad_norm": 0.07231148332357407, + "learning_rate": 2.4275841744343275e-05, + "loss": 2.5297, + "step": 30399 + }, + { + "epoch": 0.901461910268956, + "grad_norm": 0.07497400045394897, + "learning_rate": 2.4261361162836613e-05, + "loss": 2.5603, + "step": 30400 + }, + { + "epoch": 0.9014915636212674, + "grad_norm": 0.06939495354890823, + "learning_rate": 2.4246884794077274e-05, + "loss": 2.5336, + "step": 30401 + }, + { + "epoch": 0.9015212169735789, + "grad_norm": 0.06969868391752243, + "learning_rate": 2.423241263819348e-05, + "loss": 2.5514, + "step": 30402 + }, + { + "epoch": 0.9015508703258903, + "grad_norm": 0.07131478190422058, + "learning_rate": 2.4217944695313244e-05, + "loss": 2.5208, + "step": 30403 + }, + { + "epoch": 0.9015805236782019, + "grad_norm": 0.07111984491348267, + "learning_rate": 2.420348096556485e-05, + "loss": 2.5726, + "step": 30404 + }, + { + "epoch": 0.9016101770305133, + "grad_norm": 0.06929498165845871, + "learning_rate": 2.4189021449076366e-05, + "loss": 2.5487, + "step": 30405 + }, + { + "epoch": 0.9016398303828248, + "grad_norm": 0.07280942797660828, + "learning_rate": 2.4174566145975684e-05, + "loss": 2.5336, + "step": 30406 + }, + { + "epoch": 0.9016694837351362, + "grad_norm": 0.07046771794557571, + "learning_rate": 2.4160115056390873e-05, + "loss": 2.4879, + "step": 30407 + }, + { + "epoch": 0.9016991370874478, + "grad_norm": 0.07002832740545273, + "learning_rate": 2.4145668180449887e-05, + "loss": 2.5007, + "step": 30408 + }, + { + "epoch": 0.9017287904397592, + "grad_norm": 0.07009940594434738, + "learning_rate": 2.413122551828073e-05, + "loss": 2.5386, + "step": 30409 + }, + { + "epoch": 0.9017584437920707, + "grad_norm": 0.06845297664403915, + "learning_rate": 2.4116787070011248e-05, + "loss": 2.5601, + "step": 30410 + }, + { + "epoch": 0.9017880971443821, + "grad_norm": 0.07115386426448822, + "learning_rate": 2.4102352835769337e-05, + "loss": 2.5009, + "step": 30411 + }, + { + "epoch": 0.9018177504966937, + "grad_norm": 0.07203423231840134, + "learning_rate": 2.4087922815682727e-05, + "loss": 2.5313, + "step": 30412 + }, + { + "epoch": 0.9018474038490051, + "grad_norm": 0.06958221644163132, + "learning_rate": 2.4073497009879265e-05, + "loss": 2.5511, + "step": 30413 + }, + { + "epoch": 0.9018770572013166, + "grad_norm": 0.07203266769647598, + "learning_rate": 2.405907541848673e-05, + "loss": 2.5515, + "step": 30414 + }, + { + "epoch": 0.901906710553628, + "grad_norm": 0.07500854134559631, + "learning_rate": 2.4044658041632695e-05, + "loss": 2.538, + "step": 30415 + }, + { + "epoch": 0.9019363639059396, + "grad_norm": 0.06966526806354523, + "learning_rate": 2.4030244879445052e-05, + "loss": 2.5301, + "step": 30416 + }, + { + "epoch": 0.9019660172582511, + "grad_norm": 0.06828875839710236, + "learning_rate": 2.40158359320512e-05, + "loss": 2.5551, + "step": 30417 + }, + { + "epoch": 0.9019956706105625, + "grad_norm": 0.07217510789632797, + "learning_rate": 2.4001431199578816e-05, + "loss": 2.5525, + "step": 30418 + }, + { + "epoch": 0.902025323962874, + "grad_norm": 0.07487523555755615, + "learning_rate": 2.3987030682155465e-05, + "loss": 2.5129, + "step": 30419 + }, + { + "epoch": 0.9020549773151855, + "grad_norm": 0.06969935446977615, + "learning_rate": 2.3972634379908654e-05, + "loss": 2.5317, + "step": 30420 + }, + { + "epoch": 0.902084630667497, + "grad_norm": 0.07296489924192429, + "learning_rate": 2.395824229296578e-05, + "loss": 2.5083, + "step": 30421 + }, + { + "epoch": 0.9021142840198084, + "grad_norm": 0.07177230715751648, + "learning_rate": 2.394385442145447e-05, + "loss": 2.57, + "step": 30422 + }, + { + "epoch": 0.90214393737212, + "grad_norm": 0.07673627883195877, + "learning_rate": 2.392947076550206e-05, + "loss": 2.5077, + "step": 30423 + }, + { + "epoch": 0.9021735907244314, + "grad_norm": 0.08134778589010239, + "learning_rate": 2.3915091325235894e-05, + "loss": 2.5189, + "step": 30424 + }, + { + "epoch": 0.9022032440767429, + "grad_norm": 0.06860335916280746, + "learning_rate": 2.3900716100783314e-05, + "loss": 2.4824, + "step": 30425 + }, + { + "epoch": 0.9022328974290543, + "grad_norm": 0.07423099130392075, + "learning_rate": 2.3886345092271722e-05, + "loss": 2.538, + "step": 30426 + }, + { + "epoch": 0.9022625507813659, + "grad_norm": 0.0792674571275711, + "learning_rate": 2.3871978299828123e-05, + "loss": 2.5338, + "step": 30427 + }, + { + "epoch": 0.9022922041336773, + "grad_norm": 0.07035792618989944, + "learning_rate": 2.385761572357992e-05, + "loss": 2.5427, + "step": 30428 + }, + { + "epoch": 0.9023218574859888, + "grad_norm": 0.07729472219944, + "learning_rate": 2.3843257363654282e-05, + "loss": 2.5482, + "step": 30429 + }, + { + "epoch": 0.9023515108383002, + "grad_norm": 0.07449943572282791, + "learning_rate": 2.3828903220178334e-05, + "loss": 2.5475, + "step": 30430 + }, + { + "epoch": 0.9023811641906118, + "grad_norm": 0.07615824788808823, + "learning_rate": 2.381455329327914e-05, + "loss": 2.5501, + "step": 30431 + }, + { + "epoch": 0.9024108175429232, + "grad_norm": 0.07222788780927658, + "learning_rate": 2.3800207583083822e-05, + "loss": 2.5642, + "step": 30432 + }, + { + "epoch": 0.9024404708952347, + "grad_norm": 0.0742766410112381, + "learning_rate": 2.3785866089719443e-05, + "loss": 2.5369, + "step": 30433 + }, + { + "epoch": 0.9024701242475461, + "grad_norm": 0.07375553995370865, + "learning_rate": 2.3771528813312848e-05, + "loss": 2.5604, + "step": 30434 + }, + { + "epoch": 0.9024997775998577, + "grad_norm": 0.07460254430770874, + "learning_rate": 2.3757195753991155e-05, + "loss": 2.5036, + "step": 30435 + }, + { + "epoch": 0.9025294309521692, + "grad_norm": 0.07308777421712875, + "learning_rate": 2.3742866911881322e-05, + "loss": 2.5489, + "step": 30436 + }, + { + "epoch": 0.9025590843044806, + "grad_norm": 0.07369673252105713, + "learning_rate": 2.3728542287110132e-05, + "loss": 2.5634, + "step": 30437 + }, + { + "epoch": 0.9025887376567922, + "grad_norm": 0.07495435327291489, + "learning_rate": 2.3714221879804433e-05, + "loss": 2.549, + "step": 30438 + }, + { + "epoch": 0.9026183910091036, + "grad_norm": 0.06989441066980362, + "learning_rate": 2.369990569009106e-05, + "loss": 2.577, + "step": 30439 + }, + { + "epoch": 0.9026480443614151, + "grad_norm": 0.07241267710924149, + "learning_rate": 2.3685593718096755e-05, + "loss": 2.5714, + "step": 30440 + }, + { + "epoch": 0.9026776977137265, + "grad_norm": 0.07867901772260666, + "learning_rate": 2.3671285963948296e-05, + "loss": 2.537, + "step": 30441 + }, + { + "epoch": 0.902707351066038, + "grad_norm": 0.07440683990716934, + "learning_rate": 2.3656982427772365e-05, + "loss": 2.5392, + "step": 30442 + }, + { + "epoch": 0.9027370044183495, + "grad_norm": 0.07131978124380112, + "learning_rate": 2.3642683109695585e-05, + "loss": 2.5499, + "step": 30443 + }, + { + "epoch": 0.902766657770661, + "grad_norm": 0.07248999923467636, + "learning_rate": 2.3628388009844626e-05, + "loss": 2.5195, + "step": 30444 + }, + { + "epoch": 0.9027963111229724, + "grad_norm": 0.07293874770402908, + "learning_rate": 2.3614097128346057e-05, + "loss": 2.5701, + "step": 30445 + }, + { + "epoch": 0.902825964475284, + "grad_norm": 0.07265245169401169, + "learning_rate": 2.3599810465326445e-05, + "loss": 2.5529, + "step": 30446 + }, + { + "epoch": 0.9028556178275954, + "grad_norm": 0.07295715808868408, + "learning_rate": 2.358552802091224e-05, + "loss": 2.5348, + "step": 30447 + }, + { + "epoch": 0.9028852711799069, + "grad_norm": 0.0763528123497963, + "learning_rate": 2.357124979523001e-05, + "loss": 2.5685, + "step": 30448 + }, + { + "epoch": 0.9029149245322183, + "grad_norm": 0.07120638340711594, + "learning_rate": 2.3556975788406097e-05, + "loss": 2.5007, + "step": 30449 + }, + { + "epoch": 0.9029445778845299, + "grad_norm": 0.06949950009584427, + "learning_rate": 2.354270600056696e-05, + "loss": 2.5735, + "step": 30450 + }, + { + "epoch": 0.9029742312368413, + "grad_norm": 0.07746933400630951, + "learning_rate": 2.3528440431838992e-05, + "loss": 2.5385, + "step": 30451 + }, + { + "epoch": 0.9030038845891528, + "grad_norm": 0.07780299335718155, + "learning_rate": 2.351417908234843e-05, + "loss": 2.5217, + "step": 30452 + }, + { + "epoch": 0.9030335379414642, + "grad_norm": 0.07529911398887634, + "learning_rate": 2.3499921952221614e-05, + "loss": 2.5409, + "step": 30453 + }, + { + "epoch": 0.9030631912937758, + "grad_norm": 0.07843364775180817, + "learning_rate": 2.3485669041584723e-05, + "loss": 2.5558, + "step": 30454 + }, + { + "epoch": 0.9030928446460872, + "grad_norm": 0.07487163692712784, + "learning_rate": 2.34714203505641e-05, + "loss": 2.5158, + "step": 30455 + }, + { + "epoch": 0.9031224979983987, + "grad_norm": 0.07667757570743561, + "learning_rate": 2.345717587928581e-05, + "loss": 2.5576, + "step": 30456 + }, + { + "epoch": 0.9031521513507103, + "grad_norm": 0.07414888590574265, + "learning_rate": 2.3442935627876028e-05, + "loss": 2.5419, + "step": 30457 + }, + { + "epoch": 0.9031818047030217, + "grad_norm": 0.07105324417352676, + "learning_rate": 2.3428699596460824e-05, + "loss": 2.5847, + "step": 30458 + }, + { + "epoch": 0.9032114580553332, + "grad_norm": 0.0756056010723114, + "learning_rate": 2.3414467785166315e-05, + "loss": 2.5481, + "step": 30459 + }, + { + "epoch": 0.9032411114076446, + "grad_norm": 0.07769156247377396, + "learning_rate": 2.3400240194118518e-05, + "loss": 2.5787, + "step": 30460 + }, + { + "epoch": 0.9032707647599562, + "grad_norm": 0.0737358033657074, + "learning_rate": 2.3386016823443378e-05, + "loss": 2.5261, + "step": 30461 + }, + { + "epoch": 0.9033004181122676, + "grad_norm": 0.07052356004714966, + "learning_rate": 2.337179767326686e-05, + "loss": 2.5434, + "step": 30462 + }, + { + "epoch": 0.9033300714645791, + "grad_norm": 0.0723409354686737, + "learning_rate": 2.335758274371491e-05, + "loss": 2.5665, + "step": 30463 + }, + { + "epoch": 0.9033597248168905, + "grad_norm": 0.07950109988451004, + "learning_rate": 2.334337203491338e-05, + "loss": 2.5473, + "step": 30464 + }, + { + "epoch": 0.9033893781692021, + "grad_norm": 0.07316484302282333, + "learning_rate": 2.332916554698805e-05, + "loss": 2.5354, + "step": 30465 + }, + { + "epoch": 0.9034190315215135, + "grad_norm": 0.06889867782592773, + "learning_rate": 2.331496328006483e-05, + "loss": 2.5335, + "step": 30466 + }, + { + "epoch": 0.903448684873825, + "grad_norm": 0.07071726024150848, + "learning_rate": 2.3300765234269438e-05, + "loss": 2.5323, + "step": 30467 + }, + { + "epoch": 0.9034783382261364, + "grad_norm": 0.07325056940317154, + "learning_rate": 2.3286571409727562e-05, + "loss": 2.5194, + "step": 30468 + }, + { + "epoch": 0.903507991578448, + "grad_norm": 0.07679513096809387, + "learning_rate": 2.3272381806564992e-05, + "loss": 2.5648, + "step": 30469 + }, + { + "epoch": 0.9035376449307594, + "grad_norm": 0.07167429476976395, + "learning_rate": 2.325819642490723e-05, + "loss": 2.5399, + "step": 30470 + }, + { + "epoch": 0.9035672982830709, + "grad_norm": 0.07690584659576416, + "learning_rate": 2.3244015264880068e-05, + "loss": 2.5526, + "step": 30471 + }, + { + "epoch": 0.9035969516353823, + "grad_norm": 0.07344111800193787, + "learning_rate": 2.322983832660891e-05, + "loss": 2.5473, + "step": 30472 + }, + { + "epoch": 0.9036266049876939, + "grad_norm": 0.07343530654907227, + "learning_rate": 2.3215665610219316e-05, + "loss": 2.5453, + "step": 30473 + }, + { + "epoch": 0.9036562583400053, + "grad_norm": 0.07455740123987198, + "learning_rate": 2.3201497115836912e-05, + "loss": 2.5155, + "step": 30474 + }, + { + "epoch": 0.9036859116923168, + "grad_norm": 0.0735318586230278, + "learning_rate": 2.3187332843587093e-05, + "loss": 2.5491, + "step": 30475 + }, + { + "epoch": 0.9037155650446282, + "grad_norm": 0.06933912634849548, + "learning_rate": 2.3173172793595265e-05, + "loss": 2.5556, + "step": 30476 + }, + { + "epoch": 0.9037452183969398, + "grad_norm": 0.07009342312812805, + "learning_rate": 2.315901696598688e-05, + "loss": 2.5463, + "step": 30477 + }, + { + "epoch": 0.9037748717492513, + "grad_norm": 0.06962715834379196, + "learning_rate": 2.3144865360887224e-05, + "loss": 2.5587, + "step": 30478 + }, + { + "epoch": 0.9038045251015627, + "grad_norm": 0.07383795827627182, + "learning_rate": 2.313071797842159e-05, + "loss": 2.5264, + "step": 30479 + }, + { + "epoch": 0.9038341784538743, + "grad_norm": 0.06803318113088608, + "learning_rate": 2.3116574818715376e-05, + "loss": 2.5146, + "step": 30480 + }, + { + "epoch": 0.9038638318061857, + "grad_norm": 0.07556861639022827, + "learning_rate": 2.3102435881893757e-05, + "loss": 2.5494, + "step": 30481 + }, + { + "epoch": 0.9038934851584972, + "grad_norm": 0.0702543631196022, + "learning_rate": 2.3088301168081915e-05, + "loss": 2.5601, + "step": 30482 + }, + { + "epoch": 0.9039231385108086, + "grad_norm": 0.07412619888782501, + "learning_rate": 2.3074170677404972e-05, + "loss": 2.5236, + "step": 30483 + }, + { + "epoch": 0.9039527918631202, + "grad_norm": 0.07091262936592102, + "learning_rate": 2.3060044409988158e-05, + "loss": 2.5308, + "step": 30484 + }, + { + "epoch": 0.9039824452154316, + "grad_norm": 0.0697828009724617, + "learning_rate": 2.3045922365956484e-05, + "loss": 2.5755, + "step": 30485 + }, + { + "epoch": 0.9040120985677431, + "grad_norm": 0.07519879937171936, + "learning_rate": 2.3031804545434963e-05, + "loss": 2.526, + "step": 30486 + }, + { + "epoch": 0.9040417519200545, + "grad_norm": 0.07937713712453842, + "learning_rate": 2.3017690948548776e-05, + "loss": 2.5395, + "step": 30487 + }, + { + "epoch": 0.9040714052723661, + "grad_norm": 0.06799442321062088, + "learning_rate": 2.3003581575422816e-05, + "loss": 2.5337, + "step": 30488 + }, + { + "epoch": 0.9041010586246775, + "grad_norm": 0.06996004283428192, + "learning_rate": 2.2989476426181986e-05, + "loss": 2.5258, + "step": 30489 + }, + { + "epoch": 0.904130711976989, + "grad_norm": 0.0777757465839386, + "learning_rate": 2.2975375500951245e-05, + "loss": 2.5599, + "step": 30490 + }, + { + "epoch": 0.9041603653293004, + "grad_norm": 0.07195107638835907, + "learning_rate": 2.296127879985538e-05, + "loss": 2.5058, + "step": 30491 + }, + { + "epoch": 0.904190018681612, + "grad_norm": 0.0759035050868988, + "learning_rate": 2.2947186323019397e-05, + "loss": 2.5542, + "step": 30492 + }, + { + "epoch": 0.9042196720339234, + "grad_norm": 0.07418027520179749, + "learning_rate": 2.293309807056787e-05, + "loss": 2.5559, + "step": 30493 + }, + { + "epoch": 0.9042493253862349, + "grad_norm": 0.06727010756731033, + "learning_rate": 2.2919014042625697e-05, + "loss": 2.5341, + "step": 30494 + }, + { + "epoch": 0.9042789787385463, + "grad_norm": 0.07518773525953293, + "learning_rate": 2.29049342393175e-05, + "loss": 2.5351, + "step": 30495 + }, + { + "epoch": 0.9043086320908579, + "grad_norm": 0.07334888726472855, + "learning_rate": 2.289085866076801e-05, + "loss": 2.4995, + "step": 30496 + }, + { + "epoch": 0.9043382854431693, + "grad_norm": 0.07310021668672562, + "learning_rate": 2.2876787307101853e-05, + "loss": 2.5169, + "step": 30497 + }, + { + "epoch": 0.9043679387954808, + "grad_norm": 0.07112371176481247, + "learning_rate": 2.2862720178443596e-05, + "loss": 2.5321, + "step": 30498 + }, + { + "epoch": 0.9043975921477924, + "grad_norm": 0.07008339464664459, + "learning_rate": 2.2848657274917915e-05, + "loss": 2.565, + "step": 30499 + }, + { + "epoch": 0.9044272455001038, + "grad_norm": 0.07572079449892044, + "learning_rate": 2.283459859664927e-05, + "loss": 2.5444, + "step": 30500 + }, + { + "epoch": 0.9044568988524153, + "grad_norm": 0.0730598047375679, + "learning_rate": 2.282054414376211e-05, + "loss": 2.5397, + "step": 30501 + }, + { + "epoch": 0.9044865522047267, + "grad_norm": 0.07951328903436661, + "learning_rate": 2.2806493916381066e-05, + "loss": 2.542, + "step": 30502 + }, + { + "epoch": 0.9045162055570383, + "grad_norm": 0.0750526413321495, + "learning_rate": 2.2792447914630365e-05, + "loss": 2.525, + "step": 30503 + }, + { + "epoch": 0.9045458589093497, + "grad_norm": 0.07063345611095428, + "learning_rate": 2.277840613863441e-05, + "loss": 2.5424, + "step": 30504 + }, + { + "epoch": 0.9045755122616612, + "grad_norm": 0.07362976670265198, + "learning_rate": 2.27643685885176e-05, + "loss": 2.5736, + "step": 30505 + }, + { + "epoch": 0.9046051656139726, + "grad_norm": 0.07473232597112656, + "learning_rate": 2.2750335264404232e-05, + "loss": 2.5622, + "step": 30506 + }, + { + "epoch": 0.9046348189662842, + "grad_norm": 0.07200257480144501, + "learning_rate": 2.273630616641853e-05, + "loss": 2.5572, + "step": 30507 + }, + { + "epoch": 0.9046644723185956, + "grad_norm": 0.07197940349578857, + "learning_rate": 2.2722281294684787e-05, + "loss": 2.5764, + "step": 30508 + }, + { + "epoch": 0.9046941256709071, + "grad_norm": 0.07210152596235275, + "learning_rate": 2.2708260649327185e-05, + "loss": 2.5031, + "step": 30509 + }, + { + "epoch": 0.9047237790232185, + "grad_norm": 0.07203485816717148, + "learning_rate": 2.269424423046984e-05, + "loss": 2.5303, + "step": 30510 + }, + { + "epoch": 0.9047534323755301, + "grad_norm": 0.07681617140769958, + "learning_rate": 2.2680232038236827e-05, + "loss": 2.5293, + "step": 30511 + }, + { + "epoch": 0.9047830857278415, + "grad_norm": 0.07171513885259628, + "learning_rate": 2.2666224072752374e-05, + "loss": 2.5282, + "step": 30512 + }, + { + "epoch": 0.904812739080153, + "grad_norm": 0.07365749031305313, + "learning_rate": 2.2652220334140496e-05, + "loss": 2.5352, + "step": 30513 + }, + { + "epoch": 0.9048423924324644, + "grad_norm": 0.07364068925380707, + "learning_rate": 2.2638220822525036e-05, + "loss": 2.5934, + "step": 30514 + }, + { + "epoch": 0.904872045784776, + "grad_norm": 0.0731937438249588, + "learning_rate": 2.2624225538030118e-05, + "loss": 2.5422, + "step": 30515 + }, + { + "epoch": 0.9049016991370874, + "grad_norm": 0.07563620805740356, + "learning_rate": 2.261023448077959e-05, + "loss": 2.5331, + "step": 30516 + }, + { + "epoch": 0.9049313524893989, + "grad_norm": 0.07104462385177612, + "learning_rate": 2.2596247650897407e-05, + "loss": 2.557, + "step": 30517 + }, + { + "epoch": 0.9049610058417104, + "grad_norm": 0.07378410547971725, + "learning_rate": 2.2582265048507354e-05, + "loss": 2.5203, + "step": 30518 + }, + { + "epoch": 0.9049906591940219, + "grad_norm": 0.07245410233736038, + "learning_rate": 2.256828667373334e-05, + "loss": 2.5496, + "step": 30519 + }, + { + "epoch": 0.9050203125463334, + "grad_norm": 0.07222065329551697, + "learning_rate": 2.2554312526699095e-05, + "loss": 2.5649, + "step": 30520 + }, + { + "epoch": 0.9050499658986448, + "grad_norm": 0.06888732314109802, + "learning_rate": 2.2540342607528354e-05, + "loss": 2.5148, + "step": 30521 + }, + { + "epoch": 0.9050796192509564, + "grad_norm": 0.06905996054410934, + "learning_rate": 2.2526376916344792e-05, + "loss": 2.5138, + "step": 30522 + }, + { + "epoch": 0.9051092726032678, + "grad_norm": 0.07295827567577362, + "learning_rate": 2.251241545327215e-05, + "loss": 2.5335, + "step": 30523 + }, + { + "epoch": 0.9051389259555793, + "grad_norm": 0.07055038213729858, + "learning_rate": 2.249845821843405e-05, + "loss": 2.5161, + "step": 30524 + }, + { + "epoch": 0.9051685793078907, + "grad_norm": 0.07126455754041672, + "learning_rate": 2.2484505211954052e-05, + "loss": 2.4973, + "step": 30525 + }, + { + "epoch": 0.9051982326602023, + "grad_norm": 0.06899305433034897, + "learning_rate": 2.2470556433955736e-05, + "loss": 2.5193, + "step": 30526 + }, + { + "epoch": 0.9052278860125137, + "grad_norm": 0.07822982966899872, + "learning_rate": 2.2456611884562607e-05, + "loss": 2.5068, + "step": 30527 + }, + { + "epoch": 0.9052575393648252, + "grad_norm": 0.07041259855031967, + "learning_rate": 2.244267156389812e-05, + "loss": 2.5435, + "step": 30528 + }, + { + "epoch": 0.9052871927171366, + "grad_norm": 0.06894032657146454, + "learning_rate": 2.2428735472085793e-05, + "loss": 2.5162, + "step": 30529 + }, + { + "epoch": 0.9053168460694482, + "grad_norm": 0.07202240824699402, + "learning_rate": 2.2414803609248914e-05, + "loss": 2.5261, + "step": 30530 + }, + { + "epoch": 0.9053464994217596, + "grad_norm": 0.07260935008525848, + "learning_rate": 2.2400875975510992e-05, + "loss": 2.5278, + "step": 30531 + }, + { + "epoch": 0.9053761527740711, + "grad_norm": 0.07455345243215561, + "learning_rate": 2.2386952570995267e-05, + "loss": 2.5452, + "step": 30532 + }, + { + "epoch": 0.9054058061263826, + "grad_norm": 0.07509350776672363, + "learning_rate": 2.237303339582508e-05, + "loss": 2.502, + "step": 30533 + }, + { + "epoch": 0.9054354594786941, + "grad_norm": 0.07562541961669922, + "learning_rate": 2.2359118450123618e-05, + "loss": 2.5212, + "step": 30534 + }, + { + "epoch": 0.9054651128310055, + "grad_norm": 0.07362861186265945, + "learning_rate": 2.2345207734014215e-05, + "loss": 2.5512, + "step": 30535 + }, + { + "epoch": 0.905494766183317, + "grad_norm": 0.07366396486759186, + "learning_rate": 2.2331301247620006e-05, + "loss": 2.5469, + "step": 30536 + }, + { + "epoch": 0.9055244195356285, + "grad_norm": 0.07306330651044846, + "learning_rate": 2.2317398991063997e-05, + "loss": 2.5361, + "step": 30537 + }, + { + "epoch": 0.90555407288794, + "grad_norm": 0.07186418771743774, + "learning_rate": 2.2303500964469482e-05, + "loss": 2.5234, + "step": 30538 + }, + { + "epoch": 0.9055837262402514, + "grad_norm": 0.07060686498880386, + "learning_rate": 2.2289607167959413e-05, + "loss": 2.5546, + "step": 30539 + }, + { + "epoch": 0.9056133795925629, + "grad_norm": 0.07338088005781174, + "learning_rate": 2.227571760165692e-05, + "loss": 2.5431, + "step": 30540 + }, + { + "epoch": 0.9056430329448745, + "grad_norm": 0.07675226777791977, + "learning_rate": 2.2261832265684957e-05, + "loss": 2.5529, + "step": 30541 + }, + { + "epoch": 0.9056726862971859, + "grad_norm": 0.0738479420542717, + "learning_rate": 2.224795116016648e-05, + "loss": 2.5518, + "step": 30542 + }, + { + "epoch": 0.9057023396494974, + "grad_norm": 0.0737280547618866, + "learning_rate": 2.223407428522434e-05, + "loss": 2.5706, + "step": 30543 + }, + { + "epoch": 0.9057319930018088, + "grad_norm": 0.07311834394931793, + "learning_rate": 2.222020164098154e-05, + "loss": 2.5381, + "step": 30544 + }, + { + "epoch": 0.9057616463541204, + "grad_norm": 0.07241933792829514, + "learning_rate": 2.2206333227560827e-05, + "loss": 2.5401, + "step": 30545 + }, + { + "epoch": 0.9057912997064318, + "grad_norm": 0.07457154244184494, + "learning_rate": 2.219246904508504e-05, + "loss": 2.5264, + "step": 30546 + }, + { + "epoch": 0.9058209530587433, + "grad_norm": 0.07304416596889496, + "learning_rate": 2.2178609093677083e-05, + "loss": 2.5471, + "step": 30547 + }, + { + "epoch": 0.9058506064110547, + "grad_norm": 0.07573392987251282, + "learning_rate": 2.216475337345941e-05, + "loss": 2.5575, + "step": 30548 + }, + { + "epoch": 0.9058802597633663, + "grad_norm": 0.07039164006710052, + "learning_rate": 2.215090188455493e-05, + "loss": 2.5368, + "step": 30549 + }, + { + "epoch": 0.9059099131156777, + "grad_norm": 0.07009761035442352, + "learning_rate": 2.213705462708615e-05, + "loss": 2.5352, + "step": 30550 + }, + { + "epoch": 0.9059395664679892, + "grad_norm": 0.0750717744231224, + "learning_rate": 2.212321160117581e-05, + "loss": 2.5073, + "step": 30551 + }, + { + "epoch": 0.9059692198203007, + "grad_norm": 0.07031863927841187, + "learning_rate": 2.210937280694647e-05, + "loss": 2.53, + "step": 30552 + }, + { + "epoch": 0.9059988731726122, + "grad_norm": 0.07081885635852814, + "learning_rate": 2.2095538244520708e-05, + "loss": 2.5366, + "step": 30553 + }, + { + "epoch": 0.9060285265249236, + "grad_norm": 0.07080700248479843, + "learning_rate": 2.2081707914020922e-05, + "loss": 2.5498, + "step": 30554 + }, + { + "epoch": 0.9060581798772351, + "grad_norm": 0.0683366060256958, + "learning_rate": 2.206788181556968e-05, + "loss": 2.5329, + "step": 30555 + }, + { + "epoch": 0.9060878332295466, + "grad_norm": 0.06951543688774109, + "learning_rate": 2.2054059949289384e-05, + "loss": 2.5413, + "step": 30556 + }, + { + "epoch": 0.9061174865818581, + "grad_norm": 0.0729009285569191, + "learning_rate": 2.2040242315302493e-05, + "loss": 2.5538, + "step": 30557 + }, + { + "epoch": 0.9061471399341695, + "grad_norm": 0.0690961554646492, + "learning_rate": 2.2026428913731245e-05, + "loss": 2.5583, + "step": 30558 + }, + { + "epoch": 0.906176793286481, + "grad_norm": 0.07256641983985901, + "learning_rate": 2.2012619744697983e-05, + "loss": 2.5767, + "step": 30559 + }, + { + "epoch": 0.9062064466387925, + "grad_norm": 0.07227730005979538, + "learning_rate": 2.1998814808324997e-05, + "loss": 2.5382, + "step": 30560 + }, + { + "epoch": 0.906236099991104, + "grad_norm": 0.07025915384292603, + "learning_rate": 2.1985014104734637e-05, + "loss": 2.5679, + "step": 30561 + }, + { + "epoch": 0.9062657533434155, + "grad_norm": 0.06648716330528259, + "learning_rate": 2.1971217634048966e-05, + "loss": 2.5576, + "step": 30562 + }, + { + "epoch": 0.906295406695727, + "grad_norm": 0.06857550889253616, + "learning_rate": 2.1957425396390117e-05, + "loss": 2.526, + "step": 30563 + }, + { + "epoch": 0.9063250600480385, + "grad_norm": 0.07326055318117142, + "learning_rate": 2.1943637391880432e-05, + "loss": 2.5184, + "step": 30564 + }, + { + "epoch": 0.9063547134003499, + "grad_norm": 0.07192228734493256, + "learning_rate": 2.1929853620641928e-05, + "loss": 2.5366, + "step": 30565 + }, + { + "epoch": 0.9063843667526614, + "grad_norm": 0.07249704003334045, + "learning_rate": 2.1916074082796556e-05, + "loss": 2.5139, + "step": 30566 + }, + { + "epoch": 0.9064140201049729, + "grad_norm": 0.07183341681957245, + "learning_rate": 2.1902298778466447e-05, + "loss": 2.5602, + "step": 30567 + }, + { + "epoch": 0.9064436734572844, + "grad_norm": 0.07146216928958893, + "learning_rate": 2.1888527707773608e-05, + "loss": 2.5642, + "step": 30568 + }, + { + "epoch": 0.9064733268095958, + "grad_norm": 0.07043810188770294, + "learning_rate": 2.1874760870839892e-05, + "loss": 2.5629, + "step": 30569 + }, + { + "epoch": 0.9065029801619073, + "grad_norm": 0.07267162203788757, + "learning_rate": 2.1860998267787204e-05, + "loss": 2.5447, + "step": 30570 + }, + { + "epoch": 0.9065326335142188, + "grad_norm": 0.07386000454425812, + "learning_rate": 2.1847239898737437e-05, + "loss": 2.542, + "step": 30571 + }, + { + "epoch": 0.9065622868665303, + "grad_norm": 0.071992427110672, + "learning_rate": 2.1833485763812444e-05, + "loss": 2.5191, + "step": 30572 + }, + { + "epoch": 0.9065919402188417, + "grad_norm": 0.06969037652015686, + "learning_rate": 2.1819735863134017e-05, + "loss": 2.5162, + "step": 30573 + }, + { + "epoch": 0.9066215935711532, + "grad_norm": 0.07003998011350632, + "learning_rate": 2.1805990196823887e-05, + "loss": 2.5068, + "step": 30574 + }, + { + "epoch": 0.9066512469234647, + "grad_norm": 0.0716085135936737, + "learning_rate": 2.1792248765003853e-05, + "loss": 2.543, + "step": 30575 + }, + { + "epoch": 0.9066809002757762, + "grad_norm": 0.06918121874332428, + "learning_rate": 2.1778511567795422e-05, + "loss": 2.5431, + "step": 30576 + }, + { + "epoch": 0.9067105536280876, + "grad_norm": 0.07187911123037338, + "learning_rate": 2.1764778605320446e-05, + "loss": 2.532, + "step": 30577 + }, + { + "epoch": 0.9067402069803991, + "grad_norm": 0.07133518904447556, + "learning_rate": 2.175104987770049e-05, + "loss": 2.5712, + "step": 30578 + }, + { + "epoch": 0.9067698603327106, + "grad_norm": 0.07114488631486893, + "learning_rate": 2.1737325385057015e-05, + "loss": 2.5187, + "step": 30579 + }, + { + "epoch": 0.9067995136850221, + "grad_norm": 0.07034117728471756, + "learning_rate": 2.172360512751159e-05, + "loss": 2.5856, + "step": 30580 + }, + { + "epoch": 0.9068291670373335, + "grad_norm": 0.07167933881282806, + "learning_rate": 2.1709889105185788e-05, + "loss": 2.5793, + "step": 30581 + }, + { + "epoch": 0.906858820389645, + "grad_norm": 0.07339339703321457, + "learning_rate": 2.1696177318201006e-05, + "loss": 2.517, + "step": 30582 + }, + { + "epoch": 0.9068884737419566, + "grad_norm": 0.07926613092422485, + "learning_rate": 2.1682469766678648e-05, + "loss": 2.5429, + "step": 30583 + }, + { + "epoch": 0.906918127094268, + "grad_norm": 0.07253771275281906, + "learning_rate": 2.1668766450740118e-05, + "loss": 2.5439, + "step": 30584 + }, + { + "epoch": 0.9069477804465795, + "grad_norm": 0.06608948111534119, + "learning_rate": 2.165506737050682e-05, + "loss": 2.4802, + "step": 30585 + }, + { + "epoch": 0.906977433798891, + "grad_norm": 0.07665394991636276, + "learning_rate": 2.1641372526099935e-05, + "loss": 2.529, + "step": 30586 + }, + { + "epoch": 0.9070070871512025, + "grad_norm": 0.07037099450826645, + "learning_rate": 2.1627681917640863e-05, + "loss": 2.4958, + "step": 30587 + }, + { + "epoch": 0.9070367405035139, + "grad_norm": 0.07078050076961517, + "learning_rate": 2.1613995545250785e-05, + "loss": 2.5663, + "step": 30588 + }, + { + "epoch": 0.9070663938558254, + "grad_norm": 0.07237055897712708, + "learning_rate": 2.160031340905083e-05, + "loss": 2.5176, + "step": 30589 + }, + { + "epoch": 0.9070960472081369, + "grad_norm": 0.07494164258241653, + "learning_rate": 2.1586635509162235e-05, + "loss": 2.5409, + "step": 30590 + }, + { + "epoch": 0.9071257005604484, + "grad_norm": 0.07566098868846893, + "learning_rate": 2.157296184570612e-05, + "loss": 2.5681, + "step": 30591 + }, + { + "epoch": 0.9071553539127598, + "grad_norm": 0.06936266273260117, + "learning_rate": 2.1559292418803555e-05, + "loss": 2.5565, + "step": 30592 + }, + { + "epoch": 0.9071850072650713, + "grad_norm": 0.07368436455726624, + "learning_rate": 2.154562722857556e-05, + "loss": 2.5253, + "step": 30593 + }, + { + "epoch": 0.9072146606173828, + "grad_norm": 0.0731947124004364, + "learning_rate": 2.15319662751432e-05, + "loss": 2.5497, + "step": 30594 + }, + { + "epoch": 0.9072443139696943, + "grad_norm": 0.0735001340508461, + "learning_rate": 2.151830955862738e-05, + "loss": 2.5332, + "step": 30595 + }, + { + "epoch": 0.9072739673220057, + "grad_norm": 0.07634107768535614, + "learning_rate": 2.150465707914906e-05, + "loss": 2.534, + "step": 30596 + }, + { + "epoch": 0.9073036206743172, + "grad_norm": 0.0729110836982727, + "learning_rate": 2.1491008836829083e-05, + "loss": 2.5592, + "step": 30597 + }, + { + "epoch": 0.9073332740266287, + "grad_norm": 0.07777144014835358, + "learning_rate": 2.1477364831788414e-05, + "loss": 2.5408, + "step": 30598 + }, + { + "epoch": 0.9073629273789402, + "grad_norm": 0.06980838626623154, + "learning_rate": 2.1463725064147842e-05, + "loss": 2.5363, + "step": 30599 + }, + { + "epoch": 0.9073925807312516, + "grad_norm": 0.0692497193813324, + "learning_rate": 2.145008953402805e-05, + "loss": 2.5475, + "step": 30600 + }, + { + "epoch": 0.9074222340835632, + "grad_norm": 0.07244032621383667, + "learning_rate": 2.143645824154994e-05, + "loss": 2.5179, + "step": 30601 + }, + { + "epoch": 0.9074518874358746, + "grad_norm": 0.06877310574054718, + "learning_rate": 2.142283118683408e-05, + "loss": 2.505, + "step": 30602 + }, + { + "epoch": 0.9074815407881861, + "grad_norm": 0.07408744096755981, + "learning_rate": 2.140920837000121e-05, + "loss": 2.4951, + "step": 30603 + }, + { + "epoch": 0.9075111941404976, + "grad_norm": 0.07220840454101562, + "learning_rate": 2.1395589791171953e-05, + "loss": 2.5146, + "step": 30604 + }, + { + "epoch": 0.907540847492809, + "grad_norm": 0.06704500317573547, + "learning_rate": 2.1381975450466885e-05, + "loss": 2.5664, + "step": 30605 + }, + { + "epoch": 0.9075705008451206, + "grad_norm": 0.07693617790937424, + "learning_rate": 2.1368365348006567e-05, + "loss": 2.5456, + "step": 30606 + }, + { + "epoch": 0.907600154197432, + "grad_norm": 0.07245190441608429, + "learning_rate": 2.1354759483911578e-05, + "loss": 2.492, + "step": 30607 + }, + { + "epoch": 0.9076298075497435, + "grad_norm": 0.06945715844631195, + "learning_rate": 2.1341157858302318e-05, + "loss": 2.5412, + "step": 30608 + }, + { + "epoch": 0.907659460902055, + "grad_norm": 0.07060080766677856, + "learning_rate": 2.1327560471299303e-05, + "loss": 2.538, + "step": 30609 + }, + { + "epoch": 0.9076891142543665, + "grad_norm": 0.0712033063173294, + "learning_rate": 2.1313967323022875e-05, + "loss": 2.5543, + "step": 30610 + }, + { + "epoch": 0.9077187676066779, + "grad_norm": 0.07364564388990402, + "learning_rate": 2.130037841359339e-05, + "loss": 2.5312, + "step": 30611 + }, + { + "epoch": 0.9077484209589894, + "grad_norm": 0.07339475303888321, + "learning_rate": 2.128679374313136e-05, + "loss": 2.5399, + "step": 30612 + }, + { + "epoch": 0.9077780743113009, + "grad_norm": 0.07004087418317795, + "learning_rate": 2.12732133117568e-05, + "loss": 2.5514, + "step": 30613 + }, + { + "epoch": 0.9078077276636124, + "grad_norm": 0.07326057553291321, + "learning_rate": 2.125963711959017e-05, + "loss": 2.5989, + "step": 30614 + }, + { + "epoch": 0.9078373810159238, + "grad_norm": 0.07460074871778488, + "learning_rate": 2.1246065166751538e-05, + "loss": 2.5517, + "step": 30615 + }, + { + "epoch": 0.9078670343682353, + "grad_norm": 0.07429900020360947, + "learning_rate": 2.1232497453361255e-05, + "loss": 2.5515, + "step": 30616 + }, + { + "epoch": 0.9078966877205468, + "grad_norm": 0.06908009201288223, + "learning_rate": 2.121893397953939e-05, + "loss": 2.575, + "step": 30617 + }, + { + "epoch": 0.9079263410728583, + "grad_norm": 0.07215619087219238, + "learning_rate": 2.1205374745406013e-05, + "loss": 2.5236, + "step": 30618 + }, + { + "epoch": 0.9079559944251697, + "grad_norm": 0.07224301248788834, + "learning_rate": 2.1191819751081253e-05, + "loss": 2.5283, + "step": 30619 + }, + { + "epoch": 0.9079856477774813, + "grad_norm": 0.06963542103767395, + "learning_rate": 2.1178268996685125e-05, + "loss": 2.5267, + "step": 30620 + }, + { + "epoch": 0.9080153011297927, + "grad_norm": 0.06847040355205536, + "learning_rate": 2.1164722482337583e-05, + "loss": 2.523, + "step": 30621 + }, + { + "epoch": 0.9080449544821042, + "grad_norm": 0.0661683976650238, + "learning_rate": 2.115118020815865e-05, + "loss": 2.5546, + "step": 30622 + }, + { + "epoch": 0.9080746078344156, + "grad_norm": 0.07182042300701141, + "learning_rate": 2.1137642174268278e-05, + "loss": 2.5369, + "step": 30623 + }, + { + "epoch": 0.9081042611867272, + "grad_norm": 0.06945965439081192, + "learning_rate": 2.1124108380786157e-05, + "loss": 2.5768, + "step": 30624 + }, + { + "epoch": 0.9081339145390387, + "grad_norm": 0.0708247721195221, + "learning_rate": 2.1110578827832293e-05, + "loss": 2.5391, + "step": 30625 + }, + { + "epoch": 0.9081635678913501, + "grad_norm": 0.07188716530799866, + "learning_rate": 2.1097053515526488e-05, + "loss": 2.5052, + "step": 30626 + }, + { + "epoch": 0.9081932212436616, + "grad_norm": 0.07271520793437958, + "learning_rate": 2.108353244398842e-05, + "loss": 2.5661, + "step": 30627 + }, + { + "epoch": 0.9082228745959731, + "grad_norm": 0.07432616502046585, + "learning_rate": 2.1070015613337822e-05, + "loss": 2.5339, + "step": 30628 + }, + { + "epoch": 0.9082525279482846, + "grad_norm": 0.0715041235089302, + "learning_rate": 2.105650302369455e-05, + "loss": 2.5373, + "step": 30629 + }, + { + "epoch": 0.908282181300596, + "grad_norm": 0.07896789163351059, + "learning_rate": 2.1042994675178116e-05, + "loss": 2.5656, + "step": 30630 + }, + { + "epoch": 0.9083118346529075, + "grad_norm": 0.07426615059375763, + "learning_rate": 2.102949056790815e-05, + "loss": 2.5683, + "step": 30631 + }, + { + "epoch": 0.908341488005219, + "grad_norm": 0.07447809725999832, + "learning_rate": 2.1015990702004328e-05, + "loss": 2.5298, + "step": 30632 + }, + { + "epoch": 0.9083711413575305, + "grad_norm": 0.07480885088443756, + "learning_rate": 2.1002495077586115e-05, + "loss": 2.554, + "step": 30633 + }, + { + "epoch": 0.9084007947098419, + "grad_norm": 0.06897566467523575, + "learning_rate": 2.0989003694773022e-05, + "loss": 2.5498, + "step": 30634 + }, + { + "epoch": 0.9084304480621535, + "grad_norm": 0.07059203088283539, + "learning_rate": 2.0975516553684516e-05, + "loss": 2.5536, + "step": 30635 + }, + { + "epoch": 0.9084601014144649, + "grad_norm": 0.07526686042547226, + "learning_rate": 2.0962033654439993e-05, + "loss": 2.5419, + "step": 30636 + }, + { + "epoch": 0.9084897547667764, + "grad_norm": 0.07453944534063339, + "learning_rate": 2.094855499715892e-05, + "loss": 2.5071, + "step": 30637 + }, + { + "epoch": 0.9085194081190878, + "grad_norm": 0.07251309603452682, + "learning_rate": 2.093508058196064e-05, + "loss": 2.6147, + "step": 30638 + }, + { + "epoch": 0.9085490614713994, + "grad_norm": 0.07239049673080444, + "learning_rate": 2.0921610408964397e-05, + "loss": 2.5343, + "step": 30639 + }, + { + "epoch": 0.9085787148237108, + "grad_norm": 0.06825444847345352, + "learning_rate": 2.0908144478289592e-05, + "loss": 2.5168, + "step": 30640 + }, + { + "epoch": 0.9086083681760223, + "grad_norm": 0.07625926285982132, + "learning_rate": 2.0894682790055297e-05, + "loss": 2.5247, + "step": 30641 + }, + { + "epoch": 0.9086380215283337, + "grad_norm": 0.06942208856344223, + "learning_rate": 2.088122534438086e-05, + "loss": 2.4994, + "step": 30642 + }, + { + "epoch": 0.9086676748806453, + "grad_norm": 0.07221420854330063, + "learning_rate": 2.0867772141385466e-05, + "loss": 2.5517, + "step": 30643 + }, + { + "epoch": 0.9086973282329567, + "grad_norm": 0.06807471811771393, + "learning_rate": 2.085432318118824e-05, + "loss": 2.5383, + "step": 30644 + }, + { + "epoch": 0.9087269815852682, + "grad_norm": 0.0778496116399765, + "learning_rate": 2.0840878463908143e-05, + "loss": 2.5519, + "step": 30645 + }, + { + "epoch": 0.9087566349375797, + "grad_norm": 0.06898821145296097, + "learning_rate": 2.0827437989664355e-05, + "loss": 2.5866, + "step": 30646 + }, + { + "epoch": 0.9087862882898912, + "grad_norm": 0.06925693899393082, + "learning_rate": 2.0814001758575786e-05, + "loss": 2.5422, + "step": 30647 + }, + { + "epoch": 0.9088159416422027, + "grad_norm": 0.06884434819221497, + "learning_rate": 2.0800569770761558e-05, + "loss": 2.5228, + "step": 30648 + }, + { + "epoch": 0.9088455949945141, + "grad_norm": 0.06960753351449966, + "learning_rate": 2.0787142026340466e-05, + "loss": 2.5414, + "step": 30649 + }, + { + "epoch": 0.9088752483468256, + "grad_norm": 0.07041602581739426, + "learning_rate": 2.077371852543153e-05, + "loss": 2.5541, + "step": 30650 + }, + { + "epoch": 0.9089049016991371, + "grad_norm": 0.07220985740423203, + "learning_rate": 2.0760299268153595e-05, + "loss": 2.52, + "step": 30651 + }, + { + "epoch": 0.9089345550514486, + "grad_norm": 0.07068894803524017, + "learning_rate": 2.0746884254625452e-05, + "loss": 2.5849, + "step": 30652 + }, + { + "epoch": 0.90896420840376, + "grad_norm": 0.06794018298387527, + "learning_rate": 2.0733473484965902e-05, + "loss": 2.5335, + "step": 30653 + }, + { + "epoch": 0.9089938617560716, + "grad_norm": 0.07318952679634094, + "learning_rate": 2.0720066959293682e-05, + "loss": 2.5628, + "step": 30654 + }, + { + "epoch": 0.909023515108383, + "grad_norm": 0.0740288719534874, + "learning_rate": 2.0706664677727583e-05, + "loss": 2.5078, + "step": 30655 + }, + { + "epoch": 0.9090531684606945, + "grad_norm": 0.06752078980207443, + "learning_rate": 2.0693266640386233e-05, + "loss": 2.5531, + "step": 30656 + }, + { + "epoch": 0.9090828218130059, + "grad_norm": 0.07202863693237305, + "learning_rate": 2.067987284738826e-05, + "loss": 2.5185, + "step": 30657 + }, + { + "epoch": 0.9091124751653175, + "grad_norm": 0.07441537827253342, + "learning_rate": 2.066648329885229e-05, + "loss": 2.547, + "step": 30658 + }, + { + "epoch": 0.9091421285176289, + "grad_norm": 0.07342331856489182, + "learning_rate": 2.0653097994896896e-05, + "loss": 2.5159, + "step": 30659 + }, + { + "epoch": 0.9091717818699404, + "grad_norm": 0.07469398528337479, + "learning_rate": 2.0639716935640595e-05, + "loss": 2.5293, + "step": 30660 + }, + { + "epoch": 0.9092014352222518, + "grad_norm": 0.07311929762363434, + "learning_rate": 2.062634012120185e-05, + "loss": 2.5339, + "step": 30661 + }, + { + "epoch": 0.9092310885745634, + "grad_norm": 0.06940256059169769, + "learning_rate": 2.061296755169917e-05, + "loss": 2.5643, + "step": 30662 + }, + { + "epoch": 0.9092607419268748, + "grad_norm": 0.07313846051692963, + "learning_rate": 2.0599599227250963e-05, + "loss": 2.4976, + "step": 30663 + }, + { + "epoch": 0.9092903952791863, + "grad_norm": 0.0724322721362114, + "learning_rate": 2.0586235147975584e-05, + "loss": 2.5594, + "step": 30664 + }, + { + "epoch": 0.9093200486314978, + "grad_norm": 0.07348881661891937, + "learning_rate": 2.057287531399138e-05, + "loss": 2.5372, + "step": 30665 + }, + { + "epoch": 0.9093497019838093, + "grad_norm": 0.0679355189204216, + "learning_rate": 2.05595197254167e-05, + "loss": 2.5442, + "step": 30666 + }, + { + "epoch": 0.9093793553361208, + "grad_norm": 0.07258901745080948, + "learning_rate": 2.0546168382369723e-05, + "loss": 2.5294, + "step": 30667 + }, + { + "epoch": 0.9094090086884322, + "grad_norm": 0.07618077844381332, + "learning_rate": 2.0532821284968696e-05, + "loss": 2.5471, + "step": 30668 + }, + { + "epoch": 0.9094386620407438, + "grad_norm": 0.07174134254455566, + "learning_rate": 2.051947843333185e-05, + "loss": 2.5315, + "step": 30669 + }, + { + "epoch": 0.9094683153930552, + "grad_norm": 0.07119834423065186, + "learning_rate": 2.0506139827577376e-05, + "loss": 2.532, + "step": 30670 + }, + { + "epoch": 0.9094979687453667, + "grad_norm": 0.07126536220312119, + "learning_rate": 2.0492805467823285e-05, + "loss": 2.5475, + "step": 30671 + }, + { + "epoch": 0.9095276220976781, + "grad_norm": 0.07550162076950073, + "learning_rate": 2.047947535418776e-05, + "loss": 2.5042, + "step": 30672 + }, + { + "epoch": 0.9095572754499897, + "grad_norm": 0.07929238677024841, + "learning_rate": 2.046614948678871e-05, + "loss": 2.555, + "step": 30673 + }, + { + "epoch": 0.9095869288023011, + "grad_norm": 0.06961118429899216, + "learning_rate": 2.045282786574426e-05, + "loss": 2.5599, + "step": 30674 + }, + { + "epoch": 0.9096165821546126, + "grad_norm": 0.07465048879384995, + "learning_rate": 2.0439510491172374e-05, + "loss": 2.5515, + "step": 30675 + }, + { + "epoch": 0.909646235506924, + "grad_norm": 0.07498092949390411, + "learning_rate": 2.0426197363190902e-05, + "loss": 2.5529, + "step": 30676 + }, + { + "epoch": 0.9096758888592356, + "grad_norm": 0.07419013977050781, + "learning_rate": 2.0412888481917745e-05, + "loss": 2.5539, + "step": 30677 + }, + { + "epoch": 0.909705542211547, + "grad_norm": 0.07669747620820999, + "learning_rate": 2.039958384747087e-05, + "loss": 2.5576, + "step": 30678 + }, + { + "epoch": 0.9097351955638585, + "grad_norm": 0.07481792569160461, + "learning_rate": 2.038628345996796e-05, + "loss": 2.523, + "step": 30679 + }, + { + "epoch": 0.9097648489161699, + "grad_norm": 0.07812948524951935, + "learning_rate": 2.0372987319526748e-05, + "loss": 2.5222, + "step": 30680 + }, + { + "epoch": 0.9097945022684815, + "grad_norm": 0.06940989941358566, + "learning_rate": 2.035969542626509e-05, + "loss": 2.5142, + "step": 30681 + }, + { + "epoch": 0.9098241556207929, + "grad_norm": 0.07161779701709747, + "learning_rate": 2.0346407780300667e-05, + "loss": 2.5174, + "step": 30682 + }, + { + "epoch": 0.9098538089731044, + "grad_norm": 0.07173674553632736, + "learning_rate": 2.0333124381751165e-05, + "loss": 2.5161, + "step": 30683 + }, + { + "epoch": 0.9098834623254158, + "grad_norm": 0.0682896077632904, + "learning_rate": 2.0319845230734212e-05, + "loss": 2.5095, + "step": 30684 + }, + { + "epoch": 0.9099131156777274, + "grad_norm": 0.07176899164915085, + "learning_rate": 2.0306570327367325e-05, + "loss": 2.5635, + "step": 30685 + }, + { + "epoch": 0.9099427690300389, + "grad_norm": 0.06911520659923553, + "learning_rate": 2.0293299671768073e-05, + "loss": 2.5372, + "step": 30686 + }, + { + "epoch": 0.9099724223823503, + "grad_norm": 0.07084836065769196, + "learning_rate": 2.0280033264054033e-05, + "loss": 2.5217, + "step": 30687 + }, + { + "epoch": 0.9100020757346619, + "grad_norm": 0.07388830184936523, + "learning_rate": 2.0266771104342663e-05, + "loss": 2.537, + "step": 30688 + }, + { + "epoch": 0.9100317290869733, + "grad_norm": 0.07131674140691757, + "learning_rate": 2.025351319275137e-05, + "loss": 2.5532, + "step": 30689 + }, + { + "epoch": 0.9100613824392848, + "grad_norm": 0.06860685348510742, + "learning_rate": 2.0240259529397508e-05, + "loss": 2.5513, + "step": 30690 + }, + { + "epoch": 0.9100910357915962, + "grad_norm": 0.07253722101449966, + "learning_rate": 2.0227010114398537e-05, + "loss": 2.5512, + "step": 30691 + }, + { + "epoch": 0.9101206891439078, + "grad_norm": 0.07592159509658813, + "learning_rate": 2.0213764947871692e-05, + "loss": 2.5153, + "step": 30692 + }, + { + "epoch": 0.9101503424962192, + "grad_norm": 0.0765700563788414, + "learning_rate": 2.020052402993433e-05, + "loss": 2.5553, + "step": 30693 + }, + { + "epoch": 0.9101799958485307, + "grad_norm": 0.07101644575595856, + "learning_rate": 2.0187287360703743e-05, + "loss": 2.5507, + "step": 30694 + }, + { + "epoch": 0.9102096492008421, + "grad_norm": 0.07014728337526321, + "learning_rate": 2.017405494029706e-05, + "loss": 2.5354, + "step": 30695 + }, + { + "epoch": 0.9102393025531537, + "grad_norm": 0.07346795499324799, + "learning_rate": 2.0160826768831463e-05, + "loss": 2.5305, + "step": 30696 + }, + { + "epoch": 0.9102689559054651, + "grad_norm": 0.0744047686457634, + "learning_rate": 2.0147602846424085e-05, + "loss": 2.51, + "step": 30697 + }, + { + "epoch": 0.9102986092577766, + "grad_norm": 0.0745457336306572, + "learning_rate": 2.0134383173192107e-05, + "loss": 2.535, + "step": 30698 + }, + { + "epoch": 0.910328262610088, + "grad_norm": 0.07225676625967026, + "learning_rate": 2.0121167749252544e-05, + "loss": 2.5078, + "step": 30699 + }, + { + "epoch": 0.9103579159623996, + "grad_norm": 0.07696022838354111, + "learning_rate": 2.010795657472242e-05, + "loss": 2.5331, + "step": 30700 + }, + { + "epoch": 0.910387569314711, + "grad_norm": 0.07632666826248169, + "learning_rate": 2.0094749649718635e-05, + "loss": 2.5367, + "step": 30701 + }, + { + "epoch": 0.9104172226670225, + "grad_norm": 0.07106192409992218, + "learning_rate": 2.0081546974358266e-05, + "loss": 2.5417, + "step": 30702 + }, + { + "epoch": 0.9104468760193339, + "grad_norm": 0.07732464373111725, + "learning_rate": 2.0068348548758162e-05, + "loss": 2.5243, + "step": 30703 + }, + { + "epoch": 0.9104765293716455, + "grad_norm": 0.07524503022432327, + "learning_rate": 2.005515437303518e-05, + "loss": 2.5319, + "step": 30704 + }, + { + "epoch": 0.9105061827239569, + "grad_norm": 0.07270877063274384, + "learning_rate": 2.004196444730616e-05, + "loss": 2.5679, + "step": 30705 + }, + { + "epoch": 0.9105358360762684, + "grad_norm": 0.07102781534194946, + "learning_rate": 2.0028778771688015e-05, + "loss": 2.5127, + "step": 30706 + }, + { + "epoch": 0.91056548942858, + "grad_norm": 0.07579440623521805, + "learning_rate": 2.0015597346297376e-05, + "loss": 2.5694, + "step": 30707 + }, + { + "epoch": 0.9105951427808914, + "grad_norm": 0.07472584396600723, + "learning_rate": 2.0002420171251036e-05, + "loss": 2.5724, + "step": 30708 + }, + { + "epoch": 0.9106247961332029, + "grad_norm": 0.07438923418521881, + "learning_rate": 1.9989247246665678e-05, + "loss": 2.5076, + "step": 30709 + }, + { + "epoch": 0.9106544494855143, + "grad_norm": 0.0705961212515831, + "learning_rate": 1.997607857265793e-05, + "loss": 2.5724, + "step": 30710 + }, + { + "epoch": 0.9106841028378259, + "grad_norm": 0.07525182515382767, + "learning_rate": 1.996291414934437e-05, + "loss": 2.5357, + "step": 30711 + }, + { + "epoch": 0.9107137561901373, + "grad_norm": 0.08070534467697144, + "learning_rate": 1.9949753976841568e-05, + "loss": 2.5268, + "step": 30712 + }, + { + "epoch": 0.9107434095424488, + "grad_norm": 0.07369643449783325, + "learning_rate": 1.993659805526615e-05, + "loss": 2.5371, + "step": 30713 + }, + { + "epoch": 0.9107730628947602, + "grad_norm": 0.07025221735239029, + "learning_rate": 1.9923446384734534e-05, + "loss": 2.5422, + "step": 30714 + }, + { + "epoch": 0.9108027162470718, + "grad_norm": 0.07166467607021332, + "learning_rate": 1.991029896536317e-05, + "loss": 2.5637, + "step": 30715 + }, + { + "epoch": 0.9108323695993832, + "grad_norm": 0.07115460187196732, + "learning_rate": 1.9897155797268586e-05, + "loss": 2.5571, + "step": 30716 + }, + { + "epoch": 0.9108620229516947, + "grad_norm": 0.07627957314252853, + "learning_rate": 1.9884016880567014e-05, + "loss": 2.5281, + "step": 30717 + }, + { + "epoch": 0.9108916763040061, + "grad_norm": 0.07164483517408371, + "learning_rate": 1.9870882215374865e-05, + "loss": 2.5498, + "step": 30718 + }, + { + "epoch": 0.9109213296563177, + "grad_norm": 0.06964239478111267, + "learning_rate": 1.9857751801808544e-05, + "loss": 2.5343, + "step": 30719 + }, + { + "epoch": 0.9109509830086291, + "grad_norm": 0.07348848134279251, + "learning_rate": 1.9844625639984293e-05, + "loss": 2.5649, + "step": 30720 + }, + { + "epoch": 0.9109806363609406, + "grad_norm": 0.07006248831748962, + "learning_rate": 1.9831503730018242e-05, + "loss": 2.524, + "step": 30721 + }, + { + "epoch": 0.911010289713252, + "grad_norm": 0.0710027813911438, + "learning_rate": 1.981838607202663e-05, + "loss": 2.5307, + "step": 30722 + }, + { + "epoch": 0.9110399430655636, + "grad_norm": 0.07067766040563583, + "learning_rate": 1.980527266612564e-05, + "loss": 2.5296, + "step": 30723 + }, + { + "epoch": 0.911069596417875, + "grad_norm": 0.07403451949357986, + "learning_rate": 1.9792163512431405e-05, + "loss": 2.5674, + "step": 30724 + }, + { + "epoch": 0.9110992497701865, + "grad_norm": 0.07162357866764069, + "learning_rate": 1.9779058611059943e-05, + "loss": 2.5291, + "step": 30725 + }, + { + "epoch": 0.911128903122498, + "grad_norm": 0.06908221542835236, + "learning_rate": 1.976595796212738e-05, + "loss": 2.5319, + "step": 30726 + }, + { + "epoch": 0.9111585564748095, + "grad_norm": 0.06841139495372772, + "learning_rate": 1.975286156574968e-05, + "loss": 2.5101, + "step": 30727 + }, + { + "epoch": 0.911188209827121, + "grad_norm": 0.0719345360994339, + "learning_rate": 1.9739769422042862e-05, + "loss": 2.5394, + "step": 30728 + }, + { + "epoch": 0.9112178631794324, + "grad_norm": 0.07642330974340439, + "learning_rate": 1.972668153112278e-05, + "loss": 2.5549, + "step": 30729 + }, + { + "epoch": 0.911247516531744, + "grad_norm": 0.07087280601263046, + "learning_rate": 1.9713597893105396e-05, + "loss": 2.5512, + "step": 30730 + }, + { + "epoch": 0.9112771698840554, + "grad_norm": 0.07229246199131012, + "learning_rate": 1.97005185081065e-05, + "loss": 2.5226, + "step": 30731 + }, + { + "epoch": 0.9113068232363669, + "grad_norm": 0.07169600576162338, + "learning_rate": 1.9687443376242008e-05, + "loss": 2.5666, + "step": 30732 + }, + { + "epoch": 0.9113364765886783, + "grad_norm": 0.07373072952032089, + "learning_rate": 1.96743724976276e-05, + "loss": 2.5234, + "step": 30733 + }, + { + "epoch": 0.9113661299409899, + "grad_norm": 0.06915321201086044, + "learning_rate": 1.9661305872379075e-05, + "loss": 2.5013, + "step": 30734 + }, + { + "epoch": 0.9113957832933013, + "grad_norm": 0.07315173000097275, + "learning_rate": 1.9648243500612173e-05, + "loss": 2.5345, + "step": 30735 + }, + { + "epoch": 0.9114254366456128, + "grad_norm": 0.07414328306913376, + "learning_rate": 1.963518538244252e-05, + "loss": 2.535, + "step": 30736 + }, + { + "epoch": 0.9114550899979242, + "grad_norm": 0.07389529049396515, + "learning_rate": 1.9622131517985697e-05, + "loss": 2.6045, + "step": 30737 + }, + { + "epoch": 0.9114847433502358, + "grad_norm": 0.07347606122493744, + "learning_rate": 1.960908190735744e-05, + "loss": 2.5245, + "step": 30738 + }, + { + "epoch": 0.9115143967025472, + "grad_norm": 0.07153008878231049, + "learning_rate": 1.9596036550673156e-05, + "loss": 2.5393, + "step": 30739 + }, + { + "epoch": 0.9115440500548587, + "grad_norm": 0.07199408113956451, + "learning_rate": 1.958299544804848e-05, + "loss": 2.523, + "step": 30740 + }, + { + "epoch": 0.9115737034071701, + "grad_norm": 0.06915460526943207, + "learning_rate": 1.9569958599598813e-05, + "loss": 2.5386, + "step": 30741 + }, + { + "epoch": 0.9116033567594817, + "grad_norm": 0.07274720072746277, + "learning_rate": 1.9556926005439622e-05, + "loss": 2.5341, + "step": 30742 + }, + { + "epoch": 0.9116330101117931, + "grad_norm": 0.07549776881933212, + "learning_rate": 1.9543897665686317e-05, + "loss": 2.5346, + "step": 30743 + }, + { + "epoch": 0.9116626634641046, + "grad_norm": 0.07582176476716995, + "learning_rate": 1.9530873580454246e-05, + "loss": 2.5418, + "step": 30744 + }, + { + "epoch": 0.911692316816416, + "grad_norm": 0.07069188356399536, + "learning_rate": 1.9517853749858817e-05, + "loss": 2.5541, + "step": 30745 + }, + { + "epoch": 0.9117219701687276, + "grad_norm": 0.07356593012809753, + "learning_rate": 1.9504838174015215e-05, + "loss": 2.5314, + "step": 30746 + }, + { + "epoch": 0.911751623521039, + "grad_norm": 0.07335594296455383, + "learning_rate": 1.9491826853038797e-05, + "loss": 2.5107, + "step": 30747 + }, + { + "epoch": 0.9117812768733505, + "grad_norm": 0.07794757932424545, + "learning_rate": 1.9478819787044687e-05, + "loss": 2.5438, + "step": 30748 + }, + { + "epoch": 0.9118109302256621, + "grad_norm": 0.07147763669490814, + "learning_rate": 1.946581697614813e-05, + "loss": 2.5602, + "step": 30749 + }, + { + "epoch": 0.9118405835779735, + "grad_norm": 0.07137133181095123, + "learning_rate": 1.945281842046426e-05, + "loss": 2.5259, + "step": 30750 + }, + { + "epoch": 0.911870236930285, + "grad_norm": 0.07576969265937805, + "learning_rate": 1.943982412010814e-05, + "loss": 2.5516, + "step": 30751 + }, + { + "epoch": 0.9118998902825964, + "grad_norm": 0.07687268406152725, + "learning_rate": 1.9426834075194853e-05, + "loss": 2.5418, + "step": 30752 + }, + { + "epoch": 0.911929543634908, + "grad_norm": 0.07546243071556091, + "learning_rate": 1.9413848285839476e-05, + "loss": 2.5415, + "step": 30753 + }, + { + "epoch": 0.9119591969872194, + "grad_norm": 0.07339789718389511, + "learning_rate": 1.940086675215702e-05, + "loss": 2.5666, + "step": 30754 + }, + { + "epoch": 0.9119888503395309, + "grad_norm": 0.07807888090610504, + "learning_rate": 1.9387889474262286e-05, + "loss": 2.5321, + "step": 30755 + }, + { + "epoch": 0.9120185036918423, + "grad_norm": 0.07513977587223053, + "learning_rate": 1.9374916452270352e-05, + "loss": 2.504, + "step": 30756 + }, + { + "epoch": 0.9120481570441539, + "grad_norm": 0.07078169286251068, + "learning_rate": 1.93619476862959e-05, + "loss": 2.5573, + "step": 30757 + }, + { + "epoch": 0.9120778103964653, + "grad_norm": 0.07275621592998505, + "learning_rate": 1.9348983176454006e-05, + "loss": 2.56, + "step": 30758 + }, + { + "epoch": 0.9121074637487768, + "grad_norm": 0.07553595304489136, + "learning_rate": 1.9336022922859353e-05, + "loss": 2.5186, + "step": 30759 + }, + { + "epoch": 0.9121371171010882, + "grad_norm": 0.07449416816234589, + "learning_rate": 1.932306692562674e-05, + "loss": 2.5047, + "step": 30760 + }, + { + "epoch": 0.9121667704533998, + "grad_norm": 0.07197996973991394, + "learning_rate": 1.9310115184870857e-05, + "loss": 2.4999, + "step": 30761 + }, + { + "epoch": 0.9121964238057112, + "grad_norm": 0.07393622398376465, + "learning_rate": 1.9297167700706385e-05, + "loss": 2.5424, + "step": 30762 + }, + { + "epoch": 0.9122260771580227, + "grad_norm": 0.07192428410053253, + "learning_rate": 1.9284224473248068e-05, + "loss": 2.5119, + "step": 30763 + }, + { + "epoch": 0.9122557305103342, + "grad_norm": 0.07607350498437881, + "learning_rate": 1.9271285502610423e-05, + "loss": 2.5746, + "step": 30764 + }, + { + "epoch": 0.9122853838626457, + "grad_norm": 0.07103213667869568, + "learning_rate": 1.9258350788908142e-05, + "loss": 2.5184, + "step": 30765 + }, + { + "epoch": 0.9123150372149571, + "grad_norm": 0.07369989901781082, + "learning_rate": 1.924542033225557e-05, + "loss": 2.5336, + "step": 30766 + }, + { + "epoch": 0.9123446905672686, + "grad_norm": 0.0728115662932396, + "learning_rate": 1.9232494132767342e-05, + "loss": 2.5852, + "step": 30767 + }, + { + "epoch": 0.91237434391958, + "grad_norm": 0.07183561474084854, + "learning_rate": 1.9219572190557922e-05, + "loss": 2.5629, + "step": 30768 + }, + { + "epoch": 0.9124039972718916, + "grad_norm": 0.0697878822684288, + "learning_rate": 1.9206654505741717e-05, + "loss": 2.5028, + "step": 30769 + }, + { + "epoch": 0.9124336506242031, + "grad_norm": 0.07387420535087585, + "learning_rate": 1.9193741078433026e-05, + "loss": 2.529, + "step": 30770 + }, + { + "epoch": 0.9124633039765145, + "grad_norm": 0.07375866174697876, + "learning_rate": 1.9180831908746364e-05, + "loss": 2.5426, + "step": 30771 + }, + { + "epoch": 0.9124929573288261, + "grad_norm": 0.07326392084360123, + "learning_rate": 1.916792699679598e-05, + "loss": 2.5445, + "step": 30772 + }, + { + "epoch": 0.9125226106811375, + "grad_norm": 0.07015576213598251, + "learning_rate": 1.9155026342696113e-05, + "loss": 2.5203, + "step": 30773 + }, + { + "epoch": 0.912552264033449, + "grad_norm": 0.07307841628789902, + "learning_rate": 1.914212994656106e-05, + "loss": 2.5473, + "step": 30774 + }, + { + "epoch": 0.9125819173857604, + "grad_norm": 0.07165664434432983, + "learning_rate": 1.9129237808505007e-05, + "loss": 2.5352, + "step": 30775 + }, + { + "epoch": 0.912611570738072, + "grad_norm": 0.07312136888504028, + "learning_rate": 1.9116349928642084e-05, + "loss": 2.5133, + "step": 30776 + }, + { + "epoch": 0.9126412240903834, + "grad_norm": 0.07605616003274918, + "learning_rate": 1.910346630708637e-05, + "loss": 2.5135, + "step": 30777 + }, + { + "epoch": 0.9126708774426949, + "grad_norm": 0.07171201705932617, + "learning_rate": 1.9090586943952048e-05, + "loss": 2.5571, + "step": 30778 + }, + { + "epoch": 0.9127005307950063, + "grad_norm": 0.07341980934143066, + "learning_rate": 1.9077711839353085e-05, + "loss": 2.5674, + "step": 30779 + }, + { + "epoch": 0.9127301841473179, + "grad_norm": 0.07315237820148468, + "learning_rate": 1.9064840993403554e-05, + "loss": 2.5709, + "step": 30780 + }, + { + "epoch": 0.9127598374996293, + "grad_norm": 0.07889654487371445, + "learning_rate": 1.9051974406217366e-05, + "loss": 2.5322, + "step": 30781 + }, + { + "epoch": 0.9127894908519408, + "grad_norm": 0.0697561725974083, + "learning_rate": 1.9039112077908537e-05, + "loss": 2.528, + "step": 30782 + }, + { + "epoch": 0.9128191442042523, + "grad_norm": 0.0672924742102623, + "learning_rate": 1.902625400859087e-05, + "loss": 2.5225, + "step": 30783 + }, + { + "epoch": 0.9128487975565638, + "grad_norm": 0.06859756261110306, + "learning_rate": 1.9013400198378382e-05, + "loss": 2.5478, + "step": 30784 + }, + { + "epoch": 0.9128784509088752, + "grad_norm": 0.07275183498859406, + "learning_rate": 1.9000550647384763e-05, + "loss": 2.5214, + "step": 30785 + }, + { + "epoch": 0.9129081042611867, + "grad_norm": 0.07124355435371399, + "learning_rate": 1.898770535572386e-05, + "loss": 2.5463, + "step": 30786 + }, + { + "epoch": 0.9129377576134982, + "grad_norm": 0.06842033565044403, + "learning_rate": 1.897486432350931e-05, + "loss": 2.5409, + "step": 30787 + }, + { + "epoch": 0.9129674109658097, + "grad_norm": 0.07274429500102997, + "learning_rate": 1.8962027550854965e-05, + "loss": 2.52, + "step": 30788 + }, + { + "epoch": 0.9129970643181211, + "grad_norm": 0.07098930329084396, + "learning_rate": 1.8949195037874402e-05, + "loss": 2.5315, + "step": 30789 + }, + { + "epoch": 0.9130267176704326, + "grad_norm": 0.07078385353088379, + "learning_rate": 1.89363667846813e-05, + "loss": 2.5547, + "step": 30790 + }, + { + "epoch": 0.9130563710227442, + "grad_norm": 0.06991399824619293, + "learning_rate": 1.8923542791389246e-05, + "loss": 2.5205, + "step": 30791 + }, + { + "epoch": 0.9130860243750556, + "grad_norm": 0.07614101469516754, + "learning_rate": 1.891072305811181e-05, + "loss": 2.5251, + "step": 30792 + }, + { + "epoch": 0.9131156777273671, + "grad_norm": 0.07063714414834976, + "learning_rate": 1.889790758496246e-05, + "loss": 2.5234, + "step": 30793 + }, + { + "epoch": 0.9131453310796785, + "grad_norm": 0.06913138926029205, + "learning_rate": 1.8885096372054766e-05, + "loss": 2.5419, + "step": 30794 + }, + { + "epoch": 0.9131749844319901, + "grad_norm": 0.06864037364721298, + "learning_rate": 1.8872289419502085e-05, + "loss": 2.5242, + "step": 30795 + }, + { + "epoch": 0.9132046377843015, + "grad_norm": 0.0700104609131813, + "learning_rate": 1.8859486727417885e-05, + "loss": 2.5351, + "step": 30796 + }, + { + "epoch": 0.913234291136613, + "grad_norm": 0.07496554404497147, + "learning_rate": 1.8846688295915515e-05, + "loss": 2.5771, + "step": 30797 + }, + { + "epoch": 0.9132639444889245, + "grad_norm": 0.0711732730269432, + "learning_rate": 1.8833894125108274e-05, + "loss": 2.5532, + "step": 30798 + }, + { + "epoch": 0.913293597841236, + "grad_norm": 0.0690891295671463, + "learning_rate": 1.882110421510952e-05, + "loss": 2.5524, + "step": 30799 + }, + { + "epoch": 0.9133232511935474, + "grad_norm": 0.07312310487031937, + "learning_rate": 1.880831856603249e-05, + "loss": 2.5666, + "step": 30800 + }, + { + "epoch": 0.9133529045458589, + "grad_norm": 0.07556319981813431, + "learning_rate": 1.879553717799043e-05, + "loss": 2.5458, + "step": 30801 + }, + { + "epoch": 0.9133825578981704, + "grad_norm": 0.07146301120519638, + "learning_rate": 1.8782760051096415e-05, + "loss": 2.5329, + "step": 30802 + }, + { + "epoch": 0.9134122112504819, + "grad_norm": 0.0700620636343956, + "learning_rate": 1.8769987185463687e-05, + "loss": 2.5191, + "step": 30803 + }, + { + "epoch": 0.9134418646027933, + "grad_norm": 0.0732811689376831, + "learning_rate": 1.8757218581205328e-05, + "loss": 2.5545, + "step": 30804 + }, + { + "epoch": 0.9134715179551048, + "grad_norm": 0.07501471787691116, + "learning_rate": 1.8744454238434405e-05, + "loss": 2.5415, + "step": 30805 + }, + { + "epoch": 0.9135011713074163, + "grad_norm": 0.0704377144575119, + "learning_rate": 1.8731694157263944e-05, + "loss": 2.5402, + "step": 30806 + }, + { + "epoch": 0.9135308246597278, + "grad_norm": 0.06951357424259186, + "learning_rate": 1.8718938337806967e-05, + "loss": 2.5424, + "step": 30807 + }, + { + "epoch": 0.9135604780120392, + "grad_norm": 0.07375404983758926, + "learning_rate": 1.870618678017638e-05, + "loss": 2.5266, + "step": 30808 + }, + { + "epoch": 0.9135901313643507, + "grad_norm": 0.07454246282577515, + "learning_rate": 1.86934394844851e-05, + "loss": 2.575, + "step": 30809 + }, + { + "epoch": 0.9136197847166622, + "grad_norm": 0.07135440409183502, + "learning_rate": 1.8680696450846023e-05, + "loss": 2.5524, + "step": 30810 + }, + { + "epoch": 0.9136494380689737, + "grad_norm": 0.07233457267284393, + "learning_rate": 1.8667957679372015e-05, + "loss": 2.548, + "step": 30811 + }, + { + "epoch": 0.9136790914212852, + "grad_norm": 0.07222431898117065, + "learning_rate": 1.865522317017587e-05, + "loss": 2.578, + "step": 30812 + }, + { + "epoch": 0.9137087447735966, + "grad_norm": 0.07510115951299667, + "learning_rate": 1.8642492923370336e-05, + "loss": 2.5428, + "step": 30813 + }, + { + "epoch": 0.9137383981259082, + "grad_norm": 0.07225196808576584, + "learning_rate": 1.8629766939068206e-05, + "loss": 2.5577, + "step": 30814 + }, + { + "epoch": 0.9137680514782196, + "grad_norm": 0.06906470656394958, + "learning_rate": 1.8617045217382056e-05, + "loss": 2.5379, + "step": 30815 + }, + { + "epoch": 0.9137977048305311, + "grad_norm": 0.06903228163719177, + "learning_rate": 1.8604327758424578e-05, + "loss": 2.4957, + "step": 30816 + }, + { + "epoch": 0.9138273581828426, + "grad_norm": 0.07286979258060455, + "learning_rate": 1.8591614562308458e-05, + "loss": 2.5335, + "step": 30817 + }, + { + "epoch": 0.9138570115351541, + "grad_norm": 0.07167960703372955, + "learning_rate": 1.8578905629146213e-05, + "loss": 2.5221, + "step": 30818 + }, + { + "epoch": 0.9138866648874655, + "grad_norm": 0.06931174546480179, + "learning_rate": 1.8566200959050373e-05, + "loss": 2.5276, + "step": 30819 + }, + { + "epoch": 0.913916318239777, + "grad_norm": 0.07354281097650528, + "learning_rate": 1.8553500552133505e-05, + "loss": 2.5237, + "step": 30820 + }, + { + "epoch": 0.9139459715920885, + "grad_norm": 0.07085214555263519, + "learning_rate": 1.8540804408508027e-05, + "loss": 2.5293, + "step": 30821 + }, + { + "epoch": 0.9139756249444, + "grad_norm": 0.06885857880115509, + "learning_rate": 1.852811252828629e-05, + "loss": 2.5344, + "step": 30822 + }, + { + "epoch": 0.9140052782967114, + "grad_norm": 0.06904637068510056, + "learning_rate": 1.8515424911580813e-05, + "loss": 2.5516, + "step": 30823 + }, + { + "epoch": 0.9140349316490229, + "grad_norm": 0.06777487695217133, + "learning_rate": 1.8502741558503842e-05, + "loss": 2.5181, + "step": 30824 + }, + { + "epoch": 0.9140645850013344, + "grad_norm": 0.0744064524769783, + "learning_rate": 1.849006246916779e-05, + "loss": 2.5595, + "step": 30825 + }, + { + "epoch": 0.9140942383536459, + "grad_norm": 0.07095658034086227, + "learning_rate": 1.8477387643684895e-05, + "loss": 2.5519, + "step": 30826 + }, + { + "epoch": 0.9141238917059573, + "grad_norm": 0.07191590219736099, + "learning_rate": 1.8464717082167403e-05, + "loss": 2.528, + "step": 30827 + }, + { + "epoch": 0.9141535450582688, + "grad_norm": 0.06851107627153397, + "learning_rate": 1.845205078472745e-05, + "loss": 2.5244, + "step": 30828 + }, + { + "epoch": 0.9141831984105803, + "grad_norm": 0.06958553194999695, + "learning_rate": 1.8439388751477272e-05, + "loss": 2.5418, + "step": 30829 + }, + { + "epoch": 0.9142128517628918, + "grad_norm": 0.06814755499362946, + "learning_rate": 1.8426730982529006e-05, + "loss": 2.5679, + "step": 30830 + }, + { + "epoch": 0.9142425051152032, + "grad_norm": 0.06829645484685898, + "learning_rate": 1.8414077477994618e-05, + "loss": 2.5117, + "step": 30831 + }, + { + "epoch": 0.9142721584675148, + "grad_norm": 0.06848806142807007, + "learning_rate": 1.8401428237986297e-05, + "loss": 2.5294, + "step": 30832 + }, + { + "epoch": 0.9143018118198263, + "grad_norm": 0.07036362588405609, + "learning_rate": 1.8388783262615948e-05, + "loss": 2.5355, + "step": 30833 + }, + { + "epoch": 0.9143314651721377, + "grad_norm": 0.06851041316986084, + "learning_rate": 1.83761425519956e-05, + "loss": 2.523, + "step": 30834 + }, + { + "epoch": 0.9143611185244492, + "grad_norm": 0.06802595406770706, + "learning_rate": 1.8363506106237106e-05, + "loss": 2.5526, + "step": 30835 + }, + { + "epoch": 0.9143907718767607, + "grad_norm": 0.06560967117547989, + "learning_rate": 1.835087392545254e-05, + "loss": 2.5403, + "step": 30836 + }, + { + "epoch": 0.9144204252290722, + "grad_norm": 0.06857752799987793, + "learning_rate": 1.8338246009753645e-05, + "loss": 2.5434, + "step": 30837 + }, + { + "epoch": 0.9144500785813836, + "grad_norm": 0.06824667006731033, + "learning_rate": 1.8325622359252226e-05, + "loss": 2.4939, + "step": 30838 + }, + { + "epoch": 0.9144797319336951, + "grad_norm": 0.07370281964540482, + "learning_rate": 1.8313002974060135e-05, + "loss": 2.5761, + "step": 30839 + }, + { + "epoch": 0.9145093852860066, + "grad_norm": 0.07092424482107162, + "learning_rate": 1.8300387854289058e-05, + "loss": 2.5659, + "step": 30840 + }, + { + "epoch": 0.9145390386383181, + "grad_norm": 0.0709802433848381, + "learning_rate": 1.8287777000050797e-05, + "loss": 2.5392, + "step": 30841 + }, + { + "epoch": 0.9145686919906295, + "grad_norm": 0.07444542646408081, + "learning_rate": 1.8275170411456875e-05, + "loss": 2.5629, + "step": 30842 + }, + { + "epoch": 0.914598345342941, + "grad_norm": 0.07360217720270157, + "learning_rate": 1.8262568088619036e-05, + "loss": 2.5692, + "step": 30843 + }, + { + "epoch": 0.9146279986952525, + "grad_norm": 0.07000017911195755, + "learning_rate": 1.8249970031648855e-05, + "loss": 2.5144, + "step": 30844 + }, + { + "epoch": 0.914657652047564, + "grad_norm": 0.06766754388809204, + "learning_rate": 1.8237376240657856e-05, + "loss": 2.5262, + "step": 30845 + }, + { + "epoch": 0.9146873053998754, + "grad_norm": 0.0721382275223732, + "learning_rate": 1.8224786715757613e-05, + "loss": 2.5204, + "step": 30846 + }, + { + "epoch": 0.914716958752187, + "grad_norm": 0.06700266152620316, + "learning_rate": 1.8212201457059542e-05, + "loss": 2.5371, + "step": 30847 + }, + { + "epoch": 0.9147466121044984, + "grad_norm": 0.07147979736328125, + "learning_rate": 1.8199620464675105e-05, + "loss": 2.5374, + "step": 30848 + }, + { + "epoch": 0.9147762654568099, + "grad_norm": 0.06898094713687897, + "learning_rate": 1.8187043738715768e-05, + "loss": 2.5863, + "step": 30849 + }, + { + "epoch": 0.9148059188091213, + "grad_norm": 0.07006272673606873, + "learning_rate": 1.8174471279292835e-05, + "loss": 2.5383, + "step": 30850 + }, + { + "epoch": 0.9148355721614329, + "grad_norm": 0.06661702692508698, + "learning_rate": 1.8161903086517773e-05, + "loss": 2.5569, + "step": 30851 + }, + { + "epoch": 0.9148652255137443, + "grad_norm": 0.069163478910923, + "learning_rate": 1.8149339160501653e-05, + "loss": 2.5349, + "step": 30852 + }, + { + "epoch": 0.9148948788660558, + "grad_norm": 0.06749448925256729, + "learning_rate": 1.8136779501355893e-05, + "loss": 2.5522, + "step": 30853 + }, + { + "epoch": 0.9149245322183673, + "grad_norm": 0.06883443146944046, + "learning_rate": 1.812422410919162e-05, + "loss": 2.5254, + "step": 30854 + }, + { + "epoch": 0.9149541855706788, + "grad_norm": 0.06909133493900299, + "learning_rate": 1.8111672984120088e-05, + "loss": 2.5823, + "step": 30855 + }, + { + "epoch": 0.9149838389229903, + "grad_norm": 0.07131238281726837, + "learning_rate": 1.8099126126252363e-05, + "loss": 2.5546, + "step": 30856 + }, + { + "epoch": 0.9150134922753017, + "grad_norm": 0.0682004764676094, + "learning_rate": 1.8086583535699642e-05, + "loss": 2.5406, + "step": 30857 + }, + { + "epoch": 0.9150431456276132, + "grad_norm": 0.06711006164550781, + "learning_rate": 1.8074045212572943e-05, + "loss": 2.5296, + "step": 30858 + }, + { + "epoch": 0.9150727989799247, + "grad_norm": 0.0707574337720871, + "learning_rate": 1.8061511156983235e-05, + "loss": 2.4907, + "step": 30859 + }, + { + "epoch": 0.9151024523322362, + "grad_norm": 0.06844860315322876, + "learning_rate": 1.8048981369041594e-05, + "loss": 2.5325, + "step": 30860 + }, + { + "epoch": 0.9151321056845476, + "grad_norm": 0.07153622061014175, + "learning_rate": 1.803645584885899e-05, + "loss": 2.5238, + "step": 30861 + }, + { + "epoch": 0.9151617590368591, + "grad_norm": 0.06976814568042755, + "learning_rate": 1.8023934596546275e-05, + "loss": 2.5271, + "step": 30862 + }, + { + "epoch": 0.9151914123891706, + "grad_norm": 0.06854662299156189, + "learning_rate": 1.8011417612214365e-05, + "loss": 2.5584, + "step": 30863 + }, + { + "epoch": 0.9152210657414821, + "grad_norm": 0.06875565648078918, + "learning_rate": 1.7998904895974056e-05, + "loss": 2.5724, + "step": 30864 + }, + { + "epoch": 0.9152507190937935, + "grad_norm": 0.06612236052751541, + "learning_rate": 1.7986396447936203e-05, + "loss": 2.5303, + "step": 30865 + }, + { + "epoch": 0.915280372446105, + "grad_norm": 0.06646565347909927, + "learning_rate": 1.79738922682115e-05, + "loss": 2.5239, + "step": 30866 + }, + { + "epoch": 0.9153100257984165, + "grad_norm": 0.07366222143173218, + "learning_rate": 1.7961392356910745e-05, + "loss": 2.5563, + "step": 30867 + }, + { + "epoch": 0.915339679150728, + "grad_norm": 0.07074074447154999, + "learning_rate": 1.7948896714144624e-05, + "loss": 2.542, + "step": 30868 + }, + { + "epoch": 0.9153693325030394, + "grad_norm": 0.06846627593040466, + "learning_rate": 1.793640534002372e-05, + "loss": 2.5271, + "step": 30869 + }, + { + "epoch": 0.915398985855351, + "grad_norm": 0.0689772292971611, + "learning_rate": 1.792391823465872e-05, + "loss": 2.5309, + "step": 30870 + }, + { + "epoch": 0.9154286392076624, + "grad_norm": 0.06871778517961502, + "learning_rate": 1.7911435398160202e-05, + "loss": 2.5396, + "step": 30871 + }, + { + "epoch": 0.9154582925599739, + "grad_norm": 0.07181858271360397, + "learning_rate": 1.7898956830638634e-05, + "loss": 2.5331, + "step": 30872 + }, + { + "epoch": 0.9154879459122854, + "grad_norm": 0.06747930496931076, + "learning_rate": 1.7886482532204597e-05, + "loss": 2.5473, + "step": 30873 + }, + { + "epoch": 0.9155175992645969, + "grad_norm": 0.07275892049074173, + "learning_rate": 1.787401250296844e-05, + "loss": 2.5579, + "step": 30874 + }, + { + "epoch": 0.9155472526169084, + "grad_norm": 0.0683135837316513, + "learning_rate": 1.786154674304069e-05, + "loss": 2.5343, + "step": 30875 + }, + { + "epoch": 0.9155769059692198, + "grad_norm": 0.07384578883647919, + "learning_rate": 1.7849085252531707e-05, + "loss": 2.5139, + "step": 30876 + }, + { + "epoch": 0.9156065593215313, + "grad_norm": 0.06979671865701675, + "learning_rate": 1.783662803155184e-05, + "loss": 2.5423, + "step": 30877 + }, + { + "epoch": 0.9156362126738428, + "grad_norm": 0.0706067606806755, + "learning_rate": 1.782417508021139e-05, + "loss": 2.551, + "step": 30878 + }, + { + "epoch": 0.9156658660261543, + "grad_norm": 0.07058718055486679, + "learning_rate": 1.7811726398620666e-05, + "loss": 2.5233, + "step": 30879 + }, + { + "epoch": 0.9156955193784657, + "grad_norm": 0.07316768914461136, + "learning_rate": 1.779928198688979e-05, + "loss": 2.529, + "step": 30880 + }, + { + "epoch": 0.9157251727307772, + "grad_norm": 0.07141587883234024, + "learning_rate": 1.778684184512913e-05, + "loss": 2.5411, + "step": 30881 + }, + { + "epoch": 0.9157548260830887, + "grad_norm": 0.07388673722743988, + "learning_rate": 1.7774405973448706e-05, + "loss": 2.544, + "step": 30882 + }, + { + "epoch": 0.9157844794354002, + "grad_norm": 0.06986219435930252, + "learning_rate": 1.77619743719587e-05, + "loss": 2.5432, + "step": 30883 + }, + { + "epoch": 0.9158141327877116, + "grad_norm": 0.07422257214784622, + "learning_rate": 1.77495470407692e-05, + "loss": 2.5658, + "step": 30884 + }, + { + "epoch": 0.9158437861400232, + "grad_norm": 0.06984017044305801, + "learning_rate": 1.773712397999028e-05, + "loss": 2.5553, + "step": 30885 + }, + { + "epoch": 0.9158734394923346, + "grad_norm": 0.07342629879713058, + "learning_rate": 1.7724705189731792e-05, + "loss": 2.5482, + "step": 30886 + }, + { + "epoch": 0.9159030928446461, + "grad_norm": 0.07014137506484985, + "learning_rate": 1.771229067010388e-05, + "loss": 2.5136, + "step": 30887 + }, + { + "epoch": 0.9159327461969575, + "grad_norm": 0.06906460225582123, + "learning_rate": 1.7699880421216398e-05, + "loss": 2.5222, + "step": 30888 + }, + { + "epoch": 0.9159623995492691, + "grad_norm": 0.06910820305347443, + "learning_rate": 1.768747444317925e-05, + "loss": 2.5665, + "step": 30889 + }, + { + "epoch": 0.9159920529015805, + "grad_norm": 0.0707445964217186, + "learning_rate": 1.767507273610236e-05, + "loss": 2.579, + "step": 30890 + }, + { + "epoch": 0.916021706253892, + "grad_norm": 0.06983725726604462, + "learning_rate": 1.7662675300095467e-05, + "loss": 2.5503, + "step": 30891 + }, + { + "epoch": 0.9160513596062034, + "grad_norm": 0.06859144568443298, + "learning_rate": 1.765028213526837e-05, + "loss": 2.5288, + "step": 30892 + }, + { + "epoch": 0.916081012958515, + "grad_norm": 0.0700957328081131, + "learning_rate": 1.763789324173082e-05, + "loss": 2.5475, + "step": 30893 + }, + { + "epoch": 0.9161106663108265, + "grad_norm": 0.07058210670948029, + "learning_rate": 1.76255086195925e-05, + "loss": 2.5782, + "step": 30894 + }, + { + "epoch": 0.9161403196631379, + "grad_norm": 0.07127375155687332, + "learning_rate": 1.7613128268963165e-05, + "loss": 2.5453, + "step": 30895 + }, + { + "epoch": 0.9161699730154494, + "grad_norm": 0.07074063271284103, + "learning_rate": 1.7600752189952385e-05, + "loss": 2.5451, + "step": 30896 + }, + { + "epoch": 0.9161996263677609, + "grad_norm": 0.07132522016763687, + "learning_rate": 1.758838038266969e-05, + "loss": 2.5077, + "step": 30897 + }, + { + "epoch": 0.9162292797200724, + "grad_norm": 0.06991387158632278, + "learning_rate": 1.757601284722471e-05, + "loss": 2.5044, + "step": 30898 + }, + { + "epoch": 0.9162589330723838, + "grad_norm": 0.07025396823883057, + "learning_rate": 1.756364958372686e-05, + "loss": 2.553, + "step": 30899 + }, + { + "epoch": 0.9162885864246954, + "grad_norm": 0.0706106573343277, + "learning_rate": 1.7551290592285774e-05, + "loss": 2.491, + "step": 30900 + }, + { + "epoch": 0.9163182397770068, + "grad_norm": 0.06971389055252075, + "learning_rate": 1.7538935873010863e-05, + "loss": 2.5259, + "step": 30901 + }, + { + "epoch": 0.9163478931293183, + "grad_norm": 0.06967493891716003, + "learning_rate": 1.752658542601143e-05, + "loss": 2.545, + "step": 30902 + }, + { + "epoch": 0.9163775464816297, + "grad_norm": 0.06914648413658142, + "learning_rate": 1.751423925139689e-05, + "loss": 2.5469, + "step": 30903 + }, + { + "epoch": 0.9164071998339413, + "grad_norm": 0.07313213497400284, + "learning_rate": 1.7501897349276653e-05, + "loss": 2.5216, + "step": 30904 + }, + { + "epoch": 0.9164368531862527, + "grad_norm": 0.0700543001294136, + "learning_rate": 1.748955971975985e-05, + "loss": 2.5422, + "step": 30905 + }, + { + "epoch": 0.9164665065385642, + "grad_norm": 0.06978549063205719, + "learning_rate": 1.7477226362955956e-05, + "loss": 2.5073, + "step": 30906 + }, + { + "epoch": 0.9164961598908756, + "grad_norm": 0.07445864379405975, + "learning_rate": 1.7464897278973935e-05, + "loss": 2.5571, + "step": 30907 + }, + { + "epoch": 0.9165258132431872, + "grad_norm": 0.06885696947574615, + "learning_rate": 1.745257246792309e-05, + "loss": 2.5281, + "step": 30908 + }, + { + "epoch": 0.9165554665954986, + "grad_norm": 0.07386104017496109, + "learning_rate": 1.7440251929912498e-05, + "loss": 2.5513, + "step": 30909 + }, + { + "epoch": 0.9165851199478101, + "grad_norm": 0.0695338100194931, + "learning_rate": 1.7427935665051353e-05, + "loss": 2.5477, + "step": 30910 + }, + { + "epoch": 0.9166147733001215, + "grad_norm": 0.07021225988864899, + "learning_rate": 1.741562367344868e-05, + "loss": 2.5639, + "step": 30911 + }, + { + "epoch": 0.9166444266524331, + "grad_norm": 0.0693255215883255, + "learning_rate": 1.7403315955213438e-05, + "loss": 2.5481, + "step": 30912 + }, + { + "epoch": 0.9166740800047445, + "grad_norm": 0.07064919918775558, + "learning_rate": 1.739101251045472e-05, + "loss": 2.5409, + "step": 30913 + }, + { + "epoch": 0.916703733357056, + "grad_norm": 0.06913211941719055, + "learning_rate": 1.737871333928137e-05, + "loss": 2.5191, + "step": 30914 + }, + { + "epoch": 0.9167333867093675, + "grad_norm": 0.06737308949232101, + "learning_rate": 1.7366418441802424e-05, + "loss": 2.5419, + "step": 30915 + }, + { + "epoch": 0.916763040061679, + "grad_norm": 0.07037802040576935, + "learning_rate": 1.7354127818126675e-05, + "loss": 2.5577, + "step": 30916 + }, + { + "epoch": 0.9167926934139905, + "grad_norm": 0.07221218198537827, + "learning_rate": 1.7341841468362985e-05, + "loss": 2.5388, + "step": 30917 + }, + { + "epoch": 0.9168223467663019, + "grad_norm": 0.06810465455055237, + "learning_rate": 1.7329559392620153e-05, + "loss": 2.542, + "step": 30918 + }, + { + "epoch": 0.9168520001186135, + "grad_norm": 0.07108315825462341, + "learning_rate": 1.7317281591006874e-05, + "loss": 2.5344, + "step": 30919 + }, + { + "epoch": 0.9168816534709249, + "grad_norm": 0.07248065620660782, + "learning_rate": 1.730500806363189e-05, + "loss": 2.507, + "step": 30920 + }, + { + "epoch": 0.9169113068232364, + "grad_norm": 0.07036254554986954, + "learning_rate": 1.7292738810603946e-05, + "loss": 2.5425, + "step": 30921 + }, + { + "epoch": 0.9169409601755478, + "grad_norm": 0.06924433261156082, + "learning_rate": 1.728047383203163e-05, + "loss": 2.5354, + "step": 30922 + }, + { + "epoch": 0.9169706135278594, + "grad_norm": 0.06878967583179474, + "learning_rate": 1.7268213128023623e-05, + "loss": 2.515, + "step": 30923 + }, + { + "epoch": 0.9170002668801708, + "grad_norm": 0.06732863187789917, + "learning_rate": 1.7255956698688403e-05, + "loss": 2.5237, + "step": 30924 + }, + { + "epoch": 0.9170299202324823, + "grad_norm": 0.06873748451471329, + "learning_rate": 1.724370454413454e-05, + "loss": 2.5572, + "step": 30925 + }, + { + "epoch": 0.9170595735847937, + "grad_norm": 0.07182913273572922, + "learning_rate": 1.723145666447057e-05, + "loss": 2.5504, + "step": 30926 + }, + { + "epoch": 0.9170892269371053, + "grad_norm": 0.0723450556397438, + "learning_rate": 1.721921305980495e-05, + "loss": 2.5368, + "step": 30927 + }, + { + "epoch": 0.9171188802894167, + "grad_norm": 0.07144723832607269, + "learning_rate": 1.7206973730246046e-05, + "loss": 2.5451, + "step": 30928 + }, + { + "epoch": 0.9171485336417282, + "grad_norm": 0.06846561282873154, + "learning_rate": 1.7194738675902267e-05, + "loss": 2.5456, + "step": 30929 + }, + { + "epoch": 0.9171781869940396, + "grad_norm": 0.07153771817684174, + "learning_rate": 1.7182507896881916e-05, + "loss": 2.5412, + "step": 30930 + }, + { + "epoch": 0.9172078403463512, + "grad_norm": 0.07043109834194183, + "learning_rate": 1.717028139329335e-05, + "loss": 2.5211, + "step": 30931 + }, + { + "epoch": 0.9172374936986626, + "grad_norm": 0.06762877106666565, + "learning_rate": 1.7158059165244766e-05, + "loss": 2.5324, + "step": 30932 + }, + { + "epoch": 0.9172671470509741, + "grad_norm": 0.06929538398981094, + "learning_rate": 1.7145841212844516e-05, + "loss": 2.505, + "step": 30933 + }, + { + "epoch": 0.9172968004032855, + "grad_norm": 0.07119549065828323, + "learning_rate": 1.713362753620068e-05, + "loss": 2.5381, + "step": 30934 + }, + { + "epoch": 0.9173264537555971, + "grad_norm": 0.07009348273277283, + "learning_rate": 1.7121418135421508e-05, + "loss": 2.5426, + "step": 30935 + }, + { + "epoch": 0.9173561071079086, + "grad_norm": 0.07163120061159134, + "learning_rate": 1.710921301061502e-05, + "loss": 2.5591, + "step": 30936 + }, + { + "epoch": 0.91738576046022, + "grad_norm": 0.07156562805175781, + "learning_rate": 1.7097012161889357e-05, + "loss": 2.5282, + "step": 30937 + }, + { + "epoch": 0.9174154138125316, + "grad_norm": 0.0699770450592041, + "learning_rate": 1.7084815589352542e-05, + "loss": 2.5586, + "step": 30938 + }, + { + "epoch": 0.917445067164843, + "grad_norm": 0.06785478442907333, + "learning_rate": 1.7072623293112542e-05, + "loss": 2.5264, + "step": 30939 + }, + { + "epoch": 0.9174747205171545, + "grad_norm": 0.07084100693464279, + "learning_rate": 1.7060435273277385e-05, + "loss": 2.5531, + "step": 30940 + }, + { + "epoch": 0.9175043738694659, + "grad_norm": 0.07145515829324722, + "learning_rate": 1.7048251529954983e-05, + "loss": 2.5476, + "step": 30941 + }, + { + "epoch": 0.9175340272217775, + "grad_norm": 0.06817203015089035, + "learning_rate": 1.7036072063253193e-05, + "loss": 2.5541, + "step": 30942 + }, + { + "epoch": 0.9175636805740889, + "grad_norm": 0.06825044006109238, + "learning_rate": 1.7023896873279876e-05, + "loss": 2.5683, + "step": 30943 + }, + { + "epoch": 0.9175933339264004, + "grad_norm": 0.06930162012577057, + "learning_rate": 1.701172596014283e-05, + "loss": 2.5419, + "step": 30944 + }, + { + "epoch": 0.9176229872787118, + "grad_norm": 0.07180636376142502, + "learning_rate": 1.699955932394992e-05, + "loss": 2.5452, + "step": 30945 + }, + { + "epoch": 0.9176526406310234, + "grad_norm": 0.06877576559782028, + "learning_rate": 1.6987396964808777e-05, + "loss": 2.5363, + "step": 30946 + }, + { + "epoch": 0.9176822939833348, + "grad_norm": 0.07049459218978882, + "learning_rate": 1.6975238882827147e-05, + "loss": 2.5264, + "step": 30947 + }, + { + "epoch": 0.9177119473356463, + "grad_norm": 0.07075668126344681, + "learning_rate": 1.6963085078112673e-05, + "loss": 2.5445, + "step": 30948 + }, + { + "epoch": 0.9177416006879577, + "grad_norm": 0.06778521090745926, + "learning_rate": 1.695093555077304e-05, + "loss": 2.5229, + "step": 30949 + }, + { + "epoch": 0.9177712540402693, + "grad_norm": 0.06668952852487564, + "learning_rate": 1.693879030091572e-05, + "loss": 2.5593, + "step": 30950 + }, + { + "epoch": 0.9178009073925807, + "grad_norm": 0.07121802121400833, + "learning_rate": 1.6926649328648403e-05, + "loss": 2.5262, + "step": 30951 + }, + { + "epoch": 0.9178305607448922, + "grad_norm": 0.07147390395402908, + "learning_rate": 1.691451263407845e-05, + "loss": 2.5393, + "step": 30952 + }, + { + "epoch": 0.9178602140972036, + "grad_norm": 0.0727323368191719, + "learning_rate": 1.690238021731344e-05, + "loss": 2.5589, + "step": 30953 + }, + { + "epoch": 0.9178898674495152, + "grad_norm": 0.07134751230478287, + "learning_rate": 1.689025207846079e-05, + "loss": 2.5495, + "step": 30954 + }, + { + "epoch": 0.9179195208018266, + "grad_norm": 0.07857701182365417, + "learning_rate": 1.6878128217627908e-05, + "loss": 2.5098, + "step": 30955 + }, + { + "epoch": 0.9179491741541381, + "grad_norm": 0.07146821916103363, + "learning_rate": 1.686600863492205e-05, + "loss": 2.5482, + "step": 30956 + }, + { + "epoch": 0.9179788275064497, + "grad_norm": 0.06772790849208832, + "learning_rate": 1.6853893330450676e-05, + "loss": 2.5524, + "step": 30957 + }, + { + "epoch": 0.9180084808587611, + "grad_norm": 0.06960561126470566, + "learning_rate": 1.6841782304320984e-05, + "loss": 2.5288, + "step": 30958 + }, + { + "epoch": 0.9180381342110726, + "grad_norm": 0.0699576735496521, + "learning_rate": 1.682967555664022e-05, + "loss": 2.5516, + "step": 30959 + }, + { + "epoch": 0.918067787563384, + "grad_norm": 0.0782436728477478, + "learning_rate": 1.681757308751569e-05, + "loss": 2.5233, + "step": 30960 + }, + { + "epoch": 0.9180974409156956, + "grad_norm": 0.06956284493207932, + "learning_rate": 1.6805474897054474e-05, + "loss": 2.5226, + "step": 30961 + }, + { + "epoch": 0.918127094268007, + "grad_norm": 0.07283127307891846, + "learning_rate": 1.6793380985363703e-05, + "loss": 2.5231, + "step": 30962 + }, + { + "epoch": 0.9181567476203185, + "grad_norm": 0.06872901320457458, + "learning_rate": 1.6781291352550464e-05, + "loss": 2.554, + "step": 30963 + }, + { + "epoch": 0.9181864009726299, + "grad_norm": 0.06895913928747177, + "learning_rate": 1.6769205998721727e-05, + "loss": 2.5342, + "step": 30964 + }, + { + "epoch": 0.9182160543249415, + "grad_norm": 0.07276017218828201, + "learning_rate": 1.6757124923984733e-05, + "loss": 2.515, + "step": 30965 + }, + { + "epoch": 0.9182457076772529, + "grad_norm": 0.06933417171239853, + "learning_rate": 1.674504812844635e-05, + "loss": 2.5517, + "step": 30966 + }, + { + "epoch": 0.9182753610295644, + "grad_norm": 0.07097109407186508, + "learning_rate": 1.6732975612213485e-05, + "loss": 2.5496, + "step": 30967 + }, + { + "epoch": 0.9183050143818758, + "grad_norm": 0.07014844566583633, + "learning_rate": 1.6720907375393114e-05, + "loss": 2.5636, + "step": 30968 + }, + { + "epoch": 0.9183346677341874, + "grad_norm": 0.06982909142971039, + "learning_rate": 1.6708843418092033e-05, + "loss": 2.5116, + "step": 30969 + }, + { + "epoch": 0.9183643210864988, + "grad_norm": 0.07091616094112396, + "learning_rate": 1.669678374041711e-05, + "loss": 2.5617, + "step": 30970 + }, + { + "epoch": 0.9183939744388103, + "grad_norm": 0.07323504984378815, + "learning_rate": 1.6684728342475085e-05, + "loss": 2.5624, + "step": 30971 + }, + { + "epoch": 0.9184236277911217, + "grad_norm": 0.07039784640073776, + "learning_rate": 1.667267722437288e-05, + "loss": 2.503, + "step": 30972 + }, + { + "epoch": 0.9184532811434333, + "grad_norm": 0.06855546683073044, + "learning_rate": 1.6660630386216957e-05, + "loss": 2.5824, + "step": 30973 + }, + { + "epoch": 0.9184829344957447, + "grad_norm": 0.07209955900907516, + "learning_rate": 1.6648587828114127e-05, + "loss": 2.5012, + "step": 30974 + }, + { + "epoch": 0.9185125878480562, + "grad_norm": 0.07275982946157455, + "learning_rate": 1.663654955017102e-05, + "loss": 2.5904, + "step": 30975 + }, + { + "epoch": 0.9185422412003676, + "grad_norm": 0.0681469738483429, + "learning_rate": 1.662451555249428e-05, + "loss": 2.5308, + "step": 30976 + }, + { + "epoch": 0.9185718945526792, + "grad_norm": 0.07993993163108826, + "learning_rate": 1.6612485835190315e-05, + "loss": 2.5162, + "step": 30977 + }, + { + "epoch": 0.9186015479049907, + "grad_norm": 0.07402855157852173, + "learning_rate": 1.6600460398365824e-05, + "loss": 2.5312, + "step": 30978 + }, + { + "epoch": 0.9186312012573021, + "grad_norm": 0.06754450500011444, + "learning_rate": 1.6588439242127274e-05, + "loss": 2.5219, + "step": 30979 + }, + { + "epoch": 0.9186608546096137, + "grad_norm": 0.07017569988965988, + "learning_rate": 1.6576422366581023e-05, + "loss": 2.5121, + "step": 30980 + }, + { + "epoch": 0.9186905079619251, + "grad_norm": 0.069417804479599, + "learning_rate": 1.6564409771833543e-05, + "loss": 2.5359, + "step": 30981 + }, + { + "epoch": 0.9187201613142366, + "grad_norm": 0.07297000288963318, + "learning_rate": 1.6552401457991308e-05, + "loss": 2.5338, + "step": 30982 + }, + { + "epoch": 0.918749814666548, + "grad_norm": 0.06792234629392624, + "learning_rate": 1.6540397425160392e-05, + "loss": 2.5372, + "step": 30983 + }, + { + "epoch": 0.9187794680188596, + "grad_norm": 0.0702163577079773, + "learning_rate": 1.652839767344727e-05, + "loss": 2.5751, + "step": 30984 + }, + { + "epoch": 0.918809121371171, + "grad_norm": 0.06885851174592972, + "learning_rate": 1.6516402202958193e-05, + "loss": 2.5323, + "step": 30985 + }, + { + "epoch": 0.9188387747234825, + "grad_norm": 0.07175762206315994, + "learning_rate": 1.6504411013799404e-05, + "loss": 2.5282, + "step": 30986 + }, + { + "epoch": 0.9188684280757939, + "grad_norm": 0.06993205100297928, + "learning_rate": 1.6492424106076986e-05, + "loss": 2.5422, + "step": 30987 + }, + { + "epoch": 0.9188980814281055, + "grad_norm": 0.0708833634853363, + "learning_rate": 1.6480441479897136e-05, + "loss": 2.5335, + "step": 30988 + }, + { + "epoch": 0.9189277347804169, + "grad_norm": 0.07139531522989273, + "learning_rate": 1.6468463135365984e-05, + "loss": 2.5309, + "step": 30989 + }, + { + "epoch": 0.9189573881327284, + "grad_norm": 0.069205142557621, + "learning_rate": 1.6456489072589565e-05, + "loss": 2.5309, + "step": 30990 + }, + { + "epoch": 0.9189870414850398, + "grad_norm": 0.06891467422246933, + "learning_rate": 1.6444519291673952e-05, + "loss": 2.5668, + "step": 30991 + }, + { + "epoch": 0.9190166948373514, + "grad_norm": 0.06813772767782211, + "learning_rate": 1.643255379272518e-05, + "loss": 2.5482, + "step": 30992 + }, + { + "epoch": 0.9190463481896628, + "grad_norm": 0.07092853635549545, + "learning_rate": 1.6420592575849157e-05, + "loss": 2.5631, + "step": 30993 + }, + { + "epoch": 0.9190760015419743, + "grad_norm": 0.06843914091587067, + "learning_rate": 1.6408635641151747e-05, + "loss": 2.5289, + "step": 30994 + }, + { + "epoch": 0.9191056548942858, + "grad_norm": 0.0705251693725586, + "learning_rate": 1.6396682988738865e-05, + "loss": 2.579, + "step": 30995 + }, + { + "epoch": 0.9191353082465973, + "grad_norm": 0.0673694908618927, + "learning_rate": 1.6384734618716367e-05, + "loss": 2.5399, + "step": 30996 + }, + { + "epoch": 0.9191649615989087, + "grad_norm": 0.06775801628828049, + "learning_rate": 1.637279053119006e-05, + "loss": 2.5175, + "step": 30997 + }, + { + "epoch": 0.9191946149512202, + "grad_norm": 0.07126373797655106, + "learning_rate": 1.6360850726265698e-05, + "loss": 2.5239, + "step": 30998 + }, + { + "epoch": 0.9192242683035318, + "grad_norm": 0.07495120167732239, + "learning_rate": 1.634891520404902e-05, + "loss": 2.5461, + "step": 30999 + }, + { + "epoch": 0.9192539216558432, + "grad_norm": 0.07051306962966919, + "learning_rate": 1.6336983964645725e-05, + "loss": 2.4971, + "step": 31000 + }, + { + "epoch": 0.9192835750081547, + "grad_norm": 0.07302532345056534, + "learning_rate": 1.6325057008161447e-05, + "loss": 2.5087, + "step": 31001 + }, + { + "epoch": 0.9193132283604661, + "grad_norm": 0.06976944208145142, + "learning_rate": 1.6313134334701828e-05, + "loss": 2.5497, + "step": 31002 + }, + { + "epoch": 0.9193428817127777, + "grad_norm": 0.06967699527740479, + "learning_rate": 1.6301215944372395e-05, + "loss": 2.5601, + "step": 31003 + }, + { + "epoch": 0.9193725350650891, + "grad_norm": 0.07349750399589539, + "learning_rate": 1.6289301837278725e-05, + "loss": 2.5469, + "step": 31004 + }, + { + "epoch": 0.9194021884174006, + "grad_norm": 0.0720072016119957, + "learning_rate": 1.627739201352635e-05, + "loss": 2.5184, + "step": 31005 + }, + { + "epoch": 0.919431841769712, + "grad_norm": 0.07429744303226471, + "learning_rate": 1.6265486473220682e-05, + "loss": 2.5303, + "step": 31006 + }, + { + "epoch": 0.9194614951220236, + "grad_norm": 0.06928564608097076, + "learning_rate": 1.6253585216467136e-05, + "loss": 2.5502, + "step": 31007 + }, + { + "epoch": 0.919491148474335, + "grad_norm": 0.07309185713529587, + "learning_rate": 1.6241688243371188e-05, + "loss": 2.5415, + "step": 31008 + }, + { + "epoch": 0.9195208018266465, + "grad_norm": 0.077146977186203, + "learning_rate": 1.6229795554038086e-05, + "loss": 2.5356, + "step": 31009 + }, + { + "epoch": 0.919550455178958, + "grad_norm": 0.07219480723142624, + "learning_rate": 1.6217907148573186e-05, + "loss": 2.542, + "step": 31010 + }, + { + "epoch": 0.9195801085312695, + "grad_norm": 0.07236847281455994, + "learning_rate": 1.620602302708174e-05, + "loss": 2.5551, + "step": 31011 + }, + { + "epoch": 0.9196097618835809, + "grad_norm": 0.06757113337516785, + "learning_rate": 1.6194143189669053e-05, + "loss": 2.502, + "step": 31012 + }, + { + "epoch": 0.9196394152358924, + "grad_norm": 0.07371383160352707, + "learning_rate": 1.6182267636440206e-05, + "loss": 2.5231, + "step": 31013 + }, + { + "epoch": 0.9196690685882039, + "grad_norm": 0.06845162063837051, + "learning_rate": 1.617039636750045e-05, + "loss": 2.5464, + "step": 31014 + }, + { + "epoch": 0.9196987219405154, + "grad_norm": 0.06652350723743439, + "learning_rate": 1.6158529382954923e-05, + "loss": 2.5188, + "step": 31015 + }, + { + "epoch": 0.9197283752928268, + "grad_norm": 0.06686375290155411, + "learning_rate": 1.614666668290865e-05, + "loss": 2.5395, + "step": 31016 + }, + { + "epoch": 0.9197580286451383, + "grad_norm": 0.07109011709690094, + "learning_rate": 1.613480826746666e-05, + "loss": 2.5102, + "step": 31017 + }, + { + "epoch": 0.9197876819974498, + "grad_norm": 0.07597696781158447, + "learning_rate": 1.6122954136734037e-05, + "loss": 2.5877, + "step": 31018 + }, + { + "epoch": 0.9198173353497613, + "grad_norm": 0.06574901193380356, + "learning_rate": 1.611110429081569e-05, + "loss": 2.5378, + "step": 31019 + }, + { + "epoch": 0.9198469887020728, + "grad_norm": 0.07169148325920105, + "learning_rate": 1.6099258729816603e-05, + "loss": 2.5481, + "step": 31020 + }, + { + "epoch": 0.9198766420543842, + "grad_norm": 0.07615102827548981, + "learning_rate": 1.608741745384157e-05, + "loss": 2.5221, + "step": 31021 + }, + { + "epoch": 0.9199062954066958, + "grad_norm": 0.06972133368253708, + "learning_rate": 1.6075580462995566e-05, + "loss": 2.5459, + "step": 31022 + }, + { + "epoch": 0.9199359487590072, + "grad_norm": 0.06911653280258179, + "learning_rate": 1.6063747757383395e-05, + "loss": 2.5093, + "step": 31023 + }, + { + "epoch": 0.9199656021113187, + "grad_norm": 0.06984054297208786, + "learning_rate": 1.6051919337109755e-05, + "loss": 2.565, + "step": 31024 + }, + { + "epoch": 0.9199952554636301, + "grad_norm": 0.07237264513969421, + "learning_rate": 1.604009520227945e-05, + "loss": 2.5612, + "step": 31025 + }, + { + "epoch": 0.9200249088159417, + "grad_norm": 0.07211700081825256, + "learning_rate": 1.6028275352997168e-05, + "loss": 2.5373, + "step": 31026 + }, + { + "epoch": 0.9200545621682531, + "grad_norm": 0.06862668693065643, + "learning_rate": 1.6016459789367665e-05, + "loss": 2.5589, + "step": 31027 + }, + { + "epoch": 0.9200842155205646, + "grad_norm": 0.06728749722242355, + "learning_rate": 1.600464851149541e-05, + "loss": 2.5218, + "step": 31028 + }, + { + "epoch": 0.920113868872876, + "grad_norm": 0.0711321234703064, + "learning_rate": 1.599284151948499e-05, + "loss": 2.5436, + "step": 31029 + }, + { + "epoch": 0.9201435222251876, + "grad_norm": 0.07482434064149857, + "learning_rate": 1.5981038813441097e-05, + "loss": 2.5576, + "step": 31030 + }, + { + "epoch": 0.920173175577499, + "grad_norm": 0.06951282918453217, + "learning_rate": 1.59692403934682e-05, + "loss": 2.5281, + "step": 31031 + }, + { + "epoch": 0.9202028289298105, + "grad_norm": 0.06535179167985916, + "learning_rate": 1.595744625967077e-05, + "loss": 2.5299, + "step": 31032 + }, + { + "epoch": 0.920232482282122, + "grad_norm": 0.0674208328127861, + "learning_rate": 1.594565641215323e-05, + "loss": 2.5333, + "step": 31033 + }, + { + "epoch": 0.9202621356344335, + "grad_norm": 0.07176019996404648, + "learning_rate": 1.5933870851019994e-05, + "loss": 2.5028, + "step": 31034 + }, + { + "epoch": 0.9202917889867449, + "grad_norm": 0.07202588766813278, + "learning_rate": 1.5922089576375422e-05, + "loss": 2.526, + "step": 31035 + }, + { + "epoch": 0.9203214423390564, + "grad_norm": 0.06758155673742294, + "learning_rate": 1.5910312588323873e-05, + "loss": 2.5452, + "step": 31036 + }, + { + "epoch": 0.9203510956913679, + "grad_norm": 0.07117140293121338, + "learning_rate": 1.58985398869696e-05, + "loss": 2.5419, + "step": 31037 + }, + { + "epoch": 0.9203807490436794, + "grad_norm": 0.0721866637468338, + "learning_rate": 1.5886771472416796e-05, + "loss": 2.5456, + "step": 31038 + }, + { + "epoch": 0.9204104023959908, + "grad_norm": 0.06971890479326248, + "learning_rate": 1.5875007344769764e-05, + "loss": 2.5635, + "step": 31039 + }, + { + "epoch": 0.9204400557483023, + "grad_norm": 0.06947139650583267, + "learning_rate": 1.5863247504132593e-05, + "loss": 2.531, + "step": 31040 + }, + { + "epoch": 0.9204697091006139, + "grad_norm": 0.07431608438491821, + "learning_rate": 1.585149195060953e-05, + "loss": 2.5403, + "step": 31041 + }, + { + "epoch": 0.9204993624529253, + "grad_norm": 0.07093080878257751, + "learning_rate": 1.5839740684304494e-05, + "loss": 2.5372, + "step": 31042 + }, + { + "epoch": 0.9205290158052368, + "grad_norm": 0.0694388747215271, + "learning_rate": 1.582799370532173e-05, + "loss": 2.5329, + "step": 31043 + }, + { + "epoch": 0.9205586691575482, + "grad_norm": 0.07008769363164902, + "learning_rate": 1.5816251013765216e-05, + "loss": 2.5274, + "step": 31044 + }, + { + "epoch": 0.9205883225098598, + "grad_norm": 0.06930593401193619, + "learning_rate": 1.5804512609738863e-05, + "loss": 2.5265, + "step": 31045 + }, + { + "epoch": 0.9206179758621712, + "grad_norm": 0.07060089707374573, + "learning_rate": 1.5792778493346705e-05, + "loss": 2.5225, + "step": 31046 + }, + { + "epoch": 0.9206476292144827, + "grad_norm": 0.06999354064464569, + "learning_rate": 1.57810486646926e-05, + "loss": 2.557, + "step": 31047 + }, + { + "epoch": 0.9206772825667942, + "grad_norm": 0.0712246522307396, + "learning_rate": 1.5769323123880464e-05, + "loss": 2.5269, + "step": 31048 + }, + { + "epoch": 0.9207069359191057, + "grad_norm": 0.0727514699101448, + "learning_rate": 1.5757601871014048e-05, + "loss": 2.5392, + "step": 31049 + }, + { + "epoch": 0.9207365892714171, + "grad_norm": 0.06994732469320297, + "learning_rate": 1.5745884906197163e-05, + "loss": 2.5395, + "step": 31050 + }, + { + "epoch": 0.9207662426237286, + "grad_norm": 0.06845255196094513, + "learning_rate": 1.5734172229533605e-05, + "loss": 2.5426, + "step": 31051 + }, + { + "epoch": 0.9207958959760401, + "grad_norm": 0.07041257619857788, + "learning_rate": 1.5722463841127077e-05, + "loss": 2.5057, + "step": 31052 + }, + { + "epoch": 0.9208255493283516, + "grad_norm": 0.07237157970666885, + "learning_rate": 1.5710759741081214e-05, + "loss": 2.5015, + "step": 31053 + }, + { + "epoch": 0.920855202680663, + "grad_norm": 0.07083922624588013, + "learning_rate": 1.5699059929499714e-05, + "loss": 2.547, + "step": 31054 + }, + { + "epoch": 0.9208848560329745, + "grad_norm": 0.07214518636465073, + "learning_rate": 1.568736440648616e-05, + "loss": 2.5154, + "step": 31055 + }, + { + "epoch": 0.920914509385286, + "grad_norm": 0.06902177631855011, + "learning_rate": 1.567567317214419e-05, + "loss": 2.5215, + "step": 31056 + }, + { + "epoch": 0.9209441627375975, + "grad_norm": 0.07404481619596481, + "learning_rate": 1.566398622657722e-05, + "loss": 2.5737, + "step": 31057 + }, + { + "epoch": 0.9209738160899089, + "grad_norm": 0.06958125531673431, + "learning_rate": 1.5652303569888836e-05, + "loss": 2.5275, + "step": 31058 + }, + { + "epoch": 0.9210034694422204, + "grad_norm": 0.07099239528179169, + "learning_rate": 1.5640625202182457e-05, + "loss": 2.5467, + "step": 31059 + }, + { + "epoch": 0.9210331227945319, + "grad_norm": 0.07389064133167267, + "learning_rate": 1.5628951123561387e-05, + "loss": 2.5424, + "step": 31060 + }, + { + "epoch": 0.9210627761468434, + "grad_norm": 0.06957915425300598, + "learning_rate": 1.5617281334129153e-05, + "loss": 2.5525, + "step": 31061 + }, + { + "epoch": 0.9210924294991549, + "grad_norm": 0.06887435168027878, + "learning_rate": 1.5605615833989005e-05, + "loss": 2.5372, + "step": 31062 + }, + { + "epoch": 0.9211220828514664, + "grad_norm": 0.07236122339963913, + "learning_rate": 1.559395462324431e-05, + "loss": 2.4924, + "step": 31063 + }, + { + "epoch": 0.9211517362037779, + "grad_norm": 0.07130702584981918, + "learning_rate": 1.558229770199826e-05, + "loss": 2.5239, + "step": 31064 + }, + { + "epoch": 0.9211813895560893, + "grad_norm": 0.06979156285524368, + "learning_rate": 1.5570645070354163e-05, + "loss": 2.4981, + "step": 31065 + }, + { + "epoch": 0.9212110429084008, + "grad_norm": 0.07665602117776871, + "learning_rate": 1.5558996728415097e-05, + "loss": 2.5106, + "step": 31066 + }, + { + "epoch": 0.9212406962607123, + "grad_norm": 0.06720118224620819, + "learning_rate": 1.5547352676284266e-05, + "loss": 2.5204, + "step": 31067 + }, + { + "epoch": 0.9212703496130238, + "grad_norm": 0.07510768622159958, + "learning_rate": 1.5535712914064804e-05, + "loss": 2.577, + "step": 31068 + }, + { + "epoch": 0.9213000029653352, + "grad_norm": 0.07338441908359528, + "learning_rate": 1.5524077441859795e-05, + "loss": 2.5017, + "step": 31069 + }, + { + "epoch": 0.9213296563176467, + "grad_norm": 0.07032574713230133, + "learning_rate": 1.5512446259772218e-05, + "loss": 2.5424, + "step": 31070 + }, + { + "epoch": 0.9213593096699582, + "grad_norm": 0.0746803730726242, + "learning_rate": 1.5500819367905096e-05, + "loss": 2.5205, + "step": 31071 + }, + { + "epoch": 0.9213889630222697, + "grad_norm": 0.0726238265633583, + "learning_rate": 1.5489196766361346e-05, + "loss": 2.5513, + "step": 31072 + }, + { + "epoch": 0.9214186163745811, + "grad_norm": 0.07092628628015518, + "learning_rate": 1.5477578455243945e-05, + "loss": 2.4956, + "step": 31073 + }, + { + "epoch": 0.9214482697268926, + "grad_norm": 0.07288613170385361, + "learning_rate": 1.5465964434655755e-05, + "loss": 2.5276, + "step": 31074 + }, + { + "epoch": 0.9214779230792041, + "grad_norm": 0.07119005918502808, + "learning_rate": 1.5454354704699635e-05, + "loss": 2.5182, + "step": 31075 + }, + { + "epoch": 0.9215075764315156, + "grad_norm": 0.06990650296211243, + "learning_rate": 1.544274926547834e-05, + "loss": 2.5688, + "step": 31076 + }, + { + "epoch": 0.921537229783827, + "grad_norm": 0.06867742538452148, + "learning_rate": 1.543114811709473e-05, + "loss": 2.5381, + "step": 31077 + }, + { + "epoch": 0.9215668831361385, + "grad_norm": 0.06706153601408005, + "learning_rate": 1.5419551259651445e-05, + "loss": 2.5331, + "step": 31078 + }, + { + "epoch": 0.92159653648845, + "grad_norm": 0.07260637730360031, + "learning_rate": 1.540795869325118e-05, + "loss": 2.5449, + "step": 31079 + }, + { + "epoch": 0.9216261898407615, + "grad_norm": 0.07146149128675461, + "learning_rate": 1.5396370417996687e-05, + "loss": 2.5396, + "step": 31080 + }, + { + "epoch": 0.921655843193073, + "grad_norm": 0.06831956654787064, + "learning_rate": 1.538478643399044e-05, + "loss": 2.5653, + "step": 31081 + }, + { + "epoch": 0.9216854965453845, + "grad_norm": 0.06914904713630676, + "learning_rate": 1.537320674133513e-05, + "loss": 2.543, + "step": 31082 + }, + { + "epoch": 0.921715149897696, + "grad_norm": 0.06631380319595337, + "learning_rate": 1.5361631340133243e-05, + "loss": 2.524, + "step": 31083 + }, + { + "epoch": 0.9217448032500074, + "grad_norm": 0.06913008540868759, + "learning_rate": 1.535006023048735e-05, + "loss": 2.5589, + "step": 31084 + }, + { + "epoch": 0.9217744566023189, + "grad_norm": 0.0692862868309021, + "learning_rate": 1.533849341249982e-05, + "loss": 2.5209, + "step": 31085 + }, + { + "epoch": 0.9218041099546304, + "grad_norm": 0.06808378547430038, + "learning_rate": 1.5326930886273127e-05, + "loss": 2.5472, + "step": 31086 + }, + { + "epoch": 0.9218337633069419, + "grad_norm": 0.06693288683891296, + "learning_rate": 1.531537265190963e-05, + "loss": 2.5637, + "step": 31087 + }, + { + "epoch": 0.9218634166592533, + "grad_norm": 0.07313138991594315, + "learning_rate": 1.530381870951175e-05, + "loss": 2.5347, + "step": 31088 + }, + { + "epoch": 0.9218930700115648, + "grad_norm": 0.07097155600786209, + "learning_rate": 1.529226905918174e-05, + "loss": 2.5609, + "step": 31089 + }, + { + "epoch": 0.9219227233638763, + "grad_norm": 0.06919195502996445, + "learning_rate": 1.528072370102185e-05, + "loss": 2.5315, + "step": 31090 + }, + { + "epoch": 0.9219523767161878, + "grad_norm": 0.06865178048610687, + "learning_rate": 1.526918263513438e-05, + "loss": 2.5305, + "step": 31091 + }, + { + "epoch": 0.9219820300684992, + "grad_norm": 0.0716949924826622, + "learning_rate": 1.5257645861621539e-05, + "loss": 2.5549, + "step": 31092 + }, + { + "epoch": 0.9220116834208107, + "grad_norm": 0.07416950166225433, + "learning_rate": 1.5246113380585347e-05, + "loss": 2.5237, + "step": 31093 + }, + { + "epoch": 0.9220413367731222, + "grad_norm": 0.07116088271141052, + "learning_rate": 1.5234585192128115e-05, + "loss": 2.5226, + "step": 31094 + }, + { + "epoch": 0.9220709901254337, + "grad_norm": 0.06408723443746567, + "learning_rate": 1.5223061296351814e-05, + "loss": 2.5233, + "step": 31095 + }, + { + "epoch": 0.9221006434777451, + "grad_norm": 0.07001734524965286, + "learning_rate": 1.5211541693358533e-05, + "loss": 2.5556, + "step": 31096 + }, + { + "epoch": 0.9221302968300567, + "grad_norm": 0.0694114938378334, + "learning_rate": 1.5200026383250243e-05, + "loss": 2.5213, + "step": 31097 + }, + { + "epoch": 0.9221599501823681, + "grad_norm": 0.06933999061584473, + "learning_rate": 1.5188515366128919e-05, + "loss": 2.542, + "step": 31098 + }, + { + "epoch": 0.9221896035346796, + "grad_norm": 0.0730292797088623, + "learning_rate": 1.5177008642096535e-05, + "loss": 2.5605, + "step": 31099 + }, + { + "epoch": 0.922219256886991, + "grad_norm": 0.07052172720432281, + "learning_rate": 1.5165506211254954e-05, + "loss": 2.5126, + "step": 31100 + }, + { + "epoch": 0.9222489102393026, + "grad_norm": 0.06884497404098511, + "learning_rate": 1.5154008073706038e-05, + "loss": 2.5455, + "step": 31101 + }, + { + "epoch": 0.9222785635916141, + "grad_norm": 0.06982484459877014, + "learning_rate": 1.5142514229551596e-05, + "loss": 2.5481, + "step": 31102 + }, + { + "epoch": 0.9223082169439255, + "grad_norm": 0.07092702388763428, + "learning_rate": 1.513102467889349e-05, + "loss": 2.5503, + "step": 31103 + }, + { + "epoch": 0.922337870296237, + "grad_norm": 0.06785018742084503, + "learning_rate": 1.5119539421833306e-05, + "loss": 2.5557, + "step": 31104 + }, + { + "epoch": 0.9223675236485485, + "grad_norm": 0.06693709641695023, + "learning_rate": 1.5108058458472795e-05, + "loss": 2.5327, + "step": 31105 + }, + { + "epoch": 0.92239717700086, + "grad_norm": 0.06881293654441833, + "learning_rate": 1.5096581788913655e-05, + "loss": 2.536, + "step": 31106 + }, + { + "epoch": 0.9224268303531714, + "grad_norm": 0.07010632008314133, + "learning_rate": 1.508510941325758e-05, + "loss": 2.5348, + "step": 31107 + }, + { + "epoch": 0.9224564837054829, + "grad_norm": 0.07009034603834152, + "learning_rate": 1.5073641331606048e-05, + "loss": 2.5311, + "step": 31108 + }, + { + "epoch": 0.9224861370577944, + "grad_norm": 0.07149158418178558, + "learning_rate": 1.5062177544060695e-05, + "loss": 2.4951, + "step": 31109 + }, + { + "epoch": 0.9225157904101059, + "grad_norm": 0.06932138651609421, + "learning_rate": 1.5050718050723e-05, + "loss": 2.5485, + "step": 31110 + }, + { + "epoch": 0.9225454437624173, + "grad_norm": 0.07089439034461975, + "learning_rate": 1.5039262851694435e-05, + "loss": 2.4763, + "step": 31111 + }, + { + "epoch": 0.9225750971147288, + "grad_norm": 0.0722208172082901, + "learning_rate": 1.5027811947076419e-05, + "loss": 2.5287, + "step": 31112 + }, + { + "epoch": 0.9226047504670403, + "grad_norm": 0.0674399882555008, + "learning_rate": 1.5016365336970428e-05, + "loss": 2.5289, + "step": 31113 + }, + { + "epoch": 0.9226344038193518, + "grad_norm": 0.07006422430276871, + "learning_rate": 1.5004923021477768e-05, + "loss": 2.5256, + "step": 31114 + }, + { + "epoch": 0.9226640571716632, + "grad_norm": 0.0691630095243454, + "learning_rate": 1.4993485000699692e-05, + "loss": 2.5234, + "step": 31115 + }, + { + "epoch": 0.9226937105239748, + "grad_norm": 0.07106900215148926, + "learning_rate": 1.4982051274737618e-05, + "loss": 2.5518, + "step": 31116 + }, + { + "epoch": 0.9227233638762862, + "grad_norm": 0.07037094980478287, + "learning_rate": 1.497062184369269e-05, + "loss": 2.5437, + "step": 31117 + }, + { + "epoch": 0.9227530172285977, + "grad_norm": 0.07276909798383713, + "learning_rate": 1.4959196707666156e-05, + "loss": 2.52, + "step": 31118 + }, + { + "epoch": 0.9227826705809091, + "grad_norm": 0.07238280028104782, + "learning_rate": 1.4947775866759162e-05, + "loss": 2.5086, + "step": 31119 + }, + { + "epoch": 0.9228123239332207, + "grad_norm": 0.07366722822189331, + "learning_rate": 1.4936359321072902e-05, + "loss": 2.5517, + "step": 31120 + }, + { + "epoch": 0.9228419772855321, + "grad_norm": 0.07101432234048843, + "learning_rate": 1.492494707070846e-05, + "loss": 2.5667, + "step": 31121 + }, + { + "epoch": 0.9228716306378436, + "grad_norm": 0.06817354261875153, + "learning_rate": 1.4913539115766872e-05, + "loss": 2.5412, + "step": 31122 + }, + { + "epoch": 0.9229012839901551, + "grad_norm": 0.0714423730969429, + "learning_rate": 1.4902135456349165e-05, + "loss": 2.6022, + "step": 31123 + }, + { + "epoch": 0.9229309373424666, + "grad_norm": 0.06976062059402466, + "learning_rate": 1.4890736092556311e-05, + "loss": 2.5326, + "step": 31124 + }, + { + "epoch": 0.9229605906947781, + "grad_norm": 0.07118160277605057, + "learning_rate": 1.487934102448929e-05, + "loss": 2.5346, + "step": 31125 + }, + { + "epoch": 0.9229902440470895, + "grad_norm": 0.06928640604019165, + "learning_rate": 1.4867950252248908e-05, + "loss": 2.55, + "step": 31126 + }, + { + "epoch": 0.923019897399401, + "grad_norm": 0.0713818296790123, + "learning_rate": 1.4856563775936139e-05, + "loss": 2.5536, + "step": 31127 + }, + { + "epoch": 0.9230495507517125, + "grad_norm": 0.06877368688583374, + "learning_rate": 1.4845181595651735e-05, + "loss": 2.5504, + "step": 31128 + }, + { + "epoch": 0.923079204104024, + "grad_norm": 0.07030198723077774, + "learning_rate": 1.4833803711496563e-05, + "loss": 2.5648, + "step": 31129 + }, + { + "epoch": 0.9231088574563354, + "grad_norm": 0.0674186572432518, + "learning_rate": 1.4822430123571318e-05, + "loss": 2.4967, + "step": 31130 + }, + { + "epoch": 0.923138510808647, + "grad_norm": 0.06949172168970108, + "learning_rate": 1.4811060831976698e-05, + "loss": 2.5411, + "step": 31131 + }, + { + "epoch": 0.9231681641609584, + "grad_norm": 0.07580377906560898, + "learning_rate": 1.4799695836813398e-05, + "loss": 2.5608, + "step": 31132 + }, + { + "epoch": 0.9231978175132699, + "grad_norm": 0.07210850715637207, + "learning_rate": 1.4788335138182174e-05, + "loss": 2.5523, + "step": 31133 + }, + { + "epoch": 0.9232274708655813, + "grad_norm": 0.07429587095975876, + "learning_rate": 1.47769787361835e-05, + "loss": 2.5296, + "step": 31134 + }, + { + "epoch": 0.9232571242178929, + "grad_norm": 0.06642289459705353, + "learning_rate": 1.4765626630917962e-05, + "loss": 2.4889, + "step": 31135 + }, + { + "epoch": 0.9232867775702043, + "grad_norm": 0.07100097090005875, + "learning_rate": 1.4754278822486088e-05, + "loss": 2.5339, + "step": 31136 + }, + { + "epoch": 0.9233164309225158, + "grad_norm": 0.07062536478042603, + "learning_rate": 1.474293531098836e-05, + "loss": 2.5564, + "step": 31137 + }, + { + "epoch": 0.9233460842748272, + "grad_norm": 0.07202690094709396, + "learning_rate": 1.473159609652519e-05, + "loss": 2.5368, + "step": 31138 + }, + { + "epoch": 0.9233757376271388, + "grad_norm": 0.0740041732788086, + "learning_rate": 1.4720261179197115e-05, + "loss": 2.5238, + "step": 31139 + }, + { + "epoch": 0.9234053909794502, + "grad_norm": 0.07163971662521362, + "learning_rate": 1.4708930559104383e-05, + "loss": 2.5384, + "step": 31140 + }, + { + "epoch": 0.9234350443317617, + "grad_norm": 0.07161295413970947, + "learning_rate": 1.4697604236347362e-05, + "loss": 2.5042, + "step": 31141 + }, + { + "epoch": 0.9234646976840731, + "grad_norm": 0.07169266045093536, + "learning_rate": 1.4686282211026359e-05, + "loss": 2.5481, + "step": 31142 + }, + { + "epoch": 0.9234943510363847, + "grad_norm": 0.07719683647155762, + "learning_rate": 1.4674964483241626e-05, + "loss": 2.5381, + "step": 31143 + }, + { + "epoch": 0.9235240043886962, + "grad_norm": 0.0705726221203804, + "learning_rate": 1.4663651053093363e-05, + "loss": 2.5547, + "step": 31144 + }, + { + "epoch": 0.9235536577410076, + "grad_norm": 0.07003454118967056, + "learning_rate": 1.4652341920681822e-05, + "loss": 2.5026, + "step": 31145 + }, + { + "epoch": 0.9235833110933191, + "grad_norm": 0.07280733436346054, + "learning_rate": 1.464103708610709e-05, + "loss": 2.5569, + "step": 31146 + }, + { + "epoch": 0.9236129644456306, + "grad_norm": 0.06903860718011856, + "learning_rate": 1.4629736549469307e-05, + "loss": 2.5547, + "step": 31147 + }, + { + "epoch": 0.9236426177979421, + "grad_norm": 0.07146236300468445, + "learning_rate": 1.4618440310868452e-05, + "loss": 2.5401, + "step": 31148 + }, + { + "epoch": 0.9236722711502535, + "grad_norm": 0.06797630339860916, + "learning_rate": 1.4607148370404666e-05, + "loss": 2.5435, + "step": 31149 + }, + { + "epoch": 0.923701924502565, + "grad_norm": 0.06659505516290665, + "learning_rate": 1.4595860728177924e-05, + "loss": 2.5138, + "step": 31150 + }, + { + "epoch": 0.9237315778548765, + "grad_norm": 0.06845434755086899, + "learning_rate": 1.4584577384288145e-05, + "loss": 2.5358, + "step": 31151 + }, + { + "epoch": 0.923761231207188, + "grad_norm": 0.06868330389261246, + "learning_rate": 1.4573298338835194e-05, + "loss": 2.5112, + "step": 31152 + }, + { + "epoch": 0.9237908845594994, + "grad_norm": 0.06712904572486877, + "learning_rate": 1.456202359191905e-05, + "loss": 2.5509, + "step": 31153 + }, + { + "epoch": 0.923820537911811, + "grad_norm": 0.07217980176210403, + "learning_rate": 1.4550753143639516e-05, + "loss": 2.5406, + "step": 31154 + }, + { + "epoch": 0.9238501912641224, + "grad_norm": 0.07060239464044571, + "learning_rate": 1.4539486994096407e-05, + "loss": 2.535, + "step": 31155 + }, + { + "epoch": 0.9238798446164339, + "grad_norm": 0.07114991545677185, + "learning_rate": 1.4528225143389418e-05, + "loss": 2.5065, + "step": 31156 + }, + { + "epoch": 0.9239094979687453, + "grad_norm": 0.07028044015169144, + "learning_rate": 1.4516967591618358e-05, + "loss": 2.5586, + "step": 31157 + }, + { + "epoch": 0.9239391513210569, + "grad_norm": 0.07192255556583405, + "learning_rate": 1.4505714338882924e-05, + "loss": 2.5386, + "step": 31158 + }, + { + "epoch": 0.9239688046733683, + "grad_norm": 0.07206336408853531, + "learning_rate": 1.449446538528265e-05, + "loss": 2.544, + "step": 31159 + }, + { + "epoch": 0.9239984580256798, + "grad_norm": 0.07196951657533646, + "learning_rate": 1.4483220730917234e-05, + "loss": 2.5275, + "step": 31160 + }, + { + "epoch": 0.9240281113779912, + "grad_norm": 0.06800477206707001, + "learning_rate": 1.4471980375886263e-05, + "loss": 2.5458, + "step": 31161 + }, + { + "epoch": 0.9240577647303028, + "grad_norm": 0.06419491767883301, + "learning_rate": 1.4460744320289265e-05, + "loss": 2.5318, + "step": 31162 + }, + { + "epoch": 0.9240874180826142, + "grad_norm": 0.07247427105903625, + "learning_rate": 1.4449512564225664e-05, + "loss": 2.5401, + "step": 31163 + }, + { + "epoch": 0.9241170714349257, + "grad_norm": 0.06764727085828781, + "learning_rate": 1.4438285107794991e-05, + "loss": 2.5329, + "step": 31164 + }, + { + "epoch": 0.9241467247872373, + "grad_norm": 0.06902225315570831, + "learning_rate": 1.4427061951096665e-05, + "loss": 2.547, + "step": 31165 + }, + { + "epoch": 0.9241763781395487, + "grad_norm": 0.07045000046491623, + "learning_rate": 1.4415843094230052e-05, + "loss": 2.5721, + "step": 31166 + }, + { + "epoch": 0.9242060314918602, + "grad_norm": 0.0696508139371872, + "learning_rate": 1.4404628537294461e-05, + "loss": 2.5416, + "step": 31167 + }, + { + "epoch": 0.9242356848441716, + "grad_norm": 0.07108210772275925, + "learning_rate": 1.4393418280389314e-05, + "loss": 2.5331, + "step": 31168 + }, + { + "epoch": 0.9242653381964832, + "grad_norm": 0.06767421215772629, + "learning_rate": 1.4382212323613752e-05, + "loss": 2.5038, + "step": 31169 + }, + { + "epoch": 0.9242949915487946, + "grad_norm": 0.0646439716219902, + "learning_rate": 1.4371010667067087e-05, + "loss": 2.5239, + "step": 31170 + }, + { + "epoch": 0.9243246449011061, + "grad_norm": 0.06920161843299866, + "learning_rate": 1.4359813310848347e-05, + "loss": 2.5686, + "step": 31171 + }, + { + "epoch": 0.9243542982534175, + "grad_norm": 0.06992198526859283, + "learning_rate": 1.4348620255056955e-05, + "loss": 2.5369, + "step": 31172 + }, + { + "epoch": 0.9243839516057291, + "grad_norm": 0.06879065185785294, + "learning_rate": 1.433743149979183e-05, + "loss": 2.5234, + "step": 31173 + }, + { + "epoch": 0.9244136049580405, + "grad_norm": 0.06742454320192337, + "learning_rate": 1.4326247045152174e-05, + "loss": 2.5536, + "step": 31174 + }, + { + "epoch": 0.924443258310352, + "grad_norm": 0.06953448802232742, + "learning_rate": 1.4315066891236905e-05, + "loss": 2.5259, + "step": 31175 + }, + { + "epoch": 0.9244729116626634, + "grad_norm": 0.06792747229337692, + "learning_rate": 1.4303891038145111e-05, + "loss": 2.5551, + "step": 31176 + }, + { + "epoch": 0.924502565014975, + "grad_norm": 0.0680452361702919, + "learning_rate": 1.4292719485975714e-05, + "loss": 2.5319, + "step": 31177 + }, + { + "epoch": 0.9245322183672864, + "grad_norm": 0.0680026113986969, + "learning_rate": 1.4281552234827688e-05, + "loss": 2.5081, + "step": 31178 + }, + { + "epoch": 0.9245618717195979, + "grad_norm": 0.06891533732414246, + "learning_rate": 1.42703892847999e-05, + "loss": 2.5532, + "step": 31179 + }, + { + "epoch": 0.9245915250719093, + "grad_norm": 0.06715995818376541, + "learning_rate": 1.425923063599116e-05, + "loss": 2.5235, + "step": 31180 + }, + { + "epoch": 0.9246211784242209, + "grad_norm": 0.06921036541461945, + "learning_rate": 1.4248076288500334e-05, + "loss": 2.5438, + "step": 31181 + }, + { + "epoch": 0.9246508317765323, + "grad_norm": 0.07067709416151047, + "learning_rate": 1.4236926242426119e-05, + "loss": 2.5065, + "step": 31182 + }, + { + "epoch": 0.9246804851288438, + "grad_norm": 0.06915181875228882, + "learning_rate": 1.4225780497867324e-05, + "loss": 2.5519, + "step": 31183 + }, + { + "epoch": 0.9247101384811552, + "grad_norm": 0.07118247449398041, + "learning_rate": 1.4214639054922595e-05, + "loss": 2.531, + "step": 31184 + }, + { + "epoch": 0.9247397918334668, + "grad_norm": 0.06844306737184525, + "learning_rate": 1.4203501913690686e-05, + "loss": 2.5155, + "step": 31185 + }, + { + "epoch": 0.9247694451857783, + "grad_norm": 0.06756864488124847, + "learning_rate": 1.4192369074270129e-05, + "loss": 2.5428, + "step": 31186 + }, + { + "epoch": 0.9247990985380897, + "grad_norm": 0.07157499343156815, + "learning_rate": 1.418124053675951e-05, + "loss": 2.5123, + "step": 31187 + }, + { + "epoch": 0.9248287518904013, + "grad_norm": 0.0759783461689949, + "learning_rate": 1.4170116301257419e-05, + "loss": 2.5655, + "step": 31188 + }, + { + "epoch": 0.9248584052427127, + "grad_norm": 0.06561736017465591, + "learning_rate": 1.4158996367862387e-05, + "loss": 2.5409, + "step": 31189 + }, + { + "epoch": 0.9248880585950242, + "grad_norm": 0.07003692537546158, + "learning_rate": 1.414788073667278e-05, + "loss": 2.5509, + "step": 31190 + }, + { + "epoch": 0.9249177119473356, + "grad_norm": 0.06900297105312347, + "learning_rate": 1.4136769407787075e-05, + "loss": 2.5081, + "step": 31191 + }, + { + "epoch": 0.9249473652996472, + "grad_norm": 0.07066751271486282, + "learning_rate": 1.4125662381303694e-05, + "loss": 2.5286, + "step": 31192 + }, + { + "epoch": 0.9249770186519586, + "grad_norm": 0.07250870764255524, + "learning_rate": 1.4114559657320947e-05, + "loss": 2.5073, + "step": 31193 + }, + { + "epoch": 0.9250066720042701, + "grad_norm": 0.0682215467095375, + "learning_rate": 1.4103461235937199e-05, + "loss": 2.5412, + "step": 31194 + }, + { + "epoch": 0.9250363253565815, + "grad_norm": 0.06945238262414932, + "learning_rate": 1.4092367117250704e-05, + "loss": 2.5399, + "step": 31195 + }, + { + "epoch": 0.9250659787088931, + "grad_norm": 0.0703812688589096, + "learning_rate": 1.4081277301359663e-05, + "loss": 2.5392, + "step": 31196 + }, + { + "epoch": 0.9250956320612045, + "grad_norm": 0.07271143049001694, + "learning_rate": 1.407019178836233e-05, + "loss": 2.5545, + "step": 31197 + }, + { + "epoch": 0.925125285413516, + "grad_norm": 0.07203146070241928, + "learning_rate": 1.4059110578356849e-05, + "loss": 2.5362, + "step": 31198 + }, + { + "epoch": 0.9251549387658274, + "grad_norm": 0.0716322585940361, + "learning_rate": 1.4048033671441418e-05, + "loss": 2.5139, + "step": 31199 + }, + { + "epoch": 0.925184592118139, + "grad_norm": 0.06904558837413788, + "learning_rate": 1.4036961067714072e-05, + "loss": 2.4898, + "step": 31200 + }, + { + "epoch": 0.9252142454704504, + "grad_norm": 0.07116176187992096, + "learning_rate": 1.4025892767272785e-05, + "loss": 2.559, + "step": 31201 + }, + { + "epoch": 0.9252438988227619, + "grad_norm": 0.06857238709926605, + "learning_rate": 1.4014828770215704e-05, + "loss": 2.5268, + "step": 31202 + }, + { + "epoch": 0.9252735521750733, + "grad_norm": 0.07117311656475067, + "learning_rate": 1.4003769076640637e-05, + "loss": 2.5545, + "step": 31203 + }, + { + "epoch": 0.9253032055273849, + "grad_norm": 0.06931857764720917, + "learning_rate": 1.3992713686645674e-05, + "loss": 2.5404, + "step": 31204 + }, + { + "epoch": 0.9253328588796963, + "grad_norm": 0.07097350060939789, + "learning_rate": 1.3981662600328682e-05, + "loss": 2.5259, + "step": 31205 + }, + { + "epoch": 0.9253625122320078, + "grad_norm": 0.0685943216085434, + "learning_rate": 1.3970615817787413e-05, + "loss": 2.5432, + "step": 31206 + }, + { + "epoch": 0.9253921655843194, + "grad_norm": 0.06739795207977295, + "learning_rate": 1.3959573339119792e-05, + "loss": 2.4847, + "step": 31207 + }, + { + "epoch": 0.9254218189366308, + "grad_norm": 0.07285348325967789, + "learning_rate": 1.3948535164423626e-05, + "loss": 2.5431, + "step": 31208 + }, + { + "epoch": 0.9254514722889423, + "grad_norm": 0.06912361830472946, + "learning_rate": 1.3937501293796562e-05, + "loss": 2.5598, + "step": 31209 + }, + { + "epoch": 0.9254811256412537, + "grad_norm": 0.06935829669237137, + "learning_rate": 1.3926471727336353e-05, + "loss": 2.5276, + "step": 31210 + }, + { + "epoch": 0.9255107789935653, + "grad_norm": 0.06999124586582184, + "learning_rate": 1.3915446465140702e-05, + "loss": 2.5434, + "step": 31211 + }, + { + "epoch": 0.9255404323458767, + "grad_norm": 0.07002127915620804, + "learning_rate": 1.3904425507307194e-05, + "loss": 2.5666, + "step": 31212 + }, + { + "epoch": 0.9255700856981882, + "grad_norm": 0.06940782070159912, + "learning_rate": 1.389340885393342e-05, + "loss": 2.554, + "step": 31213 + }, + { + "epoch": 0.9255997390504996, + "grad_norm": 0.06940359622240067, + "learning_rate": 1.3882396505116968e-05, + "loss": 2.5332, + "step": 31214 + }, + { + "epoch": 0.9256293924028112, + "grad_norm": 0.06814739108085632, + "learning_rate": 1.3871388460955314e-05, + "loss": 2.5459, + "step": 31215 + }, + { + "epoch": 0.9256590457551226, + "grad_norm": 0.06790042668581009, + "learning_rate": 1.3860384721545993e-05, + "loss": 2.5141, + "step": 31216 + }, + { + "epoch": 0.9256886991074341, + "grad_norm": 0.07418175786733627, + "learning_rate": 1.384938528698637e-05, + "loss": 2.5176, + "step": 31217 + }, + { + "epoch": 0.9257183524597455, + "grad_norm": 0.0700574517250061, + "learning_rate": 1.3838390157373926e-05, + "loss": 2.534, + "step": 31218 + }, + { + "epoch": 0.9257480058120571, + "grad_norm": 0.06894142180681229, + "learning_rate": 1.3827399332805968e-05, + "loss": 2.5365, + "step": 31219 + }, + { + "epoch": 0.9257776591643685, + "grad_norm": 0.07153613865375519, + "learning_rate": 1.3816412813379864e-05, + "loss": 2.5437, + "step": 31220 + }, + { + "epoch": 0.92580731251668, + "grad_norm": 0.06818603724241257, + "learning_rate": 1.3805430599192815e-05, + "loss": 2.5328, + "step": 31221 + }, + { + "epoch": 0.9258369658689914, + "grad_norm": 0.0708366334438324, + "learning_rate": 1.3794452690342186e-05, + "loss": 2.5369, + "step": 31222 + }, + { + "epoch": 0.925866619221303, + "grad_norm": 0.0697421059012413, + "learning_rate": 1.3783479086925122e-05, + "loss": 2.5364, + "step": 31223 + }, + { + "epoch": 0.9258962725736144, + "grad_norm": 0.06822344660758972, + "learning_rate": 1.3772509789038823e-05, + "loss": 2.5599, + "step": 31224 + }, + { + "epoch": 0.9259259259259259, + "grad_norm": 0.07061220705509186, + "learning_rate": 1.3761544796780379e-05, + "loss": 2.4932, + "step": 31225 + }, + { + "epoch": 0.9259555792782374, + "grad_norm": 0.06930644810199738, + "learning_rate": 1.3750584110246933e-05, + "loss": 2.5759, + "step": 31226 + }, + { + "epoch": 0.9259852326305489, + "grad_norm": 0.07129010558128357, + "learning_rate": 1.373962772953552e-05, + "loss": 2.5096, + "step": 31227 + }, + { + "epoch": 0.9260148859828604, + "grad_norm": 0.06928324699401855, + "learning_rate": 1.3728675654743173e-05, + "loss": 2.5577, + "step": 31228 + }, + { + "epoch": 0.9260445393351718, + "grad_norm": 0.0667516440153122, + "learning_rate": 1.3717727885966869e-05, + "loss": 2.55, + "step": 31229 + }, + { + "epoch": 0.9260741926874834, + "grad_norm": 0.07192626595497131, + "learning_rate": 1.3706784423303587e-05, + "loss": 2.5564, + "step": 31230 + }, + { + "epoch": 0.9261038460397948, + "grad_norm": 0.07189304381608963, + "learning_rate": 1.3695845266850137e-05, + "loss": 2.5512, + "step": 31231 + }, + { + "epoch": 0.9261334993921063, + "grad_norm": 0.07085414230823517, + "learning_rate": 1.36849104167035e-05, + "loss": 2.5479, + "step": 31232 + }, + { + "epoch": 0.9261631527444177, + "grad_norm": 0.06618566811084747, + "learning_rate": 1.367397987296043e-05, + "loss": 2.4979, + "step": 31233 + }, + { + "epoch": 0.9261928060967293, + "grad_norm": 0.07365208864212036, + "learning_rate": 1.3663053635717793e-05, + "loss": 2.5565, + "step": 31234 + }, + { + "epoch": 0.9262224594490407, + "grad_norm": 0.07248222827911377, + "learning_rate": 1.3652131705072235e-05, + "loss": 2.5329, + "step": 31235 + }, + { + "epoch": 0.9262521128013522, + "grad_norm": 0.06842928379774094, + "learning_rate": 1.3641214081120512e-05, + "loss": 2.5258, + "step": 31236 + }, + { + "epoch": 0.9262817661536636, + "grad_norm": 0.06674874573945999, + "learning_rate": 1.363030076395938e-05, + "loss": 2.5373, + "step": 31237 + }, + { + "epoch": 0.9263114195059752, + "grad_norm": 0.07269828021526337, + "learning_rate": 1.3619391753685428e-05, + "loss": 2.5642, + "step": 31238 + }, + { + "epoch": 0.9263410728582866, + "grad_norm": 0.07354871183633804, + "learning_rate": 1.360848705039519e-05, + "loss": 2.5586, + "step": 31239 + }, + { + "epoch": 0.9263707262105981, + "grad_norm": 0.06564220786094666, + "learning_rate": 1.3597586654185312e-05, + "loss": 2.5228, + "step": 31240 + }, + { + "epoch": 0.9264003795629095, + "grad_norm": 0.06897047907114029, + "learning_rate": 1.3586690565152326e-05, + "loss": 2.5198, + "step": 31241 + }, + { + "epoch": 0.9264300329152211, + "grad_norm": 0.06932424753904343, + "learning_rate": 1.3575798783392657e-05, + "loss": 2.5605, + "step": 31242 + }, + { + "epoch": 0.9264596862675325, + "grad_norm": 0.06929946690797806, + "learning_rate": 1.356491130900278e-05, + "loss": 2.5448, + "step": 31243 + }, + { + "epoch": 0.926489339619844, + "grad_norm": 0.07129020988941193, + "learning_rate": 1.3554028142079122e-05, + "loss": 2.5592, + "step": 31244 + }, + { + "epoch": 0.9265189929721555, + "grad_norm": 0.07154785841703415, + "learning_rate": 1.3543149282718047e-05, + "loss": 2.5296, + "step": 31245 + }, + { + "epoch": 0.926548646324467, + "grad_norm": 0.06741422414779663, + "learning_rate": 1.3532274731015925e-05, + "loss": 2.5623, + "step": 31246 + }, + { + "epoch": 0.9265782996767784, + "grad_norm": 0.068540558218956, + "learning_rate": 1.3521404487068956e-05, + "loss": 2.5741, + "step": 31247 + }, + { + "epoch": 0.9266079530290899, + "grad_norm": 0.06747428327798843, + "learning_rate": 1.3510538550973395e-05, + "loss": 2.5212, + "step": 31248 + }, + { + "epoch": 0.9266376063814015, + "grad_norm": 0.07331761717796326, + "learning_rate": 1.3499676922825555e-05, + "loss": 2.5575, + "step": 31249 + }, + { + "epoch": 0.9266672597337129, + "grad_norm": 0.07215538620948792, + "learning_rate": 1.3488819602721636e-05, + "loss": 2.547, + "step": 31250 + }, + { + "epoch": 0.9266969130860244, + "grad_norm": 0.06856712698936462, + "learning_rate": 1.3477966590757673e-05, + "loss": 2.5582, + "step": 31251 + }, + { + "epoch": 0.9267265664383358, + "grad_norm": 0.07133277505636215, + "learning_rate": 1.3467117887029867e-05, + "loss": 2.5761, + "step": 31252 + }, + { + "epoch": 0.9267562197906474, + "grad_norm": 0.07162017375230789, + "learning_rate": 1.3456273491634251e-05, + "loss": 2.5831, + "step": 31253 + }, + { + "epoch": 0.9267858731429588, + "grad_norm": 0.06951234489679337, + "learning_rate": 1.3445433404666808e-05, + "loss": 2.5366, + "step": 31254 + }, + { + "epoch": 0.9268155264952703, + "grad_norm": 0.06800162047147751, + "learning_rate": 1.3434597626223622e-05, + "loss": 2.5247, + "step": 31255 + }, + { + "epoch": 0.9268451798475817, + "grad_norm": 0.06775116920471191, + "learning_rate": 1.3423766156400563e-05, + "loss": 2.4923, + "step": 31256 + }, + { + "epoch": 0.9268748331998933, + "grad_norm": 0.06970956176519394, + "learning_rate": 1.341293899529361e-05, + "loss": 2.5426, + "step": 31257 + }, + { + "epoch": 0.9269044865522047, + "grad_norm": 0.06798429787158966, + "learning_rate": 1.3402116142998522e-05, + "loss": 2.5077, + "step": 31258 + }, + { + "epoch": 0.9269341399045162, + "grad_norm": 0.06847050040960312, + "learning_rate": 1.3391297599611274e-05, + "loss": 2.5428, + "step": 31259 + }, + { + "epoch": 0.9269637932568277, + "grad_norm": 0.0673544630408287, + "learning_rate": 1.3380483365227625e-05, + "loss": 2.5288, + "step": 31260 + }, + { + "epoch": 0.9269934466091392, + "grad_norm": 0.06710716336965561, + "learning_rate": 1.336967343994322e-05, + "loss": 2.5684, + "step": 31261 + }, + { + "epoch": 0.9270230999614506, + "grad_norm": 0.07039292901754379, + "learning_rate": 1.3358867823853982e-05, + "loss": 2.5283, + "step": 31262 + }, + { + "epoch": 0.9270527533137621, + "grad_norm": 0.07024101167917252, + "learning_rate": 1.3348066517055446e-05, + "loss": 2.5193, + "step": 31263 + }, + { + "epoch": 0.9270824066660736, + "grad_norm": 0.06711353361606598, + "learning_rate": 1.3337269519643368e-05, + "loss": 2.5307, + "step": 31264 + }, + { + "epoch": 0.9271120600183851, + "grad_norm": 0.06966317445039749, + "learning_rate": 1.3326476831713341e-05, + "loss": 2.5263, + "step": 31265 + }, + { + "epoch": 0.9271417133706965, + "grad_norm": 0.06923684477806091, + "learning_rate": 1.3315688453360842e-05, + "loss": 2.5393, + "step": 31266 + }, + { + "epoch": 0.927171366723008, + "grad_norm": 0.068048395216465, + "learning_rate": 1.330490438468146e-05, + "loss": 2.4981, + "step": 31267 + }, + { + "epoch": 0.9272010200753195, + "grad_norm": 0.06772613525390625, + "learning_rate": 1.3294124625770677e-05, + "loss": 2.5481, + "step": 31268 + }, + { + "epoch": 0.927230673427631, + "grad_norm": 0.0687420666217804, + "learning_rate": 1.328334917672397e-05, + "loss": 2.5556, + "step": 31269 + }, + { + "epoch": 0.9272603267799425, + "grad_norm": 0.07215239107608795, + "learning_rate": 1.3272578037636763e-05, + "loss": 2.5528, + "step": 31270 + }, + { + "epoch": 0.9272899801322539, + "grad_norm": 0.07004864513874054, + "learning_rate": 1.3261811208604368e-05, + "loss": 2.5244, + "step": 31271 + }, + { + "epoch": 0.9273196334845655, + "grad_norm": 0.07026942074298859, + "learning_rate": 1.3251048689722266e-05, + "loss": 2.528, + "step": 31272 + }, + { + "epoch": 0.9273492868368769, + "grad_norm": 0.06846613436937332, + "learning_rate": 1.3240290481085603e-05, + "loss": 2.5475, + "step": 31273 + }, + { + "epoch": 0.9273789401891884, + "grad_norm": 0.07363912463188171, + "learning_rate": 1.3229536582789748e-05, + "loss": 2.5602, + "step": 31274 + }, + { + "epoch": 0.9274085935414998, + "grad_norm": 0.07159467041492462, + "learning_rate": 1.3218786994929899e-05, + "loss": 2.5669, + "step": 31275 + }, + { + "epoch": 0.9274382468938114, + "grad_norm": 0.07030463963747025, + "learning_rate": 1.320804171760126e-05, + "loss": 2.5464, + "step": 31276 + }, + { + "epoch": 0.9274679002461228, + "grad_norm": 0.07668036222457886, + "learning_rate": 1.3197300750898977e-05, + "loss": 2.5247, + "step": 31277 + }, + { + "epoch": 0.9274975535984343, + "grad_norm": 0.0691290944814682, + "learning_rate": 1.3186564094918141e-05, + "loss": 2.5252, + "step": 31278 + }, + { + "epoch": 0.9275272069507458, + "grad_norm": 0.06817051023244858, + "learning_rate": 1.3175831749753842e-05, + "loss": 2.582, + "step": 31279 + }, + { + "epoch": 0.9275568603030573, + "grad_norm": 0.06854818016290665, + "learning_rate": 1.3165103715501114e-05, + "loss": 2.5281, + "step": 31280 + }, + { + "epoch": 0.9275865136553687, + "grad_norm": 0.06735709309577942, + "learning_rate": 1.3154379992254938e-05, + "loss": 2.5717, + "step": 31281 + }, + { + "epoch": 0.9276161670076802, + "grad_norm": 0.06973959505558014, + "learning_rate": 1.314366058011035e-05, + "loss": 2.5223, + "step": 31282 + }, + { + "epoch": 0.9276458203599917, + "grad_norm": 0.07078328728675842, + "learning_rate": 1.3132945479162161e-05, + "loss": 2.5278, + "step": 31283 + }, + { + "epoch": 0.9276754737123032, + "grad_norm": 0.06540832668542862, + "learning_rate": 1.3122234689505296e-05, + "loss": 2.537, + "step": 31284 + }, + { + "epoch": 0.9277051270646146, + "grad_norm": 0.06507191807031631, + "learning_rate": 1.311152821123468e-05, + "loss": 2.5327, + "step": 31285 + }, + { + "epoch": 0.9277347804169261, + "grad_norm": 0.0686507299542427, + "learning_rate": 1.3100826044445014e-05, + "loss": 2.5174, + "step": 31286 + }, + { + "epoch": 0.9277644337692376, + "grad_norm": 0.06965940445661545, + "learning_rate": 1.309012818923111e-05, + "loss": 2.537, + "step": 31287 + }, + { + "epoch": 0.9277940871215491, + "grad_norm": 0.07042162120342255, + "learning_rate": 1.3079434645687671e-05, + "loss": 2.5403, + "step": 31288 + }, + { + "epoch": 0.9278237404738606, + "grad_norm": 0.06818129122257233, + "learning_rate": 1.3068745413909455e-05, + "loss": 2.5565, + "step": 31289 + }, + { + "epoch": 0.927853393826172, + "grad_norm": 0.07066689431667328, + "learning_rate": 1.3058060493991053e-05, + "loss": 2.5292, + "step": 31290 + }, + { + "epoch": 0.9278830471784836, + "grad_norm": 0.06588241457939148, + "learning_rate": 1.3047379886027111e-05, + "loss": 2.5068, + "step": 31291 + }, + { + "epoch": 0.927912700530795, + "grad_norm": 0.07178125530481339, + "learning_rate": 1.303670359011222e-05, + "loss": 2.5375, + "step": 31292 + }, + { + "epoch": 0.9279423538831065, + "grad_norm": 0.06804177910089493, + "learning_rate": 1.3026031606340915e-05, + "loss": 2.5345, + "step": 31293 + }, + { + "epoch": 0.927972007235418, + "grad_norm": 0.07026311010122299, + "learning_rate": 1.3015363934807678e-05, + "loss": 2.5943, + "step": 31294 + }, + { + "epoch": 0.9280016605877295, + "grad_norm": 0.06951095908880234, + "learning_rate": 1.3004700575606987e-05, + "loss": 2.529, + "step": 31295 + }, + { + "epoch": 0.9280313139400409, + "grad_norm": 0.07319255918264389, + "learning_rate": 1.2994041528833267e-05, + "loss": 2.5519, + "step": 31296 + }, + { + "epoch": 0.9280609672923524, + "grad_norm": 0.07054246962070465, + "learning_rate": 1.2983386794580888e-05, + "loss": 2.5097, + "step": 31297 + }, + { + "epoch": 0.9280906206446639, + "grad_norm": 0.06809824705123901, + "learning_rate": 1.2972736372944216e-05, + "loss": 2.5226, + "step": 31298 + }, + { + "epoch": 0.9281202739969754, + "grad_norm": 0.07251190394163132, + "learning_rate": 1.2962090264017568e-05, + "loss": 2.5315, + "step": 31299 + }, + { + "epoch": 0.9281499273492868, + "grad_norm": 0.06959850341081619, + "learning_rate": 1.2951448467895199e-05, + "loss": 2.5411, + "step": 31300 + }, + { + "epoch": 0.9281795807015983, + "grad_norm": 0.06851184368133545, + "learning_rate": 1.294081098467137e-05, + "loss": 2.5427, + "step": 31301 + }, + { + "epoch": 0.9282092340539098, + "grad_norm": 0.06747006624937057, + "learning_rate": 1.2930177814440225e-05, + "loss": 2.4849, + "step": 31302 + }, + { + "epoch": 0.9282388874062213, + "grad_norm": 0.07161643356084824, + "learning_rate": 1.2919548957296024e-05, + "loss": 2.5295, + "step": 31303 + }, + { + "epoch": 0.9282685407585327, + "grad_norm": 0.06820129603147507, + "learning_rate": 1.2908924413332746e-05, + "loss": 2.549, + "step": 31304 + }, + { + "epoch": 0.9282981941108442, + "grad_norm": 0.06873609870672226, + "learning_rate": 1.2898304182644594e-05, + "loss": 2.5358, + "step": 31305 + }, + { + "epoch": 0.9283278474631557, + "grad_norm": 0.06900583952665329, + "learning_rate": 1.2887688265325604e-05, + "loss": 2.5781, + "step": 31306 + }, + { + "epoch": 0.9283575008154672, + "grad_norm": 0.07192658632993698, + "learning_rate": 1.2877076661469699e-05, + "loss": 2.5157, + "step": 31307 + }, + { + "epoch": 0.9283871541677786, + "grad_norm": 0.06880888342857361, + "learning_rate": 1.2866469371170864e-05, + "loss": 2.5373, + "step": 31308 + }, + { + "epoch": 0.9284168075200901, + "grad_norm": 0.0722198337316513, + "learning_rate": 1.285586639452313e-05, + "loss": 2.5404, + "step": 31309 + }, + { + "epoch": 0.9284464608724017, + "grad_norm": 0.0676935613155365, + "learning_rate": 1.2845267731620314e-05, + "loss": 2.5533, + "step": 31310 + }, + { + "epoch": 0.9284761142247131, + "grad_norm": 0.07064984738826752, + "learning_rate": 1.2834673382556227e-05, + "loss": 2.5236, + "step": 31311 + }, + { + "epoch": 0.9285057675770246, + "grad_norm": 0.0697249099612236, + "learning_rate": 1.2824083347424743e-05, + "loss": 2.5583, + "step": 31312 + }, + { + "epoch": 0.928535420929336, + "grad_norm": 0.06717725098133087, + "learning_rate": 1.2813497626319614e-05, + "loss": 2.5215, + "step": 31313 + }, + { + "epoch": 0.9285650742816476, + "grad_norm": 0.06755644828081131, + "learning_rate": 1.2802916219334604e-05, + "loss": 2.5472, + "step": 31314 + }, + { + "epoch": 0.928594727633959, + "grad_norm": 0.06806100904941559, + "learning_rate": 1.2792339126563358e-05, + "loss": 2.5412, + "step": 31315 + }, + { + "epoch": 0.9286243809862705, + "grad_norm": 0.06936919689178467, + "learning_rate": 1.2781766348099632e-05, + "loss": 2.5409, + "step": 31316 + }, + { + "epoch": 0.928654034338582, + "grad_norm": 0.06926616281270981, + "learning_rate": 1.2771197884036966e-05, + "loss": 2.5365, + "step": 31317 + }, + { + "epoch": 0.9286836876908935, + "grad_norm": 0.06864358484745026, + "learning_rate": 1.2760633734468951e-05, + "loss": 2.566, + "step": 31318 + }, + { + "epoch": 0.9287133410432049, + "grad_norm": 0.06925315409898758, + "learning_rate": 1.2750073899489179e-05, + "loss": 2.5547, + "step": 31319 + }, + { + "epoch": 0.9287429943955164, + "grad_norm": 0.06909492611885071, + "learning_rate": 1.2739518379191183e-05, + "loss": 2.5671, + "step": 31320 + }, + { + "epoch": 0.9287726477478279, + "grad_norm": 0.07322709262371063, + "learning_rate": 1.2728967173668393e-05, + "loss": 2.4934, + "step": 31321 + }, + { + "epoch": 0.9288023011001394, + "grad_norm": 0.06812586635351181, + "learning_rate": 1.2718420283014175e-05, + "loss": 2.544, + "step": 31322 + }, + { + "epoch": 0.9288319544524508, + "grad_norm": 0.06433103233575821, + "learning_rate": 1.2707877707322013e-05, + "loss": 2.5481, + "step": 31323 + }, + { + "epoch": 0.9288616078047623, + "grad_norm": 0.06659523397684097, + "learning_rate": 1.2697339446685218e-05, + "loss": 2.551, + "step": 31324 + }, + { + "epoch": 0.9288912611570738, + "grad_norm": 0.06863699108362198, + "learning_rate": 1.2686805501197163e-05, + "loss": 2.5316, + "step": 31325 + }, + { + "epoch": 0.9289209145093853, + "grad_norm": 0.0713774710893631, + "learning_rate": 1.2676275870950994e-05, + "loss": 2.529, + "step": 31326 + }, + { + "epoch": 0.9289505678616967, + "grad_norm": 0.07066932320594788, + "learning_rate": 1.2665750556040135e-05, + "loss": 2.5713, + "step": 31327 + }, + { + "epoch": 0.9289802212140083, + "grad_norm": 0.06912265717983246, + "learning_rate": 1.2655229556557734e-05, + "loss": 2.5526, + "step": 31328 + }, + { + "epoch": 0.9290098745663197, + "grad_norm": 0.0656943991780281, + "learning_rate": 1.2644712872596887e-05, + "loss": 2.5712, + "step": 31329 + }, + { + "epoch": 0.9290395279186312, + "grad_norm": 0.06875965744256973, + "learning_rate": 1.2634200504250736e-05, + "loss": 2.5342, + "step": 31330 + }, + { + "epoch": 0.9290691812709427, + "grad_norm": 0.06804787367582321, + "learning_rate": 1.262369245161249e-05, + "loss": 2.5336, + "step": 31331 + }, + { + "epoch": 0.9290988346232542, + "grad_norm": 0.06997310370206833, + "learning_rate": 1.2613188714775014e-05, + "loss": 2.5067, + "step": 31332 + }, + { + "epoch": 0.9291284879755657, + "grad_norm": 0.06612509489059448, + "learning_rate": 1.2602689293831405e-05, + "loss": 2.4898, + "step": 31333 + }, + { + "epoch": 0.9291581413278771, + "grad_norm": 0.06545094400644302, + "learning_rate": 1.2592194188874694e-05, + "loss": 2.5176, + "step": 31334 + }, + { + "epoch": 0.9291877946801886, + "grad_norm": 0.06573357433080673, + "learning_rate": 1.2581703399997757e-05, + "loss": 2.5899, + "step": 31335 + }, + { + "epoch": 0.9292174480325001, + "grad_norm": 0.06968872249126434, + "learning_rate": 1.257121692729346e-05, + "loss": 2.5453, + "step": 31336 + }, + { + "epoch": 0.9292471013848116, + "grad_norm": 0.07121870666742325, + "learning_rate": 1.2560734770854732e-05, + "loss": 2.5376, + "step": 31337 + }, + { + "epoch": 0.929276754737123, + "grad_norm": 0.06692470610141754, + "learning_rate": 1.2550256930774384e-05, + "loss": 2.5332, + "step": 31338 + }, + { + "epoch": 0.9293064080894345, + "grad_norm": 0.07055623829364777, + "learning_rate": 1.2539783407145067e-05, + "loss": 2.5259, + "step": 31339 + }, + { + "epoch": 0.929336061441746, + "grad_norm": 0.06540405005216599, + "learning_rate": 1.252931420005976e-05, + "loss": 2.5928, + "step": 31340 + }, + { + "epoch": 0.9293657147940575, + "grad_norm": 0.06725702434778214, + "learning_rate": 1.2518849309611058e-05, + "loss": 2.5502, + "step": 31341 + }, + { + "epoch": 0.9293953681463689, + "grad_norm": 0.0692981630563736, + "learning_rate": 1.2508388735891607e-05, + "loss": 2.5535, + "step": 31342 + }, + { + "epoch": 0.9294250214986804, + "grad_norm": 0.06928760558366776, + "learning_rate": 1.2497932478994001e-05, + "loss": 2.5013, + "step": 31343 + }, + { + "epoch": 0.9294546748509919, + "grad_norm": 0.06761587411165237, + "learning_rate": 1.2487480539010887e-05, + "loss": 2.542, + "step": 31344 + }, + { + "epoch": 0.9294843282033034, + "grad_norm": 0.07009384036064148, + "learning_rate": 1.2477032916034858e-05, + "loss": 2.5282, + "step": 31345 + }, + { + "epoch": 0.9295139815556148, + "grad_norm": 0.06709464639425278, + "learning_rate": 1.246658961015834e-05, + "loss": 2.5152, + "step": 31346 + }, + { + "epoch": 0.9295436349079264, + "grad_norm": 0.06920673698186874, + "learning_rate": 1.2456150621473872e-05, + "loss": 2.5197, + "step": 31347 + }, + { + "epoch": 0.9295732882602378, + "grad_norm": 0.06820739060640335, + "learning_rate": 1.2445715950073876e-05, + "loss": 2.5452, + "step": 31348 + }, + { + "epoch": 0.9296029416125493, + "grad_norm": 0.06854090094566345, + "learning_rate": 1.2435285596050783e-05, + "loss": 2.5302, + "step": 31349 + }, + { + "epoch": 0.9296325949648607, + "grad_norm": 0.07288612425327301, + "learning_rate": 1.2424859559496903e-05, + "loss": 2.5337, + "step": 31350 + }, + { + "epoch": 0.9296622483171723, + "grad_norm": 0.07012078911066055, + "learning_rate": 1.2414437840504555e-05, + "loss": 2.5559, + "step": 31351 + }, + { + "epoch": 0.9296919016694838, + "grad_norm": 0.06824079155921936, + "learning_rate": 1.2404020439166053e-05, + "loss": 2.5499, + "step": 31352 + }, + { + "epoch": 0.9297215550217952, + "grad_norm": 0.0702405497431755, + "learning_rate": 1.239360735557371e-05, + "loss": 2.5303, + "step": 31353 + }, + { + "epoch": 0.9297512083741067, + "grad_norm": 0.06968330591917038, + "learning_rate": 1.2383198589819622e-05, + "loss": 2.5813, + "step": 31354 + }, + { + "epoch": 0.9297808617264182, + "grad_norm": 0.07019055634737015, + "learning_rate": 1.2372794141995991e-05, + "loss": 2.5194, + "step": 31355 + }, + { + "epoch": 0.9298105150787297, + "grad_norm": 0.06959272176027298, + "learning_rate": 1.2362394012195022e-05, + "loss": 2.5494, + "step": 31356 + }, + { + "epoch": 0.9298401684310411, + "grad_norm": 0.06851932406425476, + "learning_rate": 1.2351998200508697e-05, + "loss": 2.5242, + "step": 31357 + }, + { + "epoch": 0.9298698217833526, + "grad_norm": 0.0659974068403244, + "learning_rate": 1.2341606707029162e-05, + "loss": 2.5642, + "step": 31358 + }, + { + "epoch": 0.9298994751356641, + "grad_norm": 0.06839341670274734, + "learning_rate": 1.2331219531848403e-05, + "loss": 2.5345, + "step": 31359 + }, + { + "epoch": 0.9299291284879756, + "grad_norm": 0.0668870061635971, + "learning_rate": 1.23208366750584e-05, + "loss": 2.5392, + "step": 31360 + }, + { + "epoch": 0.929958781840287, + "grad_norm": 0.0729714184999466, + "learning_rate": 1.231045813675108e-05, + "loss": 2.536, + "step": 31361 + }, + { + "epoch": 0.9299884351925986, + "grad_norm": 0.0643211081624031, + "learning_rate": 1.2300083917018422e-05, + "loss": 2.521, + "step": 31362 + }, + { + "epoch": 0.93001808854491, + "grad_norm": 0.0692116916179657, + "learning_rate": 1.2289714015952192e-05, + "loss": 2.547, + "step": 31363 + }, + { + "epoch": 0.9300477418972215, + "grad_norm": 0.0708979070186615, + "learning_rate": 1.2279348433644256e-05, + "loss": 2.5477, + "step": 31364 + }, + { + "epoch": 0.9300773952495329, + "grad_norm": 0.06493222713470459, + "learning_rate": 1.2268987170186375e-05, + "loss": 2.5047, + "step": 31365 + }, + { + "epoch": 0.9301070486018445, + "grad_norm": 0.06909909099340439, + "learning_rate": 1.2258630225670364e-05, + "loss": 2.5243, + "step": 31366 + }, + { + "epoch": 0.9301367019541559, + "grad_norm": 0.07039815932512283, + "learning_rate": 1.2248277600187929e-05, + "loss": 2.5445, + "step": 31367 + }, + { + "epoch": 0.9301663553064674, + "grad_norm": 0.06955072283744812, + "learning_rate": 1.2237929293830718e-05, + "loss": 2.5439, + "step": 31368 + }, + { + "epoch": 0.9301960086587788, + "grad_norm": 0.0692143365740776, + "learning_rate": 1.2227585306690325e-05, + "loss": 2.5186, + "step": 31369 + }, + { + "epoch": 0.9302256620110904, + "grad_norm": 0.06853162497282028, + "learning_rate": 1.2217245638858399e-05, + "loss": 2.536, + "step": 31370 + }, + { + "epoch": 0.9302553153634018, + "grad_norm": 0.07050022482872009, + "learning_rate": 1.2206910290426477e-05, + "loss": 2.5585, + "step": 31371 + }, + { + "epoch": 0.9302849687157133, + "grad_norm": 0.06809788197278976, + "learning_rate": 1.2196579261486152e-05, + "loss": 2.5405, + "step": 31372 + }, + { + "epoch": 0.9303146220680248, + "grad_norm": 0.06743781268596649, + "learning_rate": 1.2186252552128795e-05, + "loss": 2.5428, + "step": 31373 + }, + { + "epoch": 0.9303442754203363, + "grad_norm": 0.07150936871767044, + "learning_rate": 1.2175930162445891e-05, + "loss": 2.5188, + "step": 31374 + }, + { + "epoch": 0.9303739287726478, + "grad_norm": 0.07484383136034012, + "learning_rate": 1.2165612092528922e-05, + "loss": 2.5386, + "step": 31375 + }, + { + "epoch": 0.9304035821249592, + "grad_norm": 0.0696410983800888, + "learning_rate": 1.2155298342469202e-05, + "loss": 2.5485, + "step": 31376 + }, + { + "epoch": 0.9304332354772707, + "grad_norm": 0.06888733059167862, + "learning_rate": 1.214498891235799e-05, + "loss": 2.5233, + "step": 31377 + }, + { + "epoch": 0.9304628888295822, + "grad_norm": 0.07110844552516937, + "learning_rate": 1.2134683802286605e-05, + "loss": 2.5229, + "step": 31378 + }, + { + "epoch": 0.9304925421818937, + "grad_norm": 0.06920646876096725, + "learning_rate": 1.2124383012346419e-05, + "loss": 2.5471, + "step": 31379 + }, + { + "epoch": 0.9305221955342051, + "grad_norm": 0.07099522650241852, + "learning_rate": 1.2114086542628522e-05, + "loss": 2.5273, + "step": 31380 + }, + { + "epoch": 0.9305518488865167, + "grad_norm": 0.07061246782541275, + "learning_rate": 1.2103794393224122e-05, + "loss": 2.5491, + "step": 31381 + }, + { + "epoch": 0.9305815022388281, + "grad_norm": 0.06917882710695267, + "learning_rate": 1.2093506564224421e-05, + "loss": 2.5513, + "step": 31382 + }, + { + "epoch": 0.9306111555911396, + "grad_norm": 0.06946389377117157, + "learning_rate": 1.2083223055720405e-05, + "loss": 2.5299, + "step": 31383 + }, + { + "epoch": 0.930640808943451, + "grad_norm": 0.06915104389190674, + "learning_rate": 1.2072943867803222e-05, + "loss": 2.5219, + "step": 31384 + }, + { + "epoch": 0.9306704622957626, + "grad_norm": 0.06968989223241806, + "learning_rate": 1.2062669000563908e-05, + "loss": 2.5568, + "step": 31385 + }, + { + "epoch": 0.930700115648074, + "grad_norm": 0.07567206770181656, + "learning_rate": 1.205239845409345e-05, + "loss": 2.5413, + "step": 31386 + }, + { + "epoch": 0.9307297690003855, + "grad_norm": 0.07281864434480667, + "learning_rate": 1.204213222848266e-05, + "loss": 2.5456, + "step": 31387 + }, + { + "epoch": 0.9307594223526969, + "grad_norm": 0.06857378780841827, + "learning_rate": 1.203187032382258e-05, + "loss": 2.5085, + "step": 31388 + }, + { + "epoch": 0.9307890757050085, + "grad_norm": 0.06599945574998856, + "learning_rate": 1.202161274020408e-05, + "loss": 2.5698, + "step": 31389 + }, + { + "epoch": 0.9308187290573199, + "grad_norm": 0.06872137635946274, + "learning_rate": 1.201135947771792e-05, + "loss": 2.55, + "step": 31390 + }, + { + "epoch": 0.9308483824096314, + "grad_norm": 0.07102268189191818, + "learning_rate": 1.2001110536454917e-05, + "loss": 2.5351, + "step": 31391 + }, + { + "epoch": 0.9308780357619428, + "grad_norm": 0.0714440867304802, + "learning_rate": 1.1990865916505834e-05, + "loss": 2.54, + "step": 31392 + }, + { + "epoch": 0.9309076891142544, + "grad_norm": 0.07183375209569931, + "learning_rate": 1.1980625617961427e-05, + "loss": 2.5476, + "step": 31393 + }, + { + "epoch": 0.9309373424665659, + "grad_norm": 0.07063006609678268, + "learning_rate": 1.1970389640912404e-05, + "loss": 2.5766, + "step": 31394 + }, + { + "epoch": 0.9309669958188773, + "grad_norm": 0.06777821481227875, + "learning_rate": 1.1960157985449305e-05, + "loss": 2.5695, + "step": 31395 + }, + { + "epoch": 0.9309966491711889, + "grad_norm": 0.06771157681941986, + "learning_rate": 1.1949930651662776e-05, + "loss": 2.5084, + "step": 31396 + }, + { + "epoch": 0.9310263025235003, + "grad_norm": 0.06847525388002396, + "learning_rate": 1.1939707639643416e-05, + "loss": 2.5477, + "step": 31397 + }, + { + "epoch": 0.9310559558758118, + "grad_norm": 0.07369914650917053, + "learning_rate": 1.1929488949481649e-05, + "loss": 2.5675, + "step": 31398 + }, + { + "epoch": 0.9310856092281232, + "grad_norm": 0.07177239656448364, + "learning_rate": 1.191927458126807e-05, + "loss": 2.5191, + "step": 31399 + }, + { + "epoch": 0.9311152625804348, + "grad_norm": 0.06948309391736984, + "learning_rate": 1.1909064535093106e-05, + "loss": 2.5656, + "step": 31400 + }, + { + "epoch": 0.9311449159327462, + "grad_norm": 0.0691593810915947, + "learning_rate": 1.1898858811047131e-05, + "loss": 2.5391, + "step": 31401 + }, + { + "epoch": 0.9311745692850577, + "grad_norm": 0.07098385691642761, + "learning_rate": 1.1888657409220571e-05, + "loss": 2.5541, + "step": 31402 + }, + { + "epoch": 0.9312042226373691, + "grad_norm": 0.06891147792339325, + "learning_rate": 1.1878460329703745e-05, + "loss": 2.5357, + "step": 31403 + }, + { + "epoch": 0.9312338759896807, + "grad_norm": 0.06848546862602234, + "learning_rate": 1.1868267572586855e-05, + "loss": 2.5298, + "step": 31404 + }, + { + "epoch": 0.9312635293419921, + "grad_norm": 0.06847109645605087, + "learning_rate": 1.1858079137960276e-05, + "loss": 2.5288, + "step": 31405 + }, + { + "epoch": 0.9312931826943036, + "grad_norm": 0.06750032305717468, + "learning_rate": 1.1847895025914213e-05, + "loss": 2.5373, + "step": 31406 + }, + { + "epoch": 0.931322836046615, + "grad_norm": 0.06881364434957504, + "learning_rate": 1.1837715236538871e-05, + "loss": 2.5379, + "step": 31407 + }, + { + "epoch": 0.9313524893989266, + "grad_norm": 0.0665774941444397, + "learning_rate": 1.1827539769924345e-05, + "loss": 2.553, + "step": 31408 + }, + { + "epoch": 0.931382142751238, + "grad_norm": 0.06800002604722977, + "learning_rate": 1.1817368626160674e-05, + "loss": 2.5291, + "step": 31409 + }, + { + "epoch": 0.9314117961035495, + "grad_norm": 0.066681407392025, + "learning_rate": 1.1807201805338064e-05, + "loss": 2.5589, + "step": 31410 + }, + { + "epoch": 0.9314414494558609, + "grad_norm": 0.07165597379207611, + "learning_rate": 1.1797039307546442e-05, + "loss": 2.5358, + "step": 31411 + }, + { + "epoch": 0.9314711028081725, + "grad_norm": 0.06963273137807846, + "learning_rate": 1.178688113287585e-05, + "loss": 2.5386, + "step": 31412 + }, + { + "epoch": 0.9315007561604839, + "grad_norm": 0.06868654489517212, + "learning_rate": 1.1776727281416265e-05, + "loss": 2.533, + "step": 31413 + }, + { + "epoch": 0.9315304095127954, + "grad_norm": 0.06619223207235336, + "learning_rate": 1.1766577753257512e-05, + "loss": 2.5475, + "step": 31414 + }, + { + "epoch": 0.931560062865107, + "grad_norm": 0.06970074772834778, + "learning_rate": 1.1756432548489514e-05, + "loss": 2.5713, + "step": 31415 + }, + { + "epoch": 0.9315897162174184, + "grad_norm": 0.06555488705635071, + "learning_rate": 1.1746291667202147e-05, + "loss": 2.5355, + "step": 31416 + }, + { + "epoch": 0.9316193695697299, + "grad_norm": 0.06547281891107559, + "learning_rate": 1.1736155109485114e-05, + "loss": 2.5204, + "step": 31417 + }, + { + "epoch": 0.9316490229220413, + "grad_norm": 0.06820153445005417, + "learning_rate": 1.1726022875428288e-05, + "loss": 2.5541, + "step": 31418 + }, + { + "epoch": 0.9316786762743529, + "grad_norm": 0.06494702398777008, + "learning_rate": 1.1715894965121376e-05, + "loss": 2.5066, + "step": 31419 + }, + { + "epoch": 0.9317083296266643, + "grad_norm": 0.06750718504190445, + "learning_rate": 1.1705771378653973e-05, + "loss": 2.5394, + "step": 31420 + }, + { + "epoch": 0.9317379829789758, + "grad_norm": 0.07205002009868622, + "learning_rate": 1.1695652116115785e-05, + "loss": 2.5472, + "step": 31421 + }, + { + "epoch": 0.9317676363312872, + "grad_norm": 0.06773854047060013, + "learning_rate": 1.1685537177596405e-05, + "loss": 2.5071, + "step": 31422 + }, + { + "epoch": 0.9317972896835988, + "grad_norm": 0.06618481874465942, + "learning_rate": 1.1675426563185432e-05, + "loss": 2.4923, + "step": 31423 + }, + { + "epoch": 0.9318269430359102, + "grad_norm": 0.06621690094470978, + "learning_rate": 1.1665320272972347e-05, + "loss": 2.53, + "step": 31424 + }, + { + "epoch": 0.9318565963882217, + "grad_norm": 0.06842199712991714, + "learning_rate": 1.165521830704669e-05, + "loss": 2.5512, + "step": 31425 + }, + { + "epoch": 0.9318862497405331, + "grad_norm": 0.06849891692399979, + "learning_rate": 1.1645120665497887e-05, + "loss": 2.5353, + "step": 31426 + }, + { + "epoch": 0.9319159030928447, + "grad_norm": 0.07060959190130234, + "learning_rate": 1.1635027348415427e-05, + "loss": 2.5551, + "step": 31427 + }, + { + "epoch": 0.9319455564451561, + "grad_norm": 0.06969477236270905, + "learning_rate": 1.162493835588857e-05, + "loss": 2.5644, + "step": 31428 + }, + { + "epoch": 0.9319752097974676, + "grad_norm": 0.06852799654006958, + "learning_rate": 1.1614853688006688e-05, + "loss": 2.5546, + "step": 31429 + }, + { + "epoch": 0.932004863149779, + "grad_norm": 0.0648142546415329, + "learning_rate": 1.1604773344859155e-05, + "loss": 2.5202, + "step": 31430 + }, + { + "epoch": 0.9320345165020906, + "grad_norm": 0.0671733021736145, + "learning_rate": 1.1594697326535175e-05, + "loss": 2.5359, + "step": 31431 + }, + { + "epoch": 0.932064169854402, + "grad_norm": 0.0702439621090889, + "learning_rate": 1.1584625633123957e-05, + "loss": 2.5164, + "step": 31432 + }, + { + "epoch": 0.9320938232067135, + "grad_norm": 0.0718584880232811, + "learning_rate": 1.1574558264714763e-05, + "loss": 2.492, + "step": 31433 + }, + { + "epoch": 0.9321234765590249, + "grad_norm": 0.06742911040782928, + "learning_rate": 1.1564495221396686e-05, + "loss": 2.5463, + "step": 31434 + }, + { + "epoch": 0.9321531299113365, + "grad_norm": 0.06742127984762192, + "learning_rate": 1.1554436503258824e-05, + "loss": 2.497, + "step": 31435 + }, + { + "epoch": 0.932182783263648, + "grad_norm": 0.06940388679504395, + "learning_rate": 1.1544382110390272e-05, + "loss": 2.5539, + "step": 31436 + }, + { + "epoch": 0.9322124366159594, + "grad_norm": 0.06810078769922256, + "learning_rate": 1.1534332042880013e-05, + "loss": 2.5544, + "step": 31437 + }, + { + "epoch": 0.932242089968271, + "grad_norm": 0.07026134431362152, + "learning_rate": 1.1524286300817143e-05, + "loss": 2.553, + "step": 31438 + }, + { + "epoch": 0.9322717433205824, + "grad_norm": 0.07058365643024445, + "learning_rate": 1.1514244884290536e-05, + "loss": 2.5263, + "step": 31439 + }, + { + "epoch": 0.9323013966728939, + "grad_norm": 0.06992394477128983, + "learning_rate": 1.1504207793389177e-05, + "loss": 2.5476, + "step": 31440 + }, + { + "epoch": 0.9323310500252053, + "grad_norm": 0.06762979924678802, + "learning_rate": 1.1494175028201936e-05, + "loss": 2.5651, + "step": 31441 + }, + { + "epoch": 0.9323607033775169, + "grad_norm": 0.06919417530298233, + "learning_rate": 1.1484146588817523e-05, + "loss": 2.533, + "step": 31442 + }, + { + "epoch": 0.9323903567298283, + "grad_norm": 0.07053504139184952, + "learning_rate": 1.1474122475324867e-05, + "loss": 2.5531, + "step": 31443 + }, + { + "epoch": 0.9324200100821398, + "grad_norm": 0.07104754447937012, + "learning_rate": 1.1464102687812728e-05, + "loss": 2.5698, + "step": 31444 + }, + { + "epoch": 0.9324496634344512, + "grad_norm": 0.06696370989084244, + "learning_rate": 1.1454087226369869e-05, + "loss": 2.5415, + "step": 31445 + }, + { + "epoch": 0.9324793167867628, + "grad_norm": 0.0693051666021347, + "learning_rate": 1.1444076091084887e-05, + "loss": 2.5132, + "step": 31446 + }, + { + "epoch": 0.9325089701390742, + "grad_norm": 0.07046646624803543, + "learning_rate": 1.1434069282046433e-05, + "loss": 2.5369, + "step": 31447 + }, + { + "epoch": 0.9325386234913857, + "grad_norm": 0.06957392394542694, + "learning_rate": 1.1424066799343213e-05, + "loss": 2.535, + "step": 31448 + }, + { + "epoch": 0.9325682768436971, + "grad_norm": 0.0661843791604042, + "learning_rate": 1.1414068643063713e-05, + "loss": 2.5172, + "step": 31449 + }, + { + "epoch": 0.9325979301960087, + "grad_norm": 0.06648441404104233, + "learning_rate": 1.1404074813296472e-05, + "loss": 2.5506, + "step": 31450 + }, + { + "epoch": 0.9326275835483201, + "grad_norm": 0.06963825225830078, + "learning_rate": 1.1394085310130087e-05, + "loss": 2.5397, + "step": 31451 + }, + { + "epoch": 0.9326572369006316, + "grad_norm": 0.0676574781537056, + "learning_rate": 1.138410013365293e-05, + "loss": 2.5369, + "step": 31452 + }, + { + "epoch": 0.932686890252943, + "grad_norm": 0.06527530401945114, + "learning_rate": 1.1374119283953432e-05, + "loss": 2.5304, + "step": 31453 + }, + { + "epoch": 0.9327165436052546, + "grad_norm": 0.06924773752689362, + "learning_rate": 1.1364142761119966e-05, + "loss": 2.5127, + "step": 31454 + }, + { + "epoch": 0.932746196957566, + "grad_norm": 0.07201680541038513, + "learning_rate": 1.135417056524085e-05, + "loss": 2.5665, + "step": 31455 + }, + { + "epoch": 0.9327758503098775, + "grad_norm": 0.06524763256311417, + "learning_rate": 1.134420269640446e-05, + "loss": 2.5473, + "step": 31456 + }, + { + "epoch": 0.9328055036621891, + "grad_norm": 0.06935378164052963, + "learning_rate": 1.1334239154699e-05, + "loss": 2.5554, + "step": 31457 + }, + { + "epoch": 0.9328351570145005, + "grad_norm": 0.06571734696626663, + "learning_rate": 1.1324279940212789e-05, + "loss": 2.5505, + "step": 31458 + }, + { + "epoch": 0.932864810366812, + "grad_norm": 0.0670829638838768, + "learning_rate": 1.1314325053033925e-05, + "loss": 2.5384, + "step": 31459 + }, + { + "epoch": 0.9328944637191234, + "grad_norm": 0.0682765394449234, + "learning_rate": 1.1304374493250613e-05, + "loss": 2.545, + "step": 31460 + }, + { + "epoch": 0.932924117071435, + "grad_norm": 0.06657490879297256, + "learning_rate": 1.129442826095095e-05, + "loss": 2.5162, + "step": 31461 + }, + { + "epoch": 0.9329537704237464, + "grad_norm": 0.06585682183504105, + "learning_rate": 1.1284486356223033e-05, + "loss": 2.5306, + "step": 31462 + }, + { + "epoch": 0.9329834237760579, + "grad_norm": 0.06855481117963791, + "learning_rate": 1.1274548779154847e-05, + "loss": 2.5731, + "step": 31463 + }, + { + "epoch": 0.9330130771283693, + "grad_norm": 0.06911418586969376, + "learning_rate": 1.1264615529834433e-05, + "loss": 2.5643, + "step": 31464 + }, + { + "epoch": 0.9330427304806809, + "grad_norm": 0.07166947424411774, + "learning_rate": 1.1254686608349718e-05, + "loss": 2.5516, + "step": 31465 + }, + { + "epoch": 0.9330723838329923, + "grad_norm": 0.06725282222032547, + "learning_rate": 1.1244762014788635e-05, + "loss": 2.5407, + "step": 31466 + }, + { + "epoch": 0.9331020371853038, + "grad_norm": 0.06843528896570206, + "learning_rate": 1.123484174923911e-05, + "loss": 2.5285, + "step": 31467 + }, + { + "epoch": 0.9331316905376152, + "grad_norm": 0.0682360827922821, + "learning_rate": 1.1224925811788855e-05, + "loss": 2.5443, + "step": 31468 + }, + { + "epoch": 0.9331613438899268, + "grad_norm": 0.06890129297971725, + "learning_rate": 1.1215014202525908e-05, + "loss": 2.5304, + "step": 31469 + }, + { + "epoch": 0.9331909972422382, + "grad_norm": 0.06733233481645584, + "learning_rate": 1.1205106921537867e-05, + "loss": 2.5371, + "step": 31470 + }, + { + "epoch": 0.9332206505945497, + "grad_norm": 0.0677209123969078, + "learning_rate": 1.1195203968912493e-05, + "loss": 2.5568, + "step": 31471 + }, + { + "epoch": 0.9332503039468611, + "grad_norm": 0.06502682715654373, + "learning_rate": 1.1185305344737495e-05, + "loss": 2.5371, + "step": 31472 + }, + { + "epoch": 0.9332799572991727, + "grad_norm": 0.06842850893735886, + "learning_rate": 1.117541104910058e-05, + "loss": 2.5969, + "step": 31473 + }, + { + "epoch": 0.9333096106514841, + "grad_norm": 0.06334280967712402, + "learning_rate": 1.1165521082089237e-05, + "loss": 2.5015, + "step": 31474 + }, + { + "epoch": 0.9333392640037956, + "grad_norm": 0.06786451488733292, + "learning_rate": 1.1155635443791113e-05, + "loss": 2.5556, + "step": 31475 + }, + { + "epoch": 0.933368917356107, + "grad_norm": 0.06723220646381378, + "learning_rate": 1.1145754134293806e-05, + "loss": 2.5112, + "step": 31476 + }, + { + "epoch": 0.9333985707084186, + "grad_norm": 0.06841634958982468, + "learning_rate": 1.1135877153684692e-05, + "loss": 2.5403, + "step": 31477 + }, + { + "epoch": 0.9334282240607301, + "grad_norm": 0.06829942017793655, + "learning_rate": 1.1126004502051313e-05, + "loss": 2.5119, + "step": 31478 + }, + { + "epoch": 0.9334578774130415, + "grad_norm": 0.06495068222284317, + "learning_rate": 1.1116136179481096e-05, + "loss": 2.522, + "step": 31479 + }, + { + "epoch": 0.9334875307653531, + "grad_norm": 0.07031496614217758, + "learning_rate": 1.1106272186061362e-05, + "loss": 2.532, + "step": 31480 + }, + { + "epoch": 0.9335171841176645, + "grad_norm": 0.07045043259859085, + "learning_rate": 1.109641252187954e-05, + "loss": 2.5269, + "step": 31481 + }, + { + "epoch": 0.933546837469976, + "grad_norm": 0.07143251597881317, + "learning_rate": 1.1086557187022895e-05, + "loss": 2.5514, + "step": 31482 + }, + { + "epoch": 0.9335764908222874, + "grad_norm": 0.06809273362159729, + "learning_rate": 1.1076706181578744e-05, + "loss": 2.5235, + "step": 31483 + }, + { + "epoch": 0.933606144174599, + "grad_norm": 0.06956706196069717, + "learning_rate": 1.1066859505634241e-05, + "loss": 2.5316, + "step": 31484 + }, + { + "epoch": 0.9336357975269104, + "grad_norm": 0.06939532607793808, + "learning_rate": 1.105701715927665e-05, + "loss": 2.5231, + "step": 31485 + }, + { + "epoch": 0.9336654508792219, + "grad_norm": 0.06878562271595001, + "learning_rate": 1.1047179142593066e-05, + "loss": 2.5415, + "step": 31486 + }, + { + "epoch": 0.9336951042315333, + "grad_norm": 0.06746996194124222, + "learning_rate": 1.10373454556707e-05, + "loss": 2.5431, + "step": 31487 + }, + { + "epoch": 0.9337247575838449, + "grad_norm": 0.06903735548257828, + "learning_rate": 1.102751609859648e-05, + "loss": 2.5423, + "step": 31488 + }, + { + "epoch": 0.9337544109361563, + "grad_norm": 0.06582116335630417, + "learning_rate": 1.1017691071457614e-05, + "loss": 2.5629, + "step": 31489 + }, + { + "epoch": 0.9337840642884678, + "grad_norm": 0.06605074554681778, + "learning_rate": 1.1007870374340978e-05, + "loss": 2.5411, + "step": 31490 + }, + { + "epoch": 0.9338137176407793, + "grad_norm": 0.0718265175819397, + "learning_rate": 1.0998054007333613e-05, + "loss": 2.5463, + "step": 31491 + }, + { + "epoch": 0.9338433709930908, + "grad_norm": 0.06886828690767288, + "learning_rate": 1.0988241970522395e-05, + "loss": 2.5651, + "step": 31492 + }, + { + "epoch": 0.9338730243454022, + "grad_norm": 0.07138751447200775, + "learning_rate": 1.0978434263994253e-05, + "loss": 2.5241, + "step": 31493 + }, + { + "epoch": 0.9339026776977137, + "grad_norm": 0.06800193339586258, + "learning_rate": 1.0968630887836006e-05, + "loss": 2.5664, + "step": 31494 + }, + { + "epoch": 0.9339323310500252, + "grad_norm": 0.06849020719528198, + "learning_rate": 1.095883184213442e-05, + "loss": 2.5608, + "step": 31495 + }, + { + "epoch": 0.9339619844023367, + "grad_norm": 0.07130969315767288, + "learning_rate": 1.0949037126976369e-05, + "loss": 2.5583, + "step": 31496 + }, + { + "epoch": 0.9339916377546482, + "grad_norm": 0.07395097613334656, + "learning_rate": 1.093924674244856e-05, + "loss": 2.5369, + "step": 31497 + }, + { + "epoch": 0.9340212911069596, + "grad_norm": 0.06733924895524979, + "learning_rate": 1.0929460688637649e-05, + "loss": 2.5233, + "step": 31498 + }, + { + "epoch": 0.9340509444592712, + "grad_norm": 0.06570498645305634, + "learning_rate": 1.0919678965630287e-05, + "loss": 2.5039, + "step": 31499 + }, + { + "epoch": 0.9340805978115826, + "grad_norm": 0.06874261051416397, + "learning_rate": 1.0909901573513181e-05, + "loss": 2.5285, + "step": 31500 + }, + { + "epoch": 0.9341102511638941, + "grad_norm": 0.069646917283535, + "learning_rate": 1.0900128512372765e-05, + "loss": 2.5432, + "step": 31501 + }, + { + "epoch": 0.9341399045162055, + "grad_norm": 0.07222406566143036, + "learning_rate": 1.0890359782295745e-05, + "loss": 2.5609, + "step": 31502 + }, + { + "epoch": 0.9341695578685171, + "grad_norm": 0.06563045084476471, + "learning_rate": 1.0880595383368496e-05, + "loss": 2.4932, + "step": 31503 + }, + { + "epoch": 0.9341992112208285, + "grad_norm": 0.06540138274431229, + "learning_rate": 1.0870835315677507e-05, + "loss": 2.5255, + "step": 31504 + }, + { + "epoch": 0.93422886457314, + "grad_norm": 0.06902427226305008, + "learning_rate": 1.0861079579309263e-05, + "loss": 2.5424, + "step": 31505 + }, + { + "epoch": 0.9342585179254514, + "grad_norm": 0.06784432381391525, + "learning_rate": 1.0851328174350139e-05, + "loss": 2.5319, + "step": 31506 + }, + { + "epoch": 0.934288171277763, + "grad_norm": 0.07271792739629745, + "learning_rate": 1.0841581100886455e-05, + "loss": 2.5573, + "step": 31507 + }, + { + "epoch": 0.9343178246300744, + "grad_norm": 0.06882806122303009, + "learning_rate": 1.0831838359004531e-05, + "loss": 2.556, + "step": 31508 + }, + { + "epoch": 0.9343474779823859, + "grad_norm": 0.06743897497653961, + "learning_rate": 1.0822099948790632e-05, + "loss": 2.53, + "step": 31509 + }, + { + "epoch": 0.9343771313346974, + "grad_norm": 0.07114280760288239, + "learning_rate": 1.0812365870330964e-05, + "loss": 2.5467, + "step": 31510 + }, + { + "epoch": 0.9344067846870089, + "grad_norm": 0.06781420856714249, + "learning_rate": 1.0802636123711851e-05, + "loss": 2.5283, + "step": 31511 + }, + { + "epoch": 0.9344364380393203, + "grad_norm": 0.06800905615091324, + "learning_rate": 1.0792910709019276e-05, + "loss": 2.5553, + "step": 31512 + }, + { + "epoch": 0.9344660913916318, + "grad_norm": 0.0686277374625206, + "learning_rate": 1.0783189626339508e-05, + "loss": 2.5352, + "step": 31513 + }, + { + "epoch": 0.9344957447439433, + "grad_norm": 0.06719545274972916, + "learning_rate": 1.0773472875758583e-05, + "loss": 2.5364, + "step": 31514 + }, + { + "epoch": 0.9345253980962548, + "grad_norm": 0.06536419689655304, + "learning_rate": 1.0763760457362493e-05, + "loss": 2.5304, + "step": 31515 + }, + { + "epoch": 0.9345550514485662, + "grad_norm": 0.06861162185668945, + "learning_rate": 1.0754052371237278e-05, + "loss": 2.5288, + "step": 31516 + }, + { + "epoch": 0.9345847048008777, + "grad_norm": 0.06906560063362122, + "learning_rate": 1.074434861746898e-05, + "loss": 2.5558, + "step": 31517 + }, + { + "epoch": 0.9346143581531893, + "grad_norm": 0.06796597689390182, + "learning_rate": 1.0734649196143365e-05, + "loss": 2.5533, + "step": 31518 + }, + { + "epoch": 0.9346440115055007, + "grad_norm": 0.07097096741199493, + "learning_rate": 1.0724954107346419e-05, + "loss": 2.5641, + "step": 31519 + }, + { + "epoch": 0.9346736648578122, + "grad_norm": 0.06760250777006149, + "learning_rate": 1.0715263351163962e-05, + "loss": 2.5734, + "step": 31520 + }, + { + "epoch": 0.9347033182101236, + "grad_norm": 0.06864506006240845, + "learning_rate": 1.070557692768187e-05, + "loss": 2.503, + "step": 31521 + }, + { + "epoch": 0.9347329715624352, + "grad_norm": 0.06882523745298386, + "learning_rate": 1.0695894836985909e-05, + "loss": 2.5332, + "step": 31522 + }, + { + "epoch": 0.9347626249147466, + "grad_norm": 0.0713036060333252, + "learning_rate": 1.0686217079161786e-05, + "loss": 2.5322, + "step": 31523 + }, + { + "epoch": 0.9347922782670581, + "grad_norm": 0.06746109575033188, + "learning_rate": 1.0676543654295157e-05, + "loss": 2.5212, + "step": 31524 + }, + { + "epoch": 0.9348219316193696, + "grad_norm": 0.07158507406711578, + "learning_rate": 1.0666874562471785e-05, + "loss": 2.514, + "step": 31525 + }, + { + "epoch": 0.9348515849716811, + "grad_norm": 0.06752067804336548, + "learning_rate": 1.0657209803777156e-05, + "loss": 2.5067, + "step": 31526 + }, + { + "epoch": 0.9348812383239925, + "grad_norm": 0.06825969368219376, + "learning_rate": 1.0647549378296983e-05, + "loss": 2.5278, + "step": 31527 + }, + { + "epoch": 0.934910891676304, + "grad_norm": 0.07076099514961243, + "learning_rate": 1.0637893286116806e-05, + "loss": 2.4986, + "step": 31528 + }, + { + "epoch": 0.9349405450286155, + "grad_norm": 0.06884276121854782, + "learning_rate": 1.0628241527322003e-05, + "loss": 2.5111, + "step": 31529 + }, + { + "epoch": 0.934970198380927, + "grad_norm": 0.06881658732891083, + "learning_rate": 1.0618594101998114e-05, + "loss": 2.5495, + "step": 31530 + }, + { + "epoch": 0.9349998517332384, + "grad_norm": 0.07091314345598221, + "learning_rate": 1.0608951010230627e-05, + "loss": 2.5327, + "step": 31531 + }, + { + "epoch": 0.9350295050855499, + "grad_norm": 0.06862702965736389, + "learning_rate": 1.0599312252104864e-05, + "loss": 2.5295, + "step": 31532 + }, + { + "epoch": 0.9350591584378614, + "grad_norm": 0.06900297850370407, + "learning_rate": 1.0589677827706146e-05, + "loss": 2.536, + "step": 31533 + }, + { + "epoch": 0.9350888117901729, + "grad_norm": 0.06775488704442978, + "learning_rate": 1.0580047737119847e-05, + "loss": 2.5348, + "step": 31534 + }, + { + "epoch": 0.9351184651424843, + "grad_norm": 0.07242533564567566, + "learning_rate": 1.0570421980431289e-05, + "loss": 2.535, + "step": 31535 + }, + { + "epoch": 0.9351481184947958, + "grad_norm": 0.06895264983177185, + "learning_rate": 1.0560800557725624e-05, + "loss": 2.5338, + "step": 31536 + }, + { + "epoch": 0.9351777718471073, + "grad_norm": 0.06580788642168045, + "learning_rate": 1.0551183469088066e-05, + "loss": 2.5412, + "step": 31537 + }, + { + "epoch": 0.9352074251994188, + "grad_norm": 0.0677441656589508, + "learning_rate": 1.054157071460382e-05, + "loss": 2.546, + "step": 31538 + }, + { + "epoch": 0.9352370785517303, + "grad_norm": 0.065850630402565, + "learning_rate": 1.0531962294357989e-05, + "loss": 2.5409, + "step": 31539 + }, + { + "epoch": 0.9352667319040417, + "grad_norm": 0.06867434084415436, + "learning_rate": 1.0522358208435612e-05, + "loss": 2.5479, + "step": 31540 + }, + { + "epoch": 0.9352963852563533, + "grad_norm": 0.06756217032670975, + "learning_rate": 1.051275845692179e-05, + "loss": 2.5447, + "step": 31541 + }, + { + "epoch": 0.9353260386086647, + "grad_norm": 0.06637634336948395, + "learning_rate": 1.0503163039901508e-05, + "loss": 2.5405, + "step": 31542 + }, + { + "epoch": 0.9353556919609762, + "grad_norm": 0.06542426347732544, + "learning_rate": 1.0493571957459758e-05, + "loss": 2.5298, + "step": 31543 + }, + { + "epoch": 0.9353853453132877, + "grad_norm": 0.07183349132537842, + "learning_rate": 1.0483985209681413e-05, + "loss": 2.5241, + "step": 31544 + }, + { + "epoch": 0.9354149986655992, + "grad_norm": 0.0661798045039177, + "learning_rate": 1.0474402796651405e-05, + "loss": 2.5252, + "step": 31545 + }, + { + "epoch": 0.9354446520179106, + "grad_norm": 0.0667228177189827, + "learning_rate": 1.0464824718454558e-05, + "loss": 2.536, + "step": 31546 + }, + { + "epoch": 0.9354743053702221, + "grad_norm": 0.06748820841312408, + "learning_rate": 1.0455250975175801e-05, + "loss": 2.5495, + "step": 31547 + }, + { + "epoch": 0.9355039587225336, + "grad_norm": 0.06750667095184326, + "learning_rate": 1.0445681566899789e-05, + "loss": 2.5639, + "step": 31548 + }, + { + "epoch": 0.9355336120748451, + "grad_norm": 0.07090768218040466, + "learning_rate": 1.043611649371129e-05, + "loss": 2.544, + "step": 31549 + }, + { + "epoch": 0.9355632654271565, + "grad_norm": 0.06893833726644516, + "learning_rate": 1.0426555755695067e-05, + "loss": 2.5193, + "step": 31550 + }, + { + "epoch": 0.935592918779468, + "grad_norm": 0.06983517110347748, + "learning_rate": 1.0416999352935664e-05, + "loss": 2.5753, + "step": 31551 + }, + { + "epoch": 0.9356225721317795, + "grad_norm": 0.06695841252803802, + "learning_rate": 1.0407447285517791e-05, + "loss": 2.5271, + "step": 31552 + }, + { + "epoch": 0.935652225484091, + "grad_norm": 0.07101986557245255, + "learning_rate": 1.0397899553525992e-05, + "loss": 2.5484, + "step": 31553 + }, + { + "epoch": 0.9356818788364024, + "grad_norm": 0.06763001531362534, + "learning_rate": 1.038835615704481e-05, + "loss": 2.5483, + "step": 31554 + }, + { + "epoch": 0.935711532188714, + "grad_norm": 0.07023736089468002, + "learning_rate": 1.0378817096158788e-05, + "loss": 2.5088, + "step": 31555 + }, + { + "epoch": 0.9357411855410254, + "grad_norm": 0.06607724726200104, + "learning_rate": 1.0369282370952416e-05, + "loss": 2.5171, + "step": 31556 + }, + { + "epoch": 0.9357708388933369, + "grad_norm": 0.06781325489282608, + "learning_rate": 1.0359751981510068e-05, + "loss": 2.5359, + "step": 31557 + }, + { + "epoch": 0.9358004922456483, + "grad_norm": 0.06319873034954071, + "learning_rate": 1.0350225927916179e-05, + "loss": 2.5286, + "step": 31558 + }, + { + "epoch": 0.9358301455979599, + "grad_norm": 0.07035821676254272, + "learning_rate": 1.034070421025507e-05, + "loss": 2.528, + "step": 31559 + }, + { + "epoch": 0.9358597989502714, + "grad_norm": 0.07235580682754517, + "learning_rate": 1.0331186828611062e-05, + "loss": 2.5543, + "step": 31560 + }, + { + "epoch": 0.9358894523025828, + "grad_norm": 0.0656658411026001, + "learning_rate": 1.0321673783068474e-05, + "loss": 2.5724, + "step": 31561 + }, + { + "epoch": 0.9359191056548943, + "grad_norm": 0.06758996844291687, + "learning_rate": 1.0312165073711521e-05, + "loss": 2.5518, + "step": 31562 + }, + { + "epoch": 0.9359487590072058, + "grad_norm": 0.06639239937067032, + "learning_rate": 1.0302660700624355e-05, + "loss": 2.5152, + "step": 31563 + }, + { + "epoch": 0.9359784123595173, + "grad_norm": 0.06835371255874634, + "learning_rate": 1.0293160663891243e-05, + "loss": 2.5315, + "step": 31564 + }, + { + "epoch": 0.9360080657118287, + "grad_norm": 0.06739103049039841, + "learning_rate": 1.0283664963596229e-05, + "loss": 2.5138, + "step": 31565 + }, + { + "epoch": 0.9360377190641402, + "grad_norm": 0.06816653162240982, + "learning_rate": 1.027417359982341e-05, + "loss": 2.5367, + "step": 31566 + }, + { + "epoch": 0.9360673724164517, + "grad_norm": 0.06594505906105042, + "learning_rate": 1.0264686572656834e-05, + "loss": 2.527, + "step": 31567 + }, + { + "epoch": 0.9360970257687632, + "grad_norm": 0.0667024701833725, + "learning_rate": 1.0255203882180542e-05, + "loss": 2.519, + "step": 31568 + }, + { + "epoch": 0.9361266791210746, + "grad_norm": 0.06770344823598862, + "learning_rate": 1.0245725528478466e-05, + "loss": 2.5275, + "step": 31569 + }, + { + "epoch": 0.9361563324733861, + "grad_norm": 0.06909575313329697, + "learning_rate": 1.0236251511634543e-05, + "loss": 2.5328, + "step": 31570 + }, + { + "epoch": 0.9361859858256976, + "grad_norm": 0.0723075121641159, + "learning_rate": 1.0226781831732701e-05, + "loss": 2.5966, + "step": 31571 + }, + { + "epoch": 0.9362156391780091, + "grad_norm": 0.06819819658994675, + "learning_rate": 1.0217316488856765e-05, + "loss": 2.5266, + "step": 31572 + }, + { + "epoch": 0.9362452925303205, + "grad_norm": 0.06940119713544846, + "learning_rate": 1.0207855483090611e-05, + "loss": 2.5376, + "step": 31573 + }, + { + "epoch": 0.936274945882632, + "grad_norm": 0.06573360413312912, + "learning_rate": 1.0198398814517895e-05, + "loss": 2.5617, + "step": 31574 + }, + { + "epoch": 0.9363045992349435, + "grad_norm": 0.06643762439489365, + "learning_rate": 1.0188946483222494e-05, + "loss": 2.5562, + "step": 31575 + }, + { + "epoch": 0.936334252587255, + "grad_norm": 0.06735804677009583, + "learning_rate": 1.0179498489288009e-05, + "loss": 2.5756, + "step": 31576 + }, + { + "epoch": 0.9363639059395664, + "grad_norm": 0.06953713297843933, + "learning_rate": 1.0170054832798147e-05, + "loss": 2.5232, + "step": 31577 + }, + { + "epoch": 0.936393559291878, + "grad_norm": 0.06816928833723068, + "learning_rate": 1.0160615513836513e-05, + "loss": 2.5316, + "step": 31578 + }, + { + "epoch": 0.9364232126441894, + "grad_norm": 0.06870489567518234, + "learning_rate": 1.015118053248676e-05, + "loss": 2.5333, + "step": 31579 + }, + { + "epoch": 0.9364528659965009, + "grad_norm": 0.06735482811927795, + "learning_rate": 1.014174988883232e-05, + "loss": 2.5155, + "step": 31580 + }, + { + "epoch": 0.9364825193488124, + "grad_norm": 0.06499812752008438, + "learning_rate": 1.0132323582956793e-05, + "loss": 2.5186, + "step": 31581 + }, + { + "epoch": 0.9365121727011239, + "grad_norm": 0.06772524863481522, + "learning_rate": 1.0122901614943614e-05, + "loss": 2.5114, + "step": 31582 + }, + { + "epoch": 0.9365418260534354, + "grad_norm": 0.07044115662574768, + "learning_rate": 1.0113483984876325e-05, + "loss": 2.5138, + "step": 31583 + }, + { + "epoch": 0.9365714794057468, + "grad_norm": 0.06872561573982239, + "learning_rate": 1.0104070692838141e-05, + "loss": 2.5453, + "step": 31584 + }, + { + "epoch": 0.9366011327580583, + "grad_norm": 0.0706048309803009, + "learning_rate": 1.0094661738912436e-05, + "loss": 2.5554, + "step": 31585 + }, + { + "epoch": 0.9366307861103698, + "grad_norm": 0.06464327871799469, + "learning_rate": 1.0085257123182645e-05, + "loss": 2.5397, + "step": 31586 + }, + { + "epoch": 0.9366604394626813, + "grad_norm": 0.06540234386920929, + "learning_rate": 1.0075856845732034e-05, + "loss": 2.5097, + "step": 31587 + }, + { + "epoch": 0.9366900928149927, + "grad_norm": 0.06497880071401596, + "learning_rate": 1.006646090664376e-05, + "loss": 2.5106, + "step": 31588 + }, + { + "epoch": 0.9367197461673042, + "grad_norm": 0.06918345391750336, + "learning_rate": 1.0057069306001087e-05, + "loss": 2.5252, + "step": 31589 + }, + { + "epoch": 0.9367493995196157, + "grad_norm": 0.07179394364356995, + "learning_rate": 1.0047682043887174e-05, + "loss": 2.5132, + "step": 31590 + }, + { + "epoch": 0.9367790528719272, + "grad_norm": 0.06685595959424973, + "learning_rate": 1.0038299120385119e-05, + "loss": 2.5795, + "step": 31591 + }, + { + "epoch": 0.9368087062242386, + "grad_norm": 0.06781710684299469, + "learning_rate": 1.0028920535578078e-05, + "loss": 2.5415, + "step": 31592 + }, + { + "epoch": 0.9368383595765502, + "grad_norm": 0.07046233117580414, + "learning_rate": 1.0019546289549042e-05, + "loss": 2.543, + "step": 31593 + }, + { + "epoch": 0.9368680129288616, + "grad_norm": 0.07004653662443161, + "learning_rate": 1.0010176382380942e-05, + "loss": 2.5504, + "step": 31594 + }, + { + "epoch": 0.9368976662811731, + "grad_norm": 0.06756633520126343, + "learning_rate": 1.0000810814156935e-05, + "loss": 2.5476, + "step": 31595 + }, + { + "epoch": 0.9369273196334845, + "grad_norm": 0.07066429406404495, + "learning_rate": 9.99144958495979e-06, + "loss": 2.5571, + "step": 31596 + }, + { + "epoch": 0.936956972985796, + "grad_norm": 0.06788919866085052, + "learning_rate": 9.982092694872491e-06, + "loss": 2.5357, + "step": 31597 + }, + { + "epoch": 0.9369866263381075, + "grad_norm": 0.0688735842704773, + "learning_rate": 9.97274014397781e-06, + "loss": 2.5362, + "step": 31598 + }, + { + "epoch": 0.937016279690419, + "grad_norm": 0.06632256507873535, + "learning_rate": 9.963391932358678e-06, + "loss": 2.5487, + "step": 31599 + }, + { + "epoch": 0.9370459330427304, + "grad_norm": 0.06388144940137863, + "learning_rate": 9.954048060097809e-06, + "loss": 2.5552, + "step": 31600 + }, + { + "epoch": 0.937075586395042, + "grad_norm": 0.06783634424209595, + "learning_rate": 9.944708527277967e-06, + "loss": 2.5122, + "step": 31601 + }, + { + "epoch": 0.9371052397473535, + "grad_norm": 0.07021673023700714, + "learning_rate": 9.935373333981868e-06, + "loss": 2.5573, + "step": 31602 + }, + { + "epoch": 0.9371348930996649, + "grad_norm": 0.06707140803337097, + "learning_rate": 9.926042480292163e-06, + "loss": 2.5432, + "step": 31603 + }, + { + "epoch": 0.9371645464519764, + "grad_norm": 0.06977137923240662, + "learning_rate": 9.916715966291512e-06, + "loss": 2.5757, + "step": 31604 + }, + { + "epoch": 0.9371941998042879, + "grad_norm": 0.0645834356546402, + "learning_rate": 9.907393792062403e-06, + "loss": 2.5514, + "step": 31605 + }, + { + "epoch": 0.9372238531565994, + "grad_norm": 0.06548867374658585, + "learning_rate": 9.898075957687435e-06, + "loss": 2.4873, + "step": 31606 + }, + { + "epoch": 0.9372535065089108, + "grad_norm": 0.06906720250844955, + "learning_rate": 9.888762463249156e-06, + "loss": 2.5132, + "step": 31607 + }, + { + "epoch": 0.9372831598612223, + "grad_norm": 0.06829208880662918, + "learning_rate": 9.87945330883e-06, + "loss": 2.5244, + "step": 31608 + }, + { + "epoch": 0.9373128132135338, + "grad_norm": 0.0688839852809906, + "learning_rate": 9.870148494512399e-06, + "loss": 2.5233, + "step": 31609 + }, + { + "epoch": 0.9373424665658453, + "grad_norm": 0.06370358169078827, + "learning_rate": 9.860848020378732e-06, + "loss": 2.5363, + "step": 31610 + }, + { + "epoch": 0.9373721199181567, + "grad_norm": 0.06710845232009888, + "learning_rate": 9.851551886511433e-06, + "loss": 2.5125, + "step": 31611 + }, + { + "epoch": 0.9374017732704683, + "grad_norm": 0.06498537212610245, + "learning_rate": 9.842260092992772e-06, + "loss": 2.55, + "step": 31612 + }, + { + "epoch": 0.9374314266227797, + "grad_norm": 0.06765653938055038, + "learning_rate": 9.832972639905013e-06, + "loss": 2.5617, + "step": 31613 + }, + { + "epoch": 0.9374610799750912, + "grad_norm": 0.0656299963593483, + "learning_rate": 9.823689527330482e-06, + "loss": 2.5387, + "step": 31614 + }, + { + "epoch": 0.9374907333274026, + "grad_norm": 0.06726371496915817, + "learning_rate": 9.814410755351278e-06, + "loss": 2.5468, + "step": 31615 + }, + { + "epoch": 0.9375203866797142, + "grad_norm": 0.06643615663051605, + "learning_rate": 9.805136324049612e-06, + "loss": 2.5195, + "step": 31616 + }, + { + "epoch": 0.9375500400320256, + "grad_norm": 0.06748834252357483, + "learning_rate": 9.795866233507589e-06, + "loss": 2.5473, + "step": 31617 + }, + { + "epoch": 0.9375796933843371, + "grad_norm": 0.07010374963283539, + "learning_rate": 9.786600483807307e-06, + "loss": 2.5505, + "step": 31618 + }, + { + "epoch": 0.9376093467366485, + "grad_norm": 0.06716623902320862, + "learning_rate": 9.77733907503081e-06, + "loss": 2.5144, + "step": 31619 + }, + { + "epoch": 0.9376390000889601, + "grad_norm": 0.06715060770511627, + "learning_rate": 9.768082007260148e-06, + "loss": 2.5343, + "step": 31620 + }, + { + "epoch": 0.9376686534412715, + "grad_norm": 0.06771984696388245, + "learning_rate": 9.758829280577309e-06, + "loss": 2.5394, + "step": 31621 + }, + { + "epoch": 0.937698306793583, + "grad_norm": 0.06739537417888641, + "learning_rate": 9.749580895064114e-06, + "loss": 2.5407, + "step": 31622 + }, + { + "epoch": 0.9377279601458945, + "grad_norm": 0.06785542517900467, + "learning_rate": 9.7403368508025e-06, + "loss": 2.5511, + "step": 31623 + }, + { + "epoch": 0.937757613498206, + "grad_norm": 0.06498262286186218, + "learning_rate": 9.731097147874401e-06, + "loss": 2.5102, + "step": 31624 + }, + { + "epoch": 0.9377872668505175, + "grad_norm": 0.0660942867398262, + "learning_rate": 9.721861786361642e-06, + "loss": 2.5136, + "step": 31625 + }, + { + "epoch": 0.9378169202028289, + "grad_norm": 0.06936024874448776, + "learning_rate": 9.712630766345932e-06, + "loss": 2.5492, + "step": 31626 + }, + { + "epoch": 0.9378465735551405, + "grad_norm": 0.06789509952068329, + "learning_rate": 9.70340408790904e-06, + "loss": 2.5545, + "step": 31627 + }, + { + "epoch": 0.9378762269074519, + "grad_norm": 0.06674206256866455, + "learning_rate": 9.694181751132625e-06, + "loss": 2.533, + "step": 31628 + }, + { + "epoch": 0.9379058802597634, + "grad_norm": 0.06801088154315948, + "learning_rate": 9.684963756098397e-06, + "loss": 2.5482, + "step": 31629 + }, + { + "epoch": 0.9379355336120748, + "grad_norm": 0.06731293350458145, + "learning_rate": 9.675750102887958e-06, + "loss": 2.5161, + "step": 31630 + }, + { + "epoch": 0.9379651869643864, + "grad_norm": 0.06472467631101608, + "learning_rate": 9.666540791582968e-06, + "loss": 2.5193, + "step": 31631 + }, + { + "epoch": 0.9379948403166978, + "grad_norm": 0.06636624783277512, + "learning_rate": 9.657335822264857e-06, + "loss": 2.5527, + "step": 31632 + }, + { + "epoch": 0.9380244936690093, + "grad_norm": 0.0696115493774414, + "learning_rate": 9.648135195015229e-06, + "loss": 2.5176, + "step": 31633 + }, + { + "epoch": 0.9380541470213207, + "grad_norm": 0.0681639090180397, + "learning_rate": 9.638938909915573e-06, + "loss": 2.5341, + "step": 31634 + }, + { + "epoch": 0.9380838003736323, + "grad_norm": 0.0657864362001419, + "learning_rate": 9.629746967047215e-06, + "loss": 2.5507, + "step": 31635 + }, + { + "epoch": 0.9381134537259437, + "grad_norm": 0.066909059882164, + "learning_rate": 9.620559366491698e-06, + "loss": 2.5224, + "step": 31636 + }, + { + "epoch": 0.9381431070782552, + "grad_norm": 0.0704277902841568, + "learning_rate": 9.611376108330239e-06, + "loss": 2.5483, + "step": 31637 + }, + { + "epoch": 0.9381727604305666, + "grad_norm": 0.06985506415367126, + "learning_rate": 9.602197192644213e-06, + "loss": 2.5598, + "step": 31638 + }, + { + "epoch": 0.9382024137828782, + "grad_norm": 0.06611733138561249, + "learning_rate": 9.593022619514946e-06, + "loss": 2.5364, + "step": 31639 + }, + { + "epoch": 0.9382320671351896, + "grad_norm": 0.06852065771818161, + "learning_rate": 9.58385238902365e-06, + "loss": 2.5531, + "step": 31640 + }, + { + "epoch": 0.9382617204875011, + "grad_norm": 0.0691305547952652, + "learning_rate": 9.574686501251484e-06, + "loss": 2.5642, + "step": 31641 + }, + { + "epoch": 0.9382913738398125, + "grad_norm": 0.06654180586338043, + "learning_rate": 9.56552495627966e-06, + "loss": 2.512, + "step": 31642 + }, + { + "epoch": 0.9383210271921241, + "grad_norm": 0.06841102987527847, + "learning_rate": 9.556367754189277e-06, + "loss": 2.5124, + "step": 31643 + }, + { + "epoch": 0.9383506805444356, + "grad_norm": 0.06893604248762131, + "learning_rate": 9.547214895061496e-06, + "loss": 2.5315, + "step": 31644 + }, + { + "epoch": 0.938380333896747, + "grad_norm": 0.06782656162977219, + "learning_rate": 9.53806637897725e-06, + "loss": 2.5022, + "step": 31645 + }, + { + "epoch": 0.9384099872490586, + "grad_norm": 0.06661900877952576, + "learning_rate": 9.528922206017644e-06, + "loss": 2.5193, + "step": 31646 + }, + { + "epoch": 0.93843964060137, + "grad_norm": 0.06687295436859131, + "learning_rate": 9.519782376263608e-06, + "loss": 2.5705, + "step": 31647 + }, + { + "epoch": 0.9384692939536815, + "grad_norm": 0.06452544033527374, + "learning_rate": 9.510646889796082e-06, + "loss": 2.5417, + "step": 31648 + }, + { + "epoch": 0.9384989473059929, + "grad_norm": 0.06630221754312515, + "learning_rate": 9.501515746695999e-06, + "loss": 2.5354, + "step": 31649 + }, + { + "epoch": 0.9385286006583045, + "grad_norm": 0.06865781545639038, + "learning_rate": 9.492388947044184e-06, + "loss": 2.4969, + "step": 31650 + }, + { + "epoch": 0.9385582540106159, + "grad_norm": 0.06452783942222595, + "learning_rate": 9.483266490921406e-06, + "loss": 2.5099, + "step": 31651 + }, + { + "epoch": 0.9385879073629274, + "grad_norm": 0.06626959145069122, + "learning_rate": 9.474148378408543e-06, + "loss": 2.5377, + "step": 31652 + }, + { + "epoch": 0.9386175607152388, + "grad_norm": 0.07314497232437134, + "learning_rate": 9.46503460958631e-06, + "loss": 2.5665, + "step": 31653 + }, + { + "epoch": 0.9386472140675504, + "grad_norm": 0.06824610382318497, + "learning_rate": 9.455925184535364e-06, + "loss": 2.5537, + "step": 31654 + }, + { + "epoch": 0.9386768674198618, + "grad_norm": 0.068100705742836, + "learning_rate": 9.446820103336417e-06, + "loss": 2.554, + "step": 31655 + }, + { + "epoch": 0.9387065207721733, + "grad_norm": 0.06685584038496017, + "learning_rate": 9.437719366070074e-06, + "loss": 2.5568, + "step": 31656 + }, + { + "epoch": 0.9387361741244847, + "grad_norm": 0.06724338233470917, + "learning_rate": 9.428622972816937e-06, + "loss": 2.5403, + "step": 31657 + }, + { + "epoch": 0.9387658274767963, + "grad_norm": 0.06779829412698746, + "learning_rate": 9.419530923657549e-06, + "loss": 2.5532, + "step": 31658 + }, + { + "epoch": 0.9387954808291077, + "grad_norm": 0.07025904208421707, + "learning_rate": 9.41044321867246e-06, + "loss": 2.5409, + "step": 31659 + }, + { + "epoch": 0.9388251341814192, + "grad_norm": 0.06999441981315613, + "learning_rate": 9.40135985794205e-06, + "loss": 2.5287, + "step": 31660 + }, + { + "epoch": 0.9388547875337306, + "grad_norm": 0.06974857300519943, + "learning_rate": 9.392280841546808e-06, + "loss": 2.5449, + "step": 31661 + }, + { + "epoch": 0.9388844408860422, + "grad_norm": 0.0663716048002243, + "learning_rate": 9.383206169567116e-06, + "loss": 2.5098, + "step": 31662 + }, + { + "epoch": 0.9389140942383536, + "grad_norm": 0.0688234195113182, + "learning_rate": 9.374135842083354e-06, + "loss": 2.5273, + "step": 31663 + }, + { + "epoch": 0.9389437475906651, + "grad_norm": 0.06990578025579453, + "learning_rate": 9.365069859175845e-06, + "loss": 2.5561, + "step": 31664 + }, + { + "epoch": 0.9389734009429767, + "grad_norm": 0.06716780364513397, + "learning_rate": 9.356008220924861e-06, + "loss": 2.5353, + "step": 31665 + }, + { + "epoch": 0.9390030542952881, + "grad_norm": 0.06747828423976898, + "learning_rate": 9.346950927410669e-06, + "loss": 2.5305, + "step": 31666 + }, + { + "epoch": 0.9390327076475996, + "grad_norm": 0.0655013844370842, + "learning_rate": 9.337897978713427e-06, + "loss": 2.5486, + "step": 31667 + }, + { + "epoch": 0.939062360999911, + "grad_norm": 0.06919188052415848, + "learning_rate": 9.328849374913295e-06, + "loss": 2.5242, + "step": 31668 + }, + { + "epoch": 0.9390920143522226, + "grad_norm": 0.0659274086356163, + "learning_rate": 9.31980511609054e-06, + "loss": 2.5507, + "step": 31669 + }, + { + "epoch": 0.939121667704534, + "grad_norm": 0.06721080839633942, + "learning_rate": 9.310765202324988e-06, + "loss": 2.5356, + "step": 31670 + }, + { + "epoch": 0.9391513210568455, + "grad_norm": 0.06961005181074142, + "learning_rate": 9.301729633696909e-06, + "loss": 2.5965, + "step": 31671 + }, + { + "epoch": 0.9391809744091569, + "grad_norm": 0.06650007516145706, + "learning_rate": 9.292698410286237e-06, + "loss": 2.5696, + "step": 31672 + }, + { + "epoch": 0.9392106277614685, + "grad_norm": 0.06672004610300064, + "learning_rate": 9.283671532172911e-06, + "loss": 2.545, + "step": 31673 + }, + { + "epoch": 0.9392402811137799, + "grad_norm": 0.07002075761556625, + "learning_rate": 9.27464899943692e-06, + "loss": 2.5377, + "step": 31674 + }, + { + "epoch": 0.9392699344660914, + "grad_norm": 0.07074206322431564, + "learning_rate": 9.265630812158143e-06, + "loss": 2.549, + "step": 31675 + }, + { + "epoch": 0.9392995878184028, + "grad_norm": 0.06755543500185013, + "learning_rate": 9.256616970416409e-06, + "loss": 2.5274, + "step": 31676 + }, + { + "epoch": 0.9393292411707144, + "grad_norm": 0.07957915961742401, + "learning_rate": 9.247607474291652e-06, + "loss": 2.5599, + "step": 31677 + }, + { + "epoch": 0.9393588945230258, + "grad_norm": 0.07183413952589035, + "learning_rate": 9.238602323863476e-06, + "loss": 2.5444, + "step": 31678 + }, + { + "epoch": 0.9393885478753373, + "grad_norm": 0.07225670665502548, + "learning_rate": 9.22960151921176e-06, + "loss": 2.5388, + "step": 31679 + }, + { + "epoch": 0.9394182012276487, + "grad_norm": 0.06923767924308777, + "learning_rate": 9.22060506041622e-06, + "loss": 2.5413, + "step": 31680 + }, + { + "epoch": 0.9394478545799603, + "grad_norm": 0.06678576022386551, + "learning_rate": 9.211612947556403e-06, + "loss": 2.5445, + "step": 31681 + }, + { + "epoch": 0.9394775079322717, + "grad_norm": 0.0687960684299469, + "learning_rate": 9.20262518071202e-06, + "loss": 2.5032, + "step": 31682 + }, + { + "epoch": 0.9395071612845832, + "grad_norm": 0.06864601373672485, + "learning_rate": 9.193641759962568e-06, + "loss": 2.5331, + "step": 31683 + }, + { + "epoch": 0.9395368146368946, + "grad_norm": 0.06457400321960449, + "learning_rate": 9.184662685387758e-06, + "loss": 2.5347, + "step": 31684 + }, + { + "epoch": 0.9395664679892062, + "grad_norm": 0.065296970307827, + "learning_rate": 9.175687957066913e-06, + "loss": 2.5293, + "step": 31685 + }, + { + "epoch": 0.9395961213415177, + "grad_norm": 0.06500846892595291, + "learning_rate": 9.166717575079641e-06, + "loss": 2.5179, + "step": 31686 + }, + { + "epoch": 0.9396257746938291, + "grad_norm": 0.06604444235563278, + "learning_rate": 9.157751539505377e-06, + "loss": 2.5087, + "step": 31687 + }, + { + "epoch": 0.9396554280461407, + "grad_norm": 0.07033219188451767, + "learning_rate": 9.14878985042339e-06, + "loss": 2.5062, + "step": 31688 + }, + { + "epoch": 0.9396850813984521, + "grad_norm": 0.06542818248271942, + "learning_rate": 9.13983250791317e-06, + "loss": 2.5324, + "step": 31689 + }, + { + "epoch": 0.9397147347507636, + "grad_norm": 0.07169835269451141, + "learning_rate": 9.130879512053992e-06, + "loss": 2.5502, + "step": 31690 + }, + { + "epoch": 0.939744388103075, + "grad_norm": 0.06549045443534851, + "learning_rate": 9.121930862925176e-06, + "loss": 2.5667, + "step": 31691 + }, + { + "epoch": 0.9397740414553866, + "grad_norm": 0.06692100316286087, + "learning_rate": 9.11298656060583e-06, + "loss": 2.5705, + "step": 31692 + }, + { + "epoch": 0.939803694807698, + "grad_norm": 0.06574638932943344, + "learning_rate": 9.10404660517533e-06, + "loss": 2.5256, + "step": 31693 + }, + { + "epoch": 0.9398333481600095, + "grad_norm": 0.06878110021352768, + "learning_rate": 9.095110996712674e-06, + "loss": 2.5422, + "step": 31694 + }, + { + "epoch": 0.9398630015123209, + "grad_norm": 0.06796030700206757, + "learning_rate": 9.086179735297128e-06, + "loss": 2.5509, + "step": 31695 + }, + { + "epoch": 0.9398926548646325, + "grad_norm": 0.06692662090063095, + "learning_rate": 9.077252821007742e-06, + "loss": 2.5106, + "step": 31696 + }, + { + "epoch": 0.9399223082169439, + "grad_norm": 0.06606786698102951, + "learning_rate": 9.068330253923506e-06, + "loss": 2.5187, + "step": 31697 + }, + { + "epoch": 0.9399519615692554, + "grad_norm": 0.06603296846151352, + "learning_rate": 9.05941203412347e-06, + "loss": 2.5366, + "step": 31698 + }, + { + "epoch": 0.9399816149215668, + "grad_norm": 0.06502231955528259, + "learning_rate": 9.050498161686627e-06, + "loss": 2.5698, + "step": 31699 + }, + { + "epoch": 0.9400112682738784, + "grad_norm": 0.06729429215192795, + "learning_rate": 9.04158863669191e-06, + "loss": 2.5449, + "step": 31700 + }, + { + "epoch": 0.9400409216261898, + "grad_norm": 0.06815657764673233, + "learning_rate": 9.032683459218205e-06, + "loss": 2.5345, + "step": 31701 + }, + { + "epoch": 0.9400705749785013, + "grad_norm": 0.06863068044185638, + "learning_rate": 9.023782629344335e-06, + "loss": 2.5542, + "step": 31702 + }, + { + "epoch": 0.9401002283308127, + "grad_norm": 0.06640134006738663, + "learning_rate": 9.014886147149126e-06, + "loss": 2.5417, + "step": 31703 + }, + { + "epoch": 0.9401298816831243, + "grad_norm": 0.06575600802898407, + "learning_rate": 9.005994012711405e-06, + "loss": 2.5502, + "step": 31704 + }, + { + "epoch": 0.9401595350354357, + "grad_norm": 0.06917650997638702, + "learning_rate": 8.997106226109886e-06, + "loss": 2.5293, + "step": 31705 + }, + { + "epoch": 0.9401891883877472, + "grad_norm": 0.06603451818227768, + "learning_rate": 8.988222787423229e-06, + "loss": 2.5408, + "step": 31706 + }, + { + "epoch": 0.9402188417400588, + "grad_norm": 0.067379429936409, + "learning_rate": 8.979343696730202e-06, + "loss": 2.5436, + "step": 31707 + }, + { + "epoch": 0.9402484950923702, + "grad_norm": 0.07022435963153839, + "learning_rate": 8.970468954109302e-06, + "loss": 2.5752, + "step": 31708 + }, + { + "epoch": 0.9402781484446817, + "grad_norm": 0.06597758829593658, + "learning_rate": 8.961598559639239e-06, + "loss": 2.5283, + "step": 31709 + }, + { + "epoch": 0.9403078017969931, + "grad_norm": 0.06593058258295059, + "learning_rate": 8.95273251339851e-06, + "loss": 2.5324, + "step": 31710 + }, + { + "epoch": 0.9403374551493047, + "grad_norm": 0.06645357608795166, + "learning_rate": 8.943870815465605e-06, + "loss": 2.5547, + "step": 31711 + }, + { + "epoch": 0.9403671085016161, + "grad_norm": 0.06863027065992355, + "learning_rate": 8.935013465918962e-06, + "loss": 2.5922, + "step": 31712 + }, + { + "epoch": 0.9403967618539276, + "grad_norm": 0.06560083478689194, + "learning_rate": 8.92616046483713e-06, + "loss": 2.5523, + "step": 31713 + }, + { + "epoch": 0.940426415206239, + "grad_norm": 0.06776686757802963, + "learning_rate": 8.917311812298434e-06, + "loss": 2.5355, + "step": 31714 + }, + { + "epoch": 0.9404560685585506, + "grad_norm": 0.07044634222984314, + "learning_rate": 8.908467508381202e-06, + "loss": 2.5801, + "step": 31715 + }, + { + "epoch": 0.940485721910862, + "grad_norm": 0.06714919209480286, + "learning_rate": 8.899627553163813e-06, + "loss": 2.4946, + "step": 31716 + }, + { + "epoch": 0.9405153752631735, + "grad_norm": 0.06463653594255447, + "learning_rate": 8.89079194672454e-06, + "loss": 2.5226, + "step": 31717 + }, + { + "epoch": 0.940545028615485, + "grad_norm": 0.06568319350481033, + "learning_rate": 8.881960689141543e-06, + "loss": 2.5472, + "step": 31718 + }, + { + "epoch": 0.9405746819677965, + "grad_norm": 0.06773775070905685, + "learning_rate": 8.873133780493092e-06, + "loss": 2.5676, + "step": 31719 + }, + { + "epoch": 0.9406043353201079, + "grad_norm": 0.06854628026485443, + "learning_rate": 8.864311220857401e-06, + "loss": 2.5228, + "step": 31720 + }, + { + "epoch": 0.9406339886724194, + "grad_norm": 0.06493277847766876, + "learning_rate": 8.855493010312465e-06, + "loss": 2.5257, + "step": 31721 + }, + { + "epoch": 0.9406636420247309, + "grad_norm": 0.06651141494512558, + "learning_rate": 8.846679148936499e-06, + "loss": 2.5536, + "step": 31722 + }, + { + "epoch": 0.9406932953770424, + "grad_norm": 0.06674366444349289, + "learning_rate": 8.837869636807494e-06, + "loss": 2.5111, + "step": 31723 + }, + { + "epoch": 0.9407229487293538, + "grad_norm": 0.07180924713611603, + "learning_rate": 8.82906447400339e-06, + "loss": 2.5408, + "step": 31724 + }, + { + "epoch": 0.9407526020816653, + "grad_norm": 0.0656685158610344, + "learning_rate": 8.820263660602345e-06, + "loss": 2.5138, + "step": 31725 + }, + { + "epoch": 0.9407822554339769, + "grad_norm": 0.07041987776756287, + "learning_rate": 8.811467196682077e-06, + "loss": 2.5584, + "step": 31726 + }, + { + "epoch": 0.9408119087862883, + "grad_norm": 0.06897158920764923, + "learning_rate": 8.802675082320577e-06, + "loss": 2.5317, + "step": 31727 + }, + { + "epoch": 0.9408415621385998, + "grad_norm": 0.06416566669940948, + "learning_rate": 8.793887317595729e-06, + "loss": 2.528, + "step": 31728 + }, + { + "epoch": 0.9408712154909112, + "grad_norm": 0.06711307913064957, + "learning_rate": 8.785103902585356e-06, + "loss": 2.5604, + "step": 31729 + }, + { + "epoch": 0.9409008688432228, + "grad_norm": 0.07086119055747986, + "learning_rate": 8.776324837367178e-06, + "loss": 2.5429, + "step": 31730 + }, + { + "epoch": 0.9409305221955342, + "grad_norm": 0.06836820393800735, + "learning_rate": 8.767550122018964e-06, + "loss": 2.5288, + "step": 31731 + }, + { + "epoch": 0.9409601755478457, + "grad_norm": 0.06618522852659225, + "learning_rate": 8.758779756618373e-06, + "loss": 2.5338, + "step": 31732 + }, + { + "epoch": 0.9409898289001571, + "grad_norm": 0.06793434172868729, + "learning_rate": 8.750013741243123e-06, + "loss": 2.5023, + "step": 31733 + }, + { + "epoch": 0.9410194822524687, + "grad_norm": 0.06952640414237976, + "learning_rate": 8.741252075970874e-06, + "loss": 2.5321, + "step": 31734 + }, + { + "epoch": 0.9410491356047801, + "grad_norm": 0.06632808595895767, + "learning_rate": 8.732494760879173e-06, + "loss": 2.5489, + "step": 31735 + }, + { + "epoch": 0.9410787889570916, + "grad_norm": 0.0677325651049614, + "learning_rate": 8.723741796045514e-06, + "loss": 2.5462, + "step": 31736 + }, + { + "epoch": 0.941108442309403, + "grad_norm": 0.06707078963518143, + "learning_rate": 8.714993181547448e-06, + "loss": 2.5537, + "step": 31737 + }, + { + "epoch": 0.9411380956617146, + "grad_norm": 0.06760702282190323, + "learning_rate": 8.706248917462411e-06, + "loss": 2.5548, + "step": 31738 + }, + { + "epoch": 0.941167749014026, + "grad_norm": 0.06593518704175949, + "learning_rate": 8.697509003867899e-06, + "loss": 2.542, + "step": 31739 + }, + { + "epoch": 0.9411974023663375, + "grad_norm": 0.06664740294218063, + "learning_rate": 8.688773440841235e-06, + "loss": 2.5233, + "step": 31740 + }, + { + "epoch": 0.941227055718649, + "grad_norm": 0.07146203517913818, + "learning_rate": 8.680042228459861e-06, + "loss": 2.5511, + "step": 31741 + }, + { + "epoch": 0.9412567090709605, + "grad_norm": 0.0649779811501503, + "learning_rate": 8.671315366801047e-06, + "loss": 2.5642, + "step": 31742 + }, + { + "epoch": 0.9412863624232719, + "grad_norm": 0.07103484123945236, + "learning_rate": 8.662592855942064e-06, + "loss": 2.5894, + "step": 31743 + }, + { + "epoch": 0.9413160157755834, + "grad_norm": 0.07102333009243011, + "learning_rate": 8.653874695960185e-06, + "loss": 2.5055, + "step": 31744 + }, + { + "epoch": 0.9413456691278949, + "grad_norm": 0.07682935148477554, + "learning_rate": 8.645160886932568e-06, + "loss": 2.5689, + "step": 31745 + }, + { + "epoch": 0.9413753224802064, + "grad_norm": 0.07219309359788895, + "learning_rate": 8.636451428936431e-06, + "loss": 2.5433, + "step": 31746 + }, + { + "epoch": 0.9414049758325179, + "grad_norm": 0.0775725468993187, + "learning_rate": 8.627746322048768e-06, + "loss": 2.5458, + "step": 31747 + }, + { + "epoch": 0.9414346291848293, + "grad_norm": 0.06703674048185349, + "learning_rate": 8.619045566346794e-06, + "loss": 2.5249, + "step": 31748 + }, + { + "epoch": 0.9414642825371409, + "grad_norm": 0.06925906985998154, + "learning_rate": 8.61034916190756e-06, + "loss": 2.5675, + "step": 31749 + }, + { + "epoch": 0.9414939358894523, + "grad_norm": 0.07566402107477188, + "learning_rate": 8.601657108807948e-06, + "loss": 2.5352, + "step": 31750 + }, + { + "epoch": 0.9415235892417638, + "grad_norm": 0.07011726498603821, + "learning_rate": 8.592969407125062e-06, + "loss": 2.4967, + "step": 31751 + }, + { + "epoch": 0.9415532425940752, + "grad_norm": 0.06683618575334549, + "learning_rate": 8.584286056935786e-06, + "loss": 2.5325, + "step": 31752 + }, + { + "epoch": 0.9415828959463868, + "grad_norm": 0.06891682744026184, + "learning_rate": 8.575607058316948e-06, + "loss": 2.5498, + "step": 31753 + }, + { + "epoch": 0.9416125492986982, + "grad_norm": 0.06535831838846207, + "learning_rate": 8.566932411345484e-06, + "loss": 2.5165, + "step": 31754 + }, + { + "epoch": 0.9416422026510097, + "grad_norm": 0.06783857941627502, + "learning_rate": 8.558262116098225e-06, + "loss": 2.5502, + "step": 31755 + }, + { + "epoch": 0.9416718560033212, + "grad_norm": 0.06548982113599777, + "learning_rate": 8.549596172651941e-06, + "loss": 2.5719, + "step": 31756 + }, + { + "epoch": 0.9417015093556327, + "grad_norm": 0.0659765750169754, + "learning_rate": 8.540934581083294e-06, + "loss": 2.5066, + "step": 31757 + }, + { + "epoch": 0.9417311627079441, + "grad_norm": 0.06640452146530151, + "learning_rate": 8.532277341468997e-06, + "loss": 2.5235, + "step": 31758 + }, + { + "epoch": 0.9417608160602556, + "grad_norm": 0.06822682172060013, + "learning_rate": 8.523624453885769e-06, + "loss": 2.5383, + "step": 31759 + }, + { + "epoch": 0.941790469412567, + "grad_norm": 0.06344366818666458, + "learning_rate": 8.514975918410217e-06, + "loss": 2.5431, + "step": 31760 + }, + { + "epoch": 0.9418201227648786, + "grad_norm": 0.06848958134651184, + "learning_rate": 8.506331735118944e-06, + "loss": 2.5425, + "step": 31761 + }, + { + "epoch": 0.94184977611719, + "grad_norm": 0.06796753406524658, + "learning_rate": 8.497691904088389e-06, + "loss": 2.5153, + "step": 31762 + }, + { + "epoch": 0.9418794294695015, + "grad_norm": 0.06881365925073624, + "learning_rate": 8.489056425395214e-06, + "loss": 2.5764, + "step": 31763 + }, + { + "epoch": 0.941909082821813, + "grad_norm": 0.07011646032333374, + "learning_rate": 8.480425299115746e-06, + "loss": 2.5352, + "step": 31764 + }, + { + "epoch": 0.9419387361741245, + "grad_norm": 0.06907004117965698, + "learning_rate": 8.471798525326536e-06, + "loss": 2.514, + "step": 31765 + }, + { + "epoch": 0.9419683895264359, + "grad_norm": 0.06670688092708588, + "learning_rate": 8.463176104103908e-06, + "loss": 2.539, + "step": 31766 + }, + { + "epoch": 0.9419980428787474, + "grad_norm": 0.06694044172763824, + "learning_rate": 8.454558035524196e-06, + "loss": 2.5556, + "step": 31767 + }, + { + "epoch": 0.942027696231059, + "grad_norm": 0.07060294598340988, + "learning_rate": 8.445944319663778e-06, + "loss": 2.5704, + "step": 31768 + }, + { + "epoch": 0.9420573495833704, + "grad_norm": 0.06936817616224289, + "learning_rate": 8.437334956598874e-06, + "loss": 2.5228, + "step": 31769 + }, + { + "epoch": 0.9420870029356819, + "grad_norm": 0.06748778373003006, + "learning_rate": 8.428729946405755e-06, + "loss": 2.5469, + "step": 31770 + }, + { + "epoch": 0.9421166562879933, + "grad_norm": 0.06819569319486618, + "learning_rate": 8.420129289160584e-06, + "loss": 2.5214, + "step": 31771 + }, + { + "epoch": 0.9421463096403049, + "grad_norm": 0.06941469013690948, + "learning_rate": 8.411532984939574e-06, + "loss": 2.515, + "step": 31772 + }, + { + "epoch": 0.9421759629926163, + "grad_norm": 0.07031675428152084, + "learning_rate": 8.402941033818833e-06, + "loss": 2.5515, + "step": 31773 + }, + { + "epoch": 0.9422056163449278, + "grad_norm": 0.0700923502445221, + "learning_rate": 8.394353435874414e-06, + "loss": 2.5466, + "step": 31774 + }, + { + "epoch": 0.9422352696972393, + "grad_norm": 0.07017253339290619, + "learning_rate": 8.385770191182363e-06, + "loss": 2.5658, + "step": 31775 + }, + { + "epoch": 0.9422649230495508, + "grad_norm": 0.06591267138719559, + "learning_rate": 8.377191299818732e-06, + "loss": 2.5612, + "step": 31776 + }, + { + "epoch": 0.9422945764018622, + "grad_norm": 0.06587008386850357, + "learning_rate": 8.36861676185946e-06, + "loss": 2.5111, + "step": 31777 + }, + { + "epoch": 0.9423242297541737, + "grad_norm": 0.06542050838470459, + "learning_rate": 8.36004657738043e-06, + "loss": 2.524, + "step": 31778 + }, + { + "epoch": 0.9423538831064852, + "grad_norm": 0.0668853297829628, + "learning_rate": 8.351480746457585e-06, + "loss": 2.5524, + "step": 31779 + }, + { + "epoch": 0.9423835364587967, + "grad_norm": 0.0690445676445961, + "learning_rate": 8.342919269166804e-06, + "loss": 2.5317, + "step": 31780 + }, + { + "epoch": 0.9424131898111081, + "grad_norm": 0.06378549337387085, + "learning_rate": 8.334362145583863e-06, + "loss": 2.5029, + "step": 31781 + }, + { + "epoch": 0.9424428431634196, + "grad_norm": 0.06731249392032623, + "learning_rate": 8.325809375784533e-06, + "loss": 2.5257, + "step": 31782 + }, + { + "epoch": 0.9424724965157311, + "grad_norm": 0.0675477609038353, + "learning_rate": 8.31726095984453e-06, + "loss": 2.5514, + "step": 31783 + }, + { + "epoch": 0.9425021498680426, + "grad_norm": 0.06689120829105377, + "learning_rate": 8.308716897839575e-06, + "loss": 2.5596, + "step": 31784 + }, + { + "epoch": 0.942531803220354, + "grad_norm": 0.06713060289621353, + "learning_rate": 8.300177189845382e-06, + "loss": 2.5559, + "step": 31785 + }, + { + "epoch": 0.9425614565726655, + "grad_norm": 0.06853076815605164, + "learning_rate": 8.291641835937447e-06, + "loss": 2.4943, + "step": 31786 + }, + { + "epoch": 0.942591109924977, + "grad_norm": 0.06718866527080536, + "learning_rate": 8.283110836191487e-06, + "loss": 2.5279, + "step": 31787 + }, + { + "epoch": 0.9426207632772885, + "grad_norm": 0.06724555790424347, + "learning_rate": 8.274584190682944e-06, + "loss": 2.5276, + "step": 31788 + }, + { + "epoch": 0.9426504166296, + "grad_norm": 0.06756800413131714, + "learning_rate": 8.266061899487365e-06, + "loss": 2.5728, + "step": 31789 + }, + { + "epoch": 0.9426800699819115, + "grad_norm": 0.06829673051834106, + "learning_rate": 8.257543962680247e-06, + "loss": 2.5354, + "step": 31790 + }, + { + "epoch": 0.942709723334223, + "grad_norm": 0.06838005036115646, + "learning_rate": 8.249030380336974e-06, + "loss": 2.5797, + "step": 31791 + }, + { + "epoch": 0.9427393766865344, + "grad_norm": 0.06818490475416183, + "learning_rate": 8.240521152532932e-06, + "loss": 2.5241, + "step": 31792 + }, + { + "epoch": 0.9427690300388459, + "grad_norm": 0.06725644320249557, + "learning_rate": 8.232016279343447e-06, + "loss": 2.5439, + "step": 31793 + }, + { + "epoch": 0.9427986833911574, + "grad_norm": 0.06728542596101761, + "learning_rate": 8.223515760843902e-06, + "loss": 2.5159, + "step": 31794 + }, + { + "epoch": 0.9428283367434689, + "grad_norm": 0.07118690758943558, + "learning_rate": 8.215019597109575e-06, + "loss": 2.5438, + "step": 31795 + }, + { + "epoch": 0.9428579900957803, + "grad_norm": 0.06869164854288101, + "learning_rate": 8.206527788215624e-06, + "loss": 2.5195, + "step": 31796 + }, + { + "epoch": 0.9428876434480918, + "grad_norm": 0.06760451197624207, + "learning_rate": 8.198040334237266e-06, + "loss": 2.5476, + "step": 31797 + }, + { + "epoch": 0.9429172968004033, + "grad_norm": 0.06871994584798813, + "learning_rate": 8.189557235249723e-06, + "loss": 2.583, + "step": 31798 + }, + { + "epoch": 0.9429469501527148, + "grad_norm": 0.06554999947547913, + "learning_rate": 8.181078491328043e-06, + "loss": 2.5371, + "step": 31799 + }, + { + "epoch": 0.9429766035050262, + "grad_norm": 0.0678628459572792, + "learning_rate": 8.172604102547333e-06, + "loss": 2.54, + "step": 31800 + }, + { + "epoch": 0.9430062568573377, + "grad_norm": 0.06780710816383362, + "learning_rate": 8.164134068982643e-06, + "loss": 2.5435, + "step": 31801 + }, + { + "epoch": 0.9430359102096492, + "grad_norm": 0.06775008887052536, + "learning_rate": 8.15566839070897e-06, + "loss": 2.5243, + "step": 31802 + }, + { + "epoch": 0.9430655635619607, + "grad_norm": 0.06889232248067856, + "learning_rate": 8.14720706780131e-06, + "loss": 2.5397, + "step": 31803 + }, + { + "epoch": 0.9430952169142721, + "grad_norm": 0.06736502796411514, + "learning_rate": 8.138750100334436e-06, + "loss": 2.5547, + "step": 31804 + }, + { + "epoch": 0.9431248702665836, + "grad_norm": 0.06838095188140869, + "learning_rate": 8.130297488383454e-06, + "loss": 2.5558, + "step": 31805 + }, + { + "epoch": 0.9431545236188951, + "grad_norm": 0.0669737160205841, + "learning_rate": 8.121849232023081e-06, + "loss": 2.5592, + "step": 31806 + }, + { + "epoch": 0.9431841769712066, + "grad_norm": 0.07898399233818054, + "learning_rate": 8.113405331328206e-06, + "loss": 2.5402, + "step": 31807 + }, + { + "epoch": 0.943213830323518, + "grad_norm": 0.06748397648334503, + "learning_rate": 8.104965786373542e-06, + "loss": 2.526, + "step": 31808 + }, + { + "epoch": 0.9432434836758296, + "grad_norm": 0.06745931506156921, + "learning_rate": 8.096530597233808e-06, + "loss": 2.5105, + "step": 31809 + }, + { + "epoch": 0.9432731370281411, + "grad_norm": 0.0675371065735817, + "learning_rate": 8.088099763983781e-06, + "loss": 2.5436, + "step": 31810 + }, + { + "epoch": 0.9433027903804525, + "grad_norm": 0.06536499410867691, + "learning_rate": 8.079673286698063e-06, + "loss": 2.5082, + "step": 31811 + }, + { + "epoch": 0.943332443732764, + "grad_norm": 0.06897798180580139, + "learning_rate": 8.071251165451265e-06, + "loss": 2.531, + "step": 31812 + }, + { + "epoch": 0.9433620970850755, + "grad_norm": 0.06692041456699371, + "learning_rate": 8.062833400317937e-06, + "loss": 2.5501, + "step": 31813 + }, + { + "epoch": 0.943391750437387, + "grad_norm": 0.06646526604890823, + "learning_rate": 8.054419991372685e-06, + "loss": 2.541, + "step": 31814 + }, + { + "epoch": 0.9434214037896984, + "grad_norm": 0.06653095781803131, + "learning_rate": 8.046010938690008e-06, + "loss": 2.5595, + "step": 31815 + }, + { + "epoch": 0.9434510571420099, + "grad_norm": 0.06814513355493546, + "learning_rate": 8.037606242344287e-06, + "loss": 2.5549, + "step": 31816 + }, + { + "epoch": 0.9434807104943214, + "grad_norm": 0.06447342038154602, + "learning_rate": 8.02920590241002e-06, + "loss": 2.4994, + "step": 31817 + }, + { + "epoch": 0.9435103638466329, + "grad_norm": 0.06658580899238586, + "learning_rate": 8.020809918961592e-06, + "loss": 2.558, + "step": 31818 + }, + { + "epoch": 0.9435400171989443, + "grad_norm": 0.06609829515218735, + "learning_rate": 8.012418292073387e-06, + "loss": 2.4805, + "step": 31819 + }, + { + "epoch": 0.9435696705512558, + "grad_norm": 0.06592500954866409, + "learning_rate": 8.004031021819624e-06, + "loss": 2.5623, + "step": 31820 + }, + { + "epoch": 0.9435993239035673, + "grad_norm": 0.06852608174085617, + "learning_rate": 7.995648108274578e-06, + "loss": 2.5685, + "step": 31821 + }, + { + "epoch": 0.9436289772558788, + "grad_norm": 0.06639429181814194, + "learning_rate": 7.987269551512633e-06, + "loss": 2.5517, + "step": 31822 + }, + { + "epoch": 0.9436586306081902, + "grad_norm": 0.0664939358830452, + "learning_rate": 7.978895351607785e-06, + "loss": 2.5117, + "step": 31823 + }, + { + "epoch": 0.9436882839605018, + "grad_norm": 0.06742003560066223, + "learning_rate": 7.970525508634307e-06, + "loss": 2.549, + "step": 31824 + }, + { + "epoch": 0.9437179373128132, + "grad_norm": 0.06408987194299698, + "learning_rate": 7.962160022666253e-06, + "loss": 2.5352, + "step": 31825 + }, + { + "epoch": 0.9437475906651247, + "grad_norm": 0.06835512071847916, + "learning_rate": 7.95379889377773e-06, + "loss": 2.5547, + "step": 31826 + }, + { + "epoch": 0.9437772440174361, + "grad_norm": 0.06647002696990967, + "learning_rate": 7.945442122042789e-06, + "loss": 2.4995, + "step": 31827 + }, + { + "epoch": 0.9438068973697477, + "grad_norm": 0.06791126728057861, + "learning_rate": 7.937089707535427e-06, + "loss": 2.5265, + "step": 31828 + }, + { + "epoch": 0.9438365507220591, + "grad_norm": 0.06777840107679367, + "learning_rate": 7.928741650329586e-06, + "loss": 2.5239, + "step": 31829 + }, + { + "epoch": 0.9438662040743706, + "grad_norm": 0.07200175523757935, + "learning_rate": 7.920397950499148e-06, + "loss": 2.5346, + "step": 31830 + }, + { + "epoch": 0.9438958574266821, + "grad_norm": 0.06983726471662521, + "learning_rate": 7.912058608118111e-06, + "loss": 2.5531, + "step": 31831 + }, + { + "epoch": 0.9439255107789936, + "grad_norm": 0.06532520800828934, + "learning_rate": 7.903723623260251e-06, + "loss": 2.4962, + "step": 31832 + }, + { + "epoch": 0.9439551641313051, + "grad_norm": 0.06846638768911362, + "learning_rate": 7.895392995999395e-06, + "loss": 2.5382, + "step": 31833 + }, + { + "epoch": 0.9439848174836165, + "grad_norm": 0.06968111544847488, + "learning_rate": 7.88706672640932e-06, + "loss": 2.5195, + "step": 31834 + }, + { + "epoch": 0.944014470835928, + "grad_norm": 0.06444822996854782, + "learning_rate": 7.878744814563687e-06, + "loss": 2.5506, + "step": 31835 + }, + { + "epoch": 0.9440441241882395, + "grad_norm": 0.07076691836118698, + "learning_rate": 7.870427260536273e-06, + "loss": 2.5295, + "step": 31836 + }, + { + "epoch": 0.944073777540551, + "grad_norm": 0.06921759992837906, + "learning_rate": 7.862114064400683e-06, + "loss": 2.5424, + "step": 31837 + }, + { + "epoch": 0.9441034308928624, + "grad_norm": 0.0677177906036377, + "learning_rate": 7.853805226230582e-06, + "loss": 2.5607, + "step": 31838 + }, + { + "epoch": 0.944133084245174, + "grad_norm": 0.06491319835186005, + "learning_rate": 7.845500746099466e-06, + "loss": 2.5326, + "step": 31839 + }, + { + "epoch": 0.9441627375974854, + "grad_norm": 0.06740950047969818, + "learning_rate": 7.837200624080943e-06, + "loss": 2.5353, + "step": 31840 + }, + { + "epoch": 0.9441923909497969, + "grad_norm": 0.06953539699316025, + "learning_rate": 7.828904860248453e-06, + "loss": 2.5368, + "step": 31841 + }, + { + "epoch": 0.9442220443021083, + "grad_norm": 0.0680302157998085, + "learning_rate": 7.820613454675551e-06, + "loss": 2.5551, + "step": 31842 + }, + { + "epoch": 0.9442516976544199, + "grad_norm": 0.06531453132629395, + "learning_rate": 7.812326407435566e-06, + "loss": 2.5156, + "step": 31843 + }, + { + "epoch": 0.9442813510067313, + "grad_norm": 0.06627152115106583, + "learning_rate": 7.804043718601883e-06, + "loss": 2.5327, + "step": 31844 + }, + { + "epoch": 0.9443110043590428, + "grad_norm": 0.07039661705493927, + "learning_rate": 7.795765388247945e-06, + "loss": 2.534, + "step": 31845 + }, + { + "epoch": 0.9443406577113542, + "grad_norm": 0.06830950826406479, + "learning_rate": 7.787491416446967e-06, + "loss": 2.5467, + "step": 31846 + }, + { + "epoch": 0.9443703110636658, + "grad_norm": 0.07011673599481583, + "learning_rate": 7.779221803272173e-06, + "loss": 2.5554, + "step": 31847 + }, + { + "epoch": 0.9443999644159772, + "grad_norm": 0.06926742941141129, + "learning_rate": 7.770956548796948e-06, + "loss": 2.5366, + "step": 31848 + }, + { + "epoch": 0.9444296177682887, + "grad_norm": 0.07013574242591858, + "learning_rate": 7.762695653094343e-06, + "loss": 2.5629, + "step": 31849 + }, + { + "epoch": 0.9444592711206001, + "grad_norm": 0.06960560381412506, + "learning_rate": 7.754439116237577e-06, + "loss": 2.5398, + "step": 31850 + }, + { + "epoch": 0.9444889244729117, + "grad_norm": 0.06942286342382431, + "learning_rate": 7.74618693829976e-06, + "loss": 2.5479, + "step": 31851 + }, + { + "epoch": 0.9445185778252232, + "grad_norm": 0.06680970638990402, + "learning_rate": 7.737939119353887e-06, + "loss": 2.5089, + "step": 31852 + }, + { + "epoch": 0.9445482311775346, + "grad_norm": 0.06832500547170639, + "learning_rate": 7.729695659473123e-06, + "loss": 2.5665, + "step": 31853 + }, + { + "epoch": 0.9445778845298461, + "grad_norm": 0.06920761615037918, + "learning_rate": 7.721456558730412e-06, + "loss": 2.5136, + "step": 31854 + }, + { + "epoch": 0.9446075378821576, + "grad_norm": 0.07083208113908768, + "learning_rate": 7.713221817198634e-06, + "loss": 2.5463, + "step": 31855 + }, + { + "epoch": 0.9446371912344691, + "grad_norm": 0.06749963760375977, + "learning_rate": 7.704991434950848e-06, + "loss": 2.5288, + "step": 31856 + }, + { + "epoch": 0.9446668445867805, + "grad_norm": 0.06872262060642242, + "learning_rate": 7.696765412059826e-06, + "loss": 2.5446, + "step": 31857 + }, + { + "epoch": 0.944696497939092, + "grad_norm": 0.06766634434461594, + "learning_rate": 7.688543748598453e-06, + "loss": 2.5414, + "step": 31858 + }, + { + "epoch": 0.9447261512914035, + "grad_norm": 0.06735796481370926, + "learning_rate": 7.680326444639508e-06, + "loss": 2.5516, + "step": 31859 + }, + { + "epoch": 0.944755804643715, + "grad_norm": 0.06771458685398102, + "learning_rate": 7.672113500255817e-06, + "loss": 2.5264, + "step": 31860 + }, + { + "epoch": 0.9447854579960264, + "grad_norm": 0.06432458013296127, + "learning_rate": 7.663904915520047e-06, + "loss": 2.5431, + "step": 31861 + }, + { + "epoch": 0.944815111348338, + "grad_norm": 0.06792749464511871, + "learning_rate": 7.655700690504918e-06, + "loss": 2.5501, + "step": 31862 + }, + { + "epoch": 0.9448447647006494, + "grad_norm": 0.07006552815437317, + "learning_rate": 7.647500825283038e-06, + "loss": 2.5275, + "step": 31863 + }, + { + "epoch": 0.9448744180529609, + "grad_norm": 0.06713931262493134, + "learning_rate": 7.63930531992707e-06, + "loss": 2.5323, + "step": 31864 + }, + { + "epoch": 0.9449040714052723, + "grad_norm": 0.06557415425777435, + "learning_rate": 7.631114174509569e-06, + "loss": 2.5456, + "step": 31865 + }, + { + "epoch": 0.9449337247575839, + "grad_norm": 0.07010958343744278, + "learning_rate": 7.62292738910314e-06, + "loss": 2.5374, + "step": 31866 + }, + { + "epoch": 0.9449633781098953, + "grad_norm": 0.06792470067739487, + "learning_rate": 7.614744963780118e-06, + "loss": 2.5277, + "step": 31867 + }, + { + "epoch": 0.9449930314622068, + "grad_norm": 0.068634994328022, + "learning_rate": 7.606566898613055e-06, + "loss": 2.5733, + "step": 31868 + }, + { + "epoch": 0.9450226848145182, + "grad_norm": 0.06415104866027832, + "learning_rate": 7.598393193674336e-06, + "loss": 2.5438, + "step": 31869 + }, + { + "epoch": 0.9450523381668298, + "grad_norm": 0.06820251792669296, + "learning_rate": 7.590223849036404e-06, + "loss": 2.5386, + "step": 31870 + }, + { + "epoch": 0.9450819915191412, + "grad_norm": 0.0658760741353035, + "learning_rate": 7.582058864771535e-06, + "loss": 2.5487, + "step": 31871 + }, + { + "epoch": 0.9451116448714527, + "grad_norm": 0.06902852654457092, + "learning_rate": 7.57389824095206e-06, + "loss": 2.5167, + "step": 31872 + }, + { + "epoch": 0.9451412982237642, + "grad_norm": 0.06803132593631744, + "learning_rate": 7.565741977650253e-06, + "loss": 2.5603, + "step": 31873 + }, + { + "epoch": 0.9451709515760757, + "grad_norm": 0.06817726045846939, + "learning_rate": 7.557590074938337e-06, + "loss": 2.5231, + "step": 31874 + }, + { + "epoch": 0.9452006049283872, + "grad_norm": 0.06502429395914078, + "learning_rate": 7.549442532888473e-06, + "loss": 2.5356, + "step": 31875 + }, + { + "epoch": 0.9452302582806986, + "grad_norm": 0.06472232937812805, + "learning_rate": 7.541299351572828e-06, + "loss": 2.519, + "step": 31876 + }, + { + "epoch": 0.9452599116330102, + "grad_norm": 0.06417983025312424, + "learning_rate": 7.533160531063565e-06, + "loss": 2.5344, + "step": 31877 + }, + { + "epoch": 0.9452895649853216, + "grad_norm": 0.06833616644144058, + "learning_rate": 7.525026071432628e-06, + "loss": 2.5257, + "step": 31878 + }, + { + "epoch": 0.9453192183376331, + "grad_norm": 0.06317717581987381, + "learning_rate": 7.516895972752125e-06, + "loss": 2.5151, + "step": 31879 + }, + { + "epoch": 0.9453488716899445, + "grad_norm": 0.06646420061588287, + "learning_rate": 7.508770235094053e-06, + "loss": 2.5363, + "step": 31880 + }, + { + "epoch": 0.9453785250422561, + "grad_norm": 0.06547480821609497, + "learning_rate": 7.500648858530357e-06, + "loss": 2.5237, + "step": 31881 + }, + { + "epoch": 0.9454081783945675, + "grad_norm": 0.06895972043275833, + "learning_rate": 7.492531843132866e-06, + "loss": 2.5435, + "step": 31882 + }, + { + "epoch": 0.945437831746879, + "grad_norm": 0.06348984688520432, + "learning_rate": 7.484419188973634e-06, + "loss": 2.4927, + "step": 31883 + }, + { + "epoch": 0.9454674850991904, + "grad_norm": 0.06544213742017746, + "learning_rate": 7.476310896124383e-06, + "loss": 2.5714, + "step": 31884 + }, + { + "epoch": 0.945497138451502, + "grad_norm": 0.0665966346859932, + "learning_rate": 7.468206964656943e-06, + "loss": 2.5014, + "step": 31885 + }, + { + "epoch": 0.9455267918038134, + "grad_norm": 0.06542080640792847, + "learning_rate": 7.46010739464309e-06, + "loss": 2.509, + "step": 31886 + }, + { + "epoch": 0.9455564451561249, + "grad_norm": 0.06953215599060059, + "learning_rate": 7.4520121861545445e-06, + "loss": 2.5224, + "step": 31887 + }, + { + "epoch": 0.9455860985084363, + "grad_norm": 0.06913026422262192, + "learning_rate": 7.443921339262971e-06, + "loss": 2.5499, + "step": 31888 + }, + { + "epoch": 0.9456157518607479, + "grad_norm": 0.0658111572265625, + "learning_rate": 7.43583485403998e-06, + "loss": 2.5497, + "step": 31889 + }, + { + "epoch": 0.9456454052130593, + "grad_norm": 0.07036654651165009, + "learning_rate": 7.4277527305572356e-06, + "loss": 2.5269, + "step": 31890 + }, + { + "epoch": 0.9456750585653708, + "grad_norm": 0.06751290708780289, + "learning_rate": 7.419674968886292e-06, + "loss": 2.5811, + "step": 31891 + }, + { + "epoch": 0.9457047119176822, + "grad_norm": 0.06910483539104462, + "learning_rate": 7.411601569098703e-06, + "loss": 2.5251, + "step": 31892 + }, + { + "epoch": 0.9457343652699938, + "grad_norm": 0.068580761551857, + "learning_rate": 7.403532531265911e-06, + "loss": 2.5424, + "step": 31893 + }, + { + "epoch": 0.9457640186223053, + "grad_norm": 0.06564954668283463, + "learning_rate": 7.395467855459359e-06, + "loss": 2.5747, + "step": 31894 + }, + { + "epoch": 0.9457936719746167, + "grad_norm": 0.067128986120224, + "learning_rate": 7.387407541750491e-06, + "loss": 2.525, + "step": 31895 + }, + { + "epoch": 0.9458233253269283, + "grad_norm": 0.07173166424036026, + "learning_rate": 7.379351590210748e-06, + "loss": 2.5793, + "step": 31896 + }, + { + "epoch": 0.9458529786792397, + "grad_norm": 0.06500130146741867, + "learning_rate": 7.371300000911352e-06, + "loss": 2.5573, + "step": 31897 + }, + { + "epoch": 0.9458826320315512, + "grad_norm": 0.06721637398004532, + "learning_rate": 7.36325277392369e-06, + "loss": 2.5339, + "step": 31898 + }, + { + "epoch": 0.9459122853838626, + "grad_norm": 0.06757788360118866, + "learning_rate": 7.355209909318983e-06, + "loss": 2.5537, + "step": 31899 + }, + { + "epoch": 0.9459419387361742, + "grad_norm": 0.06670232117176056, + "learning_rate": 7.347171407168452e-06, + "loss": 2.5218, + "step": 31900 + }, + { + "epoch": 0.9459715920884856, + "grad_norm": 0.06785144656896591, + "learning_rate": 7.339137267543261e-06, + "loss": 2.5402, + "step": 31901 + }, + { + "epoch": 0.9460012454407971, + "grad_norm": 0.06646358966827393, + "learning_rate": 7.331107490514577e-06, + "loss": 2.521, + "step": 31902 + }, + { + "epoch": 0.9460308987931085, + "grad_norm": 0.06892906129360199, + "learning_rate": 7.323082076153509e-06, + "loss": 2.5441, + "step": 31903 + }, + { + "epoch": 0.9460605521454201, + "grad_norm": 0.07103396952152252, + "learning_rate": 7.315061024531111e-06, + "loss": 2.5228, + "step": 31904 + }, + { + "epoch": 0.9460902054977315, + "grad_norm": 0.06690704077482224, + "learning_rate": 7.307044335718438e-06, + "loss": 2.5466, + "step": 31905 + }, + { + "epoch": 0.946119858850043, + "grad_norm": 0.06754567474126816, + "learning_rate": 7.299032009786432e-06, + "loss": 2.5312, + "step": 31906 + }, + { + "epoch": 0.9461495122023544, + "grad_norm": 0.06685535609722137, + "learning_rate": 7.291024046806039e-06, + "loss": 2.5433, + "step": 31907 + }, + { + "epoch": 0.946179165554666, + "grad_norm": 0.06657250225543976, + "learning_rate": 7.283020446848254e-06, + "loss": 2.5453, + "step": 31908 + }, + { + "epoch": 0.9462088189069774, + "grad_norm": 0.06800395250320435, + "learning_rate": 7.275021209983857e-06, + "loss": 2.5963, + "step": 31909 + }, + { + "epoch": 0.9462384722592889, + "grad_norm": 0.06614258885383606, + "learning_rate": 7.267026336283788e-06, + "loss": 2.5343, + "step": 31910 + }, + { + "epoch": 0.9462681256116003, + "grad_norm": 0.06674990803003311, + "learning_rate": 7.259035825818716e-06, + "loss": 2.5737, + "step": 31911 + }, + { + "epoch": 0.9462977789639119, + "grad_norm": 0.06426975876092911, + "learning_rate": 7.251049678659472e-06, + "loss": 2.5636, + "step": 31912 + }, + { + "epoch": 0.9463274323162233, + "grad_norm": 0.06594013422727585, + "learning_rate": 7.243067894876776e-06, + "loss": 2.5038, + "step": 31913 + }, + { + "epoch": 0.9463570856685348, + "grad_norm": 0.06454664468765259, + "learning_rate": 7.235090474541295e-06, + "loss": 2.564, + "step": 31914 + }, + { + "epoch": 0.9463867390208464, + "grad_norm": 0.06569993495941162, + "learning_rate": 7.227117417723639e-06, + "loss": 2.5197, + "step": 31915 + }, + { + "epoch": 0.9464163923731578, + "grad_norm": 0.06618946045637131, + "learning_rate": 7.2191487244944735e-06, + "loss": 2.4554, + "step": 31916 + }, + { + "epoch": 0.9464460457254693, + "grad_norm": 0.06458709388971329, + "learning_rate": 7.211184394924297e-06, + "loss": 2.5176, + "step": 31917 + }, + { + "epoch": 0.9464756990777807, + "grad_norm": 0.06449282169342041, + "learning_rate": 7.203224429083721e-06, + "loss": 2.5316, + "step": 31918 + }, + { + "epoch": 0.9465053524300923, + "grad_norm": 0.0681324452161789, + "learning_rate": 7.1952688270431335e-06, + "loss": 2.5522, + "step": 31919 + }, + { + "epoch": 0.9465350057824037, + "grad_norm": 0.06886223703622818, + "learning_rate": 7.187317588873032e-06, + "loss": 2.563, + "step": 31920 + }, + { + "epoch": 0.9465646591347152, + "grad_norm": 0.06549406796693802, + "learning_rate": 7.1793707146438625e-06, + "loss": 2.5552, + "step": 31921 + }, + { + "epoch": 0.9465943124870266, + "grad_norm": 0.0690184086561203, + "learning_rate": 7.171428204425901e-06, + "loss": 2.5298, + "step": 31922 + }, + { + "epoch": 0.9466239658393382, + "grad_norm": 0.06506501883268356, + "learning_rate": 7.16349005828959e-06, + "loss": 2.5396, + "step": 31923 + }, + { + "epoch": 0.9466536191916496, + "grad_norm": 0.06790028512477875, + "learning_rate": 7.155556276305153e-06, + "loss": 2.5461, + "step": 31924 + }, + { + "epoch": 0.9466832725439611, + "grad_norm": 0.0662865862250328, + "learning_rate": 7.147626858542811e-06, + "loss": 2.5503, + "step": 31925 + }, + { + "epoch": 0.9467129258962725, + "grad_norm": 0.0710223913192749, + "learning_rate": 7.139701805072896e-06, + "loss": 2.5446, + "step": 31926 + }, + { + "epoch": 0.9467425792485841, + "grad_norm": 0.06808800250291824, + "learning_rate": 7.131781115965519e-06, + "loss": 2.5653, + "step": 31927 + }, + { + "epoch": 0.9467722326008955, + "grad_norm": 0.06894423067569733, + "learning_rate": 7.12386479129079e-06, + "loss": 2.531, + "step": 31928 + }, + { + "epoch": 0.946801885953207, + "grad_norm": 0.06755667924880981, + "learning_rate": 7.115952831118822e-06, + "loss": 2.5596, + "step": 31929 + }, + { + "epoch": 0.9468315393055184, + "grad_norm": 0.06626955419778824, + "learning_rate": 7.108045235519722e-06, + "loss": 2.5419, + "step": 31930 + }, + { + "epoch": 0.94686119265783, + "grad_norm": 0.06965704262256622, + "learning_rate": 7.100142004563492e-06, + "loss": 2.5175, + "step": 31931 + }, + { + "epoch": 0.9468908460101414, + "grad_norm": 0.07020043581724167, + "learning_rate": 7.092243138320131e-06, + "loss": 2.5815, + "step": 31932 + }, + { + "epoch": 0.9469204993624529, + "grad_norm": 0.06926001608371735, + "learning_rate": 7.0843486368595274e-06, + "loss": 2.5366, + "step": 31933 + }, + { + "epoch": 0.9469501527147645, + "grad_norm": 0.06820975244045258, + "learning_rate": 7.07645850025157e-06, + "loss": 2.5212, + "step": 31934 + }, + { + "epoch": 0.9469798060670759, + "grad_norm": 0.0675380602478981, + "learning_rate": 7.068572728566258e-06, + "loss": 2.5419, + "step": 31935 + }, + { + "epoch": 0.9470094594193874, + "grad_norm": 0.06788758188486099, + "learning_rate": 7.060691321873314e-06, + "loss": 2.5715, + "step": 31936 + }, + { + "epoch": 0.9470391127716988, + "grad_norm": 0.06801863014698029, + "learning_rate": 7.0528142802426256e-06, + "loss": 2.5137, + "step": 31937 + }, + { + "epoch": 0.9470687661240104, + "grad_norm": 0.06890568882226944, + "learning_rate": 7.044941603743804e-06, + "loss": 2.565, + "step": 31938 + }, + { + "epoch": 0.9470984194763218, + "grad_norm": 0.06709090620279312, + "learning_rate": 7.037073292446683e-06, + "loss": 2.5573, + "step": 31939 + }, + { + "epoch": 0.9471280728286333, + "grad_norm": 0.07038576900959015, + "learning_rate": 7.029209346420873e-06, + "loss": 2.5697, + "step": 31940 + }, + { + "epoch": 0.9471577261809447, + "grad_norm": 0.06477131694555283, + "learning_rate": 7.02134976573604e-06, + "loss": 2.5422, + "step": 31941 + }, + { + "epoch": 0.9471873795332563, + "grad_norm": 0.06722529232501984, + "learning_rate": 7.013494550461796e-06, + "loss": 2.5131, + "step": 31942 + }, + { + "epoch": 0.9472170328855677, + "grad_norm": 0.06976433098316193, + "learning_rate": 7.005643700667641e-06, + "loss": 2.5127, + "step": 31943 + }, + { + "epoch": 0.9472466862378792, + "grad_norm": 0.0656774714589119, + "learning_rate": 6.99779721642313e-06, + "loss": 2.5484, + "step": 31944 + }, + { + "epoch": 0.9472763395901906, + "grad_norm": 0.0673498883843422, + "learning_rate": 6.989955097797762e-06, + "loss": 2.5497, + "step": 31945 + }, + { + "epoch": 0.9473059929425022, + "grad_norm": 0.0658969134092331, + "learning_rate": 6.9821173448609834e-06, + "loss": 2.542, + "step": 31946 + }, + { + "epoch": 0.9473356462948136, + "grad_norm": 0.06905118376016617, + "learning_rate": 6.97428395768207e-06, + "loss": 2.5378, + "step": 31947 + }, + { + "epoch": 0.9473652996471251, + "grad_norm": 0.06727038323879242, + "learning_rate": 6.966454936330635e-06, + "loss": 2.5617, + "step": 31948 + }, + { + "epoch": 0.9473949529994365, + "grad_norm": 0.06321204453706741, + "learning_rate": 6.958630280875788e-06, + "loss": 2.5581, + "step": 31949 + }, + { + "epoch": 0.9474246063517481, + "grad_norm": 0.06527575850486755, + "learning_rate": 6.9508099913869195e-06, + "loss": 2.5412, + "step": 31950 + }, + { + "epoch": 0.9474542597040595, + "grad_norm": 0.06735765188932419, + "learning_rate": 6.942994067933306e-06, + "loss": 2.5639, + "step": 31951 + }, + { + "epoch": 0.947483913056371, + "grad_norm": 0.07061909884214401, + "learning_rate": 6.935182510584059e-06, + "loss": 2.5119, + "step": 31952 + }, + { + "epoch": 0.9475135664086825, + "grad_norm": 0.06582894176244736, + "learning_rate": 6.927375319408458e-06, + "loss": 2.5214, + "step": 31953 + }, + { + "epoch": 0.947543219760994, + "grad_norm": 0.06543569266796112, + "learning_rate": 6.919572494475557e-06, + "loss": 2.5399, + "step": 31954 + }, + { + "epoch": 0.9475728731133055, + "grad_norm": 0.06665422767400742, + "learning_rate": 6.911774035854468e-06, + "loss": 2.5193, + "step": 31955 + }, + { + "epoch": 0.9476025264656169, + "grad_norm": 0.06923314183950424, + "learning_rate": 6.903979943614302e-06, + "loss": 2.5025, + "step": 31956 + }, + { + "epoch": 0.9476321798179285, + "grad_norm": 0.06754622608423233, + "learning_rate": 6.89619021782395e-06, + "loss": 2.5479, + "step": 31957 + }, + { + "epoch": 0.9476618331702399, + "grad_norm": 0.06742644309997559, + "learning_rate": 6.888404858552522e-06, + "loss": 2.5265, + "step": 31958 + }, + { + "epoch": 0.9476914865225514, + "grad_norm": 0.06768602877855301, + "learning_rate": 6.880623865868907e-06, + "loss": 2.5387, + "step": 31959 + }, + { + "epoch": 0.9477211398748628, + "grad_norm": 0.06769279390573502, + "learning_rate": 6.872847239841995e-06, + "loss": 2.5048, + "step": 31960 + }, + { + "epoch": 0.9477507932271744, + "grad_norm": 0.06941349804401398, + "learning_rate": 6.8650749805406755e-06, + "loss": 2.5596, + "step": 31961 + }, + { + "epoch": 0.9477804465794858, + "grad_norm": 0.0711037665605545, + "learning_rate": 6.857307088033726e-06, + "loss": 2.5431, + "step": 31962 + }, + { + "epoch": 0.9478100999317973, + "grad_norm": 0.06623999774456024, + "learning_rate": 6.849543562390037e-06, + "loss": 2.5464, + "step": 31963 + }, + { + "epoch": 0.9478397532841087, + "grad_norm": 0.06444722414016724, + "learning_rate": 6.841784403678275e-06, + "loss": 2.5242, + "step": 31964 + }, + { + "epoch": 0.9478694066364203, + "grad_norm": 0.06504026800394058, + "learning_rate": 6.834029611967163e-06, + "loss": 2.5113, + "step": 31965 + }, + { + "epoch": 0.9478990599887317, + "grad_norm": 0.06556054204702377, + "learning_rate": 6.8262791873253125e-06, + "loss": 2.5412, + "step": 31966 + }, + { + "epoch": 0.9479287133410432, + "grad_norm": 0.062259893864393234, + "learning_rate": 6.818533129821503e-06, + "loss": 2.5374, + "step": 31967 + }, + { + "epoch": 0.9479583666933546, + "grad_norm": 0.06891227513551712, + "learning_rate": 6.810791439524178e-06, + "loss": 2.5627, + "step": 31968 + }, + { + "epoch": 0.9479880200456662, + "grad_norm": 0.06601134687662125, + "learning_rate": 6.80305411650195e-06, + "loss": 2.5385, + "step": 31969 + }, + { + "epoch": 0.9480176733979776, + "grad_norm": 0.0688304454088211, + "learning_rate": 6.79532116082332e-06, + "loss": 2.5522, + "step": 31970 + }, + { + "epoch": 0.9480473267502891, + "grad_norm": 0.06889818608760834, + "learning_rate": 6.787592572556789e-06, + "loss": 2.5394, + "step": 31971 + }, + { + "epoch": 0.9480769801026006, + "grad_norm": 0.06882395595312119, + "learning_rate": 6.779868351770746e-06, + "loss": 2.5475, + "step": 31972 + }, + { + "epoch": 0.9481066334549121, + "grad_norm": 0.06652622669935226, + "learning_rate": 6.772148498533692e-06, + "loss": 2.5578, + "step": 31973 + }, + { + "epoch": 0.9481362868072235, + "grad_norm": 0.06844393163919449, + "learning_rate": 6.764433012913962e-06, + "loss": 2.5309, + "step": 31974 + }, + { + "epoch": 0.948165940159535, + "grad_norm": 0.06671645492315292, + "learning_rate": 6.756721894979778e-06, + "loss": 2.5385, + "step": 31975 + }, + { + "epoch": 0.9481955935118466, + "grad_norm": 0.0694141611456871, + "learning_rate": 6.749015144799475e-06, + "loss": 2.5375, + "step": 31976 + }, + { + "epoch": 0.948225246864158, + "grad_norm": 0.07074514031410217, + "learning_rate": 6.741312762441332e-06, + "loss": 2.5662, + "step": 31977 + }, + { + "epoch": 0.9482549002164695, + "grad_norm": 0.06747876852750778, + "learning_rate": 6.733614747973571e-06, + "loss": 2.5227, + "step": 31978 + }, + { + "epoch": 0.9482845535687809, + "grad_norm": 0.06518881022930145, + "learning_rate": 6.72592110146425e-06, + "loss": 2.5386, + "step": 31979 + }, + { + "epoch": 0.9483142069210925, + "grad_norm": 0.06632694602012634, + "learning_rate": 6.718231822981591e-06, + "loss": 2.558, + "step": 31980 + }, + { + "epoch": 0.9483438602734039, + "grad_norm": 0.06459105014801025, + "learning_rate": 6.710546912593651e-06, + "loss": 2.5478, + "step": 31981 + }, + { + "epoch": 0.9483735136257154, + "grad_norm": 0.06445962935686111, + "learning_rate": 6.702866370368488e-06, + "loss": 2.5431, + "step": 31982 + }, + { + "epoch": 0.9484031669780268, + "grad_norm": 0.06727731972932816, + "learning_rate": 6.695190196374157e-06, + "loss": 2.5608, + "step": 31983 + }, + { + "epoch": 0.9484328203303384, + "grad_norm": 0.06833931803703308, + "learning_rate": 6.687518390678549e-06, + "loss": 2.5447, + "step": 31984 + }, + { + "epoch": 0.9484624736826498, + "grad_norm": 0.07145795226097107, + "learning_rate": 6.679850953349664e-06, + "loss": 2.5369, + "step": 31985 + }, + { + "epoch": 0.9484921270349613, + "grad_norm": 0.06637517362833023, + "learning_rate": 6.672187884455394e-06, + "loss": 2.5474, + "step": 31986 + }, + { + "epoch": 0.9485217803872728, + "grad_norm": 0.06577986478805542, + "learning_rate": 6.664529184063517e-06, + "loss": 2.5348, + "step": 31987 + }, + { + "epoch": 0.9485514337395843, + "grad_norm": 0.06722749769687653, + "learning_rate": 6.656874852241978e-06, + "loss": 2.5384, + "step": 31988 + }, + { + "epoch": 0.9485810870918957, + "grad_norm": 0.06541546434164047, + "learning_rate": 6.649224889058448e-06, + "loss": 2.5641, + "step": 31989 + }, + { + "epoch": 0.9486107404442072, + "grad_norm": 0.06417384743690491, + "learning_rate": 6.641579294580758e-06, + "loss": 2.5494, + "step": 31990 + }, + { + "epoch": 0.9486403937965187, + "grad_norm": 0.06767868995666504, + "learning_rate": 6.633938068876521e-06, + "loss": 2.5516, + "step": 31991 + }, + { + "epoch": 0.9486700471488302, + "grad_norm": 0.06533174961805344, + "learning_rate": 6.626301212013464e-06, + "loss": 2.5155, + "step": 31992 + }, + { + "epoch": 0.9486997005011416, + "grad_norm": 0.0760856419801712, + "learning_rate": 6.618668724059196e-06, + "loss": 2.5259, + "step": 31993 + }, + { + "epoch": 0.9487293538534531, + "grad_norm": 0.06398870795965195, + "learning_rate": 6.611040605081331e-06, + "loss": 2.5362, + "step": 31994 + }, + { + "epoch": 0.9487590072057646, + "grad_norm": 0.0680941790342331, + "learning_rate": 6.60341685514737e-06, + "loss": 2.5522, + "step": 31995 + }, + { + "epoch": 0.9487886605580761, + "grad_norm": 0.06599275022745132, + "learning_rate": 6.595797474324816e-06, + "loss": 2.5068, + "step": 31996 + }, + { + "epoch": 0.9488183139103876, + "grad_norm": 0.06850440800189972, + "learning_rate": 6.5881824626812245e-06, + "loss": 2.5562, + "step": 31997 + }, + { + "epoch": 0.948847967262699, + "grad_norm": 0.06625790894031525, + "learning_rate": 6.580571820283931e-06, + "loss": 2.527, + "step": 31998 + }, + { + "epoch": 0.9488776206150106, + "grad_norm": 0.06724673509597778, + "learning_rate": 6.572965547200383e-06, + "loss": 2.5656, + "step": 31999 + }, + { + "epoch": 0.948907273967322, + "grad_norm": 0.0676591694355011, + "learning_rate": 6.5653636434979124e-06, + "loss": 2.5047, + "step": 32000 + }, + { + "epoch": 0.9489369273196335, + "grad_norm": 0.06653999537229538, + "learning_rate": 6.557766109243801e-06, + "loss": 2.5188, + "step": 32001 + }, + { + "epoch": 0.948966580671945, + "grad_norm": 0.06545662134885788, + "learning_rate": 6.5501729445054395e-06, + "loss": 2.5142, + "step": 32002 + }, + { + "epoch": 0.9489962340242565, + "grad_norm": 0.06688223779201508, + "learning_rate": 6.542584149349995e-06, + "loss": 2.5384, + "step": 32003 + }, + { + "epoch": 0.9490258873765679, + "grad_norm": 0.0651736631989479, + "learning_rate": 6.534999723844637e-06, + "loss": 2.5251, + "step": 32004 + }, + { + "epoch": 0.9490555407288794, + "grad_norm": 0.06642723083496094, + "learning_rate": 6.527419668056534e-06, + "loss": 2.5449, + "step": 32005 + }, + { + "epoch": 0.9490851940811909, + "grad_norm": 0.06660476326942444, + "learning_rate": 6.5198439820528535e-06, + "loss": 2.5372, + "step": 32006 + }, + { + "epoch": 0.9491148474335024, + "grad_norm": 0.06685617566108704, + "learning_rate": 6.512272665900709e-06, + "loss": 2.5187, + "step": 32007 + }, + { + "epoch": 0.9491445007858138, + "grad_norm": 0.06978414952754974, + "learning_rate": 6.5047057196671035e-06, + "loss": 2.5143, + "step": 32008 + }, + { + "epoch": 0.9491741541381253, + "grad_norm": 0.06243472546339035, + "learning_rate": 6.497143143418982e-06, + "loss": 2.5297, + "step": 32009 + }, + { + "epoch": 0.9492038074904368, + "grad_norm": 0.0652628168463707, + "learning_rate": 6.489584937223347e-06, + "loss": 2.529, + "step": 32010 + }, + { + "epoch": 0.9492334608427483, + "grad_norm": 0.06854589283466339, + "learning_rate": 6.482031101147146e-06, + "loss": 2.5528, + "step": 32011 + }, + { + "epoch": 0.9492631141950597, + "grad_norm": 0.06521289050579071, + "learning_rate": 6.4744816352573235e-06, + "loss": 2.5433, + "step": 32012 + }, + { + "epoch": 0.9492927675473712, + "grad_norm": 0.06554025411605835, + "learning_rate": 6.4669365396206045e-06, + "loss": 2.5361, + "step": 32013 + }, + { + "epoch": 0.9493224208996827, + "grad_norm": 0.06780992448329926, + "learning_rate": 6.459395814303936e-06, + "loss": 2.5503, + "step": 32014 + }, + { + "epoch": 0.9493520742519942, + "grad_norm": 0.06600484997034073, + "learning_rate": 6.451859459374043e-06, + "loss": 2.5309, + "step": 32015 + }, + { + "epoch": 0.9493817276043056, + "grad_norm": 0.06404769420623779, + "learning_rate": 6.444327474897649e-06, + "loss": 2.543, + "step": 32016 + }, + { + "epoch": 0.9494113809566171, + "grad_norm": 0.06336823850870132, + "learning_rate": 6.436799860941423e-06, + "loss": 2.5434, + "step": 32017 + }, + { + "epoch": 0.9494410343089287, + "grad_norm": 0.0665382444858551, + "learning_rate": 6.429276617572088e-06, + "loss": 2.5523, + "step": 32018 + }, + { + "epoch": 0.9494706876612401, + "grad_norm": 0.0667283907532692, + "learning_rate": 6.421757744856205e-06, + "loss": 2.5436, + "step": 32019 + }, + { + "epoch": 0.9495003410135516, + "grad_norm": 0.06587208807468414, + "learning_rate": 6.414243242860385e-06, + "loss": 2.545, + "step": 32020 + }, + { + "epoch": 0.949529994365863, + "grad_norm": 0.06486727297306061, + "learning_rate": 6.406733111651187e-06, + "loss": 2.5534, + "step": 32021 + }, + { + "epoch": 0.9495596477181746, + "grad_norm": 0.06685209274291992, + "learning_rate": 6.399227351295056e-06, + "loss": 2.5371, + "step": 32022 + }, + { + "epoch": 0.949589301070486, + "grad_norm": 0.07041823863983154, + "learning_rate": 6.3917259618584965e-06, + "loss": 2.4894, + "step": 32023 + }, + { + "epoch": 0.9496189544227975, + "grad_norm": 0.06681525707244873, + "learning_rate": 6.384228943407899e-06, + "loss": 2.4734, + "step": 32024 + }, + { + "epoch": 0.949648607775109, + "grad_norm": 0.07520713657140732, + "learning_rate": 6.376736296009711e-06, + "loss": 2.5321, + "step": 32025 + }, + { + "epoch": 0.9496782611274205, + "grad_norm": 0.06992822885513306, + "learning_rate": 6.3692480197302675e-06, + "loss": 2.5568, + "step": 32026 + }, + { + "epoch": 0.9497079144797319, + "grad_norm": 0.06672010570764542, + "learning_rate": 6.361764114635849e-06, + "loss": 2.5168, + "step": 32027 + }, + { + "epoch": 0.9497375678320434, + "grad_norm": 0.0692058801651001, + "learning_rate": 6.3542845807927354e-06, + "loss": 2.5539, + "step": 32028 + }, + { + "epoch": 0.9497672211843549, + "grad_norm": 0.06863080710172653, + "learning_rate": 6.3468094182672076e-06, + "loss": 2.5359, + "step": 32029 + }, + { + "epoch": 0.9497968745366664, + "grad_norm": 0.06591363251209259, + "learning_rate": 6.3393386271253796e-06, + "loss": 2.5781, + "step": 32030 + }, + { + "epoch": 0.9498265278889778, + "grad_norm": 0.0685005709528923, + "learning_rate": 6.331872207433476e-06, + "loss": 2.5509, + "step": 32031 + }, + { + "epoch": 0.9498561812412893, + "grad_norm": 0.06931332498788834, + "learning_rate": 6.324410159257554e-06, + "loss": 2.5733, + "step": 32032 + }, + { + "epoch": 0.9498858345936008, + "grad_norm": 0.062455710023641586, + "learning_rate": 6.316952482663674e-06, + "loss": 2.4941, + "step": 32033 + }, + { + "epoch": 0.9499154879459123, + "grad_norm": 0.06857660412788391, + "learning_rate": 6.309499177718003e-06, + "loss": 2.5144, + "step": 32034 + }, + { + "epoch": 0.9499451412982237, + "grad_norm": 0.06683612614870071, + "learning_rate": 6.3020502444863796e-06, + "loss": 2.5393, + "step": 32035 + }, + { + "epoch": 0.9499747946505352, + "grad_norm": 0.0662626326084137, + "learning_rate": 6.294605683034915e-06, + "loss": 2.5591, + "step": 32036 + }, + { + "epoch": 0.9500044480028467, + "grad_norm": 0.06769027560949326, + "learning_rate": 6.287165493429392e-06, + "loss": 2.5318, + "step": 32037 + }, + { + "epoch": 0.9500341013551582, + "grad_norm": 0.06749431043863297, + "learning_rate": 6.279729675735813e-06, + "loss": 2.5392, + "step": 32038 + }, + { + "epoch": 0.9500637547074697, + "grad_norm": 0.06607232987880707, + "learning_rate": 6.272298230019957e-06, + "loss": 2.5242, + "step": 32039 + }, + { + "epoch": 0.9500934080597812, + "grad_norm": 0.06823623180389404, + "learning_rate": 6.264871156347662e-06, + "loss": 2.5297, + "step": 32040 + }, + { + "epoch": 0.9501230614120927, + "grad_norm": 0.0683874562382698, + "learning_rate": 6.2574484547846534e-06, + "loss": 2.5357, + "step": 32041 + }, + { + "epoch": 0.9501527147644041, + "grad_norm": 0.06502443552017212, + "learning_rate": 6.250030125396711e-06, + "loss": 2.5369, + "step": 32042 + }, + { + "epoch": 0.9501823681167156, + "grad_norm": 0.0679459497332573, + "learning_rate": 6.242616168249504e-06, + "loss": 2.5403, + "step": 32043 + }, + { + "epoch": 0.9502120214690271, + "grad_norm": 0.06312788277864456, + "learning_rate": 6.2352065834087034e-06, + "loss": 2.5322, + "step": 32044 + }, + { + "epoch": 0.9502416748213386, + "grad_norm": 0.06865878403186798, + "learning_rate": 6.227801370939867e-06, + "loss": 2.5666, + "step": 32045 + }, + { + "epoch": 0.95027132817365, + "grad_norm": 0.06519458442926407, + "learning_rate": 6.2204005309086095e-06, + "loss": 2.5413, + "step": 32046 + }, + { + "epoch": 0.9503009815259615, + "grad_norm": 0.0711425319314003, + "learning_rate": 6.213004063380434e-06, + "loss": 2.5136, + "step": 32047 + }, + { + "epoch": 0.950330634878273, + "grad_norm": 0.06543353199958801, + "learning_rate": 6.205611968420899e-06, + "loss": 2.5504, + "step": 32048 + }, + { + "epoch": 0.9503602882305845, + "grad_norm": 0.06508180499076843, + "learning_rate": 6.198224246095452e-06, + "loss": 2.5227, + "step": 32049 + }, + { + "epoch": 0.9503899415828959, + "grad_norm": 0.06584279984235764, + "learning_rate": 6.190840896469429e-06, + "loss": 2.5355, + "step": 32050 + }, + { + "epoch": 0.9504195949352074, + "grad_norm": 0.06497020274400711, + "learning_rate": 6.1834619196083355e-06, + "loss": 2.5295, + "step": 32051 + }, + { + "epoch": 0.9504492482875189, + "grad_norm": 0.06434569507837296, + "learning_rate": 6.176087315577394e-06, + "loss": 2.5505, + "step": 32052 + }, + { + "epoch": 0.9504789016398304, + "grad_norm": 0.06605096161365509, + "learning_rate": 6.168717084441999e-06, + "loss": 2.536, + "step": 32053 + }, + { + "epoch": 0.9505085549921418, + "grad_norm": 0.0685291588306427, + "learning_rate": 6.161351226267375e-06, + "loss": 2.5597, + "step": 32054 + }, + { + "epoch": 0.9505382083444534, + "grad_norm": 0.0679091289639473, + "learning_rate": 6.153989741118749e-06, + "loss": 2.5261, + "step": 32055 + }, + { + "epoch": 0.9505678616967648, + "grad_norm": 0.06676547974348068, + "learning_rate": 6.146632629061288e-06, + "loss": 2.5631, + "step": 32056 + }, + { + "epoch": 0.9505975150490763, + "grad_norm": 0.06633847951889038, + "learning_rate": 6.139279890160221e-06, + "loss": 2.5913, + "step": 32057 + }, + { + "epoch": 0.9506271684013877, + "grad_norm": 0.06628794968128204, + "learning_rate": 6.13193152448055e-06, + "loss": 2.5836, + "step": 32058 + }, + { + "epoch": 0.9506568217536993, + "grad_norm": 0.07193739712238312, + "learning_rate": 6.124587532087389e-06, + "loss": 2.5376, + "step": 32059 + }, + { + "epoch": 0.9506864751060108, + "grad_norm": 0.06700853258371353, + "learning_rate": 6.117247913045798e-06, + "loss": 2.5627, + "step": 32060 + }, + { + "epoch": 0.9507161284583222, + "grad_norm": 0.0689983144402504, + "learning_rate": 6.109912667420781e-06, + "loss": 2.5435, + "step": 32061 + }, + { + "epoch": 0.9507457818106337, + "grad_norm": 0.06643148511648178, + "learning_rate": 6.102581795277229e-06, + "loss": 2.5341, + "step": 32062 + }, + { + "epoch": 0.9507754351629452, + "grad_norm": 0.06608591228723526, + "learning_rate": 6.095255296680091e-06, + "loss": 2.5523, + "step": 32063 + }, + { + "epoch": 0.9508050885152567, + "grad_norm": 0.06948389112949371, + "learning_rate": 6.0879331716942595e-06, + "loss": 2.5533, + "step": 32064 + }, + { + "epoch": 0.9508347418675681, + "grad_norm": 0.06501644104719162, + "learning_rate": 6.080615420384517e-06, + "loss": 2.5412, + "step": 32065 + }, + { + "epoch": 0.9508643952198796, + "grad_norm": 0.06443460285663605, + "learning_rate": 6.073302042815754e-06, + "loss": 2.5444, + "step": 32066 + }, + { + "epoch": 0.9508940485721911, + "grad_norm": 0.06701544672250748, + "learning_rate": 6.065993039052642e-06, + "loss": 2.5182, + "step": 32067 + }, + { + "epoch": 0.9509237019245026, + "grad_norm": 0.06617366522550583, + "learning_rate": 6.058688409159963e-06, + "loss": 2.5302, + "step": 32068 + }, + { + "epoch": 0.950953355276814, + "grad_norm": 0.06755053251981735, + "learning_rate": 6.0513881532023866e-06, + "loss": 2.5624, + "step": 32069 + }, + { + "epoch": 0.9509830086291255, + "grad_norm": 0.06589730829000473, + "learning_rate": 6.044092271244583e-06, + "loss": 2.5826, + "step": 32070 + }, + { + "epoch": 0.951012661981437, + "grad_norm": 0.06677823513746262, + "learning_rate": 6.036800763351058e-06, + "loss": 2.5324, + "step": 32071 + }, + { + "epoch": 0.9510423153337485, + "grad_norm": 0.06640223413705826, + "learning_rate": 6.029513629586536e-06, + "loss": 2.5426, + "step": 32072 + }, + { + "epoch": 0.9510719686860599, + "grad_norm": 0.0683915913105011, + "learning_rate": 6.02223087001541e-06, + "loss": 2.5587, + "step": 32073 + }, + { + "epoch": 0.9511016220383715, + "grad_norm": 0.06882651150226593, + "learning_rate": 6.014952484702241e-06, + "loss": 2.5465, + "step": 32074 + }, + { + "epoch": 0.9511312753906829, + "grad_norm": 0.0661214143037796, + "learning_rate": 6.00767847371142e-06, + "loss": 2.5576, + "step": 32075 + }, + { + "epoch": 0.9511609287429944, + "grad_norm": 0.06943850964307785, + "learning_rate": 6.000408837107396e-06, + "loss": 2.5286, + "step": 32076 + }, + { + "epoch": 0.9511905820953058, + "grad_norm": 0.0658908411860466, + "learning_rate": 5.993143574954562e-06, + "loss": 2.5655, + "step": 32077 + }, + { + "epoch": 0.9512202354476174, + "grad_norm": 0.06461436301469803, + "learning_rate": 5.985882687317256e-06, + "loss": 2.4974, + "step": 32078 + }, + { + "epoch": 0.9512498887999288, + "grad_norm": 0.0659797191619873, + "learning_rate": 5.9786261742597605e-06, + "loss": 2.5477, + "step": 32079 + }, + { + "epoch": 0.9512795421522403, + "grad_norm": 0.06667875498533249, + "learning_rate": 5.9713740358462995e-06, + "loss": 2.4857, + "step": 32080 + }, + { + "epoch": 0.9513091955045518, + "grad_norm": 0.06717726588249207, + "learning_rate": 5.964126272141157e-06, + "loss": 2.5376, + "step": 32081 + }, + { + "epoch": 0.9513388488568633, + "grad_norm": 0.06871692091226578, + "learning_rate": 5.9568828832084475e-06, + "loss": 2.4997, + "step": 32082 + }, + { + "epoch": 0.9513685022091748, + "grad_norm": 0.06531774997711182, + "learning_rate": 5.949643869112342e-06, + "loss": 2.5146, + "step": 32083 + }, + { + "epoch": 0.9513981555614862, + "grad_norm": 0.06526858359575272, + "learning_rate": 5.942409229916956e-06, + "loss": 2.5381, + "step": 32084 + }, + { + "epoch": 0.9514278089137977, + "grad_norm": 0.06747940182685852, + "learning_rate": 5.93517896568635e-06, + "loss": 2.5195, + "step": 32085 + }, + { + "epoch": 0.9514574622661092, + "grad_norm": 0.06823557615280151, + "learning_rate": 5.927953076484527e-06, + "loss": 2.544, + "step": 32086 + }, + { + "epoch": 0.9514871156184207, + "grad_norm": 0.06484037637710571, + "learning_rate": 5.920731562375492e-06, + "loss": 2.5338, + "step": 32087 + }, + { + "epoch": 0.9515167689707321, + "grad_norm": 0.06675698608160019, + "learning_rate": 5.913514423423138e-06, + "loss": 2.5095, + "step": 32088 + }, + { + "epoch": 0.9515464223230437, + "grad_norm": 0.06709473580121994, + "learning_rate": 5.906301659691471e-06, + "loss": 2.5241, + "step": 32089 + }, + { + "epoch": 0.9515760756753551, + "grad_norm": 0.06611557304859161, + "learning_rate": 5.899093271244271e-06, + "loss": 2.5114, + "step": 32090 + }, + { + "epoch": 0.9516057290276666, + "grad_norm": 0.06531909853219986, + "learning_rate": 5.891889258145433e-06, + "loss": 2.5128, + "step": 32091 + }, + { + "epoch": 0.951635382379978, + "grad_norm": 0.066154845058918, + "learning_rate": 5.8846896204587384e-06, + "loss": 2.5424, + "step": 32092 + }, + { + "epoch": 0.9516650357322896, + "grad_norm": 0.06361556053161621, + "learning_rate": 5.877494358247915e-06, + "loss": 2.5396, + "step": 32093 + }, + { + "epoch": 0.951694689084601, + "grad_norm": 0.06573472172021866, + "learning_rate": 5.870303471576743e-06, + "loss": 2.5338, + "step": 32094 + }, + { + "epoch": 0.9517243424369125, + "grad_norm": 0.06683588027954102, + "learning_rate": 5.863116960508841e-06, + "loss": 2.5156, + "step": 32095 + }, + { + "epoch": 0.9517539957892239, + "grad_norm": 0.06907366216182709, + "learning_rate": 5.855934825107823e-06, + "loss": 2.5354, + "step": 32096 + }, + { + "epoch": 0.9517836491415355, + "grad_norm": 0.07101724296808243, + "learning_rate": 5.84875706543736e-06, + "loss": 2.5425, + "step": 32097 + }, + { + "epoch": 0.9518133024938469, + "grad_norm": 0.06822318583726883, + "learning_rate": 5.841583681560902e-06, + "loss": 2.5377, + "step": 32098 + }, + { + "epoch": 0.9518429558461584, + "grad_norm": 0.06673994660377502, + "learning_rate": 5.834414673542121e-06, + "loss": 2.5698, + "step": 32099 + }, + { + "epoch": 0.9518726091984698, + "grad_norm": 0.06652829051017761, + "learning_rate": 5.827250041444354e-06, + "loss": 2.5319, + "step": 32100 + }, + { + "epoch": 0.9519022625507814, + "grad_norm": 0.06643762439489365, + "learning_rate": 5.820089785331162e-06, + "loss": 2.5199, + "step": 32101 + }, + { + "epoch": 0.9519319159030929, + "grad_norm": 0.0747041180729866, + "learning_rate": 5.812933905265827e-06, + "loss": 2.5198, + "step": 32102 + }, + { + "epoch": 0.9519615692554043, + "grad_norm": 0.06862099468708038, + "learning_rate": 5.805782401311854e-06, + "loss": 2.5525, + "step": 32103 + }, + { + "epoch": 0.9519912226077158, + "grad_norm": 0.06821055710315704, + "learning_rate": 5.798635273532471e-06, + "loss": 2.5377, + "step": 32104 + }, + { + "epoch": 0.9520208759600273, + "grad_norm": 0.06994247436523438, + "learning_rate": 5.791492521991016e-06, + "loss": 2.5513, + "step": 32105 + }, + { + "epoch": 0.9520505293123388, + "grad_norm": 0.06350883096456528, + "learning_rate": 5.78435414675077e-06, + "loss": 2.5134, + "step": 32106 + }, + { + "epoch": 0.9520801826646502, + "grad_norm": 0.06963921338319778, + "learning_rate": 5.777220147874851e-06, + "loss": 2.5357, + "step": 32107 + }, + { + "epoch": 0.9521098360169618, + "grad_norm": 0.06531517207622528, + "learning_rate": 5.770090525426486e-06, + "loss": 2.5296, + "step": 32108 + }, + { + "epoch": 0.9521394893692732, + "grad_norm": 0.06852347403764725, + "learning_rate": 5.76296527946879e-06, + "loss": 2.5712, + "step": 32109 + }, + { + "epoch": 0.9521691427215847, + "grad_norm": 0.06641726940870285, + "learning_rate": 5.755844410064881e-06, + "loss": 2.508, + "step": 32110 + }, + { + "epoch": 0.9521987960738961, + "grad_norm": 0.06802774965763092, + "learning_rate": 5.748727917277818e-06, + "loss": 2.5214, + "step": 32111 + }, + { + "epoch": 0.9522284494262077, + "grad_norm": 0.06742309778928757, + "learning_rate": 5.741615801170608e-06, + "loss": 2.5242, + "step": 32112 + }, + { + "epoch": 0.9522581027785191, + "grad_norm": 0.06384263932704926, + "learning_rate": 5.7345080618061986e-06, + "loss": 2.5766, + "step": 32113 + }, + { + "epoch": 0.9522877561308306, + "grad_norm": 0.06681142002344131, + "learning_rate": 5.727404699247596e-06, + "loss": 2.5243, + "step": 32114 + }, + { + "epoch": 0.952317409483142, + "grad_norm": 0.06493714451789856, + "learning_rate": 5.720305713557639e-06, + "loss": 2.5304, + "step": 32115 + }, + { + "epoch": 0.9523470628354536, + "grad_norm": 0.06336437165737152, + "learning_rate": 5.713211104799221e-06, + "loss": 2.5358, + "step": 32116 + }, + { + "epoch": 0.952376716187765, + "grad_norm": 0.06440922617912292, + "learning_rate": 5.706120873035126e-06, + "loss": 2.5669, + "step": 32117 + }, + { + "epoch": 0.9524063695400765, + "grad_norm": 0.06571047753095627, + "learning_rate": 5.699035018328247e-06, + "loss": 2.5193, + "step": 32118 + }, + { + "epoch": 0.9524360228923879, + "grad_norm": 0.06770751625299454, + "learning_rate": 5.691953540741202e-06, + "loss": 2.5045, + "step": 32119 + }, + { + "epoch": 0.9524656762446995, + "grad_norm": 0.06608550250530243, + "learning_rate": 5.684876440336772e-06, + "loss": 2.5367, + "step": 32120 + }, + { + "epoch": 0.9524953295970109, + "grad_norm": 0.0684109702706337, + "learning_rate": 5.677803717177632e-06, + "loss": 2.5518, + "step": 32121 + }, + { + "epoch": 0.9525249829493224, + "grad_norm": 0.0656551942229271, + "learning_rate": 5.670735371326397e-06, + "loss": 2.5135, + "step": 32122 + }, + { + "epoch": 0.952554636301634, + "grad_norm": 0.06468766182661057, + "learning_rate": 5.663671402845627e-06, + "loss": 2.4859, + "step": 32123 + }, + { + "epoch": 0.9525842896539454, + "grad_norm": 0.06710977852344513, + "learning_rate": 5.656611811797885e-06, + "loss": 2.5548, + "step": 32124 + }, + { + "epoch": 0.9526139430062569, + "grad_norm": 0.06686509400606155, + "learning_rate": 5.649556598245731e-06, + "loss": 2.5523, + "step": 32125 + }, + { + "epoch": 0.9526435963585683, + "grad_norm": 0.06733260303735733, + "learning_rate": 5.64250576225156e-06, + "loss": 2.5482, + "step": 32126 + }, + { + "epoch": 0.9526732497108799, + "grad_norm": 0.06949810683727264, + "learning_rate": 5.635459303877877e-06, + "loss": 2.5738, + "step": 32127 + }, + { + "epoch": 0.9527029030631913, + "grad_norm": 0.0680556371808052, + "learning_rate": 5.628417223187077e-06, + "loss": 2.5317, + "step": 32128 + }, + { + "epoch": 0.9527325564155028, + "grad_norm": 0.06815698742866516, + "learning_rate": 5.621379520241499e-06, + "loss": 2.5912, + "step": 32129 + }, + { + "epoch": 0.9527622097678142, + "grad_norm": 0.06822899729013443, + "learning_rate": 5.614346195103482e-06, + "loss": 2.5756, + "step": 32130 + }, + { + "epoch": 0.9527918631201258, + "grad_norm": 0.06720975041389465, + "learning_rate": 5.607317247835253e-06, + "loss": 2.5231, + "step": 32131 + }, + { + "epoch": 0.9528215164724372, + "grad_norm": 0.0654999241232872, + "learning_rate": 5.600292678499097e-06, + "loss": 2.528, + "step": 32132 + }, + { + "epoch": 0.9528511698247487, + "grad_norm": 0.06564410775899887, + "learning_rate": 5.593272487157186e-06, + "loss": 2.5153, + "step": 32133 + }, + { + "epoch": 0.9528808231770601, + "grad_norm": 0.06605276465415955, + "learning_rate": 5.586256673871748e-06, + "loss": 2.5196, + "step": 32134 + }, + { + "epoch": 0.9529104765293717, + "grad_norm": 0.06804930418729782, + "learning_rate": 5.5792452387049e-06, + "loss": 2.5353, + "step": 32135 + }, + { + "epoch": 0.9529401298816831, + "grad_norm": 0.06699980050325394, + "learning_rate": 5.572238181718647e-06, + "loss": 2.545, + "step": 32136 + }, + { + "epoch": 0.9529697832339946, + "grad_norm": 0.06683623790740967, + "learning_rate": 5.565235502975108e-06, + "loss": 2.5121, + "step": 32137 + }, + { + "epoch": 0.952999436586306, + "grad_norm": 0.06553396582603455, + "learning_rate": 5.558237202536287e-06, + "loss": 2.499, + "step": 32138 + }, + { + "epoch": 0.9530290899386176, + "grad_norm": 0.06610818952322006, + "learning_rate": 5.551243280464191e-06, + "loss": 2.5385, + "step": 32139 + }, + { + "epoch": 0.953058743290929, + "grad_norm": 0.06468375027179718, + "learning_rate": 5.544253736820659e-06, + "loss": 2.5223, + "step": 32140 + }, + { + "epoch": 0.9530883966432405, + "grad_norm": 0.0665413960814476, + "learning_rate": 5.537268571667586e-06, + "loss": 2.5127, + "step": 32141 + }, + { + "epoch": 0.953118049995552, + "grad_norm": 0.06821367889642715, + "learning_rate": 5.5302877850669235e-06, + "loss": 2.5461, + "step": 32142 + }, + { + "epoch": 0.9531477033478635, + "grad_norm": 0.06254573911428452, + "learning_rate": 5.523311377080398e-06, + "loss": 2.5009, + "step": 32143 + }, + { + "epoch": 0.953177356700175, + "grad_norm": 0.0681966096162796, + "learning_rate": 5.51633934776985e-06, + "loss": 2.5517, + "step": 32144 + }, + { + "epoch": 0.9532070100524864, + "grad_norm": 0.06517521291971207, + "learning_rate": 5.5093716971970074e-06, + "loss": 2.5131, + "step": 32145 + }, + { + "epoch": 0.953236663404798, + "grad_norm": 0.06565985828638077, + "learning_rate": 5.502408425423544e-06, + "loss": 2.5315, + "step": 32146 + }, + { + "epoch": 0.9532663167571094, + "grad_norm": 0.06819969415664673, + "learning_rate": 5.495449532511187e-06, + "loss": 2.5164, + "step": 32147 + }, + { + "epoch": 0.9532959701094209, + "grad_norm": 0.06744913756847382, + "learning_rate": 5.488495018521444e-06, + "loss": 2.5236, + "step": 32148 + }, + { + "epoch": 0.9533256234617323, + "grad_norm": 0.06389105319976807, + "learning_rate": 5.481544883515987e-06, + "loss": 2.5847, + "step": 32149 + }, + { + "epoch": 0.9533552768140439, + "grad_norm": 0.06861405819654465, + "learning_rate": 5.474599127556324e-06, + "loss": 2.5445, + "step": 32150 + }, + { + "epoch": 0.9533849301663553, + "grad_norm": 0.06883050501346588, + "learning_rate": 5.4676577507039585e-06, + "loss": 2.5251, + "step": 32151 + }, + { + "epoch": 0.9534145835186668, + "grad_norm": 0.06512640416622162, + "learning_rate": 5.4607207530203985e-06, + "loss": 2.5234, + "step": 32152 + }, + { + "epoch": 0.9534442368709782, + "grad_norm": 0.06562556326389313, + "learning_rate": 5.453788134567039e-06, + "loss": 2.4954, + "step": 32153 + }, + { + "epoch": 0.9534738902232898, + "grad_norm": 0.06808437407016754, + "learning_rate": 5.446859895405221e-06, + "loss": 2.5454, + "step": 32154 + }, + { + "epoch": 0.9535035435756012, + "grad_norm": 0.06526318192481995, + "learning_rate": 5.439936035596338e-06, + "loss": 2.5817, + "step": 32155 + }, + { + "epoch": 0.9535331969279127, + "grad_norm": 0.06543814390897751, + "learning_rate": 5.4330165552017865e-06, + "loss": 2.5118, + "step": 32156 + }, + { + "epoch": 0.9535628502802241, + "grad_norm": 0.06468557566404343, + "learning_rate": 5.426101454282739e-06, + "loss": 2.5852, + "step": 32157 + }, + { + "epoch": 0.9535925036325357, + "grad_norm": 0.06498657166957855, + "learning_rate": 5.419190732900425e-06, + "loss": 2.5466, + "step": 32158 + }, + { + "epoch": 0.9536221569848471, + "grad_norm": 0.0682656541466713, + "learning_rate": 5.412284391116129e-06, + "loss": 2.558, + "step": 32159 + }, + { + "epoch": 0.9536518103371586, + "grad_norm": 0.06299419701099396, + "learning_rate": 5.405382428990913e-06, + "loss": 2.5288, + "step": 32160 + }, + { + "epoch": 0.95368146368947, + "grad_norm": 0.06663999706506729, + "learning_rate": 5.398484846585949e-06, + "loss": 2.5241, + "step": 32161 + }, + { + "epoch": 0.9537111170417816, + "grad_norm": 0.06649467349052429, + "learning_rate": 5.391591643962301e-06, + "loss": 2.5634, + "step": 32162 + }, + { + "epoch": 0.9537407703940931, + "grad_norm": 0.06563524156808853, + "learning_rate": 5.384702821180976e-06, + "loss": 2.5472, + "step": 32163 + }, + { + "epoch": 0.9537704237464045, + "grad_norm": 0.0629756897687912, + "learning_rate": 5.3778183783030345e-06, + "loss": 2.4985, + "step": 32164 + }, + { + "epoch": 0.9538000770987161, + "grad_norm": 0.06649000197649002, + "learning_rate": 5.370938315389373e-06, + "loss": 2.4958, + "step": 32165 + }, + { + "epoch": 0.9538297304510275, + "grad_norm": 0.0664638802409172, + "learning_rate": 5.364062632500944e-06, + "loss": 2.5307, + "step": 32166 + }, + { + "epoch": 0.953859383803339, + "grad_norm": 0.06505363434553146, + "learning_rate": 5.357191329698697e-06, + "loss": 2.5248, + "step": 32167 + }, + { + "epoch": 0.9538890371556504, + "grad_norm": 0.06539380550384521, + "learning_rate": 5.350324407043417e-06, + "loss": 2.5451, + "step": 32168 + }, + { + "epoch": 0.953918690507962, + "grad_norm": 0.06578373908996582, + "learning_rate": 5.343461864595889e-06, + "loss": 2.5388, + "step": 32169 + }, + { + "epoch": 0.9539483438602734, + "grad_norm": 0.0683429017663002, + "learning_rate": 5.336603702417009e-06, + "loss": 2.5424, + "step": 32170 + }, + { + "epoch": 0.9539779972125849, + "grad_norm": 0.06468649208545685, + "learning_rate": 5.329749920567339e-06, + "loss": 2.5265, + "step": 32171 + }, + { + "epoch": 0.9540076505648963, + "grad_norm": 0.06771666556596756, + "learning_rate": 5.32290051910761e-06, + "loss": 2.5371, + "step": 32172 + }, + { + "epoch": 0.9540373039172079, + "grad_norm": 0.06588641554117203, + "learning_rate": 5.316055498098549e-06, + "loss": 2.4896, + "step": 32173 + }, + { + "epoch": 0.9540669572695193, + "grad_norm": 0.06674139946699142, + "learning_rate": 5.309214857600719e-06, + "loss": 2.5204, + "step": 32174 + }, + { + "epoch": 0.9540966106218308, + "grad_norm": 0.06659495085477829, + "learning_rate": 5.302378597674684e-06, + "loss": 2.5219, + "step": 32175 + }, + { + "epoch": 0.9541262639741422, + "grad_norm": 0.06440412998199463, + "learning_rate": 5.295546718381061e-06, + "loss": 2.5464, + "step": 32176 + }, + { + "epoch": 0.9541559173264538, + "grad_norm": 0.06457650661468506, + "learning_rate": 5.288719219780247e-06, + "loss": 2.5635, + "step": 32177 + }, + { + "epoch": 0.9541855706787652, + "grad_norm": 0.06550365686416626, + "learning_rate": 5.281896101932693e-06, + "loss": 2.5593, + "step": 32178 + }, + { + "epoch": 0.9542152240310767, + "grad_norm": 0.06713941693305969, + "learning_rate": 5.275077364898906e-06, + "loss": 2.5253, + "step": 32179 + }, + { + "epoch": 0.9542448773833881, + "grad_norm": 0.06657905876636505, + "learning_rate": 5.268263008739227e-06, + "loss": 2.5267, + "step": 32180 + }, + { + "epoch": 0.9542745307356997, + "grad_norm": 0.06725993007421494, + "learning_rate": 5.261453033514052e-06, + "loss": 2.5601, + "step": 32181 + }, + { + "epoch": 0.9543041840880111, + "grad_norm": 0.06704019755125046, + "learning_rate": 5.254647439283556e-06, + "loss": 2.5457, + "step": 32182 + }, + { + "epoch": 0.9543338374403226, + "grad_norm": 0.06611408293247223, + "learning_rate": 5.247846226108133e-06, + "loss": 2.5491, + "step": 32183 + }, + { + "epoch": 0.9543634907926342, + "grad_norm": 0.0633096694946289, + "learning_rate": 5.241049394047903e-06, + "loss": 2.5272, + "step": 32184 + }, + { + "epoch": 0.9543931441449456, + "grad_norm": 0.06315792351961136, + "learning_rate": 5.2342569431631515e-06, + "loss": 2.5379, + "step": 32185 + }, + { + "epoch": 0.9544227974972571, + "grad_norm": 0.06337106227874756, + "learning_rate": 5.227468873513941e-06, + "loss": 2.5542, + "step": 32186 + }, + { + "epoch": 0.9544524508495685, + "grad_norm": 0.0651412233710289, + "learning_rate": 5.220685185160446e-06, + "loss": 2.5355, + "step": 32187 + }, + { + "epoch": 0.9544821042018801, + "grad_norm": 0.06604085862636566, + "learning_rate": 5.213905878162728e-06, + "loss": 2.5042, + "step": 32188 + }, + { + "epoch": 0.9545117575541915, + "grad_norm": 0.06552884727716446, + "learning_rate": 5.207130952580741e-06, + "loss": 2.5488, + "step": 32189 + }, + { + "epoch": 0.954541410906503, + "grad_norm": 0.06730469316244125, + "learning_rate": 5.2003604084746025e-06, + "loss": 2.5305, + "step": 32190 + }, + { + "epoch": 0.9545710642588144, + "grad_norm": 0.06453204900026321, + "learning_rate": 5.193594245904154e-06, + "loss": 2.5444, + "step": 32191 + }, + { + "epoch": 0.954600717611126, + "grad_norm": 0.06585942953824997, + "learning_rate": 5.186832464929403e-06, + "loss": 2.5303, + "step": 32192 + }, + { + "epoch": 0.9546303709634374, + "grad_norm": 0.06854752451181412, + "learning_rate": 5.180075065610135e-06, + "loss": 2.5437, + "step": 32193 + }, + { + "epoch": 0.9546600243157489, + "grad_norm": 0.06733502447605133, + "learning_rate": 5.1733220480063015e-06, + "loss": 2.5432, + "step": 32194 + }, + { + "epoch": 0.9546896776680603, + "grad_norm": 0.06894142925739288, + "learning_rate": 5.166573412177577e-06, + "loss": 2.5644, + "step": 32195 + }, + { + "epoch": 0.9547193310203719, + "grad_norm": 0.06796851754188538, + "learning_rate": 5.159829158183804e-06, + "loss": 2.5072, + "step": 32196 + }, + { + "epoch": 0.9547489843726833, + "grad_norm": 0.06363086402416229, + "learning_rate": 5.153089286084711e-06, + "loss": 2.5221, + "step": 32197 + }, + { + "epoch": 0.9547786377249948, + "grad_norm": 0.06635437905788422, + "learning_rate": 5.1463537959399175e-06, + "loss": 2.56, + "step": 32198 + }, + { + "epoch": 0.9548082910773062, + "grad_norm": 0.06608734279870987, + "learning_rate": 5.139622687809098e-06, + "loss": 2.5212, + "step": 32199 + }, + { + "epoch": 0.9548379444296178, + "grad_norm": 0.06601852923631668, + "learning_rate": 5.132895961751871e-06, + "loss": 2.5253, + "step": 32200 + }, + { + "epoch": 0.9548675977819292, + "grad_norm": 0.06484343856573105, + "learning_rate": 5.126173617827801e-06, + "loss": 2.5171, + "step": 32201 + }, + { + "epoch": 0.9548972511342407, + "grad_norm": 0.07027506828308105, + "learning_rate": 5.119455656096395e-06, + "loss": 2.5574, + "step": 32202 + }, + { + "epoch": 0.9549269044865522, + "grad_norm": 0.06605445593595505, + "learning_rate": 5.112742076617216e-06, + "loss": 2.5559, + "step": 32203 + }, + { + "epoch": 0.9549565578388637, + "grad_norm": 0.062309976667165756, + "learning_rate": 5.106032879449551e-06, + "loss": 2.5094, + "step": 32204 + }, + { + "epoch": 0.9549862111911752, + "grad_norm": 0.066920705139637, + "learning_rate": 5.099328064652964e-06, + "loss": 2.5459, + "step": 32205 + }, + { + "epoch": 0.9550158645434866, + "grad_norm": 0.06811513751745224, + "learning_rate": 5.092627632286795e-06, + "loss": 2.5154, + "step": 32206 + }, + { + "epoch": 0.9550455178957982, + "grad_norm": 0.06458775699138641, + "learning_rate": 5.08593158241033e-06, + "loss": 2.5428, + "step": 32207 + }, + { + "epoch": 0.9550751712481096, + "grad_norm": 0.061987537890672684, + "learning_rate": 5.0792399150829115e-06, + "loss": 2.5442, + "step": 32208 + }, + { + "epoch": 0.9551048246004211, + "grad_norm": 0.06594956666231155, + "learning_rate": 5.072552630363769e-06, + "loss": 2.5356, + "step": 32209 + }, + { + "epoch": 0.9551344779527325, + "grad_norm": 0.06722607463598251, + "learning_rate": 5.065869728312078e-06, + "loss": 2.5483, + "step": 32210 + }, + { + "epoch": 0.9551641313050441, + "grad_norm": 0.06598801910877228, + "learning_rate": 5.059191208987124e-06, + "loss": 2.5248, + "step": 32211 + }, + { + "epoch": 0.9551937846573555, + "grad_norm": 0.06870207190513611, + "learning_rate": 5.052517072447971e-06, + "loss": 2.5757, + "step": 32212 + }, + { + "epoch": 0.955223438009667, + "grad_norm": 0.06820274889469147, + "learning_rate": 5.045847318753738e-06, + "loss": 2.5465, + "step": 32213 + }, + { + "epoch": 0.9552530913619784, + "grad_norm": 0.06528688967227936, + "learning_rate": 5.03918194796349e-06, + "loss": 2.537, + "step": 32214 + }, + { + "epoch": 0.95528274471429, + "grad_norm": 0.06798725575208664, + "learning_rate": 5.032520960136233e-06, + "loss": 2.5329, + "step": 32215 + }, + { + "epoch": 0.9553123980666014, + "grad_norm": 0.06166369467973709, + "learning_rate": 5.025864355330978e-06, + "loss": 2.5147, + "step": 32216 + }, + { + "epoch": 0.9553420514189129, + "grad_norm": 0.06738865375518799, + "learning_rate": 5.019212133606621e-06, + "loss": 2.5439, + "step": 32217 + }, + { + "epoch": 0.9553717047712244, + "grad_norm": 0.06649528443813324, + "learning_rate": 5.012564295022115e-06, + "loss": 2.5572, + "step": 32218 + }, + { + "epoch": 0.9554013581235359, + "grad_norm": 0.0647856667637825, + "learning_rate": 5.005920839636302e-06, + "loss": 2.5482, + "step": 32219 + }, + { + "epoch": 0.9554310114758473, + "grad_norm": 0.06495585292577744, + "learning_rate": 4.999281767508079e-06, + "loss": 2.53, + "step": 32220 + }, + { + "epoch": 0.9554606648281588, + "grad_norm": 0.06314598768949509, + "learning_rate": 4.992647078696122e-06, + "loss": 2.5375, + "step": 32221 + }, + { + "epoch": 0.9554903181804703, + "grad_norm": 0.06634258478879929, + "learning_rate": 4.986016773259272e-06, + "loss": 2.554, + "step": 32222 + }, + { + "epoch": 0.9555199715327818, + "grad_norm": 0.06284081190824509, + "learning_rate": 4.979390851256205e-06, + "loss": 2.5632, + "step": 32223 + }, + { + "epoch": 0.9555496248850932, + "grad_norm": 0.06531115621328354, + "learning_rate": 4.9727693127456504e-06, + "loss": 2.5651, + "step": 32224 + }, + { + "epoch": 0.9555792782374047, + "grad_norm": 0.06509844213724136, + "learning_rate": 4.9661521577861744e-06, + "loss": 2.5531, + "step": 32225 + }, + { + "epoch": 0.9556089315897163, + "grad_norm": 0.06676653772592545, + "learning_rate": 4.959539386436341e-06, + "loss": 2.5331, + "step": 32226 + }, + { + "epoch": 0.9556385849420277, + "grad_norm": 0.06601954251527786, + "learning_rate": 4.952930998754768e-06, + "loss": 2.5589, + "step": 32227 + }, + { + "epoch": 0.9556682382943392, + "grad_norm": 0.06581224501132965, + "learning_rate": 4.946326994800021e-06, + "loss": 2.5704, + "step": 32228 + }, + { + "epoch": 0.9556978916466506, + "grad_norm": 0.07029484957456589, + "learning_rate": 4.939727374630443e-06, + "loss": 2.5442, + "step": 32229 + }, + { + "epoch": 0.9557275449989622, + "grad_norm": 0.07099553197622299, + "learning_rate": 4.933132138304597e-06, + "loss": 2.5168, + "step": 32230 + }, + { + "epoch": 0.9557571983512736, + "grad_norm": 0.06529000401496887, + "learning_rate": 4.926541285880825e-06, + "loss": 2.5253, + "step": 32231 + }, + { + "epoch": 0.9557868517035851, + "grad_norm": 0.0660625547170639, + "learning_rate": 4.9199548174175265e-06, + "loss": 2.5565, + "step": 32232 + }, + { + "epoch": 0.9558165050558965, + "grad_norm": 0.06357857584953308, + "learning_rate": 4.913372732972987e-06, + "loss": 2.5309, + "step": 32233 + }, + { + "epoch": 0.9558461584082081, + "grad_norm": 0.06679175049066544, + "learning_rate": 4.906795032605549e-06, + "loss": 2.556, + "step": 32234 + }, + { + "epoch": 0.9558758117605195, + "grad_norm": 0.06519518792629242, + "learning_rate": 4.900221716373388e-06, + "loss": 2.4983, + "step": 32235 + }, + { + "epoch": 0.955905465112831, + "grad_norm": 0.06412205845117569, + "learning_rate": 4.893652784334846e-06, + "loss": 2.5394, + "step": 32236 + }, + { + "epoch": 0.9559351184651425, + "grad_norm": 0.06600727140903473, + "learning_rate": 4.8870882365478764e-06, + "loss": 2.5265, + "step": 32237 + }, + { + "epoch": 0.955964771817454, + "grad_norm": 0.06681997328996658, + "learning_rate": 4.880528073070767e-06, + "loss": 2.5569, + "step": 32238 + }, + { + "epoch": 0.9559944251697654, + "grad_norm": 0.06350047886371613, + "learning_rate": 4.873972293961582e-06, + "loss": 2.547, + "step": 32239 + }, + { + "epoch": 0.9560240785220769, + "grad_norm": 0.06325475126504898, + "learning_rate": 4.86742089927833e-06, + "loss": 2.5549, + "step": 32240 + }, + { + "epoch": 0.9560537318743884, + "grad_norm": 0.06356595456600189, + "learning_rate": 4.860873889079076e-06, + "loss": 2.5553, + "step": 32241 + }, + { + "epoch": 0.9560833852266999, + "grad_norm": 0.06541120260953903, + "learning_rate": 4.854331263421718e-06, + "loss": 2.5193, + "step": 32242 + }, + { + "epoch": 0.9561130385790113, + "grad_norm": 0.0695115476846695, + "learning_rate": 4.8477930223643214e-06, + "loss": 2.5767, + "step": 32243 + }, + { + "epoch": 0.9561426919313228, + "grad_norm": 0.06681887060403824, + "learning_rate": 4.841259165964618e-06, + "loss": 2.5604, + "step": 32244 + }, + { + "epoch": 0.9561723452836343, + "grad_norm": 0.06571616232395172, + "learning_rate": 4.834729694280615e-06, + "loss": 2.5382, + "step": 32245 + }, + { + "epoch": 0.9562019986359458, + "grad_norm": 0.06493344157934189, + "learning_rate": 4.828204607370101e-06, + "loss": 2.5591, + "step": 32246 + }, + { + "epoch": 0.9562316519882573, + "grad_norm": 0.06423606723546982, + "learning_rate": 4.821683905290808e-06, + "loss": 2.5378, + "step": 32247 + }, + { + "epoch": 0.9562613053405687, + "grad_norm": 0.06294326484203339, + "learning_rate": 4.815167588100522e-06, + "loss": 2.5345, + "step": 32248 + }, + { + "epoch": 0.9562909586928803, + "grad_norm": 0.06980212032794952, + "learning_rate": 4.808655655856864e-06, + "loss": 2.5404, + "step": 32249 + }, + { + "epoch": 0.9563206120451917, + "grad_norm": 0.06688446551561356, + "learning_rate": 4.8021481086176214e-06, + "loss": 2.5529, + "step": 32250 + }, + { + "epoch": 0.9563502653975032, + "grad_norm": 0.06420490890741348, + "learning_rate": 4.795644946440303e-06, + "loss": 2.5458, + "step": 32251 + }, + { + "epoch": 0.9563799187498147, + "grad_norm": 0.06751663982868195, + "learning_rate": 4.789146169382586e-06, + "loss": 2.4869, + "step": 32252 + }, + { + "epoch": 0.9564095721021262, + "grad_norm": 0.0678834617137909, + "learning_rate": 4.782651777501979e-06, + "loss": 2.5783, + "step": 32253 + }, + { + "epoch": 0.9564392254544376, + "grad_norm": 0.06846710294485092, + "learning_rate": 4.776161770855991e-06, + "loss": 2.5569, + "step": 32254 + }, + { + "epoch": 0.9564688788067491, + "grad_norm": 0.06945926696062088, + "learning_rate": 4.769676149502078e-06, + "loss": 2.5487, + "step": 32255 + }, + { + "epoch": 0.9564985321590606, + "grad_norm": 0.06538527458906174, + "learning_rate": 4.763194913497693e-06, + "loss": 2.5441, + "step": 32256 + }, + { + "epoch": 0.9565281855113721, + "grad_norm": 0.0656287893652916, + "learning_rate": 4.756718062900234e-06, + "loss": 2.5401, + "step": 32257 + }, + { + "epoch": 0.9565578388636835, + "grad_norm": 0.06614791601896286, + "learning_rate": 4.750245597767044e-06, + "loss": 2.529, + "step": 32258 + }, + { + "epoch": 0.956587492215995, + "grad_norm": 0.06393659114837646, + "learning_rate": 4.743777518155468e-06, + "loss": 2.5293, + "step": 32259 + }, + { + "epoch": 0.9566171455683065, + "grad_norm": 0.06376799941062927, + "learning_rate": 4.73731382412268e-06, + "loss": 2.5676, + "step": 32260 + }, + { + "epoch": 0.956646798920618, + "grad_norm": 0.06834016740322113, + "learning_rate": 4.730854515726024e-06, + "loss": 2.549, + "step": 32261 + }, + { + "epoch": 0.9566764522729294, + "grad_norm": 0.06245956942439079, + "learning_rate": 4.7243995930226765e-06, + "loss": 2.5234, + "step": 32262 + }, + { + "epoch": 0.9567061056252409, + "grad_norm": 0.0658082515001297, + "learning_rate": 4.717949056069759e-06, + "loss": 2.5605, + "step": 32263 + }, + { + "epoch": 0.9567357589775524, + "grad_norm": 0.06683991849422455, + "learning_rate": 4.711502904924448e-06, + "loss": 2.5462, + "step": 32264 + }, + { + "epoch": 0.9567654123298639, + "grad_norm": 0.06668619811534882, + "learning_rate": 4.705061139643807e-06, + "loss": 2.5278, + "step": 32265 + }, + { + "epoch": 0.9567950656821753, + "grad_norm": 0.06616088002920151, + "learning_rate": 4.698623760284793e-06, + "loss": 2.4967, + "step": 32266 + }, + { + "epoch": 0.9568247190344868, + "grad_norm": 0.06431955844163895, + "learning_rate": 4.692190766904525e-06, + "loss": 2.5545, + "step": 32267 + }, + { + "epoch": 0.9568543723867984, + "grad_norm": 0.06675106287002563, + "learning_rate": 4.685762159559959e-06, + "loss": 2.5176, + "step": 32268 + }, + { + "epoch": 0.9568840257391098, + "grad_norm": 0.0648069977760315, + "learning_rate": 4.679337938307937e-06, + "loss": 2.5296, + "step": 32269 + }, + { + "epoch": 0.9569136790914213, + "grad_norm": 0.0663604587316513, + "learning_rate": 4.672918103205415e-06, + "loss": 2.5348, + "step": 32270 + }, + { + "epoch": 0.9569433324437328, + "grad_norm": 0.0676090344786644, + "learning_rate": 4.666502654309235e-06, + "loss": 2.5646, + "step": 32271 + }, + { + "epoch": 0.9569729857960443, + "grad_norm": 0.06595481932163239, + "learning_rate": 4.660091591676186e-06, + "loss": 2.5433, + "step": 32272 + }, + { + "epoch": 0.9570026391483557, + "grad_norm": 0.0658254325389862, + "learning_rate": 4.653684915363055e-06, + "loss": 2.5688, + "step": 32273 + }, + { + "epoch": 0.9570322925006672, + "grad_norm": 0.07176610827445984, + "learning_rate": 4.647282625426575e-06, + "loss": 2.5217, + "step": 32274 + }, + { + "epoch": 0.9570619458529787, + "grad_norm": 0.06645657122135162, + "learning_rate": 4.640884721923422e-06, + "loss": 2.5271, + "step": 32275 + }, + { + "epoch": 0.9570915992052902, + "grad_norm": 0.0677407830953598, + "learning_rate": 4.634491204910274e-06, + "loss": 2.5588, + "step": 32276 + }, + { + "epoch": 0.9571212525576016, + "grad_norm": 0.0661105066537857, + "learning_rate": 4.6281020744436965e-06, + "loss": 2.532, + "step": 32277 + }, + { + "epoch": 0.9571509059099131, + "grad_norm": 0.06628943234682083, + "learning_rate": 4.621717330580366e-06, + "loss": 2.5495, + "step": 32278 + }, + { + "epoch": 0.9571805592622246, + "grad_norm": 0.06584330648183823, + "learning_rate": 4.615336973376682e-06, + "loss": 2.5568, + "step": 32279 + }, + { + "epoch": 0.9572102126145361, + "grad_norm": 0.06799114495515823, + "learning_rate": 4.608961002889267e-06, + "loss": 2.5058, + "step": 32280 + }, + { + "epoch": 0.9572398659668475, + "grad_norm": 0.06667203456163406, + "learning_rate": 4.602589419174574e-06, + "loss": 2.5086, + "step": 32281 + }, + { + "epoch": 0.957269519319159, + "grad_norm": 0.06500092893838882, + "learning_rate": 4.596222222288948e-06, + "loss": 2.536, + "step": 32282 + }, + { + "epoch": 0.9572991726714705, + "grad_norm": 0.06640854477882385, + "learning_rate": 4.589859412288788e-06, + "loss": 2.4974, + "step": 32283 + }, + { + "epoch": 0.957328826023782, + "grad_norm": 0.06382618099451065, + "learning_rate": 4.583500989230493e-06, + "loss": 2.5515, + "step": 32284 + }, + { + "epoch": 0.9573584793760934, + "grad_norm": 0.06800932437181473, + "learning_rate": 4.577146953170297e-06, + "loss": 2.5485, + "step": 32285 + }, + { + "epoch": 0.957388132728405, + "grad_norm": 0.06615062057971954, + "learning_rate": 4.570797304164542e-06, + "loss": 2.5619, + "step": 32286 + }, + { + "epoch": 0.9574177860807164, + "grad_norm": 0.06654947251081467, + "learning_rate": 4.564452042269407e-06, + "loss": 2.5574, + "step": 32287 + }, + { + "epoch": 0.9574474394330279, + "grad_norm": 0.06443783640861511, + "learning_rate": 4.558111167541068e-06, + "loss": 2.5384, + "step": 32288 + }, + { + "epoch": 0.9574770927853394, + "grad_norm": 0.06329163163900375, + "learning_rate": 4.551774680035703e-06, + "loss": 2.5442, + "step": 32289 + }, + { + "epoch": 0.9575067461376509, + "grad_norm": 0.06521092355251312, + "learning_rate": 4.545442579809433e-06, + "loss": 2.5743, + "step": 32290 + }, + { + "epoch": 0.9575363994899624, + "grad_norm": 0.06563668698072433, + "learning_rate": 4.539114866918326e-06, + "loss": 2.5276, + "step": 32291 + }, + { + "epoch": 0.9575660528422738, + "grad_norm": 0.06795983016490936, + "learning_rate": 4.532791541418391e-06, + "loss": 2.5362, + "step": 32292 + }, + { + "epoch": 0.9575957061945853, + "grad_norm": 0.0670904740691185, + "learning_rate": 4.526472603365583e-06, + "loss": 2.5297, + "step": 32293 + }, + { + "epoch": 0.9576253595468968, + "grad_norm": 0.06794915348291397, + "learning_rate": 4.52015805281597e-06, + "loss": 2.583, + "step": 32294 + }, + { + "epoch": 0.9576550128992083, + "grad_norm": 0.06634565442800522, + "learning_rate": 4.513847889825396e-06, + "loss": 2.5293, + "step": 32295 + }, + { + "epoch": 0.9576846662515197, + "grad_norm": 0.06755814701318741, + "learning_rate": 4.507542114449703e-06, + "loss": 2.5046, + "step": 32296 + }, + { + "epoch": 0.9577143196038312, + "grad_norm": 0.06397943943738937, + "learning_rate": 4.501240726744793e-06, + "loss": 2.5343, + "step": 32297 + }, + { + "epoch": 0.9577439729561427, + "grad_norm": 0.0666223093867302, + "learning_rate": 4.494943726766454e-06, + "loss": 2.56, + "step": 32298 + }, + { + "epoch": 0.9577736263084542, + "grad_norm": 0.06645537912845612, + "learning_rate": 4.488651114570419e-06, + "loss": 2.558, + "step": 32299 + }, + { + "epoch": 0.9578032796607656, + "grad_norm": 0.0639631375670433, + "learning_rate": 4.482362890212477e-06, + "loss": 2.5498, + "step": 32300 + }, + { + "epoch": 0.9578329330130771, + "grad_norm": 0.06561940163373947, + "learning_rate": 4.47607905374825e-06, + "loss": 2.5432, + "step": 32301 + }, + { + "epoch": 0.9578625863653886, + "grad_norm": 0.06482579559087753, + "learning_rate": 4.469799605233415e-06, + "loss": 2.5408, + "step": 32302 + }, + { + "epoch": 0.9578922397177001, + "grad_norm": 0.06240519508719444, + "learning_rate": 4.46352454472354e-06, + "loss": 2.5265, + "step": 32303 + }, + { + "epoch": 0.9579218930700115, + "grad_norm": 0.06391768157482147, + "learning_rate": 4.457253872274191e-06, + "loss": 2.5504, + "step": 32304 + }, + { + "epoch": 0.957951546422323, + "grad_norm": 0.0665244534611702, + "learning_rate": 4.450987587940991e-06, + "loss": 2.5241, + "step": 32305 + }, + { + "epoch": 0.9579811997746345, + "grad_norm": 0.06762176007032394, + "learning_rate": 4.444725691779283e-06, + "loss": 2.5286, + "step": 32306 + }, + { + "epoch": 0.958010853126946, + "grad_norm": 0.06825270503759384, + "learning_rate": 4.438468183844635e-06, + "loss": 2.5432, + "step": 32307 + }, + { + "epoch": 0.9580405064792574, + "grad_norm": 0.0644456222653389, + "learning_rate": 4.432215064192391e-06, + "loss": 2.511, + "step": 32308 + }, + { + "epoch": 0.958070159831569, + "grad_norm": 0.06480983644723892, + "learning_rate": 4.425966332877895e-06, + "loss": 2.495, + "step": 32309 + }, + { + "epoch": 0.9580998131838805, + "grad_norm": 0.06837784498929977, + "learning_rate": 4.419721989956549e-06, + "loss": 2.5219, + "step": 32310 + }, + { + "epoch": 0.9581294665361919, + "grad_norm": 0.0652819350361824, + "learning_rate": 4.413482035483696e-06, + "loss": 2.5187, + "step": 32311 + }, + { + "epoch": 0.9581591198885034, + "grad_norm": 0.0706719309091568, + "learning_rate": 4.407246469514514e-06, + "loss": 2.5612, + "step": 32312 + }, + { + "epoch": 0.9581887732408149, + "grad_norm": 0.0653422474861145, + "learning_rate": 4.401015292104238e-06, + "loss": 2.544, + "step": 32313 + }, + { + "epoch": 0.9582184265931264, + "grad_norm": 0.0655537098646164, + "learning_rate": 4.394788503307989e-06, + "loss": 2.5818, + "step": 32314 + }, + { + "epoch": 0.9582480799454378, + "grad_norm": 0.06799811124801636, + "learning_rate": 4.388566103181002e-06, + "loss": 2.5233, + "step": 32315 + }, + { + "epoch": 0.9582777332977493, + "grad_norm": 0.06495124846696854, + "learning_rate": 4.382348091778287e-06, + "loss": 2.5511, + "step": 32316 + }, + { + "epoch": 0.9583073866500608, + "grad_norm": 0.06722752004861832, + "learning_rate": 4.376134469154969e-06, + "loss": 2.5216, + "step": 32317 + }, + { + "epoch": 0.9583370400023723, + "grad_norm": 0.0636153519153595, + "learning_rate": 4.369925235366057e-06, + "loss": 2.5347, + "step": 32318 + }, + { + "epoch": 0.9583666933546837, + "grad_norm": 0.06480538100004196, + "learning_rate": 4.363720390466563e-06, + "loss": 2.5438, + "step": 32319 + }, + { + "epoch": 0.9583963467069953, + "grad_norm": 0.06420541554689407, + "learning_rate": 4.357519934511333e-06, + "loss": 2.5593, + "step": 32320 + }, + { + "epoch": 0.9584260000593067, + "grad_norm": 0.06659215688705444, + "learning_rate": 4.351323867555379e-06, + "loss": 2.5477, + "step": 32321 + }, + { + "epoch": 0.9584556534116182, + "grad_norm": 0.0657455250620842, + "learning_rate": 4.345132189653544e-06, + "loss": 2.5294, + "step": 32322 + }, + { + "epoch": 0.9584853067639296, + "grad_norm": 0.06542161852121353, + "learning_rate": 4.338944900860619e-06, + "loss": 2.5323, + "step": 32323 + }, + { + "epoch": 0.9585149601162412, + "grad_norm": 0.06572264432907104, + "learning_rate": 4.332762001231449e-06, + "loss": 2.5525, + "step": 32324 + }, + { + "epoch": 0.9585446134685526, + "grad_norm": 0.064530149102211, + "learning_rate": 4.3265834908207125e-06, + "loss": 2.5447, + "step": 32325 + }, + { + "epoch": 0.9585742668208641, + "grad_norm": 0.06566492468118668, + "learning_rate": 4.320409369683143e-06, + "loss": 2.5575, + "step": 32326 + }, + { + "epoch": 0.9586039201731755, + "grad_norm": 0.06630934774875641, + "learning_rate": 4.314239637873474e-06, + "loss": 2.5419, + "step": 32327 + }, + { + "epoch": 0.9586335735254871, + "grad_norm": 0.06422706693410873, + "learning_rate": 4.308074295446274e-06, + "loss": 2.5379, + "step": 32328 + }, + { + "epoch": 0.9586632268777985, + "grad_norm": 0.06403202563524246, + "learning_rate": 4.301913342456165e-06, + "loss": 2.5296, + "step": 32329 + }, + { + "epoch": 0.95869288023011, + "grad_norm": 0.06480485945940018, + "learning_rate": 4.295756778957716e-06, + "loss": 2.5032, + "step": 32330 + }, + { + "epoch": 0.9587225335824215, + "grad_norm": 0.0662432536482811, + "learning_rate": 4.289604605005437e-06, + "loss": 2.5315, + "step": 32331 + }, + { + "epoch": 0.958752186934733, + "grad_norm": 0.06417661905288696, + "learning_rate": 4.283456820653731e-06, + "loss": 2.5255, + "step": 32332 + }, + { + "epoch": 0.9587818402870445, + "grad_norm": 0.06844236701726913, + "learning_rate": 4.277313425957164e-06, + "loss": 2.5597, + "step": 32333 + }, + { + "epoch": 0.9588114936393559, + "grad_norm": 0.06622782349586487, + "learning_rate": 4.271174420970081e-06, + "loss": 2.5528, + "step": 32334 + }, + { + "epoch": 0.9588411469916674, + "grad_norm": 0.06252530217170715, + "learning_rate": 4.265039805746773e-06, + "loss": 2.5158, + "step": 32335 + }, + { + "epoch": 0.9588708003439789, + "grad_norm": 0.06581734120845795, + "learning_rate": 4.258909580341697e-06, + "loss": 2.5196, + "step": 32336 + }, + { + "epoch": 0.9589004536962904, + "grad_norm": 0.06629317998886108, + "learning_rate": 4.2527837448090305e-06, + "loss": 2.4909, + "step": 32337 + }, + { + "epoch": 0.9589301070486018, + "grad_norm": 0.06485940515995026, + "learning_rate": 4.2466622992031186e-06, + "loss": 2.5188, + "step": 32338 + }, + { + "epoch": 0.9589597604009134, + "grad_norm": 0.06189234182238579, + "learning_rate": 4.240545243578032e-06, + "loss": 2.5549, + "step": 32339 + }, + { + "epoch": 0.9589894137532248, + "grad_norm": 0.06746809929609299, + "learning_rate": 4.234432577988057e-06, + "loss": 2.5006, + "step": 32340 + }, + { + "epoch": 0.9590190671055363, + "grad_norm": 0.06498724967241287, + "learning_rate": 4.228324302487263e-06, + "loss": 2.5625, + "step": 32341 + }, + { + "epoch": 0.9590487204578477, + "grad_norm": 0.06473533064126968, + "learning_rate": 4.222220417129774e-06, + "loss": 2.5157, + "step": 32342 + }, + { + "epoch": 0.9590783738101593, + "grad_norm": 0.0642741471529007, + "learning_rate": 4.216120921969602e-06, + "loss": 2.5083, + "step": 32343 + }, + { + "epoch": 0.9591080271624707, + "grad_norm": 0.06267037987709045, + "learning_rate": 4.210025817060759e-06, + "loss": 2.5349, + "step": 32344 + }, + { + "epoch": 0.9591376805147822, + "grad_norm": 0.061934519559144974, + "learning_rate": 4.203935102457257e-06, + "loss": 2.5354, + "step": 32345 + }, + { + "epoch": 0.9591673338670936, + "grad_norm": 0.06527429074048996, + "learning_rate": 4.197848778213054e-06, + "loss": 2.5306, + "step": 32346 + }, + { + "epoch": 0.9591969872194052, + "grad_norm": 0.0626491978764534, + "learning_rate": 4.191766844381939e-06, + "loss": 2.5153, + "step": 32347 + }, + { + "epoch": 0.9592266405717166, + "grad_norm": 0.06847653537988663, + "learning_rate": 4.1856893010178695e-06, + "loss": 2.5105, + "step": 32348 + }, + { + "epoch": 0.9592562939240281, + "grad_norm": 0.06425224244594574, + "learning_rate": 4.179616148174581e-06, + "loss": 2.5321, + "step": 32349 + }, + { + "epoch": 0.9592859472763396, + "grad_norm": 0.06532678753137589, + "learning_rate": 4.173547385905974e-06, + "loss": 2.5131, + "step": 32350 + }, + { + "epoch": 0.9593156006286511, + "grad_norm": 0.0642295777797699, + "learning_rate": 4.167483014265672e-06, + "loss": 2.5452, + "step": 32351 + }, + { + "epoch": 0.9593452539809626, + "grad_norm": 0.06832771748304367, + "learning_rate": 4.16142303330741e-06, + "loss": 2.513, + "step": 32352 + }, + { + "epoch": 0.959374907333274, + "grad_norm": 0.06546410173177719, + "learning_rate": 4.155367443084867e-06, + "loss": 2.5714, + "step": 32353 + }, + { + "epoch": 0.9594045606855856, + "grad_norm": 0.0647437646985054, + "learning_rate": 4.149316243651668e-06, + "loss": 2.5382, + "step": 32354 + }, + { + "epoch": 0.959434214037897, + "grad_norm": 0.06568499654531479, + "learning_rate": 4.143269435061325e-06, + "loss": 2.5165, + "step": 32355 + }, + { + "epoch": 0.9594638673902085, + "grad_norm": 0.06586594134569168, + "learning_rate": 4.1372270173675175e-06, + "loss": 2.5395, + "step": 32356 + }, + { + "epoch": 0.9594935207425199, + "grad_norm": 0.06528204679489136, + "learning_rate": 4.131188990623646e-06, + "loss": 2.5504, + "step": 32357 + }, + { + "epoch": 0.9595231740948315, + "grad_norm": 0.06851024180650711, + "learning_rate": 4.125155354883225e-06, + "loss": 2.5378, + "step": 32358 + }, + { + "epoch": 0.9595528274471429, + "grad_norm": 0.06584624201059341, + "learning_rate": 4.119126110199656e-06, + "loss": 2.5291, + "step": 32359 + }, + { + "epoch": 0.9595824807994544, + "grad_norm": 0.06545580178499222, + "learning_rate": 4.113101256626339e-06, + "loss": 2.5453, + "step": 32360 + }, + { + "epoch": 0.9596121341517658, + "grad_norm": 0.06599273532629013, + "learning_rate": 4.107080794216622e-06, + "loss": 2.5608, + "step": 32361 + }, + { + "epoch": 0.9596417875040774, + "grad_norm": 0.0632951483130455, + "learning_rate": 4.10106472302385e-06, + "loss": 2.5531, + "step": 32362 + }, + { + "epoch": 0.9596714408563888, + "grad_norm": 0.06401167064905167, + "learning_rate": 4.09505304310126e-06, + "loss": 2.5232, + "step": 32363 + }, + { + "epoch": 0.9597010942087003, + "grad_norm": 0.06404407322406769, + "learning_rate": 4.0890457545020855e-06, + "loss": 2.5103, + "step": 32364 + }, + { + "epoch": 0.9597307475610117, + "grad_norm": 0.06690572202205658, + "learning_rate": 4.083042857279562e-06, + "loss": 2.5186, + "step": 32365 + }, + { + "epoch": 0.9597604009133233, + "grad_norm": 0.06378068774938583, + "learning_rate": 4.077044351486758e-06, + "loss": 2.547, + "step": 32366 + }, + { + "epoch": 0.9597900542656347, + "grad_norm": 0.06762780249118805, + "learning_rate": 4.07105023717691e-06, + "loss": 2.5239, + "step": 32367 + }, + { + "epoch": 0.9598197076179462, + "grad_norm": 0.06375154852867126, + "learning_rate": 4.065060514403029e-06, + "loss": 2.5352, + "step": 32368 + }, + { + "epoch": 0.9598493609702576, + "grad_norm": 0.06505093723535538, + "learning_rate": 4.0590751832181306e-06, + "loss": 2.5305, + "step": 32369 + }, + { + "epoch": 0.9598790143225692, + "grad_norm": 0.06502366811037064, + "learning_rate": 4.0530942436752815e-06, + "loss": 2.5422, + "step": 32370 + }, + { + "epoch": 0.9599086676748807, + "grad_norm": 0.06492778658866882, + "learning_rate": 4.0471176958273844e-06, + "loss": 2.543, + "step": 32371 + }, + { + "epoch": 0.9599383210271921, + "grad_norm": 0.0727492943406105, + "learning_rate": 4.0411455397273974e-06, + "loss": 2.5332, + "step": 32372 + }, + { + "epoch": 0.9599679743795037, + "grad_norm": 0.06439470499753952, + "learning_rate": 4.035177775428223e-06, + "loss": 2.5373, + "step": 32373 + }, + { + "epoch": 0.9599976277318151, + "grad_norm": 0.063758485019207, + "learning_rate": 4.02921440298265e-06, + "loss": 2.5409, + "step": 32374 + }, + { + "epoch": 0.9600272810841266, + "grad_norm": 0.06732311099767685, + "learning_rate": 4.023255422443528e-06, + "loss": 2.5404, + "step": 32375 + }, + { + "epoch": 0.960056934436438, + "grad_norm": 0.06579578667879105, + "learning_rate": 4.017300833863591e-06, + "loss": 2.5587, + "step": 32376 + }, + { + "epoch": 0.9600865877887496, + "grad_norm": 0.06620094180107117, + "learning_rate": 4.0113506372955745e-06, + "loss": 2.5242, + "step": 32377 + }, + { + "epoch": 0.960116241141061, + "grad_norm": 0.0690641701221466, + "learning_rate": 4.00540483279227e-06, + "loss": 2.5433, + "step": 32378 + }, + { + "epoch": 0.9601458944933725, + "grad_norm": 0.06318948417901993, + "learning_rate": 3.999463420406191e-06, + "loss": 2.5655, + "step": 32379 + }, + { + "epoch": 0.9601755478456839, + "grad_norm": 0.06811878085136414, + "learning_rate": 3.9935264001900175e-06, + "loss": 2.4999, + "step": 32380 + }, + { + "epoch": 0.9602052011979955, + "grad_norm": 0.06346192955970764, + "learning_rate": 3.987593772196263e-06, + "loss": 2.5353, + "step": 32381 + }, + { + "epoch": 0.9602348545503069, + "grad_norm": 0.06581150740385056, + "learning_rate": 3.981665536477552e-06, + "loss": 2.5222, + "step": 32382 + }, + { + "epoch": 0.9602645079026184, + "grad_norm": 0.06379302591085434, + "learning_rate": 3.9757416930862875e-06, + "loss": 2.5473, + "step": 32383 + }, + { + "epoch": 0.9602941612549298, + "grad_norm": 0.06631406396627426, + "learning_rate": 3.969822242074983e-06, + "loss": 2.5009, + "step": 32384 + }, + { + "epoch": 0.9603238146072414, + "grad_norm": 0.06595753133296967, + "learning_rate": 3.963907183496041e-06, + "loss": 2.5321, + "step": 32385 + }, + { + "epoch": 0.9603534679595528, + "grad_norm": 0.0625755712389946, + "learning_rate": 3.957996517401863e-06, + "loss": 2.5164, + "step": 32386 + }, + { + "epoch": 0.9603831213118643, + "grad_norm": 0.06574859470129013, + "learning_rate": 3.952090243844742e-06, + "loss": 2.5468, + "step": 32387 + }, + { + "epoch": 0.9604127746641757, + "grad_norm": 0.06473871320486069, + "learning_rate": 3.946188362877079e-06, + "loss": 2.5354, + "step": 32388 + }, + { + "epoch": 0.9604424280164873, + "grad_norm": 0.06495699286460876, + "learning_rate": 3.940290874551e-06, + "loss": 2.5377, + "step": 32389 + }, + { + "epoch": 0.9604720813687987, + "grad_norm": 0.06283394247293472, + "learning_rate": 3.934397778918797e-06, + "loss": 2.5145, + "step": 32390 + }, + { + "epoch": 0.9605017347211102, + "grad_norm": 0.0639565959572792, + "learning_rate": 3.928509076032705e-06, + "loss": 2.5479, + "step": 32391 + }, + { + "epoch": 0.9605313880734218, + "grad_norm": 0.06436222046613693, + "learning_rate": 3.922624765944738e-06, + "loss": 2.5092, + "step": 32392 + }, + { + "epoch": 0.9605610414257332, + "grad_norm": 0.06264916062355042, + "learning_rate": 3.916744848707132e-06, + "loss": 2.5254, + "step": 32393 + }, + { + "epoch": 0.9605906947780447, + "grad_norm": 0.06651891022920609, + "learning_rate": 3.910869324371902e-06, + "loss": 2.5617, + "step": 32394 + }, + { + "epoch": 0.9606203481303561, + "grad_norm": 0.06453605741262436, + "learning_rate": 3.904998192991061e-06, + "loss": 2.5047, + "step": 32395 + }, + { + "epoch": 0.9606500014826677, + "grad_norm": 0.06511323153972626, + "learning_rate": 3.899131454616623e-06, + "loss": 2.5455, + "step": 32396 + }, + { + "epoch": 0.9606796548349791, + "grad_norm": 0.06745511293411255, + "learning_rate": 3.89326910930049e-06, + "loss": 2.5447, + "step": 32397 + }, + { + "epoch": 0.9607093081872906, + "grad_norm": 0.06822985410690308, + "learning_rate": 3.887411157094623e-06, + "loss": 2.553, + "step": 32398 + }, + { + "epoch": 0.960738961539602, + "grad_norm": 0.06390157341957092, + "learning_rate": 3.881557598050922e-06, + "loss": 2.5732, + "step": 32399 + }, + { + "epoch": 0.9607686148919136, + "grad_norm": 0.07036928832530975, + "learning_rate": 3.875708432221181e-06, + "loss": 2.5336, + "step": 32400 + }, + { + "epoch": 0.960798268244225, + "grad_norm": 0.06612421572208405, + "learning_rate": 3.869863659657191e-06, + "loss": 2.5395, + "step": 32401 + }, + { + "epoch": 0.9608279215965365, + "grad_norm": 0.0664508193731308, + "learning_rate": 3.864023280410744e-06, + "loss": 2.5511, + "step": 32402 + }, + { + "epoch": 0.9608575749488479, + "grad_norm": 0.06386978924274445, + "learning_rate": 3.858187294533466e-06, + "loss": 2.5357, + "step": 32403 + }, + { + "epoch": 0.9608872283011595, + "grad_norm": 0.06916443258523941, + "learning_rate": 3.852355702077148e-06, + "loss": 2.5646, + "step": 32404 + }, + { + "epoch": 0.9609168816534709, + "grad_norm": 0.062034230679273605, + "learning_rate": 3.846528503093416e-06, + "loss": 2.5411, + "step": 32405 + }, + { + "epoch": 0.9609465350057824, + "grad_norm": 0.0641658678650856, + "learning_rate": 3.840705697633784e-06, + "loss": 2.5439, + "step": 32406 + }, + { + "epoch": 0.9609761883580938, + "grad_norm": 0.06468237936496735, + "learning_rate": 3.834887285749877e-06, + "loss": 2.5345, + "step": 32407 + }, + { + "epoch": 0.9610058417104054, + "grad_norm": 0.06705733388662338, + "learning_rate": 3.82907326749321e-06, + "loss": 2.5207, + "step": 32408 + }, + { + "epoch": 0.9610354950627168, + "grad_norm": 0.0633302628993988, + "learning_rate": 3.823263642915242e-06, + "loss": 2.5097, + "step": 32409 + }, + { + "epoch": 0.9610651484150283, + "grad_norm": 0.06189274787902832, + "learning_rate": 3.817458412067487e-06, + "loss": 2.556, + "step": 32410 + }, + { + "epoch": 0.9610948017673397, + "grad_norm": 0.06588771939277649, + "learning_rate": 3.811657575001293e-06, + "loss": 2.5422, + "step": 32411 + }, + { + "epoch": 0.9611244551196513, + "grad_norm": 0.0660383403301239, + "learning_rate": 3.8058611317680624e-06, + "loss": 2.518, + "step": 32412 + }, + { + "epoch": 0.9611541084719628, + "grad_norm": 0.0628012865781784, + "learning_rate": 3.8000690824190333e-06, + "loss": 2.5243, + "step": 32413 + }, + { + "epoch": 0.9611837618242742, + "grad_norm": 0.06474490463733673, + "learning_rate": 3.794281427005608e-06, + "loss": 2.5172, + "step": 32414 + }, + { + "epoch": 0.9612134151765858, + "grad_norm": 0.06515640020370483, + "learning_rate": 3.788498165579024e-06, + "loss": 2.5359, + "step": 32415 + }, + { + "epoch": 0.9612430685288972, + "grad_norm": 0.06473138928413391, + "learning_rate": 3.7827192981904068e-06, + "loss": 2.5293, + "step": 32416 + }, + { + "epoch": 0.9612727218812087, + "grad_norm": 0.062319811433553696, + "learning_rate": 3.7769448248910488e-06, + "loss": 2.5164, + "step": 32417 + }, + { + "epoch": 0.9613023752335201, + "grad_norm": 0.06747367233037949, + "learning_rate": 3.7711747457319645e-06, + "loss": 2.5462, + "step": 32418 + }, + { + "epoch": 0.9613320285858317, + "grad_norm": 0.06350972503423691, + "learning_rate": 3.7654090607643352e-06, + "loss": 2.5199, + "step": 32419 + }, + { + "epoch": 0.9613616819381431, + "grad_norm": 0.06380739063024521, + "learning_rate": 3.7596477700391764e-06, + "loss": 2.5676, + "step": 32420 + }, + { + "epoch": 0.9613913352904546, + "grad_norm": 0.06612663716077805, + "learning_rate": 3.7538908736075018e-06, + "loss": 2.54, + "step": 32421 + }, + { + "epoch": 0.961420988642766, + "grad_norm": 0.06673207134008408, + "learning_rate": 3.7481383715203265e-06, + "loss": 2.5261, + "step": 32422 + }, + { + "epoch": 0.9614506419950776, + "grad_norm": 0.06583560258150101, + "learning_rate": 3.7423902638285544e-06, + "loss": 2.5168, + "step": 32423 + }, + { + "epoch": 0.961480295347389, + "grad_norm": 0.07101704180240631, + "learning_rate": 3.7366465505830895e-06, + "loss": 2.544, + "step": 32424 + }, + { + "epoch": 0.9615099486997005, + "grad_norm": 0.06519395858049393, + "learning_rate": 3.730907231834779e-06, + "loss": 2.5224, + "step": 32425 + }, + { + "epoch": 0.9615396020520119, + "grad_norm": 0.06879696995019913, + "learning_rate": 3.7251723076344725e-06, + "loss": 2.506, + "step": 32426 + }, + { + "epoch": 0.9615692554043235, + "grad_norm": 0.06776273995637894, + "learning_rate": 3.719441778033017e-06, + "loss": 2.5215, + "step": 32427 + }, + { + "epoch": 0.9615989087566349, + "grad_norm": 0.06555262207984924, + "learning_rate": 3.7137156430810393e-06, + "loss": 2.5311, + "step": 32428 + }, + { + "epoch": 0.9616285621089464, + "grad_norm": 0.06807514280080795, + "learning_rate": 3.707993902829276e-06, + "loss": 2.5337, + "step": 32429 + }, + { + "epoch": 0.9616582154612578, + "grad_norm": 0.06510942429304123, + "learning_rate": 3.7022765573284655e-06, + "loss": 2.5454, + "step": 32430 + }, + { + "epoch": 0.9616878688135694, + "grad_norm": 0.06542417407035828, + "learning_rate": 3.696563606629122e-06, + "loss": 2.5314, + "step": 32431 + }, + { + "epoch": 0.9617175221658808, + "grad_norm": 0.06706037372350693, + "learning_rate": 3.690855050781983e-06, + "loss": 2.529, + "step": 32432 + }, + { + "epoch": 0.9617471755181923, + "grad_norm": 0.06497764587402344, + "learning_rate": 3.685150889837452e-06, + "loss": 2.5476, + "step": 32433 + }, + { + "epoch": 0.9617768288705039, + "grad_norm": 0.06417404860258102, + "learning_rate": 3.6794511238461558e-06, + "loss": 2.4814, + "step": 32434 + }, + { + "epoch": 0.9618064822228153, + "grad_norm": 0.06876364350318909, + "learning_rate": 3.673755752858443e-06, + "loss": 2.5504, + "step": 32435 + }, + { + "epoch": 0.9618361355751268, + "grad_norm": 0.06674408167600632, + "learning_rate": 3.668064776924829e-06, + "loss": 2.5648, + "step": 32436 + }, + { + "epoch": 0.9618657889274382, + "grad_norm": 0.06406379491090775, + "learning_rate": 3.662378196095717e-06, + "loss": 2.5132, + "step": 32437 + }, + { + "epoch": 0.9618954422797498, + "grad_norm": 0.06937404721975327, + "learning_rate": 3.656696010421401e-06, + "loss": 2.535, + "step": 32438 + }, + { + "epoch": 0.9619250956320612, + "grad_norm": 0.06638436019420624, + "learning_rate": 3.6510182199522844e-06, + "loss": 2.5846, + "step": 32439 + }, + { + "epoch": 0.9619547489843727, + "grad_norm": 0.06461837142705917, + "learning_rate": 3.6453448247386056e-06, + "loss": 2.4929, + "step": 32440 + }, + { + "epoch": 0.9619844023366841, + "grad_norm": 0.06448782235383987, + "learning_rate": 3.6396758248305463e-06, + "loss": 2.5495, + "step": 32441 + }, + { + "epoch": 0.9620140556889957, + "grad_norm": 0.06439630687236786, + "learning_rate": 3.6340112202783993e-06, + "loss": 2.5355, + "step": 32442 + }, + { + "epoch": 0.9620437090413071, + "grad_norm": 0.06743235886096954, + "learning_rate": 3.6283510111322914e-06, + "loss": 2.4911, + "step": 32443 + }, + { + "epoch": 0.9620733623936186, + "grad_norm": 0.06386861205101013, + "learning_rate": 3.6226951974423493e-06, + "loss": 2.5498, + "step": 32444 + }, + { + "epoch": 0.96210301574593, + "grad_norm": 0.06867143511772156, + "learning_rate": 3.6170437792585885e-06, + "loss": 2.5139, + "step": 32445 + }, + { + "epoch": 0.9621326690982416, + "grad_norm": 0.06489935517311096, + "learning_rate": 3.6113967566311356e-06, + "loss": 2.5033, + "step": 32446 + }, + { + "epoch": 0.962162322450553, + "grad_norm": 0.0657137855887413, + "learning_rate": 3.6057541296099503e-06, + "loss": 2.5154, + "step": 32447 + }, + { + "epoch": 0.9621919758028645, + "grad_norm": 0.06544540077447891, + "learning_rate": 3.6001158982450487e-06, + "loss": 2.5275, + "step": 32448 + }, + { + "epoch": 0.962221629155176, + "grad_norm": 0.06526869535446167, + "learning_rate": 3.594482062586335e-06, + "loss": 2.5297, + "step": 32449 + }, + { + "epoch": 0.9622512825074875, + "grad_norm": 0.06527189910411835, + "learning_rate": 3.588852622683658e-06, + "loss": 2.5046, + "step": 32450 + }, + { + "epoch": 0.9622809358597989, + "grad_norm": 0.06755494326353073, + "learning_rate": 3.5832275785868673e-06, + "loss": 2.5229, + "step": 32451 + }, + { + "epoch": 0.9623105892121104, + "grad_norm": 0.06584207713603973, + "learning_rate": 3.5776069303458116e-06, + "loss": 2.5118, + "step": 32452 + }, + { + "epoch": 0.9623402425644219, + "grad_norm": 0.06459221988916397, + "learning_rate": 3.571990678010284e-06, + "loss": 2.5066, + "step": 32453 + }, + { + "epoch": 0.9623698959167334, + "grad_norm": 0.06297080218791962, + "learning_rate": 3.566378821630023e-06, + "loss": 2.5421, + "step": 32454 + }, + { + "epoch": 0.9623995492690449, + "grad_norm": 0.06720593571662903, + "learning_rate": 3.5607713612546554e-06, + "loss": 2.5331, + "step": 32455 + }, + { + "epoch": 0.9624292026213563, + "grad_norm": 0.06604715436697006, + "learning_rate": 3.555168296933864e-06, + "loss": 2.5366, + "step": 32456 + }, + { + "epoch": 0.9624588559736679, + "grad_norm": 0.06719545274972916, + "learning_rate": 3.549569628717331e-06, + "loss": 2.5631, + "step": 32457 + }, + { + "epoch": 0.9624885093259793, + "grad_norm": 0.06912344694137573, + "learning_rate": 3.543975356654516e-06, + "loss": 2.4981, + "step": 32458 + }, + { + "epoch": 0.9625181626782908, + "grad_norm": 0.06599237024784088, + "learning_rate": 3.5383854807949922e-06, + "loss": 2.5465, + "step": 32459 + }, + { + "epoch": 0.9625478160306022, + "grad_norm": 0.06520121544599533, + "learning_rate": 3.53280000118833e-06, + "loss": 2.5173, + "step": 32460 + }, + { + "epoch": 0.9625774693829138, + "grad_norm": 0.0628771260380745, + "learning_rate": 3.527218917883934e-06, + "loss": 2.554, + "step": 32461 + }, + { + "epoch": 0.9626071227352252, + "grad_norm": 0.06586451828479767, + "learning_rate": 3.52164223093121e-06, + "loss": 2.5426, + "step": 32462 + }, + { + "epoch": 0.9626367760875367, + "grad_norm": 0.0650096982717514, + "learning_rate": 3.5160699403795626e-06, + "loss": 2.5196, + "step": 32463 + }, + { + "epoch": 0.9626664294398481, + "grad_norm": 0.06365075707435608, + "learning_rate": 3.510502046278341e-06, + "loss": 2.5582, + "step": 32464 + }, + { + "epoch": 0.9626960827921597, + "grad_norm": 0.06680110096931458, + "learning_rate": 3.5049385486768392e-06, + "loss": 2.5553, + "step": 32465 + }, + { + "epoch": 0.9627257361444711, + "grad_norm": 0.06518396735191345, + "learning_rate": 3.4993794476243514e-06, + "loss": 2.5546, + "step": 32466 + }, + { + "epoch": 0.9627553894967826, + "grad_norm": 0.06498414278030396, + "learning_rate": 3.4938247431700596e-06, + "loss": 2.5182, + "step": 32467 + }, + { + "epoch": 0.962785042849094, + "grad_norm": 0.0632714107632637, + "learning_rate": 3.4882744353632033e-06, + "loss": 2.5251, + "step": 32468 + }, + { + "epoch": 0.9628146962014056, + "grad_norm": 0.06371932476758957, + "learning_rate": 3.4827285242529095e-06, + "loss": 2.5478, + "step": 32469 + }, + { + "epoch": 0.962844349553717, + "grad_norm": 0.06523259729146957, + "learning_rate": 3.4771870098882497e-06, + "loss": 2.5434, + "step": 32470 + }, + { + "epoch": 0.9628740029060285, + "grad_norm": 0.06271175295114517, + "learning_rate": 3.471649892318296e-06, + "loss": 2.5359, + "step": 32471 + }, + { + "epoch": 0.96290365625834, + "grad_norm": 0.06461238861083984, + "learning_rate": 3.466117171592176e-06, + "loss": 2.5233, + "step": 32472 + }, + { + "epoch": 0.9629333096106515, + "grad_norm": 0.07101111114025116, + "learning_rate": 3.4605888477587944e-06, + "loss": 2.5169, + "step": 32473 + }, + { + "epoch": 0.9629629629629629, + "grad_norm": 0.0656847432255745, + "learning_rate": 3.4550649208671127e-06, + "loss": 2.4921, + "step": 32474 + }, + { + "epoch": 0.9629926163152744, + "grad_norm": 0.0689450204372406, + "learning_rate": 3.4495453909660913e-06, + "loss": 2.5184, + "step": 32475 + }, + { + "epoch": 0.963022269667586, + "grad_norm": 0.06522511690855026, + "learning_rate": 3.4440302581045245e-06, + "loss": 2.5162, + "step": 32476 + }, + { + "epoch": 0.9630519230198974, + "grad_norm": 0.06900659203529358, + "learning_rate": 3.4385195223313738e-06, + "loss": 2.5356, + "step": 32477 + }, + { + "epoch": 0.9630815763722089, + "grad_norm": 0.06545436382293701, + "learning_rate": 3.4330131836953214e-06, + "loss": 2.544, + "step": 32478 + }, + { + "epoch": 0.9631112297245203, + "grad_norm": 0.06801118701696396, + "learning_rate": 3.427511242245163e-06, + "loss": 2.5536, + "step": 32479 + }, + { + "epoch": 0.9631408830768319, + "grad_norm": 0.06488422304391861, + "learning_rate": 3.4220136980296355e-06, + "loss": 2.5098, + "step": 32480 + }, + { + "epoch": 0.9631705364291433, + "grad_norm": 0.06569714099168777, + "learning_rate": 3.416520551097424e-06, + "loss": 2.5226, + "step": 32481 + }, + { + "epoch": 0.9632001897814548, + "grad_norm": 0.06570020318031311, + "learning_rate": 3.4110318014971555e-06, + "loss": 2.5416, + "step": 32482 + }, + { + "epoch": 0.9632298431337663, + "grad_norm": 0.06639596074819565, + "learning_rate": 3.4055474492774574e-06, + "loss": 2.5353, + "step": 32483 + }, + { + "epoch": 0.9632594964860778, + "grad_norm": 0.06481441110372543, + "learning_rate": 3.400067494486847e-06, + "loss": 2.545, + "step": 32484 + }, + { + "epoch": 0.9632891498383892, + "grad_norm": 0.06397530436515808, + "learning_rate": 3.3945919371738967e-06, + "loss": 2.555, + "step": 32485 + }, + { + "epoch": 0.9633188031907007, + "grad_norm": 0.06464844197034836, + "learning_rate": 3.389120777387067e-06, + "loss": 2.5406, + "step": 32486 + }, + { + "epoch": 0.9633484565430122, + "grad_norm": 0.06488664448261261, + "learning_rate": 3.3836540151748196e-06, + "loss": 2.5713, + "step": 32487 + }, + { + "epoch": 0.9633781098953237, + "grad_norm": 0.06341666728258133, + "learning_rate": 3.3781916505855603e-06, + "loss": 2.5233, + "step": 32488 + }, + { + "epoch": 0.9634077632476351, + "grad_norm": 0.06536922603845596, + "learning_rate": 3.3727336836676947e-06, + "loss": 2.5456, + "step": 32489 + }, + { + "epoch": 0.9634374165999466, + "grad_norm": 0.06404060870409012, + "learning_rate": 3.367280114469462e-06, + "loss": 2.5442, + "step": 32490 + }, + { + "epoch": 0.9634670699522581, + "grad_norm": 0.06523656100034714, + "learning_rate": 3.3618309430392124e-06, + "loss": 2.5785, + "step": 32491 + }, + { + "epoch": 0.9634967233045696, + "grad_norm": 0.06659774482250214, + "learning_rate": 3.356386169425185e-06, + "loss": 2.5307, + "step": 32492 + }, + { + "epoch": 0.963526376656881, + "grad_norm": 0.06784182786941528, + "learning_rate": 3.3509457936756193e-06, + "loss": 2.6034, + "step": 32493 + }, + { + "epoch": 0.9635560300091925, + "grad_norm": 0.06382163614034653, + "learning_rate": 3.3455098158386987e-06, + "loss": 2.486, + "step": 32494 + }, + { + "epoch": 0.963585683361504, + "grad_norm": 0.06325028091669083, + "learning_rate": 3.3400782359625517e-06, + "loss": 2.5082, + "step": 32495 + }, + { + "epoch": 0.9636153367138155, + "grad_norm": 0.06472890824079514, + "learning_rate": 3.3346510540952503e-06, + "loss": 2.5436, + "step": 32496 + }, + { + "epoch": 0.963644990066127, + "grad_norm": 0.06525088101625443, + "learning_rate": 3.3292282702848674e-06, + "loss": 2.5295, + "step": 32497 + }, + { + "epoch": 0.9636746434184384, + "grad_norm": 0.06872387230396271, + "learning_rate": 3.3238098845794206e-06, + "loss": 2.5627, + "step": 32498 + }, + { + "epoch": 0.96370429677075, + "grad_norm": 0.06516062468290329, + "learning_rate": 3.3183958970268714e-06, + "loss": 2.5589, + "step": 32499 + }, + { + "epoch": 0.9637339501230614, + "grad_norm": 0.06776909530162811, + "learning_rate": 3.312986307675181e-06, + "loss": 2.5394, + "step": 32500 + }, + { + "epoch": 0.9637636034753729, + "grad_norm": 0.06415481120347977, + "learning_rate": 3.307581116572256e-06, + "loss": 2.5142, + "step": 32501 + }, + { + "epoch": 0.9637932568276844, + "grad_norm": 0.06630736589431763, + "learning_rate": 3.3021803237659463e-06, + "loss": 2.5274, + "step": 32502 + }, + { + "epoch": 0.9638229101799959, + "grad_norm": 0.06257439404726028, + "learning_rate": 3.2967839293041034e-06, + "loss": 2.5271, + "step": 32503 + }, + { + "epoch": 0.9638525635323073, + "grad_norm": 0.06762483716011047, + "learning_rate": 3.2913919332344666e-06, + "loss": 2.5709, + "step": 32504 + }, + { + "epoch": 0.9638822168846188, + "grad_norm": 0.06233648955821991, + "learning_rate": 3.2860043356048308e-06, + "loss": 2.5056, + "step": 32505 + }, + { + "epoch": 0.9639118702369303, + "grad_norm": 0.06546627730131149, + "learning_rate": 3.2806211364629357e-06, + "loss": 2.5646, + "step": 32506 + }, + { + "epoch": 0.9639415235892418, + "grad_norm": 0.06395124644041061, + "learning_rate": 3.2752423358563545e-06, + "loss": 2.5474, + "step": 32507 + }, + { + "epoch": 0.9639711769415532, + "grad_norm": 0.07024111598730087, + "learning_rate": 3.269867933832771e-06, + "loss": 2.584, + "step": 32508 + }, + { + "epoch": 0.9640008302938647, + "grad_norm": 0.07034123688936234, + "learning_rate": 3.264497930439758e-06, + "loss": 2.5283, + "step": 32509 + }, + { + "epoch": 0.9640304836461762, + "grad_norm": 0.06527067720890045, + "learning_rate": 3.2591323257248896e-06, + "loss": 2.5142, + "step": 32510 + }, + { + "epoch": 0.9640601369984877, + "grad_norm": 0.06319994479417801, + "learning_rate": 3.2537711197356825e-06, + "loss": 2.555, + "step": 32511 + }, + { + "epoch": 0.9640897903507991, + "grad_norm": 0.06299643218517303, + "learning_rate": 3.2484143125195987e-06, + "loss": 2.5353, + "step": 32512 + }, + { + "epoch": 0.9641194437031106, + "grad_norm": 0.06313206255435944, + "learning_rate": 3.243061904124045e-06, + "loss": 2.5095, + "step": 32513 + }, + { + "epoch": 0.9641490970554221, + "grad_norm": 0.06586383283138275, + "learning_rate": 3.2377138945964833e-06, + "loss": 2.5653, + "step": 32514 + }, + { + "epoch": 0.9641787504077336, + "grad_norm": 0.0634898841381073, + "learning_rate": 3.2323702839842096e-06, + "loss": 2.5377, + "step": 32515 + }, + { + "epoch": 0.964208403760045, + "grad_norm": 0.06441643089056015, + "learning_rate": 3.2270310723345185e-06, + "loss": 2.5662, + "step": 32516 + }, + { + "epoch": 0.9642380571123566, + "grad_norm": 0.06458833068609238, + "learning_rate": 3.2216962596948174e-06, + "loss": 2.5451, + "step": 32517 + }, + { + "epoch": 0.9642677104646681, + "grad_norm": 0.06183059886097908, + "learning_rate": 3.2163658461122346e-06, + "loss": 2.4929, + "step": 32518 + }, + { + "epoch": 0.9642973638169795, + "grad_norm": 0.06608500331640244, + "learning_rate": 3.2110398316340105e-06, + "loss": 2.5445, + "step": 32519 + }, + { + "epoch": 0.964327017169291, + "grad_norm": 0.06639789789915085, + "learning_rate": 3.205718216307274e-06, + "loss": 2.5429, + "step": 32520 + }, + { + "epoch": 0.9643566705216025, + "grad_norm": 0.06677420437335968, + "learning_rate": 3.2004010001792094e-06, + "loss": 2.5289, + "step": 32521 + }, + { + "epoch": 0.964386323873914, + "grad_norm": 0.0660509392619133, + "learning_rate": 3.1950881832968348e-06, + "loss": 2.5151, + "step": 32522 + }, + { + "epoch": 0.9644159772262254, + "grad_norm": 0.0712403729557991, + "learning_rate": 3.189779765707279e-06, + "loss": 2.5385, + "step": 32523 + }, + { + "epoch": 0.9644456305785369, + "grad_norm": 0.0662865862250328, + "learning_rate": 3.1844757474574494e-06, + "loss": 2.5331, + "step": 32524 + }, + { + "epoch": 0.9644752839308484, + "grad_norm": 0.06672242283821106, + "learning_rate": 3.1791761285944187e-06, + "loss": 2.5271, + "step": 32525 + }, + { + "epoch": 0.9645049372831599, + "grad_norm": 0.06344763934612274, + "learning_rate": 3.1738809091649833e-06, + "loss": 2.5553, + "step": 32526 + }, + { + "epoch": 0.9645345906354713, + "grad_norm": 0.06474115699529648, + "learning_rate": 3.168590089216161e-06, + "loss": 2.5508, + "step": 32527 + }, + { + "epoch": 0.9645642439877828, + "grad_norm": 0.06706910580396652, + "learning_rate": 3.163303668794748e-06, + "loss": 2.5293, + "step": 32528 + }, + { + "epoch": 0.9645938973400943, + "grad_norm": 0.06415098160505295, + "learning_rate": 3.1580216479475397e-06, + "loss": 2.5475, + "step": 32529 + }, + { + "epoch": 0.9646235506924058, + "grad_norm": 0.06327623128890991, + "learning_rate": 3.152744026721388e-06, + "loss": 2.4989, + "step": 32530 + }, + { + "epoch": 0.9646532040447172, + "grad_norm": 0.06610599905252457, + "learning_rate": 3.147470805162922e-06, + "loss": 2.5198, + "step": 32531 + }, + { + "epoch": 0.9646828573970287, + "grad_norm": 0.06735745817422867, + "learning_rate": 3.1422019833189374e-06, + "loss": 2.4927, + "step": 32532 + }, + { + "epoch": 0.9647125107493402, + "grad_norm": 0.06593135744333267, + "learning_rate": 3.1369375612360085e-06, + "loss": 2.5559, + "step": 32533 + }, + { + "epoch": 0.9647421641016517, + "grad_norm": 0.06393643468618393, + "learning_rate": 3.1316775389607645e-06, + "loss": 2.5715, + "step": 32534 + }, + { + "epoch": 0.9647718174539631, + "grad_norm": 0.06584085524082184, + "learning_rate": 3.1264219165398346e-06, + "loss": 2.53, + "step": 32535 + }, + { + "epoch": 0.9648014708062747, + "grad_norm": 0.06463102251291275, + "learning_rate": 3.1211706940196817e-06, + "loss": 2.5361, + "step": 32536 + }, + { + "epoch": 0.9648311241585861, + "grad_norm": 0.06574886292219162, + "learning_rate": 3.1159238714468797e-06, + "loss": 2.5479, + "step": 32537 + }, + { + "epoch": 0.9648607775108976, + "grad_norm": 0.06622714549303055, + "learning_rate": 3.110681448867836e-06, + "loss": 2.5441, + "step": 32538 + }, + { + "epoch": 0.9648904308632091, + "grad_norm": 0.06896892189979553, + "learning_rate": 3.105443426329013e-06, + "loss": 2.5164, + "step": 32539 + }, + { + "epoch": 0.9649200842155206, + "grad_norm": 0.06268735975027084, + "learning_rate": 3.100209803876819e-06, + "loss": 2.546, + "step": 32540 + }, + { + "epoch": 0.9649497375678321, + "grad_norm": 0.0631946548819542, + "learning_rate": 3.094980581557494e-06, + "loss": 2.5539, + "step": 32541 + }, + { + "epoch": 0.9649793909201435, + "grad_norm": 0.06421448290348053, + "learning_rate": 3.089755759417445e-06, + "loss": 2.5228, + "step": 32542 + }, + { + "epoch": 0.965009044272455, + "grad_norm": 0.06730366498231888, + "learning_rate": 3.084535337502914e-06, + "loss": 2.567, + "step": 32543 + }, + { + "epoch": 0.9650386976247665, + "grad_norm": 0.06303848326206207, + "learning_rate": 3.079319315860085e-06, + "loss": 2.5238, + "step": 32544 + }, + { + "epoch": 0.965068350977078, + "grad_norm": 0.06373811513185501, + "learning_rate": 3.0741076945352e-06, + "loss": 2.5618, + "step": 32545 + }, + { + "epoch": 0.9650980043293894, + "grad_norm": 0.06606902927160263, + "learning_rate": 3.0689004735743873e-06, + "loss": 2.5883, + "step": 32546 + }, + { + "epoch": 0.965127657681701, + "grad_norm": 0.06912354379892349, + "learning_rate": 3.0636976530237227e-06, + "loss": 2.5442, + "step": 32547 + }, + { + "epoch": 0.9651573110340124, + "grad_norm": 0.06543929129838943, + "learning_rate": 3.05849923292939e-06, + "loss": 2.4933, + "step": 32548 + }, + { + "epoch": 0.9651869643863239, + "grad_norm": 0.06418556720018387, + "learning_rate": 3.0533052133372985e-06, + "loss": 2.5345, + "step": 32549 + }, + { + "epoch": 0.9652166177386353, + "grad_norm": 0.06533406674861908, + "learning_rate": 3.0481155942934657e-06, + "loss": 2.5219, + "step": 32550 + }, + { + "epoch": 0.9652462710909469, + "grad_norm": 0.06480146199464798, + "learning_rate": 3.0429303758439107e-06, + "loss": 2.5248, + "step": 32551 + }, + { + "epoch": 0.9652759244432583, + "grad_norm": 0.06572217494249344, + "learning_rate": 3.0377495580344862e-06, + "loss": 2.5385, + "step": 32552 + }, + { + "epoch": 0.9653055777955698, + "grad_norm": 0.06638147681951523, + "learning_rate": 3.032573140911099e-06, + "loss": 2.5235, + "step": 32553 + }, + { + "epoch": 0.9653352311478812, + "grad_norm": 0.06790818274021149, + "learning_rate": 3.0274011245195466e-06, + "loss": 2.5421, + "step": 32554 + }, + { + "epoch": 0.9653648845001928, + "grad_norm": 0.06559647619724274, + "learning_rate": 3.0222335089056807e-06, + "loss": 2.5329, + "step": 32555 + }, + { + "epoch": 0.9653945378525042, + "grad_norm": 0.06500402837991714, + "learning_rate": 3.0170702941152985e-06, + "loss": 2.541, + "step": 32556 + }, + { + "epoch": 0.9654241912048157, + "grad_norm": 0.06384722143411636, + "learning_rate": 3.011911480193974e-06, + "loss": 2.5255, + "step": 32557 + }, + { + "epoch": 0.9654538445571272, + "grad_norm": 0.06691243499517441, + "learning_rate": 3.0067570671875597e-06, + "loss": 2.6046, + "step": 32558 + }, + { + "epoch": 0.9654834979094387, + "grad_norm": 0.06468790024518967, + "learning_rate": 3.0016070551415753e-06, + "loss": 2.5239, + "step": 32559 + }, + { + "epoch": 0.9655131512617502, + "grad_norm": 0.06224510446190834, + "learning_rate": 2.9964614441017057e-06, + "loss": 2.5508, + "step": 32560 + }, + { + "epoch": 0.9655428046140616, + "grad_norm": 0.06417325884103775, + "learning_rate": 2.9913202341134147e-06, + "loss": 2.513, + "step": 32561 + }, + { + "epoch": 0.9655724579663731, + "grad_norm": 0.06448010355234146, + "learning_rate": 2.986183425222333e-06, + "loss": 2.5345, + "step": 32562 + }, + { + "epoch": 0.9656021113186846, + "grad_norm": 0.06773446500301361, + "learning_rate": 2.9810510174739236e-06, + "loss": 2.5465, + "step": 32563 + }, + { + "epoch": 0.9656317646709961, + "grad_norm": 0.06334009021520615, + "learning_rate": 2.9759230109136504e-06, + "loss": 2.5582, + "step": 32564 + }, + { + "epoch": 0.9656614180233075, + "grad_norm": 0.0648350641131401, + "learning_rate": 2.9707994055868103e-06, + "loss": 2.5337, + "step": 32565 + }, + { + "epoch": 0.965691071375619, + "grad_norm": 0.06349126249551773, + "learning_rate": 2.965680201538923e-06, + "loss": 2.5046, + "step": 32566 + }, + { + "epoch": 0.9657207247279305, + "grad_norm": 0.06568138301372528, + "learning_rate": 2.9605653988151736e-06, + "loss": 2.5436, + "step": 32567 + }, + { + "epoch": 0.965750378080242, + "grad_norm": 0.06955871731042862, + "learning_rate": 2.955454997460971e-06, + "loss": 2.5172, + "step": 32568 + }, + { + "epoch": 0.9657800314325534, + "grad_norm": 0.06292733550071716, + "learning_rate": 2.950348997521557e-06, + "loss": 2.5523, + "step": 32569 + }, + { + "epoch": 0.965809684784865, + "grad_norm": 0.06557868421077728, + "learning_rate": 2.945247399042117e-06, + "loss": 2.5599, + "step": 32570 + }, + { + "epoch": 0.9658393381371764, + "grad_norm": 0.06663057953119278, + "learning_rate": 2.940150202067782e-06, + "loss": 2.5345, + "step": 32571 + }, + { + "epoch": 0.9658689914894879, + "grad_norm": 0.06707506626844406, + "learning_rate": 2.935057406643793e-06, + "loss": 2.5402, + "step": 32572 + }, + { + "epoch": 0.9658986448417993, + "grad_norm": 0.0664525106549263, + "learning_rate": 2.9299690128151703e-06, + "loss": 2.5364, + "step": 32573 + }, + { + "epoch": 0.9659282981941109, + "grad_norm": 0.06525326520204544, + "learning_rate": 2.924885020626988e-06, + "loss": 2.5596, + "step": 32574 + }, + { + "epoch": 0.9659579515464223, + "grad_norm": 0.06466599553823471, + "learning_rate": 2.9198054301242673e-06, + "loss": 2.5544, + "step": 32575 + }, + { + "epoch": 0.9659876048987338, + "grad_norm": 0.06601020693778992, + "learning_rate": 2.914730241352026e-06, + "loss": 2.5345, + "step": 32576 + }, + { + "epoch": 0.9660172582510452, + "grad_norm": 0.06745747476816177, + "learning_rate": 2.909659454355118e-06, + "loss": 2.5335, + "step": 32577 + }, + { + "epoch": 0.9660469116033568, + "grad_norm": 0.06742499023675919, + "learning_rate": 2.9045930691785074e-06, + "loss": 2.5233, + "step": 32578 + }, + { + "epoch": 0.9660765649556683, + "grad_norm": 0.06430694460868835, + "learning_rate": 2.899531085867102e-06, + "loss": 2.5339, + "step": 32579 + }, + { + "epoch": 0.9661062183079797, + "grad_norm": 0.06688482314348221, + "learning_rate": 2.894473504465589e-06, + "loss": 2.5307, + "step": 32580 + }, + { + "epoch": 0.9661358716602912, + "grad_norm": 0.06948831677436829, + "learning_rate": 2.889420325018932e-06, + "loss": 2.5699, + "step": 32581 + }, + { + "epoch": 0.9661655250126027, + "grad_norm": 0.0675167590379715, + "learning_rate": 2.884371547571707e-06, + "loss": 2.5385, + "step": 32582 + }, + { + "epoch": 0.9661951783649142, + "grad_norm": 0.0663541853427887, + "learning_rate": 2.8793271721687662e-06, + "loss": 2.5044, + "step": 32583 + }, + { + "epoch": 0.9662248317172256, + "grad_norm": 0.06599648296833038, + "learning_rate": 2.874287198854686e-06, + "loss": 2.5206, + "step": 32584 + }, + { + "epoch": 0.9662544850695372, + "grad_norm": 0.06354472041130066, + "learning_rate": 2.869251627674152e-06, + "loss": 2.5039, + "step": 32585 + }, + { + "epoch": 0.9662841384218486, + "grad_norm": 0.06467371433973312, + "learning_rate": 2.8642204586716848e-06, + "loss": 2.5148, + "step": 32586 + }, + { + "epoch": 0.9663137917741601, + "grad_norm": 0.06452576071023941, + "learning_rate": 2.859193691891915e-06, + "loss": 2.5289, + "step": 32587 + }, + { + "epoch": 0.9663434451264715, + "grad_norm": 0.06314677745103836, + "learning_rate": 2.8541713273792514e-06, + "loss": 2.5193, + "step": 32588 + }, + { + "epoch": 0.966373098478783, + "grad_norm": 0.06949084252119064, + "learning_rate": 2.8491533651783253e-06, + "loss": 2.5693, + "step": 32589 + }, + { + "epoch": 0.9664027518310945, + "grad_norm": 0.06488756090402603, + "learning_rate": 2.8441398053334344e-06, + "loss": 2.5587, + "step": 32590 + }, + { + "epoch": 0.966432405183406, + "grad_norm": 0.06592468917369843, + "learning_rate": 2.8391306478889877e-06, + "loss": 2.5655, + "step": 32591 + }, + { + "epoch": 0.9664620585357174, + "grad_norm": 0.06562443822622299, + "learning_rate": 2.8341258928893943e-06, + "loss": 2.5331, + "step": 32592 + }, + { + "epoch": 0.966491711888029, + "grad_norm": 0.0619359165430069, + "learning_rate": 2.8291255403789516e-06, + "loss": 2.5511, + "step": 32593 + }, + { + "epoch": 0.9665213652403404, + "grad_norm": 0.06715019792318344, + "learning_rate": 2.824129590401958e-06, + "loss": 2.5755, + "step": 32594 + }, + { + "epoch": 0.9665510185926519, + "grad_norm": 0.06517447531223297, + "learning_rate": 2.8191380430026557e-06, + "loss": 2.4996, + "step": 32595 + }, + { + "epoch": 0.9665806719449633, + "grad_norm": 0.06536684185266495, + "learning_rate": 2.8141508982252316e-06, + "loss": 2.5728, + "step": 32596 + }, + { + "epoch": 0.9666103252972749, + "grad_norm": 0.06644191592931747, + "learning_rate": 2.8091681561138173e-06, + "loss": 2.5741, + "step": 32597 + }, + { + "epoch": 0.9666399786495863, + "grad_norm": 0.06361952424049377, + "learning_rate": 2.8041898167125433e-06, + "loss": 2.5504, + "step": 32598 + }, + { + "epoch": 0.9666696320018978, + "grad_norm": 0.06741900742053986, + "learning_rate": 2.799215880065542e-06, + "loss": 2.5303, + "step": 32599 + }, + { + "epoch": 0.9666992853542093, + "grad_norm": 0.06382578611373901, + "learning_rate": 2.794246346216778e-06, + "loss": 2.5616, + "step": 32600 + }, + { + "epoch": 0.9667289387065208, + "grad_norm": 0.06564022600650787, + "learning_rate": 2.789281215210382e-06, + "loss": 2.5283, + "step": 32601 + }, + { + "epoch": 0.9667585920588323, + "grad_norm": 0.06533855199813843, + "learning_rate": 2.7843204870901527e-06, + "loss": 2.5313, + "step": 32602 + }, + { + "epoch": 0.9667882454111437, + "grad_norm": 0.06667469441890717, + "learning_rate": 2.7793641619001664e-06, + "loss": 2.4936, + "step": 32603 + }, + { + "epoch": 0.9668178987634553, + "grad_norm": 0.06889751553535461, + "learning_rate": 2.7744122396842765e-06, + "loss": 2.5546, + "step": 32604 + }, + { + "epoch": 0.9668475521157667, + "grad_norm": 0.06586887687444687, + "learning_rate": 2.769464720486281e-06, + "loss": 2.5739, + "step": 32605 + }, + { + "epoch": 0.9668772054680782, + "grad_norm": 0.06472368538379669, + "learning_rate": 2.7645216043500342e-06, + "loss": 2.5331, + "step": 32606 + }, + { + "epoch": 0.9669068588203896, + "grad_norm": 0.06360740214586258, + "learning_rate": 2.7595828913193345e-06, + "loss": 2.5398, + "step": 32607 + }, + { + "epoch": 0.9669365121727012, + "grad_norm": 0.06763347238302231, + "learning_rate": 2.754648581437813e-06, + "loss": 2.5433, + "step": 32608 + }, + { + "epoch": 0.9669661655250126, + "grad_norm": 0.06574800610542297, + "learning_rate": 2.7497186747492687e-06, + "loss": 2.532, + "step": 32609 + }, + { + "epoch": 0.9669958188773241, + "grad_norm": 0.06345134228467941, + "learning_rate": 2.7447931712972775e-06, + "loss": 2.5231, + "step": 32610 + }, + { + "epoch": 0.9670254722296355, + "grad_norm": 0.064519964158535, + "learning_rate": 2.7398720711255263e-06, + "loss": 2.5352, + "step": 32611 + }, + { + "epoch": 0.9670551255819471, + "grad_norm": 0.06603792309761047, + "learning_rate": 2.7349553742775924e-06, + "loss": 2.5294, + "step": 32612 + }, + { + "epoch": 0.9670847789342585, + "grad_norm": 0.06783373653888702, + "learning_rate": 2.73004308079694e-06, + "loss": 2.5203, + "step": 32613 + }, + { + "epoch": 0.96711443228657, + "grad_norm": 0.06635984778404236, + "learning_rate": 2.725135190727146e-06, + "loss": 2.5412, + "step": 32614 + }, + { + "epoch": 0.9671440856388814, + "grad_norm": 0.06340411305427551, + "learning_rate": 2.7202317041115644e-06, + "loss": 2.5385, + "step": 32615 + }, + { + "epoch": 0.967173738991193, + "grad_norm": 0.06769825518131256, + "learning_rate": 2.715332620993771e-06, + "loss": 2.5498, + "step": 32616 + }, + { + "epoch": 0.9672033923435044, + "grad_norm": 0.06609814614057541, + "learning_rate": 2.71043794141701e-06, + "loss": 2.5918, + "step": 32617 + }, + { + "epoch": 0.9672330456958159, + "grad_norm": 0.06590025871992111, + "learning_rate": 2.7055476654246902e-06, + "loss": 2.5485, + "step": 32618 + }, + { + "epoch": 0.9672626990481273, + "grad_norm": 0.06625641137361526, + "learning_rate": 2.7006617930601106e-06, + "loss": 2.5502, + "step": 32619 + }, + { + "epoch": 0.9672923524004389, + "grad_norm": 0.06452503800392151, + "learning_rate": 2.6957803243665703e-06, + "loss": 2.5143, + "step": 32620 + }, + { + "epoch": 0.9673220057527504, + "grad_norm": 0.06531329452991486, + "learning_rate": 2.6909032593872008e-06, + "loss": 2.5074, + "step": 32621 + }, + { + "epoch": 0.9673516591050618, + "grad_norm": 0.062283072620630264, + "learning_rate": 2.686030598165301e-06, + "loss": 2.5277, + "step": 32622 + }, + { + "epoch": 0.9673813124573734, + "grad_norm": 0.06621887534856796, + "learning_rate": 2.6811623407438923e-06, + "loss": 2.5395, + "step": 32623 + }, + { + "epoch": 0.9674109658096848, + "grad_norm": 0.0679536834359169, + "learning_rate": 2.6762984871662177e-06, + "loss": 2.5416, + "step": 32624 + }, + { + "epoch": 0.9674406191619963, + "grad_norm": 0.0647626742720604, + "learning_rate": 2.6714390374752427e-06, + "loss": 2.5251, + "step": 32625 + }, + { + "epoch": 0.9674702725143077, + "grad_norm": 0.06606363505125046, + "learning_rate": 2.6665839917140446e-06, + "loss": 2.556, + "step": 32626 + }, + { + "epoch": 0.9674999258666193, + "grad_norm": 0.06781627237796783, + "learning_rate": 2.6617333499256436e-06, + "loss": 2.5543, + "step": 32627 + }, + { + "epoch": 0.9675295792189307, + "grad_norm": 0.06738734990358353, + "learning_rate": 2.65688711215295e-06, + "loss": 2.5542, + "step": 32628 + }, + { + "epoch": 0.9675592325712422, + "grad_norm": 0.06633587926626205, + "learning_rate": 2.6520452784388748e-06, + "loss": 2.5467, + "step": 32629 + }, + { + "epoch": 0.9675888859235536, + "grad_norm": 0.06499504297971725, + "learning_rate": 2.6472078488263276e-06, + "loss": 2.5259, + "step": 32630 + }, + { + "epoch": 0.9676185392758652, + "grad_norm": 0.069928377866745, + "learning_rate": 2.642374823358107e-06, + "loss": 2.5109, + "step": 32631 + }, + { + "epoch": 0.9676481926281766, + "grad_norm": 0.06264713406562805, + "learning_rate": 2.6375462020770125e-06, + "loss": 2.5615, + "step": 32632 + }, + { + "epoch": 0.9676778459804881, + "grad_norm": 0.06336048245429993, + "learning_rate": 2.6327219850257877e-06, + "loss": 2.5089, + "step": 32633 + }, + { + "epoch": 0.9677074993327995, + "grad_norm": 0.06425493955612183, + "learning_rate": 2.6279021722472317e-06, + "loss": 2.5411, + "step": 32634 + }, + { + "epoch": 0.9677371526851111, + "grad_norm": 0.06363224238157272, + "learning_rate": 2.623086763783977e-06, + "loss": 2.5584, + "step": 32635 + }, + { + "epoch": 0.9677668060374225, + "grad_norm": 0.0650290995836258, + "learning_rate": 2.6182757596786566e-06, + "loss": 2.535, + "step": 32636 + }, + { + "epoch": 0.967796459389734, + "grad_norm": 0.0662105455994606, + "learning_rate": 2.6134691599739025e-06, + "loss": 2.5499, + "step": 32637 + }, + { + "epoch": 0.9678261127420454, + "grad_norm": 0.06561915576457977, + "learning_rate": 2.6086669647122363e-06, + "loss": 2.5249, + "step": 32638 + }, + { + "epoch": 0.967855766094357, + "grad_norm": 0.06258652359247208, + "learning_rate": 2.60386917393618e-06, + "loss": 2.5141, + "step": 32639 + }, + { + "epoch": 0.9678854194466684, + "grad_norm": 0.06824417412281036, + "learning_rate": 2.5990757876882543e-06, + "loss": 2.5603, + "step": 32640 + }, + { + "epoch": 0.9679150727989799, + "grad_norm": 0.0655285194516182, + "learning_rate": 2.594286806010926e-06, + "loss": 2.5327, + "step": 32641 + }, + { + "epoch": 0.9679447261512915, + "grad_norm": 0.0647033229470253, + "learning_rate": 2.589502228946494e-06, + "loss": 2.5331, + "step": 32642 + }, + { + "epoch": 0.9679743795036029, + "grad_norm": 0.06397746503353119, + "learning_rate": 2.584722056537425e-06, + "loss": 2.5659, + "step": 32643 + }, + { + "epoch": 0.9680040328559144, + "grad_norm": 0.06594201922416687, + "learning_rate": 2.579946288826074e-06, + "loss": 2.5103, + "step": 32644 + }, + { + "epoch": 0.9680336862082258, + "grad_norm": 0.06406240165233612, + "learning_rate": 2.575174925854573e-06, + "loss": 2.5253, + "step": 32645 + }, + { + "epoch": 0.9680633395605374, + "grad_norm": 0.06288935989141464, + "learning_rate": 2.570407967665389e-06, + "loss": 2.4992, + "step": 32646 + }, + { + "epoch": 0.9680929929128488, + "grad_norm": 0.0648626983165741, + "learning_rate": 2.5656454143005436e-06, + "loss": 2.5102, + "step": 32647 + }, + { + "epoch": 0.9681226462651603, + "grad_norm": 0.0636029839515686, + "learning_rate": 2.5608872658023365e-06, + "loss": 2.5543, + "step": 32648 + }, + { + "epoch": 0.9681522996174717, + "grad_norm": 0.06579703837633133, + "learning_rate": 2.5561335222128444e-06, + "loss": 2.5438, + "step": 32649 + }, + { + "epoch": 0.9681819529697833, + "grad_norm": 0.06751711666584015, + "learning_rate": 2.5513841835742013e-06, + "loss": 2.5801, + "step": 32650 + }, + { + "epoch": 0.9682116063220947, + "grad_norm": 0.06515257805585861, + "learning_rate": 2.5466392499284284e-06, + "loss": 2.5147, + "step": 32651 + }, + { + "epoch": 0.9682412596744062, + "grad_norm": 0.06471733003854752, + "learning_rate": 2.541898721317548e-06, + "loss": 2.53, + "step": 32652 + }, + { + "epoch": 0.9682709130267176, + "grad_norm": 0.06501542776823044, + "learning_rate": 2.537162597783527e-06, + "loss": 2.5098, + "step": 32653 + }, + { + "epoch": 0.9683005663790292, + "grad_norm": 0.06753426045179367, + "learning_rate": 2.5324308793683306e-06, + "loss": 2.5268, + "step": 32654 + }, + { + "epoch": 0.9683302197313406, + "grad_norm": 0.06608182191848755, + "learning_rate": 2.527703566113815e-06, + "loss": 2.5454, + "step": 32655 + }, + { + "epoch": 0.9683598730836521, + "grad_norm": 0.06609975546598434, + "learning_rate": 2.5229806580618907e-06, + "loss": 2.5463, + "step": 32656 + }, + { + "epoch": 0.9683895264359635, + "grad_norm": 0.06475022435188293, + "learning_rate": 2.5182621552544137e-06, + "loss": 2.5389, + "step": 32657 + }, + { + "epoch": 0.9684191797882751, + "grad_norm": 0.06716041266918182, + "learning_rate": 2.513548057733017e-06, + "loss": 2.5315, + "step": 32658 + }, + { + "epoch": 0.9684488331405865, + "grad_norm": 0.06729814410209656, + "learning_rate": 2.508838365539612e-06, + "loss": 2.5264, + "step": 32659 + }, + { + "epoch": 0.968478486492898, + "grad_norm": 0.06541581451892853, + "learning_rate": 2.504133078715831e-06, + "loss": 2.5456, + "step": 32660 + }, + { + "epoch": 0.9685081398452094, + "grad_norm": 0.06671890616416931, + "learning_rate": 2.499432197303364e-06, + "loss": 2.5783, + "step": 32661 + }, + { + "epoch": 0.968537793197521, + "grad_norm": 0.06839468330144882, + "learning_rate": 2.4947357213438436e-06, + "loss": 2.5772, + "step": 32662 + }, + { + "epoch": 0.9685674465498325, + "grad_norm": 0.0644761249423027, + "learning_rate": 2.490043650878737e-06, + "loss": 2.5302, + "step": 32663 + }, + { + "epoch": 0.9685970999021439, + "grad_norm": 0.06638195365667343, + "learning_rate": 2.485355985949789e-06, + "loss": 2.5329, + "step": 32664 + }, + { + "epoch": 0.9686267532544555, + "grad_norm": 0.0642995685338974, + "learning_rate": 2.480672726598354e-06, + "loss": 2.533, + "step": 32665 + }, + { + "epoch": 0.9686564066067669, + "grad_norm": 0.06237165629863739, + "learning_rate": 2.475993872865956e-06, + "loss": 2.5325, + "step": 32666 + }, + { + "epoch": 0.9686860599590784, + "grad_norm": 0.06332381069660187, + "learning_rate": 2.471319424794061e-06, + "loss": 2.5332, + "step": 32667 + }, + { + "epoch": 0.9687157133113898, + "grad_norm": 0.0660770907998085, + "learning_rate": 2.4666493824240243e-06, + "loss": 2.5321, + "step": 32668 + }, + { + "epoch": 0.9687453666637014, + "grad_norm": 0.06556762754917145, + "learning_rate": 2.4619837457972025e-06, + "loss": 2.5201, + "step": 32669 + }, + { + "epoch": 0.9687750200160128, + "grad_norm": 0.06711690872907639, + "learning_rate": 2.457322514954896e-06, + "loss": 2.5405, + "step": 32670 + }, + { + "epoch": 0.9688046733683243, + "grad_norm": 0.06625986099243164, + "learning_rate": 2.4526656899384047e-06, + "loss": 2.5221, + "step": 32671 + }, + { + "epoch": 0.9688343267206357, + "grad_norm": 0.06794201582670212, + "learning_rate": 2.448013270789029e-06, + "loss": 2.5587, + "step": 32672 + }, + { + "epoch": 0.9688639800729473, + "grad_norm": 0.06453446298837662, + "learning_rate": 2.443365257547847e-06, + "loss": 2.5498, + "step": 32673 + }, + { + "epoch": 0.9688936334252587, + "grad_norm": 0.06309568136930466, + "learning_rate": 2.4387216502560483e-06, + "loss": 2.5316, + "step": 32674 + }, + { + "epoch": 0.9689232867775702, + "grad_norm": 0.06662967801094055, + "learning_rate": 2.4340824489548217e-06, + "loss": 2.5352, + "step": 32675 + }, + { + "epoch": 0.9689529401298816, + "grad_norm": 0.0659695640206337, + "learning_rate": 2.4294476536851352e-06, + "loss": 2.5412, + "step": 32676 + }, + { + "epoch": 0.9689825934821932, + "grad_norm": 0.0645715519785881, + "learning_rate": 2.4248172644881773e-06, + "loss": 2.5385, + "step": 32677 + }, + { + "epoch": 0.9690122468345046, + "grad_norm": 0.06478283554315567, + "learning_rate": 2.4201912814048044e-06, + "loss": 2.5454, + "step": 32678 + }, + { + "epoch": 0.9690419001868161, + "grad_norm": 0.06585042178630829, + "learning_rate": 2.4155697044760948e-06, + "loss": 2.5207, + "step": 32679 + }, + { + "epoch": 0.9690715535391276, + "grad_norm": 0.06255020201206207, + "learning_rate": 2.410952533742905e-06, + "loss": 2.5334, + "step": 32680 + }, + { + "epoch": 0.9691012068914391, + "grad_norm": 0.06293068081140518, + "learning_rate": 2.406339769246091e-06, + "loss": 2.5278, + "step": 32681 + }, + { + "epoch": 0.9691308602437505, + "grad_norm": 0.06270986795425415, + "learning_rate": 2.4017314110266196e-06, + "loss": 2.5545, + "step": 32682 + }, + { + "epoch": 0.969160513596062, + "grad_norm": 0.06905248761177063, + "learning_rate": 2.3971274591251814e-06, + "loss": 2.4981, + "step": 32683 + }, + { + "epoch": 0.9691901669483736, + "grad_norm": 0.06467234343290329, + "learning_rate": 2.3925279135826315e-06, + "loss": 2.5241, + "step": 32684 + }, + { + "epoch": 0.969219820300685, + "grad_norm": 0.0631491094827652, + "learning_rate": 2.3879327744396606e-06, + "loss": 2.5237, + "step": 32685 + }, + { + "epoch": 0.9692494736529965, + "grad_norm": 0.06551531702280045, + "learning_rate": 2.3833420417369577e-06, + "loss": 2.4986, + "step": 32686 + }, + { + "epoch": 0.9692791270053079, + "grad_norm": 0.06583403795957565, + "learning_rate": 2.3787557155151574e-06, + "loss": 2.5251, + "step": 32687 + }, + { + "epoch": 0.9693087803576195, + "grad_norm": 0.062038321048021317, + "learning_rate": 2.3741737958148935e-06, + "loss": 2.5232, + "step": 32688 + }, + { + "epoch": 0.9693384337099309, + "grad_norm": 0.0658080130815506, + "learning_rate": 2.3695962826767446e-06, + "loss": 2.5283, + "step": 32689 + }, + { + "epoch": 0.9693680870622424, + "grad_norm": 0.0655980110168457, + "learning_rate": 2.3650231761412343e-06, + "loss": 2.556, + "step": 32690 + }, + { + "epoch": 0.9693977404145538, + "grad_norm": 0.067759670317173, + "learning_rate": 2.36045447624883e-06, + "loss": 2.539, + "step": 32691 + }, + { + "epoch": 0.9694273937668654, + "grad_norm": 0.06593041121959686, + "learning_rate": 2.3558901830400547e-06, + "loss": 2.5499, + "step": 32692 + }, + { + "epoch": 0.9694570471191768, + "grad_norm": 0.06556539237499237, + "learning_rate": 2.3513302965552653e-06, + "loss": 2.537, + "step": 32693 + }, + { + "epoch": 0.9694867004714883, + "grad_norm": 0.06353693455457687, + "learning_rate": 2.3467748168348736e-06, + "loss": 2.5149, + "step": 32694 + }, + { + "epoch": 0.9695163538237997, + "grad_norm": 0.06493895500898361, + "learning_rate": 2.3422237439192917e-06, + "loss": 2.5364, + "step": 32695 + }, + { + "epoch": 0.9695460071761113, + "grad_norm": 0.06734340637922287, + "learning_rate": 2.3376770778486544e-06, + "loss": 2.5456, + "step": 32696 + }, + { + "epoch": 0.9695756605284227, + "grad_norm": 0.06779489666223526, + "learning_rate": 2.3331348186633183e-06, + "loss": 2.5461, + "step": 32697 + }, + { + "epoch": 0.9696053138807342, + "grad_norm": 0.06677161902189255, + "learning_rate": 2.3285969664034734e-06, + "loss": 2.5678, + "step": 32698 + }, + { + "epoch": 0.9696349672330457, + "grad_norm": 0.06217261776328087, + "learning_rate": 2.324063521109365e-06, + "loss": 2.5472, + "step": 32699 + }, + { + "epoch": 0.9696646205853572, + "grad_norm": 0.06567151099443436, + "learning_rate": 2.3195344828211285e-06, + "loss": 2.532, + "step": 32700 + }, + { + "epoch": 0.9696942739376686, + "grad_norm": 0.06375259906053543, + "learning_rate": 2.3150098515787866e-06, + "loss": 2.5113, + "step": 32701 + }, + { + "epoch": 0.9697239272899801, + "grad_norm": 0.06471813470125198, + "learning_rate": 2.310489627422474e-06, + "loss": 2.5093, + "step": 32702 + }, + { + "epoch": 0.9697535806422916, + "grad_norm": 0.06456665694713593, + "learning_rate": 2.3059738103922145e-06, + "loss": 2.5546, + "step": 32703 + }, + { + "epoch": 0.9697832339946031, + "grad_norm": 0.0642302930355072, + "learning_rate": 2.3014624005279763e-06, + "loss": 2.5176, + "step": 32704 + }, + { + "epoch": 0.9698128873469146, + "grad_norm": 0.06385542452335358, + "learning_rate": 2.2969553978697267e-06, + "loss": 2.551, + "step": 32705 + }, + { + "epoch": 0.969842540699226, + "grad_norm": 0.06516771763563156, + "learning_rate": 2.292452802457379e-06, + "loss": 2.524, + "step": 32706 + }, + { + "epoch": 0.9698721940515376, + "grad_norm": 0.06533287465572357, + "learning_rate": 2.287954614330734e-06, + "loss": 2.5329, + "step": 32707 + }, + { + "epoch": 0.969901847403849, + "grad_norm": 0.06849952042102814, + "learning_rate": 2.28346083352976e-06, + "loss": 2.5747, + "step": 32708 + }, + { + "epoch": 0.9699315007561605, + "grad_norm": 0.06421324610710144, + "learning_rate": 2.2789714600940926e-06, + "loss": 2.553, + "step": 32709 + }, + { + "epoch": 0.969961154108472, + "grad_norm": 0.06509006768465042, + "learning_rate": 2.274486494063588e-06, + "loss": 2.5678, + "step": 32710 + }, + { + "epoch": 0.9699908074607835, + "grad_norm": 0.06432649493217468, + "learning_rate": 2.2700059354779925e-06, + "loss": 2.5511, + "step": 32711 + }, + { + "epoch": 0.9700204608130949, + "grad_norm": 0.06556490063667297, + "learning_rate": 2.265529784376885e-06, + "loss": 2.5365, + "step": 32712 + }, + { + "epoch": 0.9700501141654064, + "grad_norm": 0.06378716975450516, + "learning_rate": 2.261058040799957e-06, + "loss": 2.5574, + "step": 32713 + }, + { + "epoch": 0.9700797675177179, + "grad_norm": 0.06534811854362488, + "learning_rate": 2.256590704786787e-06, + "loss": 2.5441, + "step": 32714 + }, + { + "epoch": 0.9701094208700294, + "grad_norm": 0.06548649072647095, + "learning_rate": 2.2521277763769555e-06, + "loss": 2.5539, + "step": 32715 + }, + { + "epoch": 0.9701390742223408, + "grad_norm": 0.06479736417531967, + "learning_rate": 2.2476692556099853e-06, + "loss": 2.5449, + "step": 32716 + }, + { + "epoch": 0.9701687275746523, + "grad_norm": 0.06628664582967758, + "learning_rate": 2.24321514252529e-06, + "loss": 2.5554, + "step": 32717 + }, + { + "epoch": 0.9701983809269638, + "grad_norm": 0.06397459656000137, + "learning_rate": 2.2387654371623935e-06, + "loss": 2.5368, + "step": 32718 + }, + { + "epoch": 0.9702280342792753, + "grad_norm": 0.06146066635847092, + "learning_rate": 2.2343201395606526e-06, + "loss": 2.5081, + "step": 32719 + }, + { + "epoch": 0.9702576876315867, + "grad_norm": 0.06342428177595139, + "learning_rate": 2.229879249759481e-06, + "loss": 2.5272, + "step": 32720 + }, + { + "epoch": 0.9702873409838982, + "grad_norm": 0.06264179199934006, + "learning_rate": 2.225442767798125e-06, + "loss": 2.5457, + "step": 32721 + }, + { + "epoch": 0.9703169943362097, + "grad_norm": 0.06508640944957733, + "learning_rate": 2.221010693715941e-06, + "loss": 2.5414, + "step": 32722 + }, + { + "epoch": 0.9703466476885212, + "grad_norm": 0.0648244246840477, + "learning_rate": 2.2165830275521215e-06, + "loss": 2.5494, + "step": 32723 + }, + { + "epoch": 0.9703763010408326, + "grad_norm": 0.06646164506673813, + "learning_rate": 2.212159769345967e-06, + "loss": 2.54, + "step": 32724 + }, + { + "epoch": 0.9704059543931441, + "grad_norm": 0.06229761615395546, + "learning_rate": 2.207740919136558e-06, + "loss": 2.529, + "step": 32725 + }, + { + "epoch": 0.9704356077454557, + "grad_norm": 0.06727048754692078, + "learning_rate": 2.20332647696303e-06, + "loss": 2.5477, + "step": 32726 + }, + { + "epoch": 0.9704652610977671, + "grad_norm": 0.06524845957756042, + "learning_rate": 2.198916442864518e-06, + "loss": 2.537, + "step": 32727 + }, + { + "epoch": 0.9704949144500786, + "grad_norm": 0.06443770229816437, + "learning_rate": 2.194510816879991e-06, + "loss": 2.5106, + "step": 32728 + }, + { + "epoch": 0.97052456780239, + "grad_norm": 0.06441058218479156, + "learning_rate": 2.1901095990485843e-06, + "loss": 2.5527, + "step": 32729 + }, + { + "epoch": 0.9705542211547016, + "grad_norm": 0.06460976600646973, + "learning_rate": 2.1857127894091557e-06, + "loss": 2.5057, + "step": 32730 + }, + { + "epoch": 0.970583874507013, + "grad_norm": 0.06575679033994675, + "learning_rate": 2.1813203880007293e-06, + "loss": 2.5212, + "step": 32731 + }, + { + "epoch": 0.9706135278593245, + "grad_norm": 0.06503846496343613, + "learning_rate": 2.176932394862108e-06, + "loss": 2.5239, + "step": 32732 + }, + { + "epoch": 0.970643181211636, + "grad_norm": 0.06314343959093094, + "learning_rate": 2.17254881003226e-06, + "loss": 2.5255, + "step": 32733 + }, + { + "epoch": 0.9706728345639475, + "grad_norm": 0.0637459009885788, + "learning_rate": 2.168169633549877e-06, + "loss": 2.5261, + "step": 32734 + }, + { + "epoch": 0.9707024879162589, + "grad_norm": 0.0712905153632164, + "learning_rate": 2.163794865453872e-06, + "loss": 2.5314, + "step": 32735 + }, + { + "epoch": 0.9707321412685704, + "grad_norm": 0.06521287560462952, + "learning_rate": 2.159424505782881e-06, + "loss": 2.5422, + "step": 32736 + }, + { + "epoch": 0.9707617946208819, + "grad_norm": 0.06220785155892372, + "learning_rate": 2.1550585545756505e-06, + "loss": 2.5157, + "step": 32737 + }, + { + "epoch": 0.9707914479731934, + "grad_norm": 0.0664554089307785, + "learning_rate": 2.150697011870817e-06, + "loss": 2.5392, + "step": 32738 + }, + { + "epoch": 0.9708211013255048, + "grad_norm": 0.06542952358722687, + "learning_rate": 2.1463398777070152e-06, + "loss": 2.5412, + "step": 32739 + }, + { + "epoch": 0.9708507546778163, + "grad_norm": 0.06460581719875336, + "learning_rate": 2.141987152122826e-06, + "loss": 2.5298, + "step": 32740 + }, + { + "epoch": 0.9708804080301278, + "grad_norm": 0.06498418003320694, + "learning_rate": 2.13763883515683e-06, + "loss": 2.5592, + "step": 32741 + }, + { + "epoch": 0.9709100613824393, + "grad_norm": 0.06256327033042908, + "learning_rate": 2.1332949268474954e-06, + "loss": 2.5247, + "step": 32742 + }, + { + "epoch": 0.9709397147347507, + "grad_norm": 0.06988664716482162, + "learning_rate": 2.128955427233237e-06, + "loss": 2.5322, + "step": 32743 + }, + { + "epoch": 0.9709693680870622, + "grad_norm": 0.06330208480358124, + "learning_rate": 2.1246203363525783e-06, + "loss": 2.5154, + "step": 32744 + }, + { + "epoch": 0.9709990214393737, + "grad_norm": 0.06695694476366043, + "learning_rate": 2.120289654243879e-06, + "loss": 2.5205, + "step": 32745 + }, + { + "epoch": 0.9710286747916852, + "grad_norm": 0.06484358012676239, + "learning_rate": 2.1159633809454403e-06, + "loss": 2.5358, + "step": 32746 + }, + { + "epoch": 0.9710583281439967, + "grad_norm": 0.06525216996669769, + "learning_rate": 2.1116415164956215e-06, + "loss": 2.5385, + "step": 32747 + }, + { + "epoch": 0.9710879814963082, + "grad_norm": 0.06449339538812637, + "learning_rate": 2.1073240609326694e-06, + "loss": 2.532, + "step": 32748 + }, + { + "epoch": 0.9711176348486197, + "grad_norm": 0.06595269590616226, + "learning_rate": 2.103011014294831e-06, + "loss": 2.5209, + "step": 32749 + }, + { + "epoch": 0.9711472882009311, + "grad_norm": 0.06490274518728256, + "learning_rate": 2.0987023766202984e-06, + "loss": 2.571, + "step": 32750 + }, + { + "epoch": 0.9711769415532426, + "grad_norm": 0.06637565046548843, + "learning_rate": 2.0943981479472627e-06, + "loss": 2.5423, + "step": 32751 + }, + { + "epoch": 0.971206594905554, + "grad_norm": 0.06598957628011703, + "learning_rate": 2.09009832831375e-06, + "loss": 2.5397, + "step": 32752 + }, + { + "epoch": 0.9712362482578656, + "grad_norm": 0.06825917214155197, + "learning_rate": 2.08580291775784e-06, + "loss": 2.5576, + "step": 32753 + }, + { + "epoch": 0.971265901610177, + "grad_norm": 0.06350596994161606, + "learning_rate": 2.0815119163176686e-06, + "loss": 2.5487, + "step": 32754 + }, + { + "epoch": 0.9712955549624885, + "grad_norm": 0.06375438719987869, + "learning_rate": 2.077225324031151e-06, + "loss": 2.5309, + "step": 32755 + }, + { + "epoch": 0.9713252083148, + "grad_norm": 0.0650763213634491, + "learning_rate": 2.072943140936312e-06, + "loss": 2.502, + "step": 32756 + }, + { + "epoch": 0.9713548616671115, + "grad_norm": 0.06612654030323029, + "learning_rate": 2.0686653670709543e-06, + "loss": 2.5385, + "step": 32757 + }, + { + "epoch": 0.9713845150194229, + "grad_norm": 0.0644347220659256, + "learning_rate": 2.064392002473103e-06, + "loss": 2.5476, + "step": 32758 + }, + { + "epoch": 0.9714141683717344, + "grad_norm": 0.06856691092252731, + "learning_rate": 2.0601230471805068e-06, + "loss": 2.5375, + "step": 32759 + }, + { + "epoch": 0.9714438217240459, + "grad_norm": 0.06407599151134491, + "learning_rate": 2.055858501230967e-06, + "loss": 2.5129, + "step": 32760 + }, + { + "epoch": 0.9714734750763574, + "grad_norm": 0.06604321300983429, + "learning_rate": 2.0515983646622883e-06, + "loss": 2.5442, + "step": 32761 + }, + { + "epoch": 0.9715031284286688, + "grad_norm": 0.06399407237768173, + "learning_rate": 2.047342637512217e-06, + "loss": 2.5256, + "step": 32762 + }, + { + "epoch": 0.9715327817809803, + "grad_norm": 0.06354159116744995, + "learning_rate": 2.0430913198183354e-06, + "loss": 2.4931, + "step": 32763 + }, + { + "epoch": 0.9715624351332918, + "grad_norm": 0.06424612551927567, + "learning_rate": 2.03884441161839e-06, + "loss": 2.5102, + "step": 32764 + }, + { + "epoch": 0.9715920884856033, + "grad_norm": 0.06705914437770844, + "learning_rate": 2.034601912949963e-06, + "loss": 2.5382, + "step": 32765 + }, + { + "epoch": 0.9716217418379147, + "grad_norm": 0.06440237164497375, + "learning_rate": 2.0303638238505784e-06, + "loss": 2.518, + "step": 32766 + }, + { + "epoch": 0.9716513951902263, + "grad_norm": 0.06407897174358368, + "learning_rate": 2.0261301443578185e-06, + "loss": 2.5236, + "step": 32767 + }, + { + "epoch": 0.9716810485425378, + "grad_norm": 0.0633280947804451, + "learning_rate": 2.021900874509153e-06, + "loss": 2.5213, + "step": 32768 + }, + { + "epoch": 0.9717107018948492, + "grad_norm": 0.06582637876272202, + "learning_rate": 2.0176760143419957e-06, + "loss": 2.565, + "step": 32769 + }, + { + "epoch": 0.9717403552471607, + "grad_norm": 0.06690748035907745, + "learning_rate": 2.013455563893818e-06, + "loss": 2.5252, + "step": 32770 + }, + { + "epoch": 0.9717700085994722, + "grad_norm": 0.06256440281867981, + "learning_rate": 2.009239523202033e-06, + "loss": 2.5492, + "step": 32771 + }, + { + "epoch": 0.9717996619517837, + "grad_norm": 0.06384745985269547, + "learning_rate": 2.0050278923038343e-06, + "loss": 2.5918, + "step": 32772 + }, + { + "epoch": 0.9718293153040951, + "grad_norm": 0.06669602543115616, + "learning_rate": 2.000820671236636e-06, + "loss": 2.5483, + "step": 32773 + }, + { + "epoch": 0.9718589686564066, + "grad_norm": 0.06206238269805908, + "learning_rate": 1.996617860037575e-06, + "loss": 2.5377, + "step": 32774 + }, + { + "epoch": 0.9718886220087181, + "grad_norm": 0.0662902444601059, + "learning_rate": 1.9924194587440103e-06, + "loss": 2.5277, + "step": 32775 + }, + { + "epoch": 0.9719182753610296, + "grad_norm": 0.06314956396818161, + "learning_rate": 1.9882254673930233e-06, + "loss": 2.5548, + "step": 32776 + }, + { + "epoch": 0.971947928713341, + "grad_norm": 0.06551118195056915, + "learning_rate": 1.9840358860218068e-06, + "loss": 2.5352, + "step": 32777 + }, + { + "epoch": 0.9719775820656525, + "grad_norm": 0.0649043396115303, + "learning_rate": 1.9798507146674417e-06, + "loss": 2.5253, + "step": 32778 + }, + { + "epoch": 0.972007235417964, + "grad_norm": 0.06367231160402298, + "learning_rate": 1.9756699533669543e-06, + "loss": 2.5762, + "step": 32779 + }, + { + "epoch": 0.9720368887702755, + "grad_norm": 0.06634121388196945, + "learning_rate": 1.9714936021574257e-06, + "loss": 2.5425, + "step": 32780 + }, + { + "epoch": 0.9720665421225869, + "grad_norm": 0.06436040997505188, + "learning_rate": 1.9673216610757717e-06, + "loss": 2.5303, + "step": 32781 + }, + { + "epoch": 0.9720961954748985, + "grad_norm": 0.06417646259069443, + "learning_rate": 1.9631541301590173e-06, + "loss": 2.5023, + "step": 32782 + }, + { + "epoch": 0.9721258488272099, + "grad_norm": 0.06544022262096405, + "learning_rate": 1.9589910094439665e-06, + "loss": 2.5039, + "step": 32783 + }, + { + "epoch": 0.9721555021795214, + "grad_norm": 0.06416019052267075, + "learning_rate": 1.9548322989675903e-06, + "loss": 2.5566, + "step": 32784 + }, + { + "epoch": 0.9721851555318328, + "grad_norm": 0.06234142929315567, + "learning_rate": 1.9506779987666366e-06, + "loss": 2.5665, + "step": 32785 + }, + { + "epoch": 0.9722148088841444, + "grad_norm": 0.06216654181480408, + "learning_rate": 1.9465281088779096e-06, + "loss": 2.5555, + "step": 32786 + }, + { + "epoch": 0.9722444622364559, + "grad_norm": 0.0640089139342308, + "learning_rate": 1.9423826293381574e-06, + "loss": 2.568, + "step": 32787 + }, + { + "epoch": 0.9722741155887673, + "grad_norm": 0.06325320154428482, + "learning_rate": 1.938241560184073e-06, + "loss": 2.5228, + "step": 32788 + }, + { + "epoch": 0.9723037689410788, + "grad_norm": 0.06398067623376846, + "learning_rate": 1.9341049014524047e-06, + "loss": 2.5156, + "step": 32789 + }, + { + "epoch": 0.9723334222933903, + "grad_norm": 0.06353580951690674, + "learning_rate": 1.9299726531797344e-06, + "loss": 2.5166, + "step": 32790 + }, + { + "epoch": 0.9723630756457018, + "grad_norm": 0.06413354724645615, + "learning_rate": 1.925844815402644e-06, + "loss": 2.5399, + "step": 32791 + }, + { + "epoch": 0.9723927289980132, + "grad_norm": 0.06355319917201996, + "learning_rate": 1.92172138815766e-06, + "loss": 2.5906, + "step": 32792 + }, + { + "epoch": 0.9724223823503247, + "grad_norm": 0.06465492397546768, + "learning_rate": 1.917602371481364e-06, + "loss": 2.5234, + "step": 32793 + }, + { + "epoch": 0.9724520357026362, + "grad_norm": 0.06384018808603287, + "learning_rate": 1.9134877654101714e-06, + "loss": 2.5657, + "step": 32794 + }, + { + "epoch": 0.9724816890549477, + "grad_norm": 0.06768842041492462, + "learning_rate": 1.9093775699805526e-06, + "loss": 2.5543, + "step": 32795 + }, + { + "epoch": 0.9725113424072591, + "grad_norm": 0.06499013304710388, + "learning_rate": 1.9052717852288681e-06, + "loss": 2.5535, + "step": 32796 + }, + { + "epoch": 0.9725409957595706, + "grad_norm": 0.06572301685810089, + "learning_rate": 1.9011704111914773e-06, + "loss": 2.5089, + "step": 32797 + }, + { + "epoch": 0.9725706491118821, + "grad_norm": 0.06406401842832565, + "learning_rate": 1.8970734479047957e-06, + "loss": 2.5433, + "step": 32798 + }, + { + "epoch": 0.9726003024641936, + "grad_norm": 0.06487303227186203, + "learning_rate": 1.892980895404961e-06, + "loss": 2.5106, + "step": 32799 + }, + { + "epoch": 0.972629955816505, + "grad_norm": 0.061748065054416656, + "learning_rate": 1.8888927537282773e-06, + "loss": 2.5104, + "step": 32800 + }, + { + "epoch": 0.9726596091688166, + "grad_norm": 0.06264705210924149, + "learning_rate": 1.8848090229109937e-06, + "loss": 2.5626, + "step": 32801 + }, + { + "epoch": 0.972689262521128, + "grad_norm": 0.06418372690677643, + "learning_rate": 1.8807297029891924e-06, + "loss": 2.5552, + "step": 32802 + }, + { + "epoch": 0.9727189158734395, + "grad_norm": 0.0636928528547287, + "learning_rate": 1.8766547939990664e-06, + "loss": 2.5413, + "step": 32803 + }, + { + "epoch": 0.9727485692257509, + "grad_norm": 0.0668659657239914, + "learning_rate": 1.8725842959766425e-06, + "loss": 2.5262, + "step": 32804 + }, + { + "epoch": 0.9727782225780625, + "grad_norm": 0.06715520471334457, + "learning_rate": 1.8685182089580033e-06, + "loss": 2.5217, + "step": 32805 + }, + { + "epoch": 0.9728078759303739, + "grad_norm": 0.06334105879068375, + "learning_rate": 1.8644565329791197e-06, + "loss": 2.5496, + "step": 32806 + }, + { + "epoch": 0.9728375292826854, + "grad_norm": 0.06565114110708237, + "learning_rate": 1.8603992680759629e-06, + "loss": 2.5249, + "step": 32807 + }, + { + "epoch": 0.9728671826349969, + "grad_norm": 0.06394849717617035, + "learning_rate": 1.8563464142845043e-06, + "loss": 2.5575, + "step": 32808 + }, + { + "epoch": 0.9728968359873084, + "grad_norm": 0.06298615783452988, + "learning_rate": 1.852297971640604e-06, + "loss": 2.5228, + "step": 32809 + }, + { + "epoch": 0.9729264893396199, + "grad_norm": 0.06395905464887619, + "learning_rate": 1.8482539401800669e-06, + "loss": 2.5382, + "step": 32810 + }, + { + "epoch": 0.9729561426919313, + "grad_norm": 0.06564734876155853, + "learning_rate": 1.8442143199388084e-06, + "loss": 2.5459, + "step": 32811 + }, + { + "epoch": 0.9729857960442428, + "grad_norm": 0.06393289566040039, + "learning_rate": 1.8401791109525222e-06, + "loss": 2.5543, + "step": 32812 + }, + { + "epoch": 0.9730154493965543, + "grad_norm": 0.06666212528944016, + "learning_rate": 1.8361483132569022e-06, + "loss": 2.5477, + "step": 32813 + }, + { + "epoch": 0.9730451027488658, + "grad_norm": 0.06423927843570709, + "learning_rate": 1.8321219268877531e-06, + "loss": 2.5648, + "step": 32814 + }, + { + "epoch": 0.9730747561011772, + "grad_norm": 0.06550022959709167, + "learning_rate": 1.8280999518806575e-06, + "loss": 2.5555, + "step": 32815 + }, + { + "epoch": 0.9731044094534888, + "grad_norm": 0.06283202767372131, + "learning_rate": 1.824082388271253e-06, + "loss": 2.5266, + "step": 32816 + }, + { + "epoch": 0.9731340628058002, + "grad_norm": 0.060741763561964035, + "learning_rate": 1.8200692360951232e-06, + "loss": 2.5446, + "step": 32817 + }, + { + "epoch": 0.9731637161581117, + "grad_norm": 0.06334444880485535, + "learning_rate": 1.8160604953877946e-06, + "loss": 2.5327, + "step": 32818 + }, + { + "epoch": 0.9731933695104231, + "grad_norm": 0.0660315528512001, + "learning_rate": 1.812056166184739e-06, + "loss": 2.5362, + "step": 32819 + }, + { + "epoch": 0.9732230228627347, + "grad_norm": 0.06408943980932236, + "learning_rate": 1.808056248521428e-06, + "loss": 2.4885, + "step": 32820 + }, + { + "epoch": 0.9732526762150461, + "grad_norm": 0.06436772644519806, + "learning_rate": 1.8040607424333333e-06, + "loss": 2.5315, + "step": 32821 + }, + { + "epoch": 0.9732823295673576, + "grad_norm": 0.063447505235672, + "learning_rate": 1.8000696479557598e-06, + "loss": 2.5422, + "step": 32822 + }, + { + "epoch": 0.973311982919669, + "grad_norm": 0.06442609429359436, + "learning_rate": 1.7960829651240685e-06, + "loss": 2.5569, + "step": 32823 + }, + { + "epoch": 0.9733416362719806, + "grad_norm": 0.06487623602151871, + "learning_rate": 1.7921006939736195e-06, + "loss": 2.5603, + "step": 32824 + }, + { + "epoch": 0.973371289624292, + "grad_norm": 0.06696148216724396, + "learning_rate": 1.7881228345396073e-06, + "loss": 2.5428, + "step": 32825 + }, + { + "epoch": 0.9734009429766035, + "grad_norm": 0.06344510614871979, + "learning_rate": 1.7841493868573367e-06, + "loss": 2.5349, + "step": 32826 + }, + { + "epoch": 0.9734305963289149, + "grad_norm": 0.062198664993047714, + "learning_rate": 1.7801803509618908e-06, + "loss": 2.5592, + "step": 32827 + }, + { + "epoch": 0.9734602496812265, + "grad_norm": 0.0635727271437645, + "learning_rate": 1.7762157268884638e-06, + "loss": 2.5059, + "step": 32828 + }, + { + "epoch": 0.973489903033538, + "grad_norm": 0.06381437182426453, + "learning_rate": 1.772255514672194e-06, + "loss": 2.5582, + "step": 32829 + }, + { + "epoch": 0.9735195563858494, + "grad_norm": 0.06486161798238754, + "learning_rate": 1.7682997143481095e-06, + "loss": 2.5395, + "step": 32830 + }, + { + "epoch": 0.973549209738161, + "grad_norm": 0.06402655690908432, + "learning_rate": 1.7643483259512371e-06, + "loss": 2.5473, + "step": 32831 + }, + { + "epoch": 0.9735788630904724, + "grad_norm": 0.06248673424124718, + "learning_rate": 1.760401349516605e-06, + "loss": 2.5592, + "step": 32832 + }, + { + "epoch": 0.9736085164427839, + "grad_norm": 0.06560187041759491, + "learning_rate": 1.7564587850791291e-06, + "loss": 2.5238, + "step": 32833 + }, + { + "epoch": 0.9736381697950953, + "grad_norm": 0.06515049189329147, + "learning_rate": 1.7525206326737264e-06, + "loss": 2.5315, + "step": 32834 + }, + { + "epoch": 0.9736678231474069, + "grad_norm": 0.06109359487891197, + "learning_rate": 1.7485868923352577e-06, + "loss": 2.5118, + "step": 32835 + }, + { + "epoch": 0.9736974764997183, + "grad_norm": 0.06445585936307907, + "learning_rate": 1.7446575640986395e-06, + "loss": 2.5289, + "step": 32836 + }, + { + "epoch": 0.9737271298520298, + "grad_norm": 0.0631716251373291, + "learning_rate": 1.7407326479985664e-06, + "loss": 2.5321, + "step": 32837 + }, + { + "epoch": 0.9737567832043412, + "grad_norm": 0.06304392963647842, + "learning_rate": 1.7368121440698436e-06, + "loss": 2.4995, + "step": 32838 + }, + { + "epoch": 0.9737864365566528, + "grad_norm": 0.06639717519283295, + "learning_rate": 1.7328960523471105e-06, + "loss": 2.5793, + "step": 32839 + }, + { + "epoch": 0.9738160899089642, + "grad_norm": 0.0641787126660347, + "learning_rate": 1.728984372865172e-06, + "loss": 2.5357, + "step": 32840 + }, + { + "epoch": 0.9738457432612757, + "grad_norm": 0.06745252758264542, + "learning_rate": 1.7250771056586122e-06, + "loss": 2.52, + "step": 32841 + }, + { + "epoch": 0.9738753966135871, + "grad_norm": 0.06326131522655487, + "learning_rate": 1.7211742507619588e-06, + "loss": 2.5445, + "step": 32842 + }, + { + "epoch": 0.9739050499658987, + "grad_norm": 0.06267719715833664, + "learning_rate": 1.717275808209906e-06, + "loss": 2.5345, + "step": 32843 + }, + { + "epoch": 0.9739347033182101, + "grad_norm": 0.06333336234092712, + "learning_rate": 1.713381778036871e-06, + "loss": 2.5472, + "step": 32844 + }, + { + "epoch": 0.9739643566705216, + "grad_norm": 0.06613572686910629, + "learning_rate": 1.7094921602773817e-06, + "loss": 2.5358, + "step": 32845 + }, + { + "epoch": 0.973994010022833, + "grad_norm": 0.0656941756606102, + "learning_rate": 1.7056069549658548e-06, + "loss": 2.5312, + "step": 32846 + }, + { + "epoch": 0.9740236633751446, + "grad_norm": 0.06301349401473999, + "learning_rate": 1.701726162136763e-06, + "loss": 2.5337, + "step": 32847 + }, + { + "epoch": 0.974053316727456, + "grad_norm": 0.06648773699998856, + "learning_rate": 1.6978497818243565e-06, + "loss": 2.5289, + "step": 32848 + }, + { + "epoch": 0.9740829700797675, + "grad_norm": 0.06191141903400421, + "learning_rate": 1.6939778140630525e-06, + "loss": 2.5439, + "step": 32849 + }, + { + "epoch": 0.974112623432079, + "grad_norm": 0.06431109458208084, + "learning_rate": 1.6901102588871009e-06, + "loss": 2.5137, + "step": 32850 + }, + { + "epoch": 0.9741422767843905, + "grad_norm": 0.0640365406870842, + "learning_rate": 1.6862471163307525e-06, + "loss": 2.5208, + "step": 32851 + }, + { + "epoch": 0.974171930136702, + "grad_norm": 0.06252530962228775, + "learning_rate": 1.6823883864282018e-06, + "loss": 2.5561, + "step": 32852 + }, + { + "epoch": 0.9742015834890134, + "grad_norm": 0.06699861586093903, + "learning_rate": 1.678534069213644e-06, + "loss": 2.5081, + "step": 32853 + }, + { + "epoch": 0.974231236841325, + "grad_norm": 0.06765581667423248, + "learning_rate": 1.6746841647212185e-06, + "loss": 2.574, + "step": 32854 + }, + { + "epoch": 0.9742608901936364, + "grad_norm": 0.06657109409570694, + "learning_rate": 1.6708386729849535e-06, + "loss": 2.5267, + "step": 32855 + }, + { + "epoch": 0.9742905435459479, + "grad_norm": 0.0644323006272316, + "learning_rate": 1.6669975940390436e-06, + "loss": 2.5252, + "step": 32856 + }, + { + "epoch": 0.9743201968982593, + "grad_norm": 0.06610174477100372, + "learning_rate": 1.663160927917351e-06, + "loss": 2.587, + "step": 32857 + }, + { + "epoch": 0.9743498502505709, + "grad_norm": 0.06313152611255646, + "learning_rate": 1.659328674653904e-06, + "loss": 2.4883, + "step": 32858 + }, + { + "epoch": 0.9743795036028823, + "grad_norm": 0.06479774415493011, + "learning_rate": 1.655500834282675e-06, + "loss": 2.5692, + "step": 32859 + }, + { + "epoch": 0.9744091569551938, + "grad_norm": 0.06299405544996262, + "learning_rate": 1.6516774068374708e-06, + "loss": 2.4998, + "step": 32860 + }, + { + "epoch": 0.9744388103075052, + "grad_norm": 0.06374860554933548, + "learning_rate": 1.6478583923522638e-06, + "loss": 2.5474, + "step": 32861 + }, + { + "epoch": 0.9744684636598168, + "grad_norm": 0.06355102360248566, + "learning_rate": 1.6440437908607497e-06, + "loss": 2.5346, + "step": 32862 + }, + { + "epoch": 0.9744981170121282, + "grad_norm": 0.06569627672433853, + "learning_rate": 1.6402336023967902e-06, + "loss": 2.5403, + "step": 32863 + }, + { + "epoch": 0.9745277703644397, + "grad_norm": 0.06411157548427582, + "learning_rate": 1.6364278269941357e-06, + "loss": 2.5352, + "step": 32864 + }, + { + "epoch": 0.9745574237167511, + "grad_norm": 0.06313169747591019, + "learning_rate": 1.6326264646864263e-06, + "loss": 2.5256, + "step": 32865 + }, + { + "epoch": 0.9745870770690627, + "grad_norm": 0.06641606986522675, + "learning_rate": 1.6288295155073573e-06, + "loss": 2.543, + "step": 32866 + }, + { + "epoch": 0.9746167304213741, + "grad_norm": 0.06489428132772446, + "learning_rate": 1.6250369794905129e-06, + "loss": 2.5631, + "step": 32867 + }, + { + "epoch": 0.9746463837736856, + "grad_norm": 0.06537880748510361, + "learning_rate": 1.6212488566695882e-06, + "loss": 2.5764, + "step": 32868 + }, + { + "epoch": 0.974676037125997, + "grad_norm": 0.06618751585483551, + "learning_rate": 1.617465147078001e-06, + "loss": 2.5564, + "step": 32869 + }, + { + "epoch": 0.9747056904783086, + "grad_norm": 0.06222984194755554, + "learning_rate": 1.6136858507493358e-06, + "loss": 2.5137, + "step": 32870 + }, + { + "epoch": 0.9747353438306201, + "grad_norm": 0.064595527946949, + "learning_rate": 1.60991096771701e-06, + "loss": 2.5087, + "step": 32871 + }, + { + "epoch": 0.9747649971829315, + "grad_norm": 0.06551358103752136, + "learning_rate": 1.6061404980144412e-06, + "loss": 2.524, + "step": 32872 + }, + { + "epoch": 0.9747946505352431, + "grad_norm": 0.06435701251029968, + "learning_rate": 1.6023744416751029e-06, + "loss": 2.5378, + "step": 32873 + }, + { + "epoch": 0.9748243038875545, + "grad_norm": 0.06386137008666992, + "learning_rate": 1.5986127987322463e-06, + "loss": 2.5604, + "step": 32874 + }, + { + "epoch": 0.974853957239866, + "grad_norm": 0.07730988413095474, + "learning_rate": 1.5948555692192336e-06, + "loss": 2.5517, + "step": 32875 + }, + { + "epoch": 0.9748836105921774, + "grad_norm": 0.06425131112337112, + "learning_rate": 1.5911027531693156e-06, + "loss": 2.5207, + "step": 32876 + }, + { + "epoch": 0.974913263944489, + "grad_norm": 0.06638596951961517, + "learning_rate": 1.5873543506157439e-06, + "loss": 2.5489, + "step": 32877 + }, + { + "epoch": 0.9749429172968004, + "grad_norm": 0.06606696546077728, + "learning_rate": 1.583610361591714e-06, + "loss": 2.5529, + "step": 32878 + }, + { + "epoch": 0.9749725706491119, + "grad_norm": 0.06482552736997604, + "learning_rate": 1.5798707861303662e-06, + "loss": 2.5303, + "step": 32879 + }, + { + "epoch": 0.9750022240014233, + "grad_norm": 0.06518887728452682, + "learning_rate": 1.5761356242647851e-06, + "loss": 2.5549, + "step": 32880 + }, + { + "epoch": 0.9750318773537349, + "grad_norm": 0.0671958327293396, + "learning_rate": 1.5724048760281106e-06, + "loss": 2.5372, + "step": 32881 + }, + { + "epoch": 0.9750615307060463, + "grad_norm": 0.06365033984184265, + "learning_rate": 1.5686785414533167e-06, + "loss": 2.5421, + "step": 32882 + }, + { + "epoch": 0.9750911840583578, + "grad_norm": 0.06757307797670364, + "learning_rate": 1.5649566205734877e-06, + "loss": 2.5523, + "step": 32883 + }, + { + "epoch": 0.9751208374106692, + "grad_norm": 0.066974937915802, + "learning_rate": 1.561239113421431e-06, + "loss": 2.5436, + "step": 32884 + }, + { + "epoch": 0.9751504907629808, + "grad_norm": 0.06281649321317673, + "learning_rate": 1.557526020030231e-06, + "loss": 2.5053, + "step": 32885 + }, + { + "epoch": 0.9751801441152922, + "grad_norm": 0.06409399956464767, + "learning_rate": 1.5538173404326395e-06, + "loss": 2.5345, + "step": 32886 + }, + { + "epoch": 0.9752097974676037, + "grad_norm": 0.06374851614236832, + "learning_rate": 1.55011307466163e-06, + "loss": 2.5274, + "step": 32887 + }, + { + "epoch": 0.9752394508199151, + "grad_norm": 0.06821352243423462, + "learning_rate": 1.546413222749843e-06, + "loss": 2.5393, + "step": 32888 + }, + { + "epoch": 0.9752691041722267, + "grad_norm": 0.06400422006845474, + "learning_rate": 1.542717784730141e-06, + "loss": 2.5713, + "step": 32889 + }, + { + "epoch": 0.9752987575245381, + "grad_norm": 0.06684406846761703, + "learning_rate": 1.5390267606352759e-06, + "loss": 2.559, + "step": 32890 + }, + { + "epoch": 0.9753284108768496, + "grad_norm": 0.06696882098913193, + "learning_rate": 1.5353401504978882e-06, + "loss": 2.5216, + "step": 32891 + }, + { + "epoch": 0.9753580642291612, + "grad_norm": 0.06542935967445374, + "learning_rate": 1.5316579543505626e-06, + "loss": 2.5447, + "step": 32892 + }, + { + "epoch": 0.9753877175814726, + "grad_norm": 0.06462964415550232, + "learning_rate": 1.5279801722259957e-06, + "loss": 2.5308, + "step": 32893 + }, + { + "epoch": 0.9754173709337841, + "grad_norm": 0.06374839693307877, + "learning_rate": 1.5243068041567721e-06, + "loss": 2.5034, + "step": 32894 + }, + { + "epoch": 0.9754470242860955, + "grad_norm": 0.0669814869761467, + "learning_rate": 1.5206378501753104e-06, + "loss": 2.5119, + "step": 32895 + }, + { + "epoch": 0.9754766776384071, + "grad_norm": 0.0671275183558464, + "learning_rate": 1.5169733103141959e-06, + "loss": 2.5284, + "step": 32896 + }, + { + "epoch": 0.9755063309907185, + "grad_norm": 0.06610316783189774, + "learning_rate": 1.5133131846058468e-06, + "loss": 2.5904, + "step": 32897 + }, + { + "epoch": 0.97553598434303, + "grad_norm": 0.06134732812643051, + "learning_rate": 1.5096574730826818e-06, + "loss": 2.5009, + "step": 32898 + }, + { + "epoch": 0.9755656376953414, + "grad_norm": 0.06491456925868988, + "learning_rate": 1.5060061757770082e-06, + "loss": 2.5405, + "step": 32899 + }, + { + "epoch": 0.975595291047653, + "grad_norm": 0.0642072781920433, + "learning_rate": 1.5023592927213004e-06, + "loss": 2.583, + "step": 32900 + }, + { + "epoch": 0.9756249443999644, + "grad_norm": 0.06456662714481354, + "learning_rate": 1.4987168239476989e-06, + "loss": 2.5858, + "step": 32901 + }, + { + "epoch": 0.9756545977522759, + "grad_norm": 0.06561250239610672, + "learning_rate": 1.495078769488567e-06, + "loss": 2.5501, + "step": 32902 + }, + { + "epoch": 0.9756842511045873, + "grad_norm": 0.0659613087773323, + "learning_rate": 1.4914451293760455e-06, + "loss": 2.5553, + "step": 32903 + }, + { + "epoch": 0.9757139044568989, + "grad_norm": 0.06394175440073013, + "learning_rate": 1.4878159036423866e-06, + "loss": 2.5826, + "step": 32904 + }, + { + "epoch": 0.9757435578092103, + "grad_norm": 0.062389764934778214, + "learning_rate": 1.48419109231962e-06, + "loss": 2.5451, + "step": 32905 + }, + { + "epoch": 0.9757732111615218, + "grad_norm": 0.06548643112182617, + "learning_rate": 1.4805706954399978e-06, + "loss": 2.5214, + "step": 32906 + }, + { + "epoch": 0.9758028645138332, + "grad_norm": 0.06215332821011543, + "learning_rate": 1.4769547130354387e-06, + "loss": 2.5195, + "step": 32907 + }, + { + "epoch": 0.9758325178661448, + "grad_norm": 0.06909608095884323, + "learning_rate": 1.4733431451380285e-06, + "loss": 2.563, + "step": 32908 + }, + { + "epoch": 0.9758621712184562, + "grad_norm": 0.06273764371871948, + "learning_rate": 1.4697359917797416e-06, + "loss": 2.5398, + "step": 32909 + }, + { + "epoch": 0.9758918245707677, + "grad_norm": 0.06283947825431824, + "learning_rate": 1.4661332529924964e-06, + "loss": 2.5553, + "step": 32910 + }, + { + "epoch": 0.9759214779230792, + "grad_norm": 0.06646761298179626, + "learning_rate": 1.4625349288082124e-06, + "loss": 2.5322, + "step": 32911 + }, + { + "epoch": 0.9759511312753907, + "grad_norm": 0.06367625296115875, + "learning_rate": 1.4589410192587526e-06, + "loss": 2.5543, + "step": 32912 + }, + { + "epoch": 0.9759807846277022, + "grad_norm": 0.06329236179590225, + "learning_rate": 1.4553515243759808e-06, + "loss": 2.5373, + "step": 32913 + }, + { + "epoch": 0.9760104379800136, + "grad_norm": 0.064116470515728, + "learning_rate": 1.4517664441915934e-06, + "loss": 2.5534, + "step": 32914 + }, + { + "epoch": 0.9760400913323252, + "grad_norm": 0.06442850083112717, + "learning_rate": 1.4481857787374542e-06, + "loss": 2.5144, + "step": 32915 + }, + { + "epoch": 0.9760697446846366, + "grad_norm": 0.06986177712678909, + "learning_rate": 1.4446095280451487e-06, + "loss": 2.5444, + "step": 32916 + }, + { + "epoch": 0.9760993980369481, + "grad_norm": 0.06230446696281433, + "learning_rate": 1.441037692146374e-06, + "loss": 2.5064, + "step": 32917 + }, + { + "epoch": 0.9761290513892595, + "grad_norm": 0.0670555979013443, + "learning_rate": 1.4374702710728826e-06, + "loss": 2.552, + "step": 32918 + }, + { + "epoch": 0.9761587047415711, + "grad_norm": 0.06578955054283142, + "learning_rate": 1.4339072648560935e-06, + "loss": 2.5376, + "step": 32919 + }, + { + "epoch": 0.9761883580938825, + "grad_norm": 0.06385234743356705, + "learning_rate": 1.4303486735276483e-06, + "loss": 2.5303, + "step": 32920 + }, + { + "epoch": 0.976218011446194, + "grad_norm": 0.06470921635627747, + "learning_rate": 1.4267944971190773e-06, + "loss": 2.5952, + "step": 32921 + }, + { + "epoch": 0.9762476647985054, + "grad_norm": 0.06508026272058487, + "learning_rate": 1.4232447356617995e-06, + "loss": 2.5819, + "step": 32922 + }, + { + "epoch": 0.976277318150817, + "grad_norm": 0.06641914695501328, + "learning_rate": 1.41969938918729e-06, + "loss": 2.5311, + "step": 32923 + }, + { + "epoch": 0.9763069715031284, + "grad_norm": 0.0634770542383194, + "learning_rate": 1.416158457726857e-06, + "loss": 2.53, + "step": 32924 + }, + { + "epoch": 0.9763366248554399, + "grad_norm": 0.06348579376935959, + "learning_rate": 1.4126219413119757e-06, + "loss": 2.5049, + "step": 32925 + }, + { + "epoch": 0.9763662782077513, + "grad_norm": 0.06499126553535461, + "learning_rate": 1.4090898399738982e-06, + "loss": 2.5632, + "step": 32926 + }, + { + "epoch": 0.9763959315600629, + "grad_norm": 0.0640670657157898, + "learning_rate": 1.4055621537439333e-06, + "loss": 2.5575, + "step": 32927 + }, + { + "epoch": 0.9764255849123743, + "grad_norm": 0.06219841539859772, + "learning_rate": 1.402038882653278e-06, + "loss": 2.5232, + "step": 32928 + }, + { + "epoch": 0.9764552382646858, + "grad_norm": 0.06424997001886368, + "learning_rate": 1.39852002673313e-06, + "loss": 2.5712, + "step": 32929 + }, + { + "epoch": 0.9764848916169973, + "grad_norm": 0.06409594416618347, + "learning_rate": 1.395005586014686e-06, + "loss": 2.5248, + "step": 32930 + }, + { + "epoch": 0.9765145449693088, + "grad_norm": 0.06444636732339859, + "learning_rate": 1.3914955605290326e-06, + "loss": 2.5503, + "step": 32931 + }, + { + "epoch": 0.9765441983216202, + "grad_norm": 0.0650663822889328, + "learning_rate": 1.3879899503073112e-06, + "loss": 2.5437, + "step": 32932 + }, + { + "epoch": 0.9765738516739317, + "grad_norm": 0.06726177036762238, + "learning_rate": 1.3844887553804974e-06, + "loss": 2.5335, + "step": 32933 + }, + { + "epoch": 0.9766035050262433, + "grad_norm": 0.06320024281740189, + "learning_rate": 1.3809919757796218e-06, + "loss": 2.5443, + "step": 32934 + }, + { + "epoch": 0.9766331583785547, + "grad_norm": 0.0627439022064209, + "learning_rate": 1.3774996115356597e-06, + "loss": 2.5608, + "step": 32935 + }, + { + "epoch": 0.9766628117308662, + "grad_norm": 0.06406913697719574, + "learning_rate": 1.3740116626795308e-06, + "loss": 2.5195, + "step": 32936 + }, + { + "epoch": 0.9766924650831776, + "grad_norm": 0.06445419788360596, + "learning_rate": 1.3705281292421546e-06, + "loss": 2.5157, + "step": 32937 + }, + { + "epoch": 0.9767221184354892, + "grad_norm": 0.06398559361696243, + "learning_rate": 1.3670490112542844e-06, + "loss": 2.5255, + "step": 32938 + }, + { + "epoch": 0.9767517717878006, + "grad_norm": 0.06354381889104843, + "learning_rate": 1.3635743087467844e-06, + "loss": 2.5333, + "step": 32939 + }, + { + "epoch": 0.9767814251401121, + "grad_norm": 0.06393877416849136, + "learning_rate": 1.3601040217504633e-06, + "loss": 2.5318, + "step": 32940 + }, + { + "epoch": 0.9768110784924235, + "grad_norm": 0.063652403652668, + "learning_rate": 1.3566381502959634e-06, + "loss": 2.5507, + "step": 32941 + }, + { + "epoch": 0.9768407318447351, + "grad_norm": 0.06439093500375748, + "learning_rate": 1.3531766944140377e-06, + "loss": 2.5318, + "step": 32942 + }, + { + "epoch": 0.9768703851970465, + "grad_norm": 0.06385219097137451, + "learning_rate": 1.3497196541353284e-06, + "loss": 2.5416, + "step": 32943 + }, + { + "epoch": 0.976900038549358, + "grad_norm": 0.06358131766319275, + "learning_rate": 1.3462670294904777e-06, + "loss": 2.5548, + "step": 32944 + }, + { + "epoch": 0.9769296919016695, + "grad_norm": 0.06273514777421951, + "learning_rate": 1.3428188205099612e-06, + "loss": 2.5504, + "step": 32945 + }, + { + "epoch": 0.976959345253981, + "grad_norm": 0.06512884050607681, + "learning_rate": 1.339375027224421e-06, + "loss": 2.5612, + "step": 32946 + }, + { + "epoch": 0.9769889986062924, + "grad_norm": 0.06310838460922241, + "learning_rate": 1.3359356496642771e-06, + "loss": 2.5195, + "step": 32947 + }, + { + "epoch": 0.9770186519586039, + "grad_norm": 0.06266199797391891, + "learning_rate": 1.332500687860061e-06, + "loss": 2.5244, + "step": 32948 + }, + { + "epoch": 0.9770483053109154, + "grad_norm": 0.06201054900884628, + "learning_rate": 1.329070141842137e-06, + "loss": 2.5294, + "step": 32949 + }, + { + "epoch": 0.9770779586632269, + "grad_norm": 0.06276663392782211, + "learning_rate": 1.3256440116408697e-06, + "loss": 2.5558, + "step": 32950 + }, + { + "epoch": 0.9771076120155383, + "grad_norm": 0.06623149663209915, + "learning_rate": 1.3222222972866238e-06, + "loss": 2.5194, + "step": 32951 + }, + { + "epoch": 0.9771372653678498, + "grad_norm": 0.062097493559122086, + "learning_rate": 1.3188049988097083e-06, + "loss": 2.5192, + "step": 32952 + }, + { + "epoch": 0.9771669187201613, + "grad_norm": 0.06574445962905884, + "learning_rate": 1.315392116240377e-06, + "loss": 2.5934, + "step": 32953 + }, + { + "epoch": 0.9771965720724728, + "grad_norm": 0.06694795936346054, + "learning_rate": 1.3119836496088278e-06, + "loss": 2.5415, + "step": 32954 + }, + { + "epoch": 0.9772262254247843, + "grad_norm": 0.06493385881185532, + "learning_rate": 1.3085795989452587e-06, + "loss": 2.5377, + "step": 32955 + }, + { + "epoch": 0.9772558787770957, + "grad_norm": 0.06489590555429459, + "learning_rate": 1.3051799642798124e-06, + "loss": 2.5435, + "step": 32956 + }, + { + "epoch": 0.9772855321294073, + "grad_norm": 0.06212369725108147, + "learning_rate": 1.3017847456426312e-06, + "loss": 2.5226, + "step": 32957 + }, + { + "epoch": 0.9773151854817187, + "grad_norm": 0.06485150754451752, + "learning_rate": 1.298393943063747e-06, + "loss": 2.5456, + "step": 32958 + }, + { + "epoch": 0.9773448388340302, + "grad_norm": 0.06709251552820206, + "learning_rate": 1.2950075565731357e-06, + "loss": 2.5275, + "step": 32959 + }, + { + "epoch": 0.9773744921863416, + "grad_norm": 0.06416898220777512, + "learning_rate": 1.2916255862008841e-06, + "loss": 2.5649, + "step": 32960 + }, + { + "epoch": 0.9774041455386532, + "grad_norm": 0.06739481538534164, + "learning_rate": 1.2882480319768575e-06, + "loss": 2.5602, + "step": 32961 + }, + { + "epoch": 0.9774337988909646, + "grad_norm": 0.06354238837957382, + "learning_rate": 1.2848748939310318e-06, + "loss": 2.5334, + "step": 32962 + }, + { + "epoch": 0.9774634522432761, + "grad_norm": 0.061837028712034225, + "learning_rate": 1.2815061720932165e-06, + "loss": 2.5405, + "step": 32963 + }, + { + "epoch": 0.9774931055955876, + "grad_norm": 0.062419649213552475, + "learning_rate": 1.2781418664932765e-06, + "loss": 2.5528, + "step": 32964 + }, + { + "epoch": 0.9775227589478991, + "grad_norm": 0.0664181262254715, + "learning_rate": 1.2747819771610214e-06, + "loss": 2.5151, + "step": 32965 + }, + { + "epoch": 0.9775524123002105, + "grad_norm": 0.06619825959205627, + "learning_rate": 1.2714265041260942e-06, + "loss": 2.5393, + "step": 32966 + }, + { + "epoch": 0.977582065652522, + "grad_norm": 0.0640801414847374, + "learning_rate": 1.2680754474183597e-06, + "loss": 2.5304, + "step": 32967 + }, + { + "epoch": 0.9776117190048335, + "grad_norm": 0.06303410232067108, + "learning_rate": 1.2647288070674058e-06, + "loss": 2.5335, + "step": 32968 + }, + { + "epoch": 0.977641372357145, + "grad_norm": 0.0626748576760292, + "learning_rate": 1.2613865831028747e-06, + "loss": 2.5459, + "step": 32969 + }, + { + "epoch": 0.9776710257094564, + "grad_norm": 0.0631072148680687, + "learning_rate": 1.25804877555441e-06, + "loss": 2.5542, + "step": 32970 + }, + { + "epoch": 0.9777006790617679, + "grad_norm": 0.06612029671669006, + "learning_rate": 1.254715384451488e-06, + "loss": 2.5447, + "step": 32971 + }, + { + "epoch": 0.9777303324140794, + "grad_norm": 0.06632888317108154, + "learning_rate": 1.251386409823696e-06, + "loss": 2.531, + "step": 32972 + }, + { + "epoch": 0.9777599857663909, + "grad_norm": 0.06841697543859482, + "learning_rate": 1.2480618517005104e-06, + "loss": 2.5378, + "step": 32973 + }, + { + "epoch": 0.9777896391187023, + "grad_norm": 0.06739402562379837, + "learning_rate": 1.2447417101112968e-06, + "loss": 2.5515, + "step": 32974 + }, + { + "epoch": 0.9778192924710138, + "grad_norm": 0.06509960442781448, + "learning_rate": 1.2414259850855315e-06, + "loss": 2.5411, + "step": 32975 + }, + { + "epoch": 0.9778489458233254, + "grad_norm": 0.06333781778812408, + "learning_rate": 1.2381146766525243e-06, + "loss": 2.5545, + "step": 32976 + }, + { + "epoch": 0.9778785991756368, + "grad_norm": 0.06456490606069565, + "learning_rate": 1.2348077848416406e-06, + "loss": 2.5558, + "step": 32977 + }, + { + "epoch": 0.9779082525279483, + "grad_norm": 0.06283383071422577, + "learning_rate": 1.2315053096821905e-06, + "loss": 2.5062, + "step": 32978 + }, + { + "epoch": 0.9779379058802598, + "grad_norm": 0.06448521465063095, + "learning_rate": 1.228207251203317e-06, + "loss": 2.5051, + "step": 32979 + }, + { + "epoch": 0.9779675592325713, + "grad_norm": 0.06645093113183975, + "learning_rate": 1.224913609434275e-06, + "loss": 2.5472, + "step": 32980 + }, + { + "epoch": 0.9779972125848827, + "grad_norm": 0.06538379192352295, + "learning_rate": 1.221624384404263e-06, + "loss": 2.545, + "step": 32981 + }, + { + "epoch": 0.9780268659371942, + "grad_norm": 0.06288297474384308, + "learning_rate": 1.2183395761423687e-06, + "loss": 2.5581, + "step": 32982 + }, + { + "epoch": 0.9780565192895057, + "grad_norm": 0.06455624103546143, + "learning_rate": 1.2150591846776803e-06, + "loss": 2.5607, + "step": 32983 + }, + { + "epoch": 0.9780861726418172, + "grad_norm": 0.06458109617233276, + "learning_rate": 1.2117832100392857e-06, + "loss": 2.5583, + "step": 32984 + }, + { + "epoch": 0.9781158259941286, + "grad_norm": 0.0655582919716835, + "learning_rate": 1.2085116522561613e-06, + "loss": 2.5236, + "step": 32985 + }, + { + "epoch": 0.9781454793464401, + "grad_norm": 0.06446325033903122, + "learning_rate": 1.2052445113572841e-06, + "loss": 2.5315, + "step": 32986 + }, + { + "epoch": 0.9781751326987516, + "grad_norm": 0.06415681540966034, + "learning_rate": 1.2019817873715756e-06, + "loss": 2.5272, + "step": 32987 + }, + { + "epoch": 0.9782047860510631, + "grad_norm": 0.06353525072336197, + "learning_rate": 1.1987234803279567e-06, + "loss": 2.5208, + "step": 32988 + }, + { + "epoch": 0.9782344394033745, + "grad_norm": 0.06387978792190552, + "learning_rate": 1.1954695902552382e-06, + "loss": 2.5113, + "step": 32989 + }, + { + "epoch": 0.978264092755686, + "grad_norm": 0.06567198038101196, + "learning_rate": 1.19222011718223e-06, + "loss": 2.5614, + "step": 32990 + }, + { + "epoch": 0.9782937461079975, + "grad_norm": 0.06433935463428497, + "learning_rate": 1.188975061137798e-06, + "loss": 2.5523, + "step": 32991 + }, + { + "epoch": 0.978323399460309, + "grad_norm": 0.06730762124061584, + "learning_rate": 1.1857344221505305e-06, + "loss": 2.5075, + "step": 32992 + }, + { + "epoch": 0.9783530528126204, + "grad_norm": 0.06603941321372986, + "learning_rate": 1.1824982002492934e-06, + "loss": 2.5491, + "step": 32993 + }, + { + "epoch": 0.978382706164932, + "grad_norm": 0.06494691222906113, + "learning_rate": 1.1792663954625637e-06, + "loss": 2.5246, + "step": 32994 + }, + { + "epoch": 0.9784123595172435, + "grad_norm": 0.06128855422139168, + "learning_rate": 1.1760390078190963e-06, + "loss": 2.5093, + "step": 32995 + }, + { + "epoch": 0.9784420128695549, + "grad_norm": 0.06266222149133682, + "learning_rate": 1.172816037347424e-06, + "loss": 2.5589, + "step": 32996 + }, + { + "epoch": 0.9784716662218664, + "grad_norm": 0.06498503684997559, + "learning_rate": 1.1695974840760792e-06, + "loss": 2.5526, + "step": 32997 + }, + { + "epoch": 0.9785013195741779, + "grad_norm": 0.06200229749083519, + "learning_rate": 1.166383348033595e-06, + "loss": 2.5134, + "step": 32998 + }, + { + "epoch": 0.9785309729264894, + "grad_norm": 0.06334537267684937, + "learning_rate": 1.1631736292484484e-06, + "loss": 2.5073, + "step": 32999 + }, + { + "epoch": 0.9785606262788008, + "grad_norm": 0.06585591286420822, + "learning_rate": 1.1599683277489502e-06, + "loss": 2.5494, + "step": 33000 + }, + { + "epoch": 0.9785902796311123, + "grad_norm": 0.06699937582015991, + "learning_rate": 1.1567674435635779e-06, + "loss": 2.5457, + "step": 33001 + }, + { + "epoch": 0.9786199329834238, + "grad_norm": 0.06287400424480438, + "learning_rate": 1.1535709767206415e-06, + "loss": 2.5303, + "step": 33002 + }, + { + "epoch": 0.9786495863357353, + "grad_norm": 0.06654573976993561, + "learning_rate": 1.1503789272485077e-06, + "loss": 2.5485, + "step": 33003 + }, + { + "epoch": 0.9786792396880467, + "grad_norm": 0.06453979015350342, + "learning_rate": 1.147191295175376e-06, + "loss": 2.5527, + "step": 33004 + }, + { + "epoch": 0.9787088930403582, + "grad_norm": 0.06768717616796494, + "learning_rate": 1.1440080805294463e-06, + "loss": 2.5574, + "step": 33005 + }, + { + "epoch": 0.9787385463926697, + "grad_norm": 0.0619388148188591, + "learning_rate": 1.1408292833389732e-06, + "loss": 2.5201, + "step": 33006 + }, + { + "epoch": 0.9787681997449812, + "grad_norm": 0.06322218477725983, + "learning_rate": 1.1376549036321016e-06, + "loss": 2.5217, + "step": 33007 + }, + { + "epoch": 0.9787978530972926, + "grad_norm": 0.06363005936145782, + "learning_rate": 1.1344849414369195e-06, + "loss": 2.5303, + "step": 33008 + }, + { + "epoch": 0.9788275064496041, + "grad_norm": 0.06396674364805222, + "learning_rate": 1.1313193967815161e-06, + "loss": 2.5482, + "step": 33009 + }, + { + "epoch": 0.9788571598019156, + "grad_norm": 0.06506488472223282, + "learning_rate": 1.128158269693924e-06, + "loss": 2.5636, + "step": 33010 + }, + { + "epoch": 0.9788868131542271, + "grad_norm": 0.06456077843904495, + "learning_rate": 1.1250015602020658e-06, + "loss": 2.5483, + "step": 33011 + }, + { + "epoch": 0.9789164665065385, + "grad_norm": 0.06521043181419373, + "learning_rate": 1.1218492683339743e-06, + "loss": 2.5399, + "step": 33012 + }, + { + "epoch": 0.97894611985885, + "grad_norm": 0.0663144662976265, + "learning_rate": 1.1187013941175717e-06, + "loss": 2.5213, + "step": 33013 + }, + { + "epoch": 0.9789757732111615, + "grad_norm": 0.06325515359640121, + "learning_rate": 1.115557937580669e-06, + "loss": 2.55, + "step": 33014 + }, + { + "epoch": 0.979005426563473, + "grad_norm": 0.06240811198949814, + "learning_rate": 1.112418898751133e-06, + "loss": 2.5271, + "step": 33015 + }, + { + "epoch": 0.9790350799157845, + "grad_norm": 0.06436561793088913, + "learning_rate": 1.1092842776567747e-06, + "loss": 2.5224, + "step": 33016 + }, + { + "epoch": 0.979064733268096, + "grad_norm": 0.0633295476436615, + "learning_rate": 1.106154074325294e-06, + "loss": 2.5507, + "step": 33017 + }, + { + "epoch": 0.9790943866204075, + "grad_norm": 0.06430821865797043, + "learning_rate": 1.1030282887845022e-06, + "loss": 2.4958, + "step": 33018 + }, + { + "epoch": 0.9791240399727189, + "grad_norm": 0.06239418685436249, + "learning_rate": 1.0999069210619883e-06, + "loss": 2.5194, + "step": 33019 + }, + { + "epoch": 0.9791536933250304, + "grad_norm": 0.06451159715652466, + "learning_rate": 1.0967899711854523e-06, + "loss": 2.5615, + "step": 33020 + }, + { + "epoch": 0.9791833466773419, + "grad_norm": 0.06687730550765991, + "learning_rate": 1.0936774391824833e-06, + "loss": 2.5695, + "step": 33021 + }, + { + "epoch": 0.9792130000296534, + "grad_norm": 0.06512849032878876, + "learning_rate": 1.0905693250806703e-06, + "loss": 2.5253, + "step": 33022 + }, + { + "epoch": 0.9792426533819648, + "grad_norm": 0.06508314609527588, + "learning_rate": 1.087465628907436e-06, + "loss": 2.5093, + "step": 33023 + }, + { + "epoch": 0.9792723067342763, + "grad_norm": 0.06184525415301323, + "learning_rate": 1.084366350690369e-06, + "loss": 2.5437, + "step": 33024 + }, + { + "epoch": 0.9793019600865878, + "grad_norm": 0.06400884687900543, + "learning_rate": 1.0812714904568922e-06, + "loss": 2.5389, + "step": 33025 + }, + { + "epoch": 0.9793316134388993, + "grad_norm": 0.06344828754663467, + "learning_rate": 1.0781810482343723e-06, + "loss": 2.5541, + "step": 33026 + }, + { + "epoch": 0.9793612667912107, + "grad_norm": 0.06515462696552277, + "learning_rate": 1.0750950240501766e-06, + "loss": 2.5694, + "step": 33027 + }, + { + "epoch": 0.9793909201435222, + "grad_norm": 0.06538063287734985, + "learning_rate": 1.072013417931672e-06, + "loss": 2.5246, + "step": 33028 + }, + { + "epoch": 0.9794205734958337, + "grad_norm": 0.06195791810750961, + "learning_rate": 1.0689362299061144e-06, + "loss": 2.4949, + "step": 33029 + }, + { + "epoch": 0.9794502268481452, + "grad_norm": 0.0647604912519455, + "learning_rate": 1.0658634600008155e-06, + "loss": 2.5291, + "step": 33030 + }, + { + "epoch": 0.9794798802004566, + "grad_norm": 0.0629238411784172, + "learning_rate": 1.0627951082428643e-06, + "loss": 2.5369, + "step": 33031 + }, + { + "epoch": 0.9795095335527682, + "grad_norm": 0.06234359368681908, + "learning_rate": 1.059731174659573e-06, + "loss": 2.5135, + "step": 33032 + }, + { + "epoch": 0.9795391869050796, + "grad_norm": 0.07348611950874329, + "learning_rate": 1.0566716592779746e-06, + "loss": 2.5102, + "step": 33033 + }, + { + "epoch": 0.9795688402573911, + "grad_norm": 0.06556185334920883, + "learning_rate": 1.0536165621251592e-06, + "loss": 2.4951, + "step": 33034 + }, + { + "epoch": 0.9795984936097025, + "grad_norm": 0.06610032171010971, + "learning_rate": 1.0505658832282715e-06, + "loss": 2.545, + "step": 33035 + }, + { + "epoch": 0.9796281469620141, + "grad_norm": 0.06456608325242996, + "learning_rate": 1.0475196226142348e-06, + "loss": 2.5167, + "step": 33036 + }, + { + "epoch": 0.9796578003143256, + "grad_norm": 0.06365307420492172, + "learning_rate": 1.044477780310027e-06, + "loss": 2.5045, + "step": 33037 + }, + { + "epoch": 0.979687453666637, + "grad_norm": 0.06485175341367722, + "learning_rate": 1.0414403563426822e-06, + "loss": 2.5695, + "step": 33038 + }, + { + "epoch": 0.9797171070189485, + "grad_norm": 0.06405708193778992, + "learning_rate": 1.038407350738957e-06, + "loss": 2.529, + "step": 33039 + }, + { + "epoch": 0.97974676037126, + "grad_norm": 0.06292905658483505, + "learning_rate": 1.0353787635258294e-06, + "loss": 2.4861, + "step": 33040 + }, + { + "epoch": 0.9797764137235715, + "grad_norm": 0.06336135417222977, + "learning_rate": 1.032354594730056e-06, + "loss": 2.5597, + "step": 33041 + }, + { + "epoch": 0.9798060670758829, + "grad_norm": 0.0679798424243927, + "learning_rate": 1.0293348443784488e-06, + "loss": 2.5333, + "step": 33042 + }, + { + "epoch": 0.9798357204281944, + "grad_norm": 0.06568588316440582, + "learning_rate": 1.0263195124977088e-06, + "loss": 2.5889, + "step": 33043 + }, + { + "epoch": 0.9798653737805059, + "grad_norm": 0.06393127143383026, + "learning_rate": 1.0233085991145364e-06, + "loss": 2.5582, + "step": 33044 + }, + { + "epoch": 0.9798950271328174, + "grad_norm": 0.06792263686656952, + "learning_rate": 1.0203021042556326e-06, + "loss": 2.5272, + "step": 33045 + }, + { + "epoch": 0.9799246804851288, + "grad_norm": 0.0642179474234581, + "learning_rate": 1.017300027947643e-06, + "loss": 2.5363, + "step": 33046 + }, + { + "epoch": 0.9799543338374404, + "grad_norm": 0.0649385079741478, + "learning_rate": 1.0143023702170461e-06, + "loss": 2.5649, + "step": 33047 + }, + { + "epoch": 0.9799839871897518, + "grad_norm": 0.06813428550958633, + "learning_rate": 1.0113091310904876e-06, + "loss": 2.4779, + "step": 33048 + }, + { + "epoch": 0.9800136405420633, + "grad_norm": 0.06170688942074776, + "learning_rate": 1.0083203105943905e-06, + "loss": 2.5194, + "step": 33049 + }, + { + "epoch": 0.9800432938943747, + "grad_norm": 0.06266272813081741, + "learning_rate": 1.0053359087553448e-06, + "loss": 2.5511, + "step": 33050 + }, + { + "epoch": 0.9800729472466863, + "grad_norm": 0.06477442383766174, + "learning_rate": 1.0023559255996627e-06, + "loss": 2.5316, + "step": 33051 + }, + { + "epoch": 0.9801026005989977, + "grad_norm": 0.06287795305252075, + "learning_rate": 9.993803611537677e-07, + "loss": 2.5169, + "step": 33052 + }, + { + "epoch": 0.9801322539513092, + "grad_norm": 0.06124351918697357, + "learning_rate": 9.964092154439719e-07, + "loss": 2.5314, + "step": 33053 + }, + { + "epoch": 0.9801619073036206, + "grad_norm": 0.06405092775821686, + "learning_rate": 9.934424884966987e-07, + "loss": 2.5456, + "step": 33054 + }, + { + "epoch": 0.9801915606559322, + "grad_norm": 0.06496522575616837, + "learning_rate": 9.90480180338149e-07, + "loss": 2.5741, + "step": 33055 + }, + { + "epoch": 0.9802212140082436, + "grad_norm": 0.06399475038051605, + "learning_rate": 9.875222909944692e-07, + "loss": 2.5213, + "step": 33056 + }, + { + "epoch": 0.9802508673605551, + "grad_norm": 0.06131041795015335, + "learning_rate": 9.845688204920267e-07, + "loss": 2.5581, + "step": 33057 + }, + { + "epoch": 0.9802805207128666, + "grad_norm": 0.06443066895008087, + "learning_rate": 9.816197688568007e-07, + "loss": 2.5182, + "step": 33058 + }, + { + "epoch": 0.9803101740651781, + "grad_norm": 0.06422913074493408, + "learning_rate": 9.786751361149926e-07, + "loss": 2.5324, + "step": 33059 + }, + { + "epoch": 0.9803398274174896, + "grad_norm": 0.06302464008331299, + "learning_rate": 9.75734922292748e-07, + "loss": 2.5513, + "step": 33060 + }, + { + "epoch": 0.980369480769801, + "grad_norm": 0.06315618008375168, + "learning_rate": 9.727991274159352e-07, + "loss": 2.5394, + "step": 33061 + }, + { + "epoch": 0.9803991341221125, + "grad_norm": 0.06592244654893875, + "learning_rate": 9.698677515107001e-07, + "loss": 2.5139, + "step": 33062 + }, + { + "epoch": 0.980428787474424, + "grad_norm": 0.06628365814685822, + "learning_rate": 9.669407946029663e-07, + "loss": 2.5641, + "step": 33063 + }, + { + "epoch": 0.9804584408267355, + "grad_norm": 0.06562967598438263, + "learning_rate": 9.640182567185463e-07, + "loss": 2.5514, + "step": 33064 + }, + { + "epoch": 0.9804880941790469, + "grad_norm": 0.06426148116588593, + "learning_rate": 9.61100137883475e-07, + "loss": 2.533, + "step": 33065 + }, + { + "epoch": 0.9805177475313585, + "grad_norm": 0.06334105879068375, + "learning_rate": 9.581864381235094e-07, + "loss": 2.5221, + "step": 33066 + }, + { + "epoch": 0.9805474008836699, + "grad_norm": 0.06376761943101883, + "learning_rate": 9.552771574644625e-07, + "loss": 2.5352, + "step": 33067 + }, + { + "epoch": 0.9805770542359814, + "grad_norm": 0.06313426792621613, + "learning_rate": 9.523722959320913e-07, + "loss": 2.5302, + "step": 33068 + }, + { + "epoch": 0.9806067075882928, + "grad_norm": 0.06514619290828705, + "learning_rate": 9.494718535520974e-07, + "loss": 2.5709, + "step": 33069 + }, + { + "epoch": 0.9806363609406044, + "grad_norm": 0.06275063008069992, + "learning_rate": 9.465758303502381e-07, + "loss": 2.5597, + "step": 33070 + }, + { + "epoch": 0.9806660142929158, + "grad_norm": 0.06308812648057938, + "learning_rate": 9.436842263520484e-07, + "loss": 2.5035, + "step": 33071 + }, + { + "epoch": 0.9806956676452273, + "grad_norm": 0.06620181351900101, + "learning_rate": 9.407970415832301e-07, + "loss": 2.5114, + "step": 33072 + }, + { + "epoch": 0.9807253209975387, + "grad_norm": 0.06369036436080933, + "learning_rate": 9.379142760693183e-07, + "loss": 2.5383, + "step": 33073 + }, + { + "epoch": 0.9807549743498503, + "grad_norm": 0.06612145155668259, + "learning_rate": 9.350359298358479e-07, + "loss": 2.5555, + "step": 33074 + }, + { + "epoch": 0.9807846277021617, + "grad_norm": 0.06322197616100311, + "learning_rate": 9.321620029082989e-07, + "loss": 2.5356, + "step": 33075 + }, + { + "epoch": 0.9808142810544732, + "grad_norm": 0.06286550313234329, + "learning_rate": 9.292924953120951e-07, + "loss": 2.5267, + "step": 33076 + }, + { + "epoch": 0.9808439344067846, + "grad_norm": 0.06620782613754272, + "learning_rate": 9.264274070727163e-07, + "loss": 2.5583, + "step": 33077 + }, + { + "epoch": 0.9808735877590962, + "grad_norm": 0.06335797160863876, + "learning_rate": 9.2356673821542e-07, + "loss": 2.55, + "step": 33078 + }, + { + "epoch": 0.9809032411114077, + "grad_norm": 0.06328724324703217, + "learning_rate": 9.207104887656859e-07, + "loss": 2.4962, + "step": 33079 + }, + { + "epoch": 0.9809328944637191, + "grad_norm": 0.06355975568294525, + "learning_rate": 9.178586587486603e-07, + "loss": 2.5306, + "step": 33080 + }, + { + "epoch": 0.9809625478160306, + "grad_norm": 0.06678622961044312, + "learning_rate": 9.150112481896567e-07, + "loss": 2.5181, + "step": 33081 + }, + { + "epoch": 0.9809922011683421, + "grad_norm": 0.0623457133769989, + "learning_rate": 9.121682571139323e-07, + "loss": 2.5266, + "step": 33082 + }, + { + "epoch": 0.9810218545206536, + "grad_norm": 0.06490959972143173, + "learning_rate": 9.093296855466338e-07, + "loss": 2.5316, + "step": 33083 + }, + { + "epoch": 0.981051507872965, + "grad_norm": 0.06701183319091797, + "learning_rate": 9.064955335128522e-07, + "loss": 2.54, + "step": 33084 + }, + { + "epoch": 0.9810811612252766, + "grad_norm": 0.06365463137626648, + "learning_rate": 9.036658010377341e-07, + "loss": 2.5306, + "step": 33085 + }, + { + "epoch": 0.981110814577588, + "grad_norm": 0.06332647055387497, + "learning_rate": 9.008404881463705e-07, + "loss": 2.5124, + "step": 33086 + }, + { + "epoch": 0.9811404679298995, + "grad_norm": 0.0658080205321312, + "learning_rate": 8.98019594863686e-07, + "loss": 2.5615, + "step": 33087 + }, + { + "epoch": 0.9811701212822109, + "grad_norm": 0.0651259645819664, + "learning_rate": 8.95203121214716e-07, + "loss": 2.5315, + "step": 33088 + }, + { + "epoch": 0.9811997746345225, + "grad_norm": 0.0651441141963005, + "learning_rate": 8.923910672243851e-07, + "loss": 2.5477, + "step": 33089 + }, + { + "epoch": 0.9812294279868339, + "grad_norm": 0.06557019054889679, + "learning_rate": 8.895834329176177e-07, + "loss": 2.5238, + "step": 33090 + }, + { + "epoch": 0.9812590813391454, + "grad_norm": 0.06389883160591125, + "learning_rate": 8.867802183192275e-07, + "loss": 2.5288, + "step": 33091 + }, + { + "epoch": 0.9812887346914568, + "grad_norm": 0.06620018184185028, + "learning_rate": 8.839814234540833e-07, + "loss": 2.5324, + "step": 33092 + }, + { + "epoch": 0.9813183880437684, + "grad_norm": 0.06349707394838333, + "learning_rate": 8.811870483469985e-07, + "loss": 2.5435, + "step": 33093 + }, + { + "epoch": 0.9813480413960798, + "grad_norm": 0.06405189633369446, + "learning_rate": 8.783970930226204e-07, + "loss": 2.544, + "step": 33094 + }, + { + "epoch": 0.9813776947483913, + "grad_norm": 0.06796635687351227, + "learning_rate": 8.756115575057066e-07, + "loss": 2.5287, + "step": 33095 + }, + { + "epoch": 0.9814073481007027, + "grad_norm": 0.0650627613067627, + "learning_rate": 8.728304418209598e-07, + "loss": 2.5478, + "step": 33096 + }, + { + "epoch": 0.9814370014530143, + "grad_norm": 0.06571090221405029, + "learning_rate": 8.700537459929714e-07, + "loss": 2.5232, + "step": 33097 + }, + { + "epoch": 0.9814666548053257, + "grad_norm": 0.06256496161222458, + "learning_rate": 8.672814700463328e-07, + "loss": 2.5131, + "step": 33098 + }, + { + "epoch": 0.9814963081576372, + "grad_norm": 0.06436780095100403, + "learning_rate": 8.645136140055798e-07, + "loss": 2.5575, + "step": 33099 + }, + { + "epoch": 0.9815259615099488, + "grad_norm": 0.06321089714765549, + "learning_rate": 8.617501778952486e-07, + "loss": 2.5486, + "step": 33100 + }, + { + "epoch": 0.9815556148622602, + "grad_norm": 0.06359107047319412, + "learning_rate": 8.589911617397639e-07, + "loss": 2.5179, + "step": 33101 + }, + { + "epoch": 0.9815852682145717, + "grad_norm": 0.06467349827289581, + "learning_rate": 8.562365655636062e-07, + "loss": 2.5098, + "step": 33102 + }, + { + "epoch": 0.9816149215668831, + "grad_norm": 0.06838543713092804, + "learning_rate": 8.534863893911449e-07, + "loss": 2.5159, + "step": 33103 + }, + { + "epoch": 0.9816445749191947, + "grad_norm": 0.06409024447202682, + "learning_rate": 8.507406332467494e-07, + "loss": 2.554, + "step": 33104 + }, + { + "epoch": 0.9816742282715061, + "grad_norm": 0.0661793127655983, + "learning_rate": 8.479992971547334e-07, + "loss": 2.5517, + "step": 33105 + }, + { + "epoch": 0.9817038816238176, + "grad_norm": 0.06442052125930786, + "learning_rate": 8.452623811393557e-07, + "loss": 2.5279, + "step": 33106 + }, + { + "epoch": 0.981733534976129, + "grad_norm": 0.062324587255716324, + "learning_rate": 8.425298852248742e-07, + "loss": 2.5546, + "step": 33107 + }, + { + "epoch": 0.9817631883284406, + "grad_norm": 0.0637134537100792, + "learning_rate": 8.398018094354365e-07, + "loss": 2.5416, + "step": 33108 + }, + { + "epoch": 0.981792841680752, + "grad_norm": 0.06275546550750732, + "learning_rate": 8.370781537952454e-07, + "loss": 2.5173, + "step": 33109 + }, + { + "epoch": 0.9818224950330635, + "grad_norm": 0.06928198039531708, + "learning_rate": 8.343589183283928e-07, + "loss": 2.5393, + "step": 33110 + }, + { + "epoch": 0.9818521483853749, + "grad_norm": 0.06278246641159058, + "learning_rate": 8.31644103059026e-07, + "loss": 2.5427, + "step": 33111 + }, + { + "epoch": 0.9818818017376865, + "grad_norm": 0.06481795758008957, + "learning_rate": 8.289337080110704e-07, + "loss": 2.5554, + "step": 33112 + }, + { + "epoch": 0.9819114550899979, + "grad_norm": 0.06275667995214462, + "learning_rate": 8.262277332086177e-07, + "loss": 2.5136, + "step": 33113 + }, + { + "epoch": 0.9819411084423094, + "grad_norm": 0.06423404812812805, + "learning_rate": 8.235261786755932e-07, + "loss": 2.5267, + "step": 33114 + }, + { + "epoch": 0.9819707617946208, + "grad_norm": 0.06381735950708389, + "learning_rate": 8.208290444359223e-07, + "loss": 2.5195, + "step": 33115 + }, + { + "epoch": 0.9820004151469324, + "grad_norm": 0.06399356573820114, + "learning_rate": 8.181363305134748e-07, + "loss": 2.5281, + "step": 33116 + }, + { + "epoch": 0.9820300684992438, + "grad_norm": 0.06642325222492218, + "learning_rate": 8.15448036932176e-07, + "loss": 2.5763, + "step": 33117 + }, + { + "epoch": 0.9820597218515553, + "grad_norm": 0.0638599544763565, + "learning_rate": 8.127641637157291e-07, + "loss": 2.5379, + "step": 33118 + }, + { + "epoch": 0.9820893752038667, + "grad_norm": 0.0650644302368164, + "learning_rate": 8.100847108879483e-07, + "loss": 2.5482, + "step": 33119 + }, + { + "epoch": 0.9821190285561783, + "grad_norm": 0.06340496242046356, + "learning_rate": 8.074096784725371e-07, + "loss": 2.5438, + "step": 33120 + }, + { + "epoch": 0.9821486819084898, + "grad_norm": 0.06363343447446823, + "learning_rate": 8.047390664931986e-07, + "loss": 2.5222, + "step": 33121 + }, + { + "epoch": 0.9821783352608012, + "grad_norm": 0.06520646065473557, + "learning_rate": 8.020728749735806e-07, + "loss": 2.5503, + "step": 33122 + }, + { + "epoch": 0.9822079886131128, + "grad_norm": 0.06464777886867523, + "learning_rate": 7.994111039373309e-07, + "loss": 2.5387, + "step": 33123 + }, + { + "epoch": 0.9822376419654242, + "grad_norm": 0.06681495904922485, + "learning_rate": 7.967537534079305e-07, + "loss": 2.5447, + "step": 33124 + }, + { + "epoch": 0.9822672953177357, + "grad_norm": 0.06534942239522934, + "learning_rate": 7.941008234089719e-07, + "loss": 2.5448, + "step": 33125 + }, + { + "epoch": 0.9822969486700471, + "grad_norm": 0.0648394376039505, + "learning_rate": 7.914523139639918e-07, + "loss": 2.5859, + "step": 33126 + }, + { + "epoch": 0.9823266020223587, + "grad_norm": 0.06391390413045883, + "learning_rate": 7.888082250963046e-07, + "loss": 2.5183, + "step": 33127 + }, + { + "epoch": 0.9823562553746701, + "grad_norm": 0.06674113869667053, + "learning_rate": 7.861685568294474e-07, + "loss": 2.5627, + "step": 33128 + }, + { + "epoch": 0.9823859087269816, + "grad_norm": 0.06302453577518463, + "learning_rate": 7.835333091867903e-07, + "loss": 2.5418, + "step": 33129 + }, + { + "epoch": 0.982415562079293, + "grad_norm": 0.06346600502729416, + "learning_rate": 7.809024821916477e-07, + "loss": 2.5258, + "step": 33130 + }, + { + "epoch": 0.9824452154316046, + "grad_norm": 0.06207958981394768, + "learning_rate": 7.782760758672236e-07, + "loss": 2.5475, + "step": 33131 + }, + { + "epoch": 0.982474868783916, + "grad_norm": 0.06477762758731842, + "learning_rate": 7.756540902368881e-07, + "loss": 2.5323, + "step": 33132 + }, + { + "epoch": 0.9825045221362275, + "grad_norm": 0.06582403928041458, + "learning_rate": 7.730365253238447e-07, + "loss": 2.5474, + "step": 33133 + }, + { + "epoch": 0.9825341754885389, + "grad_norm": 0.0646086037158966, + "learning_rate": 7.704233811512417e-07, + "loss": 2.5403, + "step": 33134 + }, + { + "epoch": 0.9825638288408505, + "grad_norm": 0.06611604988574982, + "learning_rate": 7.678146577422273e-07, + "loss": 2.5384, + "step": 33135 + }, + { + "epoch": 0.9825934821931619, + "grad_norm": 0.0651087611913681, + "learning_rate": 7.652103551198941e-07, + "loss": 2.5566, + "step": 33136 + }, + { + "epoch": 0.9826231355454734, + "grad_norm": 0.06373044103384018, + "learning_rate": 7.626104733073347e-07, + "loss": 2.5562, + "step": 33137 + }, + { + "epoch": 0.9826527888977848, + "grad_norm": 0.06359432637691498, + "learning_rate": 7.600150123275861e-07, + "loss": 2.5155, + "step": 33138 + }, + { + "epoch": 0.9826824422500964, + "grad_norm": 0.0649486631155014, + "learning_rate": 7.574239722035747e-07, + "loss": 2.5654, + "step": 33139 + }, + { + "epoch": 0.9827120956024078, + "grad_norm": 0.06358549743890762, + "learning_rate": 7.548373529582264e-07, + "loss": 2.5382, + "step": 33140 + }, + { + "epoch": 0.9827417489547193, + "grad_norm": 0.06405548006296158, + "learning_rate": 7.522551546145229e-07, + "loss": 2.5367, + "step": 33141 + }, + { + "epoch": 0.9827714023070309, + "grad_norm": 0.063043013215065, + "learning_rate": 7.496773771952792e-07, + "loss": 2.5487, + "step": 33142 + }, + { + "epoch": 0.9828010556593423, + "grad_norm": 0.06317490339279175, + "learning_rate": 7.471040207233659e-07, + "loss": 2.5411, + "step": 33143 + }, + { + "epoch": 0.9828307090116538, + "grad_norm": 0.0636514350771904, + "learning_rate": 7.445350852215427e-07, + "loss": 2.5295, + "step": 33144 + }, + { + "epoch": 0.9828603623639652, + "grad_norm": 0.061832163482904434, + "learning_rate": 7.419705707125135e-07, + "loss": 2.5248, + "step": 33145 + }, + { + "epoch": 0.9828900157162768, + "grad_norm": 0.0612201988697052, + "learning_rate": 7.394104772190935e-07, + "loss": 2.563, + "step": 33146 + }, + { + "epoch": 0.9829196690685882, + "grad_norm": 0.06352457404136658, + "learning_rate": 7.368548047638202e-07, + "loss": 2.5323, + "step": 33147 + }, + { + "epoch": 0.9829493224208997, + "grad_norm": 0.06477905809879303, + "learning_rate": 7.343035533694531e-07, + "loss": 2.5003, + "step": 33148 + }, + { + "epoch": 0.9829789757732111, + "grad_norm": 0.06468510627746582, + "learning_rate": 7.317567230584743e-07, + "loss": 2.5332, + "step": 33149 + }, + { + "epoch": 0.9830086291255227, + "grad_norm": 0.06256064772605896, + "learning_rate": 7.292143138535323e-07, + "loss": 2.509, + "step": 33150 + }, + { + "epoch": 0.9830382824778341, + "grad_norm": 0.06370800733566284, + "learning_rate": 7.266763257770537e-07, + "loss": 2.5419, + "step": 33151 + }, + { + "epoch": 0.9830679358301456, + "grad_norm": 0.06579778343439102, + "learning_rate": 7.241427588516314e-07, + "loss": 2.5715, + "step": 33152 + }, + { + "epoch": 0.983097589182457, + "grad_norm": 0.06418462842702866, + "learning_rate": 7.216136130995255e-07, + "loss": 2.5033, + "step": 33153 + }, + { + "epoch": 0.9831272425347686, + "grad_norm": 0.06422942131757736, + "learning_rate": 7.190888885433289e-07, + "loss": 2.5189, + "step": 33154 + }, + { + "epoch": 0.98315689588708, + "grad_norm": 0.06374651193618774, + "learning_rate": 7.165685852052462e-07, + "loss": 2.5239, + "step": 33155 + }, + { + "epoch": 0.9831865492393915, + "grad_norm": 0.06475862860679626, + "learning_rate": 7.140527031076483e-07, + "loss": 2.5272, + "step": 33156 + }, + { + "epoch": 0.983216202591703, + "grad_norm": 0.06330592185258865, + "learning_rate": 7.115412422727952e-07, + "loss": 2.5671, + "step": 33157 + }, + { + "epoch": 0.9832458559440145, + "grad_norm": 0.06295972317457199, + "learning_rate": 7.090342027230023e-07, + "loss": 2.5383, + "step": 33158 + }, + { + "epoch": 0.9832755092963259, + "grad_norm": 0.06625370681285858, + "learning_rate": 7.065315844803632e-07, + "loss": 2.5412, + "step": 33159 + }, + { + "epoch": 0.9833051626486374, + "grad_norm": 0.0657382532954216, + "learning_rate": 7.040333875671378e-07, + "loss": 2.5043, + "step": 33160 + }, + { + "epoch": 0.9833348160009489, + "grad_norm": 0.06394302099943161, + "learning_rate": 7.015396120053641e-07, + "loss": 2.55, + "step": 33161 + }, + { + "epoch": 0.9833644693532604, + "grad_norm": 0.06558223068714142, + "learning_rate": 6.990502578171354e-07, + "loss": 2.5449, + "step": 33162 + }, + { + "epoch": 0.9833941227055719, + "grad_norm": 0.06255226582288742, + "learning_rate": 6.965653250246007e-07, + "loss": 2.542, + "step": 33163 + }, + { + "epoch": 0.9834237760578833, + "grad_norm": 0.06332790851593018, + "learning_rate": 6.940848136496314e-07, + "loss": 2.5473, + "step": 33164 + }, + { + "epoch": 0.9834534294101949, + "grad_norm": 0.06292598694562912, + "learning_rate": 6.916087237142099e-07, + "loss": 2.5475, + "step": 33165 + }, + { + "epoch": 0.9834830827625063, + "grad_norm": 0.06472670286893845, + "learning_rate": 6.891370552403741e-07, + "loss": 2.5274, + "step": 33166 + }, + { + "epoch": 0.9835127361148178, + "grad_norm": 0.06553792208433151, + "learning_rate": 6.866698082498846e-07, + "loss": 2.5193, + "step": 33167 + }, + { + "epoch": 0.9835423894671292, + "grad_norm": 0.061088111251592636, + "learning_rate": 6.84206982764668e-07, + "loss": 2.5278, + "step": 33168 + }, + { + "epoch": 0.9835720428194408, + "grad_norm": 0.06555169820785522, + "learning_rate": 6.817485788064847e-07, + "loss": 2.5479, + "step": 33169 + }, + { + "epoch": 0.9836016961717522, + "grad_norm": 0.06374754756689072, + "learning_rate": 6.792945963971509e-07, + "loss": 2.5467, + "step": 33170 + }, + { + "epoch": 0.9836313495240637, + "grad_norm": 0.06299372762441635, + "learning_rate": 6.768450355583155e-07, + "loss": 2.5492, + "step": 33171 + }, + { + "epoch": 0.9836610028763751, + "grad_norm": 0.06195421516895294, + "learning_rate": 6.743998963117947e-07, + "loss": 2.5397, + "step": 33172 + }, + { + "epoch": 0.9836906562286867, + "grad_norm": 0.06534828990697861, + "learning_rate": 6.719591786791268e-07, + "loss": 2.5155, + "step": 33173 + }, + { + "epoch": 0.9837203095809981, + "grad_norm": 0.0639006644487381, + "learning_rate": 6.69522882681961e-07, + "loss": 2.5349, + "step": 33174 + }, + { + "epoch": 0.9837499629333096, + "grad_norm": 0.0661972314119339, + "learning_rate": 6.670910083419468e-07, + "loss": 2.5318, + "step": 33175 + }, + { + "epoch": 0.983779616285621, + "grad_norm": 0.06370589882135391, + "learning_rate": 6.646635556804559e-07, + "loss": 2.553, + "step": 33176 + }, + { + "epoch": 0.9838092696379326, + "grad_norm": 0.06307566910982132, + "learning_rate": 6.622405247191377e-07, + "loss": 2.4883, + "step": 33177 + }, + { + "epoch": 0.983838922990244, + "grad_norm": 0.0637807548046112, + "learning_rate": 6.598219154794194e-07, + "loss": 2.5756, + "step": 33178 + }, + { + "epoch": 0.9838685763425555, + "grad_norm": 0.062775157392025, + "learning_rate": 6.574077279826174e-07, + "loss": 2.5298, + "step": 33179 + }, + { + "epoch": 0.983898229694867, + "grad_norm": 0.06369675695896149, + "learning_rate": 6.549979622502145e-07, + "loss": 2.5241, + "step": 33180 + }, + { + "epoch": 0.9839278830471785, + "grad_norm": 0.06551806628704071, + "learning_rate": 6.525926183035269e-07, + "loss": 2.5515, + "step": 33181 + }, + { + "epoch": 0.9839575363994899, + "grad_norm": 0.06655044108629227, + "learning_rate": 6.501916961638154e-07, + "loss": 2.4893, + "step": 33182 + }, + { + "epoch": 0.9839871897518014, + "grad_norm": 0.06316672265529633, + "learning_rate": 6.477951958523965e-07, + "loss": 2.563, + "step": 33183 + }, + { + "epoch": 0.984016843104113, + "grad_norm": 0.06597927957773209, + "learning_rate": 6.454031173904196e-07, + "loss": 2.5069, + "step": 33184 + }, + { + "epoch": 0.9840464964564244, + "grad_norm": 0.06372705101966858, + "learning_rate": 6.430154607991456e-07, + "loss": 2.523, + "step": 33185 + }, + { + "epoch": 0.9840761498087359, + "grad_norm": 0.06486086547374725, + "learning_rate": 6.406322260997244e-07, + "loss": 2.5222, + "step": 33186 + }, + { + "epoch": 0.9841058031610473, + "grad_norm": 0.06353703141212463, + "learning_rate": 6.382534133131391e-07, + "loss": 2.5468, + "step": 33187 + }, + { + "epoch": 0.9841354565133589, + "grad_norm": 0.0645950585603714, + "learning_rate": 6.358790224605949e-07, + "loss": 2.4914, + "step": 33188 + }, + { + "epoch": 0.9841651098656703, + "grad_norm": 0.06127103418111801, + "learning_rate": 6.335090535630195e-07, + "loss": 2.5351, + "step": 33189 + }, + { + "epoch": 0.9841947632179818, + "grad_norm": 0.06309087574481964, + "learning_rate": 6.311435066414517e-07, + "loss": 2.5288, + "step": 33190 + }, + { + "epoch": 0.9842244165702932, + "grad_norm": 0.0641813650727272, + "learning_rate": 6.287823817168193e-07, + "loss": 2.5409, + "step": 33191 + }, + { + "epoch": 0.9842540699226048, + "grad_norm": 0.06618737429380417, + "learning_rate": 6.264256788100497e-07, + "loss": 2.5965, + "step": 33192 + }, + { + "epoch": 0.9842837232749162, + "grad_norm": 0.06202520802617073, + "learning_rate": 6.2407339794196e-07, + "loss": 2.5083, + "step": 33193 + }, + { + "epoch": 0.9843133766272277, + "grad_norm": 0.06497082114219666, + "learning_rate": 6.217255391334775e-07, + "loss": 2.5062, + "step": 33194 + }, + { + "epoch": 0.9843430299795392, + "grad_norm": 0.062295470386743546, + "learning_rate": 6.193821024052526e-07, + "loss": 2.5385, + "step": 33195 + }, + { + "epoch": 0.9843726833318507, + "grad_norm": 0.0654282197356224, + "learning_rate": 6.170430877782129e-07, + "loss": 2.529, + "step": 33196 + }, + { + "epoch": 0.9844023366841621, + "grad_norm": 0.06447495520114899, + "learning_rate": 6.147084952728976e-07, + "loss": 2.5661, + "step": 33197 + }, + { + "epoch": 0.9844319900364736, + "grad_norm": 0.06336592137813568, + "learning_rate": 6.123783249100679e-07, + "loss": 2.5284, + "step": 33198 + }, + { + "epoch": 0.9844616433887851, + "grad_norm": 0.06541808694601059, + "learning_rate": 6.100525767103737e-07, + "loss": 2.495, + "step": 33199 + }, + { + "epoch": 0.9844912967410966, + "grad_norm": 0.066102035343647, + "learning_rate": 6.077312506943544e-07, + "loss": 2.5044, + "step": 33200 + }, + { + "epoch": 0.984520950093408, + "grad_norm": 0.06471860408782959, + "learning_rate": 6.054143468826045e-07, + "loss": 2.5437, + "step": 33201 + }, + { + "epoch": 0.9845506034457195, + "grad_norm": 0.06373757869005203, + "learning_rate": 6.031018652956633e-07, + "loss": 2.5581, + "step": 33202 + }, + { + "epoch": 0.9845802567980311, + "grad_norm": 0.06308876723051071, + "learning_rate": 6.007938059539031e-07, + "loss": 2.5655, + "step": 33203 + }, + { + "epoch": 0.9846099101503425, + "grad_norm": 0.06649011373519897, + "learning_rate": 5.984901688779187e-07, + "loss": 2.5522, + "step": 33204 + }, + { + "epoch": 0.984639563502654, + "grad_norm": 0.0634516105055809, + "learning_rate": 5.961909540879718e-07, + "loss": 2.5176, + "step": 33205 + }, + { + "epoch": 0.9846692168549654, + "grad_norm": 0.06293011456727982, + "learning_rate": 5.938961616044902e-07, + "loss": 2.5195, + "step": 33206 + }, + { + "epoch": 0.984698870207277, + "grad_norm": 0.06444993615150452, + "learning_rate": 5.916057914477913e-07, + "loss": 2.5392, + "step": 33207 + }, + { + "epoch": 0.9847285235595884, + "grad_norm": 0.062874935567379, + "learning_rate": 5.893198436381364e-07, + "loss": 2.5379, + "step": 33208 + }, + { + "epoch": 0.9847581769118999, + "grad_norm": 0.06369853019714355, + "learning_rate": 5.870383181957873e-07, + "loss": 2.5251, + "step": 33209 + }, + { + "epoch": 0.9847878302642114, + "grad_norm": 0.06378120929002762, + "learning_rate": 5.847612151408943e-07, + "loss": 2.5573, + "step": 33210 + }, + { + "epoch": 0.9848174836165229, + "grad_norm": 0.06089642271399498, + "learning_rate": 5.824885344937191e-07, + "loss": 2.5493, + "step": 33211 + }, + { + "epoch": 0.9848471369688343, + "grad_norm": 0.06538822501897812, + "learning_rate": 5.802202762743014e-07, + "loss": 2.5465, + "step": 33212 + }, + { + "epoch": 0.9848767903211458, + "grad_norm": 0.06325314939022064, + "learning_rate": 5.779564405027359e-07, + "loss": 2.5643, + "step": 33213 + }, + { + "epoch": 0.9849064436734573, + "grad_norm": 0.06460051983594894, + "learning_rate": 5.75697027199118e-07, + "loss": 2.5052, + "step": 33214 + }, + { + "epoch": 0.9849360970257688, + "grad_norm": 0.06460674852132797, + "learning_rate": 5.734420363834314e-07, + "loss": 2.541, + "step": 33215 + }, + { + "epoch": 0.9849657503780802, + "grad_norm": 0.06733046472072601, + "learning_rate": 5.711914680756048e-07, + "loss": 2.5069, + "step": 33216 + }, + { + "epoch": 0.9849954037303917, + "grad_norm": 0.06529494374990463, + "learning_rate": 5.689453222956775e-07, + "loss": 2.5523, + "step": 33217 + }, + { + "epoch": 0.9850250570827032, + "grad_norm": 0.06176915764808655, + "learning_rate": 5.667035990634117e-07, + "loss": 2.5274, + "step": 33218 + }, + { + "epoch": 0.9850547104350147, + "grad_norm": 0.06772331148386002, + "learning_rate": 5.644662983987359e-07, + "loss": 2.5053, + "step": 33219 + }, + { + "epoch": 0.9850843637873261, + "grad_norm": 0.06333643198013306, + "learning_rate": 5.622334203214119e-07, + "loss": 2.5334, + "step": 33220 + }, + { + "epoch": 0.9851140171396376, + "grad_norm": 0.06587503850460052, + "learning_rate": 5.600049648512573e-07, + "loss": 2.5581, + "step": 33221 + }, + { + "epoch": 0.9851436704919491, + "grad_norm": 0.06429735571146011, + "learning_rate": 5.577809320079786e-07, + "loss": 2.5252, + "step": 33222 + }, + { + "epoch": 0.9851733238442606, + "grad_norm": 0.06352350860834122, + "learning_rate": 5.55561321811282e-07, + "loss": 2.5173, + "step": 33223 + }, + { + "epoch": 0.9852029771965721, + "grad_norm": 0.06502634286880493, + "learning_rate": 5.533461342808189e-07, + "loss": 2.5589, + "step": 33224 + }, + { + "epoch": 0.9852326305488835, + "grad_norm": 0.06312978267669678, + "learning_rate": 5.511353694361843e-07, + "loss": 2.5246, + "step": 33225 + }, + { + "epoch": 0.9852622839011951, + "grad_norm": 0.06497370451688766, + "learning_rate": 5.489290272970294e-07, + "loss": 2.5212, + "step": 33226 + }, + { + "epoch": 0.9852919372535065, + "grad_norm": 0.06598173081874847, + "learning_rate": 5.467271078827829e-07, + "loss": 2.5233, + "step": 33227 + }, + { + "epoch": 0.985321590605818, + "grad_norm": 0.065599225461483, + "learning_rate": 5.445296112130404e-07, + "loss": 2.5359, + "step": 33228 + }, + { + "epoch": 0.9853512439581295, + "grad_norm": 0.06500548869371414, + "learning_rate": 5.423365373071754e-07, + "loss": 2.5573, + "step": 33229 + }, + { + "epoch": 0.985380897310441, + "grad_norm": 0.06414872407913208, + "learning_rate": 5.40147886184672e-07, + "loss": 2.493, + "step": 33230 + }, + { + "epoch": 0.9854105506627524, + "grad_norm": 0.06466306746006012, + "learning_rate": 5.379636578649038e-07, + "loss": 2.5554, + "step": 33231 + }, + { + "epoch": 0.9854402040150639, + "grad_norm": 0.06443983316421509, + "learning_rate": 5.357838523671888e-07, + "loss": 2.5062, + "step": 33232 + }, + { + "epoch": 0.9854698573673754, + "grad_norm": 0.06349067389965057, + "learning_rate": 5.33608469710789e-07, + "loss": 2.549, + "step": 33233 + }, + { + "epoch": 0.9854995107196869, + "grad_norm": 0.06564848124980927, + "learning_rate": 5.314375099150781e-07, + "loss": 2.5552, + "step": 33234 + }, + { + "epoch": 0.9855291640719983, + "grad_norm": 0.06365559250116348, + "learning_rate": 5.292709729992073e-07, + "loss": 2.5062, + "step": 33235 + }, + { + "epoch": 0.9855588174243098, + "grad_norm": 0.06391094624996185, + "learning_rate": 5.271088589823836e-07, + "loss": 2.5473, + "step": 33236 + }, + { + "epoch": 0.9855884707766213, + "grad_norm": 0.06321091949939728, + "learning_rate": 5.249511678837582e-07, + "loss": 2.5415, + "step": 33237 + }, + { + "epoch": 0.9856181241289328, + "grad_norm": 0.06175260618329048, + "learning_rate": 5.227978997223715e-07, + "loss": 2.5565, + "step": 33238 + }, + { + "epoch": 0.9856477774812442, + "grad_norm": 0.06539271771907806, + "learning_rate": 5.206490545173747e-07, + "loss": 2.5513, + "step": 33239 + }, + { + "epoch": 0.9856774308335557, + "grad_norm": 0.06744620949029922, + "learning_rate": 5.185046322877529e-07, + "loss": 2.5512, + "step": 33240 + }, + { + "epoch": 0.9857070841858672, + "grad_norm": 0.0654946118593216, + "learning_rate": 5.163646330525462e-07, + "loss": 2.5526, + "step": 33241 + }, + { + "epoch": 0.9857367375381787, + "grad_norm": 0.06204134598374367, + "learning_rate": 5.142290568306285e-07, + "loss": 2.4949, + "step": 33242 + }, + { + "epoch": 0.9857663908904901, + "grad_norm": 0.06582273542881012, + "learning_rate": 5.120979036409845e-07, + "loss": 2.5346, + "step": 33243 + }, + { + "epoch": 0.9857960442428017, + "grad_norm": 0.06351890414953232, + "learning_rate": 5.099711735024327e-07, + "loss": 2.5043, + "step": 33244 + }, + { + "epoch": 0.9858256975951132, + "grad_norm": 0.06565224379301071, + "learning_rate": 5.07848866433791e-07, + "loss": 2.5636, + "step": 33245 + }, + { + "epoch": 0.9858553509474246, + "grad_norm": 0.0653570145368576, + "learning_rate": 5.057309824538781e-07, + "loss": 2.5377, + "step": 33246 + }, + { + "epoch": 0.9858850042997361, + "grad_norm": 0.06550414860248566, + "learning_rate": 5.03617521581512e-07, + "loss": 2.5418, + "step": 33247 + }, + { + "epoch": 0.9859146576520476, + "grad_norm": 0.0656212791800499, + "learning_rate": 5.01508483835289e-07, + "loss": 2.5358, + "step": 33248 + }, + { + "epoch": 0.9859443110043591, + "grad_norm": 0.0650806725025177, + "learning_rate": 4.994038692340275e-07, + "loss": 2.5303, + "step": 33249 + }, + { + "epoch": 0.9859739643566705, + "grad_norm": 0.061624325811862946, + "learning_rate": 4.973036777962125e-07, + "loss": 2.5613, + "step": 33250 + }, + { + "epoch": 0.986003617708982, + "grad_norm": 0.06416232138872147, + "learning_rate": 4.952079095405515e-07, + "loss": 2.5317, + "step": 33251 + }, + { + "epoch": 0.9860332710612935, + "grad_norm": 0.06305255740880966, + "learning_rate": 4.931165644855296e-07, + "loss": 2.5315, + "step": 33252 + }, + { + "epoch": 0.986062924413605, + "grad_norm": 0.06493856757879257, + "learning_rate": 4.910296426496874e-07, + "loss": 2.5316, + "step": 33253 + }, + { + "epoch": 0.9860925777659164, + "grad_norm": 0.06326556950807571, + "learning_rate": 4.889471440515658e-07, + "loss": 2.5267, + "step": 33254 + }, + { + "epoch": 0.9861222311182279, + "grad_norm": 0.06223228573799133, + "learning_rate": 4.868690687095389e-07, + "loss": 2.5569, + "step": 33255 + }, + { + "epoch": 0.9861518844705394, + "grad_norm": 0.06511224061250687, + "learning_rate": 4.847954166420366e-07, + "loss": 2.528, + "step": 33256 + }, + { + "epoch": 0.9861815378228509, + "grad_norm": 0.06155794486403465, + "learning_rate": 4.827261878673772e-07, + "loss": 2.5251, + "step": 33257 + }, + { + "epoch": 0.9862111911751623, + "grad_norm": 0.06371162086725235, + "learning_rate": 4.806613824039352e-07, + "loss": 2.5373, + "step": 33258 + }, + { + "epoch": 0.9862408445274738, + "grad_norm": 0.06489061564207077, + "learning_rate": 4.786010002699736e-07, + "loss": 2.5428, + "step": 33259 + }, + { + "epoch": 0.9862704978797853, + "grad_norm": 0.062385547906160355, + "learning_rate": 4.765450414837558e-07, + "loss": 2.502, + "step": 33260 + }, + { + "epoch": 0.9863001512320968, + "grad_norm": 0.0641217976808548, + "learning_rate": 4.7449350606348916e-07, + "loss": 2.5553, + "step": 33261 + }, + { + "epoch": 0.9863298045844082, + "grad_norm": 0.06464950740337372, + "learning_rate": 4.7244639402732604e-07, + "loss": 2.5469, + "step": 33262 + }, + { + "epoch": 0.9863594579367198, + "grad_norm": 0.06315993517637253, + "learning_rate": 4.7040370539336297e-07, + "loss": 2.5264, + "step": 33263 + }, + { + "epoch": 0.9863891112890312, + "grad_norm": 0.06280064582824707, + "learning_rate": 4.6836544017969664e-07, + "loss": 2.5381, + "step": 33264 + }, + { + "epoch": 0.9864187646413427, + "grad_norm": 0.06447521597146988, + "learning_rate": 4.663315984044236e-07, + "loss": 2.5665, + "step": 33265 + }, + { + "epoch": 0.9864484179936542, + "grad_norm": 0.06411556154489517, + "learning_rate": 4.643021800855296e-07, + "loss": 2.5443, + "step": 33266 + }, + { + "epoch": 0.9864780713459657, + "grad_norm": 0.06314484775066376, + "learning_rate": 4.6227718524100013e-07, + "loss": 2.5861, + "step": 33267 + }, + { + "epoch": 0.9865077246982772, + "grad_norm": 0.06337081640958786, + "learning_rate": 4.602566138887654e-07, + "loss": 2.521, + "step": 33268 + }, + { + "epoch": 0.9865373780505886, + "grad_norm": 0.061904799193143845, + "learning_rate": 4.5824046604664436e-07, + "loss": 2.5238, + "step": 33269 + }, + { + "epoch": 0.9865670314029001, + "grad_norm": 0.06349284946918488, + "learning_rate": 4.5622874173262277e-07, + "loss": 2.5774, + "step": 33270 + }, + { + "epoch": 0.9865966847552116, + "grad_norm": 0.06317804008722305, + "learning_rate": 4.5422144096435303e-07, + "loss": 2.5408, + "step": 33271 + }, + { + "epoch": 0.9866263381075231, + "grad_norm": 0.06429918110370636, + "learning_rate": 4.5221856375976535e-07, + "loss": 2.556, + "step": 33272 + }, + { + "epoch": 0.9866559914598345, + "grad_norm": 0.06390101462602615, + "learning_rate": 4.502201101365122e-07, + "loss": 2.5032, + "step": 33273 + }, + { + "epoch": 0.986685644812146, + "grad_norm": 0.06374343484640121, + "learning_rate": 4.482260801123017e-07, + "loss": 2.5397, + "step": 33274 + }, + { + "epoch": 0.9867152981644575, + "grad_norm": 0.06474210321903229, + "learning_rate": 4.462364737047864e-07, + "loss": 2.5507, + "step": 33275 + }, + { + "epoch": 0.986744951516769, + "grad_norm": 0.0645197257399559, + "learning_rate": 4.4425129093161876e-07, + "loss": 2.5604, + "step": 33276 + }, + { + "epoch": 0.9867746048690804, + "grad_norm": 0.06122672185301781, + "learning_rate": 4.422705318103404e-07, + "loss": 2.5436, + "step": 33277 + }, + { + "epoch": 0.986804258221392, + "grad_norm": 0.06321120262145996, + "learning_rate": 4.402941963584928e-07, + "loss": 2.5271, + "step": 33278 + }, + { + "epoch": 0.9868339115737034, + "grad_norm": 0.06442289799451828, + "learning_rate": 4.3832228459361747e-07, + "loss": 2.5547, + "step": 33279 + }, + { + "epoch": 0.9868635649260149, + "grad_norm": 0.06573457270860672, + "learning_rate": 4.363547965330894e-07, + "loss": 2.5182, + "step": 33280 + }, + { + "epoch": 0.9868932182783263, + "grad_norm": 0.06265143305063248, + "learning_rate": 4.343917321944502e-07, + "loss": 2.4984, + "step": 33281 + }, + { + "epoch": 0.9869228716306379, + "grad_norm": 0.0682932585477829, + "learning_rate": 4.324330915950192e-07, + "loss": 2.536, + "step": 33282 + }, + { + "epoch": 0.9869525249829493, + "grad_norm": 0.062390103936195374, + "learning_rate": 4.30478874752116e-07, + "loss": 2.4974, + "step": 33283 + }, + { + "epoch": 0.9869821783352608, + "grad_norm": 0.06343141198158264, + "learning_rate": 4.2852908168306006e-07, + "loss": 2.5049, + "step": 33284 + }, + { + "epoch": 0.9870118316875722, + "grad_norm": 0.06108023226261139, + "learning_rate": 4.265837124051708e-07, + "loss": 2.5446, + "step": 33285 + }, + { + "epoch": 0.9870414850398838, + "grad_norm": 0.06554968655109406, + "learning_rate": 4.246427669356012e-07, + "loss": 2.5507, + "step": 33286 + }, + { + "epoch": 0.9870711383921953, + "grad_norm": 0.06658145785331726, + "learning_rate": 4.2270624529155974e-07, + "loss": 2.5581, + "step": 33287 + }, + { + "epoch": 0.9871007917445067, + "grad_norm": 0.0634208470582962, + "learning_rate": 4.2077414749025487e-07, + "loss": 2.5732, + "step": 33288 + }, + { + "epoch": 0.9871304450968182, + "grad_norm": 0.0662304013967514, + "learning_rate": 4.188464735487285e-07, + "loss": 2.5263, + "step": 33289 + }, + { + "epoch": 0.9871600984491297, + "grad_norm": 0.06485649943351746, + "learning_rate": 4.169232234840226e-07, + "loss": 2.5297, + "step": 33290 + }, + { + "epoch": 0.9871897518014412, + "grad_norm": 0.06517275422811508, + "learning_rate": 4.150043973132345e-07, + "loss": 2.4849, + "step": 33291 + }, + { + "epoch": 0.9872194051537526, + "grad_norm": 0.06368070840835571, + "learning_rate": 4.1308999505335067e-07, + "loss": 2.5478, + "step": 33292 + }, + { + "epoch": 0.9872490585060641, + "grad_norm": 0.0639728456735611, + "learning_rate": 4.111800167213575e-07, + "loss": 2.54, + "step": 33293 + }, + { + "epoch": 0.9872787118583756, + "grad_norm": 0.06760574132204056, + "learning_rate": 4.0927446233407493e-07, + "loss": 2.5329, + "step": 33294 + }, + { + "epoch": 0.9873083652106871, + "grad_norm": 0.06339247524738312, + "learning_rate": 4.0737333190837834e-07, + "loss": 2.492, + "step": 33295 + }, + { + "epoch": 0.9873380185629985, + "grad_norm": 0.06623277068138123, + "learning_rate": 4.054766254611986e-07, + "loss": 2.5425, + "step": 33296 + }, + { + "epoch": 0.98736767191531, + "grad_norm": 0.0638008639216423, + "learning_rate": 4.035843430092445e-07, + "loss": 2.5463, + "step": 33297 + }, + { + "epoch": 0.9873973252676215, + "grad_norm": 0.06193619221448898, + "learning_rate": 4.0169648456933606e-07, + "loss": 2.5246, + "step": 33298 + }, + { + "epoch": 0.987426978619933, + "grad_norm": 0.06562914699316025, + "learning_rate": 3.998130501581265e-07, + "loss": 2.5094, + "step": 33299 + }, + { + "epoch": 0.9874566319722444, + "grad_norm": 0.06830060482025146, + "learning_rate": 3.979340397923803e-07, + "loss": 2.5228, + "step": 33300 + }, + { + "epoch": 0.987486285324556, + "grad_norm": 0.0637848749756813, + "learning_rate": 3.960594534886397e-07, + "loss": 2.5429, + "step": 33301 + }, + { + "epoch": 0.9875159386768674, + "grad_norm": 0.06337427347898483, + "learning_rate": 3.941892912635581e-07, + "loss": 2.5329, + "step": 33302 + }, + { + "epoch": 0.9875455920291789, + "grad_norm": 0.06395973265171051, + "learning_rate": 3.923235531336777e-07, + "loss": 2.5484, + "step": 33303 + }, + { + "epoch": 0.9875752453814903, + "grad_norm": 0.06310342252254486, + "learning_rate": 3.90462239115541e-07, + "loss": 2.5258, + "step": 33304 + }, + { + "epoch": 0.9876048987338019, + "grad_norm": 0.06595390290021896, + "learning_rate": 3.8860534922563475e-07, + "loss": 2.5548, + "step": 33305 + }, + { + "epoch": 0.9876345520861133, + "grad_norm": 0.06292671710252762, + "learning_rate": 3.8675288348033467e-07, + "loss": 2.5379, + "step": 33306 + }, + { + "epoch": 0.9876642054384248, + "grad_norm": 0.06183329597115517, + "learning_rate": 3.849048418961276e-07, + "loss": 2.5075, + "step": 33307 + }, + { + "epoch": 0.9876938587907363, + "grad_norm": 0.062223631888628006, + "learning_rate": 3.830612244893339e-07, + "loss": 2.5259, + "step": 33308 + }, + { + "epoch": 0.9877235121430478, + "grad_norm": 0.0661604031920433, + "learning_rate": 3.812220312763293e-07, + "loss": 2.529, + "step": 33309 + }, + { + "epoch": 0.9877531654953593, + "grad_norm": 0.06271674484014511, + "learning_rate": 3.79387262273323e-07, + "loss": 2.5019, + "step": 33310 + }, + { + "epoch": 0.9877828188476707, + "grad_norm": 0.06617759168148041, + "learning_rate": 3.7755691749663536e-07, + "loss": 2.5245, + "step": 33311 + }, + { + "epoch": 0.9878124721999822, + "grad_norm": 0.06317102164030075, + "learning_rate": 3.7573099696242007e-07, + "loss": 2.542, + "step": 33312 + }, + { + "epoch": 0.9878421255522937, + "grad_norm": 0.06295079737901688, + "learning_rate": 3.7390950068683094e-07, + "loss": 2.5238, + "step": 33313 + }, + { + "epoch": 0.9878717789046052, + "grad_norm": 0.06226501241326332, + "learning_rate": 3.7209242868607716e-07, + "loss": 2.5208, + "step": 33314 + }, + { + "epoch": 0.9879014322569166, + "grad_norm": 0.06511492282152176, + "learning_rate": 3.702797809762015e-07, + "loss": 2.5321, + "step": 33315 + }, + { + "epoch": 0.9879310856092282, + "grad_norm": 0.06300899386405945, + "learning_rate": 3.684715575732467e-07, + "loss": 2.5367, + "step": 33316 + }, + { + "epoch": 0.9879607389615396, + "grad_norm": 0.06264898180961609, + "learning_rate": 3.666677584932554e-07, + "loss": 2.5334, + "step": 33317 + }, + { + "epoch": 0.9879903923138511, + "grad_norm": 0.06466744840145111, + "learning_rate": 3.6486838375215935e-07, + "loss": 2.5469, + "step": 33318 + }, + { + "epoch": 0.9880200456661625, + "grad_norm": 0.0616026446223259, + "learning_rate": 3.630734333658903e-07, + "loss": 2.5523, + "step": 33319 + }, + { + "epoch": 0.9880496990184741, + "grad_norm": 0.06365809589624405, + "learning_rate": 3.6128290735043534e-07, + "loss": 2.5808, + "step": 33320 + }, + { + "epoch": 0.9880793523707855, + "grad_norm": 0.06152057647705078, + "learning_rate": 3.5949680572155975e-07, + "loss": 2.4926, + "step": 33321 + }, + { + "epoch": 0.988109005723097, + "grad_norm": 0.06267350912094116, + "learning_rate": 3.5771512849508415e-07, + "loss": 2.5354, + "step": 33322 + }, + { + "epoch": 0.9881386590754084, + "grad_norm": 0.06341949105262756, + "learning_rate": 3.559378756867737e-07, + "loss": 2.5432, + "step": 33323 + }, + { + "epoch": 0.98816831242772, + "grad_norm": 0.06474824249744415, + "learning_rate": 3.5416504731244915e-07, + "loss": 2.5217, + "step": 33324 + }, + { + "epoch": 0.9881979657800314, + "grad_norm": 0.06338822841644287, + "learning_rate": 3.5239664338776454e-07, + "loss": 2.5316, + "step": 33325 + }, + { + "epoch": 0.9882276191323429, + "grad_norm": 0.06286352127790451, + "learning_rate": 3.506326639283186e-07, + "loss": 2.5328, + "step": 33326 + }, + { + "epoch": 0.9882572724846543, + "grad_norm": 0.06161364167928696, + "learning_rate": 3.4887310894982095e-07, + "loss": 2.522, + "step": 33327 + }, + { + "epoch": 0.9882869258369659, + "grad_norm": 0.06247856095433235, + "learning_rate": 3.471179784678147e-07, + "loss": 2.5515, + "step": 33328 + }, + { + "epoch": 0.9883165791892774, + "grad_norm": 0.06270230561494827, + "learning_rate": 3.4536727249784296e-07, + "loss": 2.5607, + "step": 33329 + }, + { + "epoch": 0.9883462325415888, + "grad_norm": 0.06377408653497696, + "learning_rate": 3.4362099105539336e-07, + "loss": 2.531, + "step": 33330 + }, + { + "epoch": 0.9883758858939004, + "grad_norm": 0.06341584771871567, + "learning_rate": 3.418791341559535e-07, + "loss": 2.5551, + "step": 33331 + }, + { + "epoch": 0.9884055392462118, + "grad_norm": 0.06381101161241531, + "learning_rate": 3.401417018149e-07, + "loss": 2.5253, + "step": 33332 + }, + { + "epoch": 0.9884351925985233, + "grad_norm": 0.06240994855761528, + "learning_rate": 3.3840869404772046e-07, + "loss": 2.5184, + "step": 33333 + }, + { + "epoch": 0.9884648459508347, + "grad_norm": 0.06501904875040054, + "learning_rate": 3.3668011086968043e-07, + "loss": 2.5412, + "step": 33334 + }, + { + "epoch": 0.9884944993031463, + "grad_norm": 0.06534022092819214, + "learning_rate": 3.3495595229610097e-07, + "loss": 2.5499, + "step": 33335 + }, + { + "epoch": 0.9885241526554577, + "grad_norm": 0.06739651411771774, + "learning_rate": 3.332362183422477e-07, + "loss": 2.5496, + "step": 33336 + }, + { + "epoch": 0.9885538060077692, + "grad_norm": 0.06404805928468704, + "learning_rate": 3.3152090902333063e-07, + "loss": 2.5272, + "step": 33337 + }, + { + "epoch": 0.9885834593600806, + "grad_norm": 0.06590147316455841, + "learning_rate": 3.2981002435461537e-07, + "loss": 2.525, + "step": 33338 + }, + { + "epoch": 0.9886131127123922, + "grad_norm": 0.06389220058917999, + "learning_rate": 3.281035643511454e-07, + "loss": 2.5564, + "step": 33339 + }, + { + "epoch": 0.9886427660647036, + "grad_norm": 0.06259249895811081, + "learning_rate": 3.264015290281308e-07, + "loss": 2.5152, + "step": 33340 + }, + { + "epoch": 0.9886724194170151, + "grad_norm": 0.06563172489404678, + "learning_rate": 3.2470391840055957e-07, + "loss": 2.5102, + "step": 33341 + }, + { + "epoch": 0.9887020727693265, + "grad_norm": 0.06553741544485092, + "learning_rate": 3.2301073248353076e-07, + "loss": 2.5492, + "step": 33342 + }, + { + "epoch": 0.9887317261216381, + "grad_norm": 0.06488877534866333, + "learning_rate": 3.2132197129197683e-07, + "loss": 2.5546, + "step": 33343 + }, + { + "epoch": 0.9887613794739495, + "grad_norm": 0.06370134651660919, + "learning_rate": 3.196376348409413e-07, + "loss": 2.5396, + "step": 33344 + }, + { + "epoch": 0.988791032826261, + "grad_norm": 0.06488668918609619, + "learning_rate": 3.1795772314524574e-07, + "loss": 2.5458, + "step": 33345 + }, + { + "epoch": 0.9888206861785724, + "grad_norm": 0.0635618045926094, + "learning_rate": 3.1628223621982255e-07, + "loss": 2.5507, + "step": 33346 + }, + { + "epoch": 0.988850339530884, + "grad_norm": 0.06712774932384491, + "learning_rate": 3.146111740794377e-07, + "loss": 2.5319, + "step": 33347 + }, + { + "epoch": 0.9888799928831954, + "grad_norm": 0.06669130176305771, + "learning_rate": 3.129445367390238e-07, + "loss": 2.567, + "step": 33348 + }, + { + "epoch": 0.9889096462355069, + "grad_norm": 0.06164420768618584, + "learning_rate": 3.1128232421318017e-07, + "loss": 2.5299, + "step": 33349 + }, + { + "epoch": 0.9889392995878185, + "grad_norm": 0.0663100853562355, + "learning_rate": 3.096245365167283e-07, + "loss": 2.5472, + "step": 33350 + }, + { + "epoch": 0.9889689529401299, + "grad_norm": 0.06315890699625015, + "learning_rate": 3.0797117366437865e-07, + "loss": 2.5497, + "step": 33351 + }, + { + "epoch": 0.9889986062924414, + "grad_norm": 0.06285639107227325, + "learning_rate": 3.0632223567061966e-07, + "loss": 2.54, + "step": 33352 + }, + { + "epoch": 0.9890282596447528, + "grad_norm": 0.06280886381864548, + "learning_rate": 3.046777225502173e-07, + "loss": 2.5501, + "step": 33353 + }, + { + "epoch": 0.9890579129970644, + "grad_norm": 0.060799188911914825, + "learning_rate": 3.0303763431765995e-07, + "loss": 2.5343, + "step": 33354 + }, + { + "epoch": 0.9890875663493758, + "grad_norm": 0.06378140300512314, + "learning_rate": 3.0140197098743605e-07, + "loss": 2.5447, + "step": 33355 + }, + { + "epoch": 0.9891172197016873, + "grad_norm": 0.06331004202365875, + "learning_rate": 2.997707325740895e-07, + "loss": 2.5386, + "step": 33356 + }, + { + "epoch": 0.9891468730539987, + "grad_norm": 0.06478182226419449, + "learning_rate": 2.981439190920532e-07, + "loss": 2.5236, + "step": 33357 + }, + { + "epoch": 0.9891765264063103, + "grad_norm": 0.06165295094251633, + "learning_rate": 2.9652153055570454e-07, + "loss": 2.5813, + "step": 33358 + }, + { + "epoch": 0.9892061797586217, + "grad_norm": 0.06625735759735107, + "learning_rate": 2.949035669794764e-07, + "loss": 2.5352, + "step": 33359 + }, + { + "epoch": 0.9892358331109332, + "grad_norm": 0.06637116521596909, + "learning_rate": 2.932900283776352e-07, + "loss": 2.5137, + "step": 33360 + }, + { + "epoch": 0.9892654864632446, + "grad_norm": 0.06287070363759995, + "learning_rate": 2.9168091476444724e-07, + "loss": 2.5371, + "step": 33361 + }, + { + "epoch": 0.9892951398155562, + "grad_norm": 0.06382980942726135, + "learning_rate": 2.9007622615423446e-07, + "loss": 2.5309, + "step": 33362 + }, + { + "epoch": 0.9893247931678676, + "grad_norm": 0.065238356590271, + "learning_rate": 2.8847596256115214e-07, + "loss": 2.5445, + "step": 33363 + }, + { + "epoch": 0.9893544465201791, + "grad_norm": 0.06274999678134918, + "learning_rate": 2.868801239994112e-07, + "loss": 2.5635, + "step": 33364 + }, + { + "epoch": 0.9893840998724905, + "grad_norm": 0.06575652956962585, + "learning_rate": 2.852887104830559e-07, + "loss": 2.5309, + "step": 33365 + }, + { + "epoch": 0.9894137532248021, + "grad_norm": 0.06451407819986343, + "learning_rate": 2.837017220262972e-07, + "loss": 2.5552, + "step": 33366 + }, + { + "epoch": 0.9894434065771135, + "grad_norm": 0.06417787820100784, + "learning_rate": 2.821191586431793e-07, + "loss": 2.5667, + "step": 33367 + }, + { + "epoch": 0.989473059929425, + "grad_norm": 0.06479158997535706, + "learning_rate": 2.8054102034758e-07, + "loss": 2.5672, + "step": 33368 + }, + { + "epoch": 0.9895027132817364, + "grad_norm": 0.062322139739990234, + "learning_rate": 2.789673071535992e-07, + "loss": 2.5459, + "step": 33369 + }, + { + "epoch": 0.989532366634048, + "grad_norm": 0.06366170942783356, + "learning_rate": 2.7739801907517015e-07, + "loss": 2.5317, + "step": 33370 + }, + { + "epoch": 0.9895620199863595, + "grad_norm": 0.06647960841655731, + "learning_rate": 2.758331561261151e-07, + "loss": 2.5475, + "step": 33371 + }, + { + "epoch": 0.9895916733386709, + "grad_norm": 0.06110381707549095, + "learning_rate": 2.742727183203675e-07, + "loss": 2.5263, + "step": 33372 + }, + { + "epoch": 0.9896213266909825, + "grad_norm": 0.06391549110412598, + "learning_rate": 2.7271670567169393e-07, + "loss": 2.52, + "step": 33373 + }, + { + "epoch": 0.9896509800432939, + "grad_norm": 0.06370129436254501, + "learning_rate": 2.7116511819391674e-07, + "loss": 2.5489, + "step": 33374 + }, + { + "epoch": 0.9896806333956054, + "grad_norm": 0.06124599277973175, + "learning_rate": 2.696179559007472e-07, + "loss": 2.5228, + "step": 33375 + }, + { + "epoch": 0.9897102867479168, + "grad_norm": 0.06515344977378845, + "learning_rate": 2.6807521880584107e-07, + "loss": 2.5478, + "step": 33376 + }, + { + "epoch": 0.9897399401002284, + "grad_norm": 0.06599663943052292, + "learning_rate": 2.6653690692296506e-07, + "loss": 2.5081, + "step": 33377 + }, + { + "epoch": 0.9897695934525398, + "grad_norm": 0.06673907488584518, + "learning_rate": 2.6500302026566394e-07, + "loss": 2.5741, + "step": 33378 + }, + { + "epoch": 0.9897992468048513, + "grad_norm": 0.06659798324108124, + "learning_rate": 2.6347355884748236e-07, + "loss": 2.5099, + "step": 33379 + }, + { + "epoch": 0.9898289001571627, + "grad_norm": 0.061897940933704376, + "learning_rate": 2.6194852268207616e-07, + "loss": 2.5404, + "step": 33380 + }, + { + "epoch": 0.9898585535094743, + "grad_norm": 0.06414566934108734, + "learning_rate": 2.6042791178287894e-07, + "loss": 2.5385, + "step": 33381 + }, + { + "epoch": 0.9898882068617857, + "grad_norm": 0.06441313773393631, + "learning_rate": 2.5891172616338e-07, + "loss": 2.5459, + "step": 33382 + }, + { + "epoch": 0.9899178602140972, + "grad_norm": 0.06283443421125412, + "learning_rate": 2.5739996583701297e-07, + "loss": 2.5344, + "step": 33383 + }, + { + "epoch": 0.9899475135664086, + "grad_norm": 0.0630260780453682, + "learning_rate": 2.558926308171561e-07, + "loss": 2.5327, + "step": 33384 + }, + { + "epoch": 0.9899771669187202, + "grad_norm": 0.06320879608392715, + "learning_rate": 2.54389721117132e-07, + "loss": 2.5165, + "step": 33385 + }, + { + "epoch": 0.9900068202710316, + "grad_norm": 0.0646626427769661, + "learning_rate": 2.5289123675026337e-07, + "loss": 2.5623, + "step": 33386 + }, + { + "epoch": 0.9900364736233431, + "grad_norm": 0.07280343025922775, + "learning_rate": 2.513971777298174e-07, + "loss": 2.5483, + "step": 33387 + }, + { + "epoch": 0.9900661269756545, + "grad_norm": 0.0646759644150734, + "learning_rate": 2.4990754406900574e-07, + "loss": 2.5346, + "step": 33388 + }, + { + "epoch": 0.9900957803279661, + "grad_norm": 0.06424381583929062, + "learning_rate": 2.484223357810955e-07, + "loss": 2.52, + "step": 33389 + }, + { + "epoch": 0.9901254336802775, + "grad_norm": 0.06214006245136261, + "learning_rate": 2.469415528791319e-07, + "loss": 2.5073, + "step": 33390 + }, + { + "epoch": 0.990155087032589, + "grad_norm": 0.06438543647527695, + "learning_rate": 2.454651953763265e-07, + "loss": 2.5647, + "step": 33391 + }, + { + "epoch": 0.9901847403849006, + "grad_norm": 0.06582359969615936, + "learning_rate": 2.4399326328572447e-07, + "loss": 2.5403, + "step": 33392 + }, + { + "epoch": 0.990214393737212, + "grad_norm": 0.062190745025873184, + "learning_rate": 2.425257566203154e-07, + "loss": 2.5326, + "step": 33393 + }, + { + "epoch": 0.9902440470895235, + "grad_norm": 0.06277115643024445, + "learning_rate": 2.410626753931444e-07, + "loss": 2.5509, + "step": 33394 + }, + { + "epoch": 0.9902737004418349, + "grad_norm": 0.06235882639884949, + "learning_rate": 2.396040196170901e-07, + "loss": 2.5516, + "step": 33395 + }, + { + "epoch": 0.9903033537941465, + "grad_norm": 0.06161965802311897, + "learning_rate": 2.3814978930514208e-07, + "loss": 2.5344, + "step": 33396 + }, + { + "epoch": 0.9903330071464579, + "grad_norm": 0.06245952844619751, + "learning_rate": 2.3669998447017893e-07, + "loss": 2.5711, + "step": 33397 + }, + { + "epoch": 0.9903626604987694, + "grad_norm": 0.06366080790758133, + "learning_rate": 2.3525460512502373e-07, + "loss": 2.5024, + "step": 33398 + }, + { + "epoch": 0.9903923138510808, + "grad_norm": 0.06272205710411072, + "learning_rate": 2.3381365128249955e-07, + "loss": 2.5691, + "step": 33399 + }, + { + "epoch": 0.9904219672033924, + "grad_norm": 0.06605342030525208, + "learning_rate": 2.3237712295531844e-07, + "loss": 2.5259, + "step": 33400 + }, + { + "epoch": 0.9904516205557038, + "grad_norm": 0.061525266617536545, + "learning_rate": 2.3094502015619245e-07, + "loss": 2.5336, + "step": 33401 + }, + { + "epoch": 0.9904812739080153, + "grad_norm": 0.06289894878864288, + "learning_rate": 2.2951734289783367e-07, + "loss": 2.5668, + "step": 33402 + }, + { + "epoch": 0.9905109272603267, + "grad_norm": 0.06557037681341171, + "learning_rate": 2.280940911929541e-07, + "loss": 2.4961, + "step": 33403 + }, + { + "epoch": 0.9905405806126383, + "grad_norm": 0.06333746016025543, + "learning_rate": 2.2667526505398828e-07, + "loss": 2.5291, + "step": 33404 + }, + { + "epoch": 0.9905702339649497, + "grad_norm": 0.06412909179925919, + "learning_rate": 2.2526086449364824e-07, + "loss": 2.5147, + "step": 33405 + }, + { + "epoch": 0.9905998873172612, + "grad_norm": 0.06424381583929062, + "learning_rate": 2.2385088952442402e-07, + "loss": 2.5399, + "step": 33406 + }, + { + "epoch": 0.9906295406695727, + "grad_norm": 0.06354490667581558, + "learning_rate": 2.2244534015875006e-07, + "loss": 2.5337, + "step": 33407 + }, + { + "epoch": 0.9906591940218842, + "grad_norm": 0.06344626098871231, + "learning_rate": 2.2104421640911643e-07, + "loss": 2.557, + "step": 33408 + }, + { + "epoch": 0.9906888473741956, + "grad_norm": 0.06251586973667145, + "learning_rate": 2.196475182879576e-07, + "loss": 2.5612, + "step": 33409 + }, + { + "epoch": 0.9907185007265071, + "grad_norm": 0.0613347589969635, + "learning_rate": 2.1825524580754152e-07, + "loss": 2.5365, + "step": 33410 + }, + { + "epoch": 0.9907481540788187, + "grad_norm": 0.28361237049102783, + "learning_rate": 2.1686739898030272e-07, + "loss": 2.5204, + "step": 33411 + }, + { + "epoch": 0.9907778074311301, + "grad_norm": 0.06656904518604279, + "learning_rate": 2.1548397781850913e-07, + "loss": 2.5334, + "step": 33412 + }, + { + "epoch": 0.9908074607834416, + "grad_norm": 0.06645222008228302, + "learning_rate": 2.1410498233437326e-07, + "loss": 2.5563, + "step": 33413 + }, + { + "epoch": 0.990837114135753, + "grad_norm": 0.06786248832941055, + "learning_rate": 2.127304125401075e-07, + "loss": 2.5192, + "step": 33414 + }, + { + "epoch": 0.9908667674880646, + "grad_norm": 0.06546814739704132, + "learning_rate": 2.1136026844792434e-07, + "loss": 2.5471, + "step": 33415 + }, + { + "epoch": 0.990896420840376, + "grad_norm": 0.06574031710624695, + "learning_rate": 2.099945500699252e-07, + "loss": 2.5063, + "step": 33416 + }, + { + "epoch": 0.9909260741926875, + "grad_norm": 0.06511153280735016, + "learning_rate": 2.086332574182115e-07, + "loss": 2.55, + "step": 33417 + }, + { + "epoch": 0.9909557275449989, + "grad_norm": 0.06449881196022034, + "learning_rate": 2.0727639050482917e-07, + "loss": 2.5013, + "step": 33418 + }, + { + "epoch": 0.9909853808973105, + "grad_norm": 0.0686754658818245, + "learning_rate": 2.0592394934182413e-07, + "loss": 2.5594, + "step": 33419 + }, + { + "epoch": 0.9910150342496219, + "grad_norm": 0.06369767338037491, + "learning_rate": 2.0457593394113128e-07, + "loss": 2.5344, + "step": 33420 + }, + { + "epoch": 0.9910446876019334, + "grad_norm": 0.06341932713985443, + "learning_rate": 2.032323443146855e-07, + "loss": 2.5139, + "step": 33421 + }, + { + "epoch": 0.9910743409542448, + "grad_norm": 0.06542066484689713, + "learning_rate": 2.0189318047447724e-07, + "loss": 2.5471, + "step": 33422 + }, + { + "epoch": 0.9911039943065564, + "grad_norm": 0.06334774941205978, + "learning_rate": 2.0055844243221932e-07, + "loss": 2.554, + "step": 33423 + }, + { + "epoch": 0.9911336476588678, + "grad_norm": 0.06379901617765427, + "learning_rate": 1.9922813019984663e-07, + "loss": 2.5533, + "step": 33424 + }, + { + "epoch": 0.9911633010111793, + "grad_norm": 0.0627954974770546, + "learning_rate": 1.9790224378907207e-07, + "loss": 2.5391, + "step": 33425 + }, + { + "epoch": 0.9911929543634908, + "grad_norm": 0.06356161832809448, + "learning_rate": 1.9658078321171947e-07, + "loss": 2.5366, + "step": 33426 + }, + { + "epoch": 0.9912226077158023, + "grad_norm": 0.06290891766548157, + "learning_rate": 1.9526374847939066e-07, + "loss": 2.5444, + "step": 33427 + }, + { + "epoch": 0.9912522610681137, + "grad_norm": 0.06232719123363495, + "learning_rate": 1.939511396037985e-07, + "loss": 2.5667, + "step": 33428 + }, + { + "epoch": 0.9912819144204252, + "grad_norm": 0.06521056592464447, + "learning_rate": 1.9264295659654486e-07, + "loss": 2.5351, + "step": 33429 + }, + { + "epoch": 0.9913115677727367, + "grad_norm": 0.06200703978538513, + "learning_rate": 1.913391994692315e-07, + "loss": 2.5563, + "step": 33430 + }, + { + "epoch": 0.9913412211250482, + "grad_norm": 0.06409083306789398, + "learning_rate": 1.900398682334048e-07, + "loss": 2.5211, + "step": 33431 + }, + { + "epoch": 0.9913708744773597, + "grad_norm": 0.06418272107839584, + "learning_rate": 1.8874496290055554e-07, + "loss": 2.5412, + "step": 33432 + }, + { + "epoch": 0.9914005278296711, + "grad_norm": 0.06553498655557632, + "learning_rate": 1.8745448348223006e-07, + "loss": 2.5332, + "step": 33433 + }, + { + "epoch": 0.9914301811819827, + "grad_norm": 0.06325999647378922, + "learning_rate": 1.8616842998969706e-07, + "loss": 2.5177, + "step": 33434 + }, + { + "epoch": 0.9914598345342941, + "grad_norm": 0.06235473230481148, + "learning_rate": 1.8488680243450294e-07, + "loss": 2.5354, + "step": 33435 + }, + { + "epoch": 0.9914894878866056, + "grad_norm": 0.06575626134872437, + "learning_rate": 1.8360960082786093e-07, + "loss": 2.5262, + "step": 33436 + }, + { + "epoch": 0.991519141238917, + "grad_norm": 0.06188749521970749, + "learning_rate": 1.8233682518120631e-07, + "loss": 2.5237, + "step": 33437 + }, + { + "epoch": 0.9915487945912286, + "grad_norm": 0.06440915912389755, + "learning_rate": 1.8106847550569682e-07, + "loss": 2.5282, + "step": 33438 + }, + { + "epoch": 0.99157844794354, + "grad_norm": 0.06372815370559692, + "learning_rate": 1.7980455181265675e-07, + "loss": 2.5394, + "step": 33439 + }, + { + "epoch": 0.9916081012958515, + "grad_norm": 0.0635833665728569, + "learning_rate": 1.7854505411324383e-07, + "loss": 2.554, + "step": 33440 + }, + { + "epoch": 0.991637754648163, + "grad_norm": 0.06598199158906937, + "learning_rate": 1.7728998241861581e-07, + "loss": 2.5114, + "step": 33441 + }, + { + "epoch": 0.9916674080004745, + "grad_norm": 0.06283647567033768, + "learning_rate": 1.7603933673987492e-07, + "loss": 2.5272, + "step": 33442 + }, + { + "epoch": 0.9916970613527859, + "grad_norm": 0.06425818800926208, + "learning_rate": 1.747931170880679e-07, + "loss": 2.5841, + "step": 33443 + }, + { + "epoch": 0.9917267147050974, + "grad_norm": 0.06355450302362442, + "learning_rate": 1.7355132347429692e-07, + "loss": 2.5641, + "step": 33444 + }, + { + "epoch": 0.9917563680574089, + "grad_norm": 0.06486191600561142, + "learning_rate": 1.7231395590949772e-07, + "loss": 2.4927, + "step": 33445 + }, + { + "epoch": 0.9917860214097204, + "grad_norm": 0.06342343240976334, + "learning_rate": 1.7108101440466151e-07, + "loss": 2.5196, + "step": 33446 + }, + { + "epoch": 0.9918156747620318, + "grad_norm": 0.0630570724606514, + "learning_rate": 1.6985249897066845e-07, + "loss": 2.5155, + "step": 33447 + }, + { + "epoch": 0.9918453281143433, + "grad_norm": 0.06568034738302231, + "learning_rate": 1.6862840961845427e-07, + "loss": 2.5404, + "step": 33448 + }, + { + "epoch": 0.9918749814666548, + "grad_norm": 0.06534357368946075, + "learning_rate": 1.6740874635884362e-07, + "loss": 2.5354, + "step": 33449 + }, + { + "epoch": 0.9919046348189663, + "grad_norm": 0.0638415664434433, + "learning_rate": 1.6619350920260568e-07, + "loss": 2.544, + "step": 33450 + }, + { + "epoch": 0.9919342881712777, + "grad_norm": 0.06526950001716614, + "learning_rate": 1.649826981605651e-07, + "loss": 2.5302, + "step": 33451 + }, + { + "epoch": 0.9919639415235892, + "grad_norm": 0.06366614252328873, + "learning_rate": 1.6377631324332453e-07, + "loss": 2.5613, + "step": 33452 + }, + { + "epoch": 0.9919935948759008, + "grad_norm": 0.062122005969285965, + "learning_rate": 1.6257435446170866e-07, + "loss": 2.5552, + "step": 33453 + }, + { + "epoch": 0.9920232482282122, + "grad_norm": 0.06487925350666046, + "learning_rate": 1.6137682182620905e-07, + "loss": 2.5532, + "step": 33454 + }, + { + "epoch": 0.9920529015805237, + "grad_norm": 0.061284344643354416, + "learning_rate": 1.601837153475949e-07, + "loss": 2.5407, + "step": 33455 + }, + { + "epoch": 0.9920825549328351, + "grad_norm": 0.0644504725933075, + "learning_rate": 1.589950350363023e-07, + "loss": 2.5002, + "step": 33456 + }, + { + "epoch": 0.9921122082851467, + "grad_norm": 0.0607696957886219, + "learning_rate": 1.5781078090293387e-07, + "loss": 2.5086, + "step": 33457 + }, + { + "epoch": 0.9921418616374581, + "grad_norm": 0.0629277154803276, + "learning_rate": 1.5663095295792574e-07, + "loss": 2.5706, + "step": 33458 + }, + { + "epoch": 0.9921715149897696, + "grad_norm": 0.06470983475446701, + "learning_rate": 1.5545555121176946e-07, + "loss": 2.5218, + "step": 33459 + }, + { + "epoch": 0.992201168342081, + "grad_norm": 0.06255801767110825, + "learning_rate": 1.5428457567484566e-07, + "loss": 2.5694, + "step": 33460 + }, + { + "epoch": 0.9922308216943926, + "grad_norm": 0.06326036155223846, + "learning_rate": 1.531180263575349e-07, + "loss": 2.5478, + "step": 33461 + }, + { + "epoch": 0.992260475046704, + "grad_norm": 0.06398481130599976, + "learning_rate": 1.5195590327010677e-07, + "loss": 2.4955, + "step": 33462 + }, + { + "epoch": 0.9922901283990155, + "grad_norm": 0.06249868497252464, + "learning_rate": 1.5079820642299735e-07, + "loss": 2.5173, + "step": 33463 + }, + { + "epoch": 0.992319781751327, + "grad_norm": 0.06327234953641891, + "learning_rate": 1.4964493582630967e-07, + "loss": 2.5584, + "step": 33464 + }, + { + "epoch": 0.9923494351036385, + "grad_norm": 0.0631176084280014, + "learning_rate": 1.484960914903133e-07, + "loss": 2.5148, + "step": 33465 + }, + { + "epoch": 0.9923790884559499, + "grad_norm": 0.06336142867803574, + "learning_rate": 1.4735167342516675e-07, + "loss": 2.5211, + "step": 33466 + }, + { + "epoch": 0.9924087418082614, + "grad_norm": 0.06248391792178154, + "learning_rate": 1.462116816410841e-07, + "loss": 2.5548, + "step": 33467 + }, + { + "epoch": 0.9924383951605729, + "grad_norm": 0.06518889963626862, + "learning_rate": 1.4507611614805738e-07, + "loss": 2.5383, + "step": 33468 + }, + { + "epoch": 0.9924680485128844, + "grad_norm": 0.0652036964893341, + "learning_rate": 1.4394497695618958e-07, + "loss": 2.5455, + "step": 33469 + }, + { + "epoch": 0.9924977018651958, + "grad_norm": 0.06326261162757874, + "learning_rate": 1.428182640754727e-07, + "loss": 2.5298, + "step": 33470 + }, + { + "epoch": 0.9925273552175073, + "grad_norm": 0.0666147992014885, + "learning_rate": 1.416959775158988e-07, + "loss": 2.5464, + "step": 33471 + }, + { + "epoch": 0.9925570085698188, + "grad_norm": 0.0636880025267601, + "learning_rate": 1.4057811728740432e-07, + "loss": 2.5349, + "step": 33472 + }, + { + "epoch": 0.9925866619221303, + "grad_norm": 0.06389322876930237, + "learning_rate": 1.3946468339992579e-07, + "loss": 2.559, + "step": 33473 + }, + { + "epoch": 0.9926163152744418, + "grad_norm": 0.06356436759233475, + "learning_rate": 1.3835567586323318e-07, + "loss": 2.5052, + "step": 33474 + }, + { + "epoch": 0.9926459686267533, + "grad_norm": 0.06584424525499344, + "learning_rate": 1.3725109468726293e-07, + "loss": 2.5286, + "step": 33475 + }, + { + "epoch": 0.9926756219790648, + "grad_norm": 0.06414630264043808, + "learning_rate": 1.3615093988167403e-07, + "loss": 2.5272, + "step": 33476 + }, + { + "epoch": 0.9927052753313762, + "grad_norm": 0.06273967027664185, + "learning_rate": 1.3505521145629195e-07, + "loss": 2.4965, + "step": 33477 + }, + { + "epoch": 0.9927349286836877, + "grad_norm": 0.0643828883767128, + "learning_rate": 1.3396390942083115e-07, + "loss": 2.5465, + "step": 33478 + }, + { + "epoch": 0.9927645820359992, + "grad_norm": 0.06458545476198196, + "learning_rate": 1.3287703378489503e-07, + "loss": 2.5663, + "step": 33479 + }, + { + "epoch": 0.9927942353883107, + "grad_norm": 0.06667469441890717, + "learning_rate": 1.3179458455814254e-07, + "loss": 2.5103, + "step": 33480 + }, + { + "epoch": 0.9928238887406221, + "grad_norm": 0.06175608932971954, + "learning_rate": 1.307165617501216e-07, + "loss": 2.5453, + "step": 33481 + }, + { + "epoch": 0.9928535420929336, + "grad_norm": 0.06641983985900879, + "learning_rate": 1.2964296537043562e-07, + "loss": 2.5455, + "step": 33482 + }, + { + "epoch": 0.9928831954452451, + "grad_norm": 0.06502782553434372, + "learning_rate": 1.2857379542852153e-07, + "loss": 2.5666, + "step": 33483 + }, + { + "epoch": 0.9929128487975566, + "grad_norm": 0.0617021843791008, + "learning_rate": 1.2750905193392726e-07, + "loss": 2.5533, + "step": 33484 + }, + { + "epoch": 0.992942502149868, + "grad_norm": 0.062442149966955185, + "learning_rate": 1.2644873489603416e-07, + "loss": 2.5621, + "step": 33485 + }, + { + "epoch": 0.9929721555021795, + "grad_norm": 0.06300043314695358, + "learning_rate": 1.253928443242236e-07, + "loss": 2.5719, + "step": 33486 + }, + { + "epoch": 0.993001808854491, + "grad_norm": 0.06441802531480789, + "learning_rate": 1.24341380227877e-07, + "loss": 2.5199, + "step": 33487 + }, + { + "epoch": 0.9930314622068025, + "grad_norm": 0.06264176219701767, + "learning_rate": 1.2329434261632022e-07, + "loss": 2.5442, + "step": 33488 + }, + { + "epoch": 0.9930611155591139, + "grad_norm": 0.06259927153587341, + "learning_rate": 1.222517314987681e-07, + "loss": 2.5356, + "step": 33489 + }, + { + "epoch": 0.9930907689114254, + "grad_norm": 0.06200070679187775, + "learning_rate": 1.2121354688443552e-07, + "loss": 2.5328, + "step": 33490 + }, + { + "epoch": 0.9931204222637369, + "grad_norm": 0.06650198251008987, + "learning_rate": 1.2017978878264834e-07, + "loss": 2.5554, + "step": 33491 + }, + { + "epoch": 0.9931500756160484, + "grad_norm": 0.06500398367643356, + "learning_rate": 1.1915045720239936e-07, + "loss": 2.5591, + "step": 33492 + }, + { + "epoch": 0.9931797289683598, + "grad_norm": 0.062039680778980255, + "learning_rate": 1.1812555215290343e-07, + "loss": 2.5657, + "step": 33493 + }, + { + "epoch": 0.9932093823206714, + "grad_norm": 0.0649947077035904, + "learning_rate": 1.1710507364320888e-07, + "loss": 2.5111, + "step": 33494 + }, + { + "epoch": 0.9932390356729829, + "grad_norm": 0.06572961807250977, + "learning_rate": 1.1608902168236401e-07, + "loss": 2.5248, + "step": 33495 + }, + { + "epoch": 0.9932686890252943, + "grad_norm": 0.0648757740855217, + "learning_rate": 1.1507739627930614e-07, + "loss": 2.5413, + "step": 33496 + }, + { + "epoch": 0.9932983423776058, + "grad_norm": 0.06301485747098923, + "learning_rate": 1.1407019744308356e-07, + "loss": 2.5495, + "step": 33497 + }, + { + "epoch": 0.9933279957299173, + "grad_norm": 0.06803081929683685, + "learning_rate": 1.1306742518257807e-07, + "loss": 2.5606, + "step": 33498 + }, + { + "epoch": 0.9933576490822288, + "grad_norm": 0.06319519877433777, + "learning_rate": 1.1206907950667145e-07, + "loss": 2.5109, + "step": 33499 + }, + { + "epoch": 0.9933873024345402, + "grad_norm": 0.0641995370388031, + "learning_rate": 1.1107516042418998e-07, + "loss": 2.5575, + "step": 33500 + }, + { + "epoch": 0.9934169557868517, + "grad_norm": 0.06464221328496933, + "learning_rate": 1.1008566794390439e-07, + "loss": 2.5373, + "step": 33501 + }, + { + "epoch": 0.9934466091391632, + "grad_norm": 0.06438874453306198, + "learning_rate": 1.0910060207464101e-07, + "loss": 2.5782, + "step": 33502 + }, + { + "epoch": 0.9934762624914747, + "grad_norm": 0.06230999156832695, + "learning_rate": 1.0811996282511504e-07, + "loss": 2.5305, + "step": 33503 + }, + { + "epoch": 0.9935059158437861, + "grad_norm": 0.06335221976041794, + "learning_rate": 1.0714375020398626e-07, + "loss": 2.5504, + "step": 33504 + }, + { + "epoch": 0.9935355691960976, + "grad_norm": 0.06614981591701508, + "learning_rate": 1.0617196421991438e-07, + "loss": 2.5633, + "step": 33505 + }, + { + "epoch": 0.9935652225484091, + "grad_norm": 0.06254303455352783, + "learning_rate": 1.0520460488144812e-07, + "loss": 2.5508, + "step": 33506 + }, + { + "epoch": 0.9935948759007206, + "grad_norm": 0.06401286274194717, + "learning_rate": 1.0424167219724723e-07, + "loss": 2.5485, + "step": 33507 + }, + { + "epoch": 0.993624529253032, + "grad_norm": 0.06147677078843117, + "learning_rate": 1.032831661757494e-07, + "loss": 2.5159, + "step": 33508 + }, + { + "epoch": 0.9936541826053435, + "grad_norm": 0.06469673663377762, + "learning_rate": 1.0232908682550335e-07, + "loss": 2.531, + "step": 33509 + }, + { + "epoch": 0.993683835957655, + "grad_norm": 0.06269349902868271, + "learning_rate": 1.0137943415494677e-07, + "loss": 2.5425, + "step": 33510 + }, + { + "epoch": 0.9937134893099665, + "grad_norm": 0.06289201229810715, + "learning_rate": 1.0043420817251736e-07, + "loss": 2.541, + "step": 33511 + }, + { + "epoch": 0.9937431426622779, + "grad_norm": 0.06384654343128204, + "learning_rate": 9.949340888648629e-08, + "loss": 2.5528, + "step": 33512 + }, + { + "epoch": 0.9937727960145895, + "grad_norm": 0.06344769150018692, + "learning_rate": 9.855703630529123e-08, + "loss": 2.5583, + "step": 33513 + }, + { + "epoch": 0.9938024493669009, + "grad_norm": 0.06556446105241776, + "learning_rate": 9.762509043714784e-08, + "loss": 2.5504, + "step": 33514 + }, + { + "epoch": 0.9938321027192124, + "grad_norm": 0.06433802843093872, + "learning_rate": 9.669757129038282e-08, + "loss": 2.5452, + "step": 33515 + }, + { + "epoch": 0.9938617560715239, + "grad_norm": 0.06325292587280273, + "learning_rate": 9.577447887315627e-08, + "loss": 2.5435, + "step": 33516 + }, + { + "epoch": 0.9938914094238354, + "grad_norm": 0.06410210579633713, + "learning_rate": 9.485581319362835e-08, + "loss": 2.5329, + "step": 33517 + }, + { + "epoch": 0.9939210627761469, + "grad_norm": 0.06130586937069893, + "learning_rate": 9.39415742599592e-08, + "loss": 2.5626, + "step": 33518 + }, + { + "epoch": 0.9939507161284583, + "grad_norm": 0.06284566223621368, + "learning_rate": 9.303176208025344e-08, + "loss": 2.5599, + "step": 33519 + }, + { + "epoch": 0.9939803694807698, + "grad_norm": 0.06380210071802139, + "learning_rate": 9.212637666261569e-08, + "loss": 2.5191, + "step": 33520 + }, + { + "epoch": 0.9940100228330813, + "grad_norm": 0.06422172486782074, + "learning_rate": 9.122541801492856e-08, + "loss": 2.5378, + "step": 33521 + }, + { + "epoch": 0.9940396761853928, + "grad_norm": 0.06218087673187256, + "learning_rate": 9.032888614529667e-08, + "loss": 2.5384, + "step": 33522 + }, + { + "epoch": 0.9940693295377042, + "grad_norm": 0.06237149238586426, + "learning_rate": 8.943678106154706e-08, + "loss": 2.5004, + "step": 33523 + }, + { + "epoch": 0.9940989828900157, + "grad_norm": 0.06352212280035019, + "learning_rate": 8.854910277172889e-08, + "loss": 2.5261, + "step": 33524 + }, + { + "epoch": 0.9941286362423272, + "grad_norm": 0.06527052074670792, + "learning_rate": 8.766585128355819e-08, + "loss": 2.5151, + "step": 33525 + }, + { + "epoch": 0.9941582895946387, + "grad_norm": 0.06532377004623413, + "learning_rate": 8.678702660491755e-08, + "loss": 2.5819, + "step": 33526 + }, + { + "epoch": 0.9941879429469501, + "grad_norm": 0.06452678889036179, + "learning_rate": 8.591262874363403e-08, + "loss": 2.5596, + "step": 33527 + }, + { + "epoch": 0.9942175962992617, + "grad_norm": 0.06374525278806686, + "learning_rate": 8.504265770736819e-08, + "loss": 2.5043, + "step": 33528 + }, + { + "epoch": 0.9942472496515731, + "grad_norm": 0.0636897087097168, + "learning_rate": 8.417711350383605e-08, + "loss": 2.5315, + "step": 33529 + }, + { + "epoch": 0.9942769030038846, + "grad_norm": 0.06318527460098267, + "learning_rate": 8.331599614075369e-08, + "loss": 2.5272, + "step": 33530 + }, + { + "epoch": 0.994306556356196, + "grad_norm": 0.06296731531620026, + "learning_rate": 8.245930562572613e-08, + "loss": 2.513, + "step": 33531 + }, + { + "epoch": 0.9943362097085076, + "grad_norm": 0.063988097012043, + "learning_rate": 8.160704196630286e-08, + "loss": 2.5467, + "step": 33532 + }, + { + "epoch": 0.994365863060819, + "grad_norm": 0.06280766427516937, + "learning_rate": 8.075920517008895e-08, + "loss": 2.5712, + "step": 33533 + }, + { + "epoch": 0.9943955164131305, + "grad_norm": 0.061620865017175674, + "learning_rate": 7.991579524457836e-08, + "loss": 2.5471, + "step": 33534 + }, + { + "epoch": 0.9944251697654419, + "grad_norm": 0.06344550848007202, + "learning_rate": 7.907681219715413e-08, + "loss": 2.5326, + "step": 33535 + }, + { + "epoch": 0.9944548231177535, + "grad_norm": 0.06506466120481491, + "learning_rate": 7.824225603536573e-08, + "loss": 2.5262, + "step": 33536 + }, + { + "epoch": 0.994484476470065, + "grad_norm": 0.06357508897781372, + "learning_rate": 7.741212676654064e-08, + "loss": 2.5621, + "step": 33537 + }, + { + "epoch": 0.9945141298223764, + "grad_norm": 0.06588829308748245, + "learning_rate": 7.658642439806185e-08, + "loss": 2.5484, + "step": 33538 + }, + { + "epoch": 0.994543783174688, + "grad_norm": 0.0638294667005539, + "learning_rate": 7.576514893720132e-08, + "loss": 2.5421, + "step": 33539 + }, + { + "epoch": 0.9945734365269994, + "grad_norm": 0.06478855758905411, + "learning_rate": 7.494830039123101e-08, + "loss": 2.5216, + "step": 33540 + }, + { + "epoch": 0.9946030898793109, + "grad_norm": 0.06330639868974686, + "learning_rate": 7.413587876742289e-08, + "loss": 2.5832, + "step": 33541 + }, + { + "epoch": 0.9946327432316223, + "grad_norm": 0.06283260136842728, + "learning_rate": 7.33278840729934e-08, + "loss": 2.5412, + "step": 33542 + }, + { + "epoch": 0.9946623965839338, + "grad_norm": 0.06259479373693466, + "learning_rate": 7.252431631499246e-08, + "loss": 2.5153, + "step": 33543 + }, + { + "epoch": 0.9946920499362453, + "grad_norm": 0.06468959897756577, + "learning_rate": 7.172517550063651e-08, + "loss": 2.5406, + "step": 33544 + }, + { + "epoch": 0.9947217032885568, + "grad_norm": 0.06371042132377625, + "learning_rate": 7.093046163697548e-08, + "loss": 2.5418, + "step": 33545 + }, + { + "epoch": 0.9947513566408682, + "grad_norm": 0.06214692071080208, + "learning_rate": 7.014017473100375e-08, + "loss": 2.5489, + "step": 33546 + }, + { + "epoch": 0.9947810099931798, + "grad_norm": 0.06536965072154999, + "learning_rate": 6.935431478977128e-08, + "loss": 2.5503, + "step": 33547 + }, + { + "epoch": 0.9948106633454912, + "grad_norm": 0.06484802067279816, + "learning_rate": 6.857288182021692e-08, + "loss": 2.5661, + "step": 33548 + }, + { + "epoch": 0.9948403166978027, + "grad_norm": 0.06391003727912903, + "learning_rate": 6.779587582927959e-08, + "loss": 2.5094, + "step": 33549 + }, + { + "epoch": 0.9948699700501141, + "grad_norm": 0.06326790899038315, + "learning_rate": 6.702329682378717e-08, + "loss": 2.5122, + "step": 33550 + }, + { + "epoch": 0.9948996234024257, + "grad_norm": 0.06692925840616226, + "learning_rate": 6.6255144810623e-08, + "loss": 2.567, + "step": 33551 + }, + { + "epoch": 0.9949292767547371, + "grad_norm": 0.06497519463300705, + "learning_rate": 6.549141979661499e-08, + "loss": 2.5121, + "step": 33552 + }, + { + "epoch": 0.9949589301070486, + "grad_norm": 0.06378629803657532, + "learning_rate": 6.473212178842446e-08, + "loss": 2.5561, + "step": 33553 + }, + { + "epoch": 0.99498858345936, + "grad_norm": 0.06250607967376709, + "learning_rate": 6.397725079287931e-08, + "loss": 2.5445, + "step": 33554 + }, + { + "epoch": 0.9950182368116716, + "grad_norm": 0.06539301574230194, + "learning_rate": 6.322680681664083e-08, + "loss": 2.5321, + "step": 33555 + }, + { + "epoch": 0.995047890163983, + "grad_norm": 0.062230125069618225, + "learning_rate": 6.24807898663704e-08, + "loss": 2.5288, + "step": 33556 + }, + { + "epoch": 0.9950775435162945, + "grad_norm": 0.06400015950202942, + "learning_rate": 6.173919994861833e-08, + "loss": 2.518, + "step": 33557 + }, + { + "epoch": 0.995107196868606, + "grad_norm": 0.06411241739988327, + "learning_rate": 6.10020370699349e-08, + "loss": 2.5212, + "step": 33558 + }, + { + "epoch": 0.9951368502209175, + "grad_norm": 0.06611794978380203, + "learning_rate": 6.026930123692598e-08, + "loss": 2.4984, + "step": 33559 + }, + { + "epoch": 0.995166503573229, + "grad_norm": 0.06533880531787872, + "learning_rate": 5.9540992456086354e-08, + "loss": 2.5327, + "step": 33560 + }, + { + "epoch": 0.9951961569255404, + "grad_norm": 0.062289077788591385, + "learning_rate": 5.881711073379981e-08, + "loss": 2.5206, + "step": 33561 + }, + { + "epoch": 0.995225810277852, + "grad_norm": 0.060906682163476944, + "learning_rate": 5.809765607645012e-08, + "loss": 2.5471, + "step": 33562 + }, + { + "epoch": 0.9952554636301634, + "grad_norm": 0.06303226202726364, + "learning_rate": 5.7382628490532105e-08, + "loss": 2.5511, + "step": 33563 + }, + { + "epoch": 0.9952851169824749, + "grad_norm": 0.06264624744653702, + "learning_rate": 5.6672027982263006e-08, + "loss": 2.5186, + "step": 33564 + }, + { + "epoch": 0.9953147703347863, + "grad_norm": 0.06213441863656044, + "learning_rate": 5.5965854557971095e-08, + "loss": 2.5136, + "step": 33565 + }, + { + "epoch": 0.9953444236870979, + "grad_norm": 0.06425903737545013, + "learning_rate": 5.526410822392913e-08, + "loss": 2.5533, + "step": 33566 + }, + { + "epoch": 0.9953740770394093, + "grad_norm": 0.06506843864917755, + "learning_rate": 5.456678898635436e-08, + "loss": 2.5196, + "step": 33567 + }, + { + "epoch": 0.9954037303917208, + "grad_norm": 0.06517761200666428, + "learning_rate": 5.387389685140853e-08, + "loss": 2.5426, + "step": 33568 + }, + { + "epoch": 0.9954333837440322, + "grad_norm": 0.06322009861469269, + "learning_rate": 5.318543182519786e-08, + "loss": 2.5476, + "step": 33569 + }, + { + "epoch": 0.9954630370963438, + "grad_norm": 0.06612288951873779, + "learning_rate": 5.250139391382858e-08, + "loss": 2.5368, + "step": 33570 + }, + { + "epoch": 0.9954926904486552, + "grad_norm": 0.06440640240907669, + "learning_rate": 5.182178312340691e-08, + "loss": 2.5709, + "step": 33571 + }, + { + "epoch": 0.9955223438009667, + "grad_norm": 0.06385508179664612, + "learning_rate": 5.1146599459928054e-08, + "loss": 2.5111, + "step": 33572 + }, + { + "epoch": 0.9955519971532781, + "grad_norm": 0.06565089523792267, + "learning_rate": 5.047584292933172e-08, + "loss": 2.5097, + "step": 33573 + }, + { + "epoch": 0.9955816505055897, + "grad_norm": 0.060958173125982285, + "learning_rate": 4.9809513537613096e-08, + "loss": 2.5107, + "step": 33574 + }, + { + "epoch": 0.9956113038579011, + "grad_norm": 0.06273804605007172, + "learning_rate": 4.914761129060086e-08, + "loss": 2.5051, + "step": 33575 + }, + { + "epoch": 0.9956409572102126, + "grad_norm": 0.06217050924897194, + "learning_rate": 4.84901361942347e-08, + "loss": 2.5353, + "step": 33576 + }, + { + "epoch": 0.995670610562524, + "grad_norm": 0.06384813040494919, + "learning_rate": 4.7837088254343296e-08, + "loss": 2.5378, + "step": 33577 + }, + { + "epoch": 0.9957002639148356, + "grad_norm": 0.06496546417474747, + "learning_rate": 4.7188467476588784e-08, + "loss": 2.5554, + "step": 33578 + }, + { + "epoch": 0.9957299172671471, + "grad_norm": 0.06512227654457092, + "learning_rate": 4.654427386685534e-08, + "loss": 2.5696, + "step": 33579 + }, + { + "epoch": 0.9957595706194585, + "grad_norm": 0.06660449504852295, + "learning_rate": 4.5904507430749584e-08, + "loss": 2.5493, + "step": 33580 + }, + { + "epoch": 0.99578922397177, + "grad_norm": 0.06524606049060822, + "learning_rate": 4.52691681740447e-08, + "loss": 2.5259, + "step": 33581 + }, + { + "epoch": 0.9958188773240815, + "grad_norm": 0.06331270188093185, + "learning_rate": 4.463825610223626e-08, + "loss": 2.5287, + "step": 33582 + }, + { + "epoch": 0.995848530676393, + "grad_norm": 0.06804952025413513, + "learning_rate": 4.401177122098643e-08, + "loss": 2.5463, + "step": 33583 + }, + { + "epoch": 0.9958781840287044, + "grad_norm": 0.0640607699751854, + "learning_rate": 4.33897135357908e-08, + "loss": 2.5036, + "step": 33584 + }, + { + "epoch": 0.995907837381016, + "grad_norm": 0.06139564886689186, + "learning_rate": 4.2772083052255994e-08, + "loss": 2.5166, + "step": 33585 + }, + { + "epoch": 0.9959374907333274, + "grad_norm": 0.0630023404955864, + "learning_rate": 4.21588797757666e-08, + "loss": 2.5107, + "step": 33586 + }, + { + "epoch": 0.9959671440856389, + "grad_norm": 0.0630323737859726, + "learning_rate": 4.155010371176271e-08, + "loss": 2.5493, + "step": 33587 + }, + { + "epoch": 0.9959967974379503, + "grad_norm": 0.06592997908592224, + "learning_rate": 4.094575486568442e-08, + "loss": 2.5384, + "step": 33588 + }, + { + "epoch": 0.9960264507902619, + "grad_norm": 0.06420168280601501, + "learning_rate": 4.0345833242805276e-08, + "loss": 2.539, + "step": 33589 + }, + { + "epoch": 0.9960561041425733, + "grad_norm": 0.06118151918053627, + "learning_rate": 3.975033884850987e-08, + "loss": 2.5659, + "step": 33590 + }, + { + "epoch": 0.9960857574948848, + "grad_norm": 0.06620772182941437, + "learning_rate": 3.915927168801625e-08, + "loss": 2.527, + "step": 33591 + }, + { + "epoch": 0.9961154108471962, + "grad_norm": 0.06343179196119308, + "learning_rate": 3.8572631766653487e-08, + "loss": 2.5286, + "step": 33592 + }, + { + "epoch": 0.9961450641995078, + "grad_norm": 0.0655592679977417, + "learning_rate": 3.799041908947309e-08, + "loss": 2.5317, + "step": 33593 + }, + { + "epoch": 0.9961747175518192, + "grad_norm": 0.06457596272230148, + "learning_rate": 3.741263366174863e-08, + "loss": 2.5698, + "step": 33594 + }, + { + "epoch": 0.9962043709041307, + "grad_norm": 0.06431020051240921, + "learning_rate": 3.683927548853161e-08, + "loss": 2.5646, + "step": 33595 + }, + { + "epoch": 0.9962340242564421, + "grad_norm": 0.06539110094308853, + "learning_rate": 3.627034457492906e-08, + "loss": 2.5449, + "step": 33596 + }, + { + "epoch": 0.9962636776087537, + "grad_norm": 0.06173526495695114, + "learning_rate": 3.570584092593698e-08, + "loss": 2.5091, + "step": 33597 + }, + { + "epoch": 0.9962933309610651, + "grad_norm": 0.06232327222824097, + "learning_rate": 3.51457645466069e-08, + "loss": 2.5541, + "step": 33598 + }, + { + "epoch": 0.9963229843133766, + "grad_norm": 0.06357085704803467, + "learning_rate": 3.459011544187929e-08, + "loss": 2.5972, + "step": 33599 + }, + { + "epoch": 0.9963526376656882, + "grad_norm": 0.06191565468907356, + "learning_rate": 3.403889361669465e-08, + "loss": 2.5434, + "step": 33600 + }, + { + "epoch": 0.9963822910179996, + "grad_norm": 0.06541851907968521, + "learning_rate": 3.349209907588247e-08, + "loss": 2.5598, + "step": 33601 + }, + { + "epoch": 0.9964119443703111, + "grad_norm": 0.06448919326066971, + "learning_rate": 3.294973182438321e-08, + "loss": 2.5339, + "step": 33602 + }, + { + "epoch": 0.9964415977226225, + "grad_norm": 0.06249703839421272, + "learning_rate": 3.241179186685983e-08, + "loss": 2.5658, + "step": 33603 + }, + { + "epoch": 0.9964712510749341, + "grad_norm": 0.06274168938398361, + "learning_rate": 3.187827920814179e-08, + "loss": 2.5505, + "step": 33604 + }, + { + "epoch": 0.9965009044272455, + "grad_norm": 0.0633033737540245, + "learning_rate": 3.134919385300306e-08, + "loss": 2.5165, + "step": 33605 + }, + { + "epoch": 0.996530557779557, + "grad_norm": 0.06615491956472397, + "learning_rate": 3.082453580610656e-08, + "loss": 2.5234, + "step": 33606 + }, + { + "epoch": 0.9965602111318684, + "grad_norm": 0.06347586959600449, + "learning_rate": 3.030430507200421e-08, + "loss": 2.5333, + "step": 33607 + }, + { + "epoch": 0.99658986448418, + "grad_norm": 0.06270819902420044, + "learning_rate": 2.978850165541447e-08, + "loss": 2.5169, + "step": 33608 + }, + { + "epoch": 0.9966195178364914, + "grad_norm": 0.06260942667722702, + "learning_rate": 2.9277125560889238e-08, + "loss": 2.5285, + "step": 33609 + }, + { + "epoch": 0.9966491711888029, + "grad_norm": 0.06290261447429657, + "learning_rate": 2.8770176792924928e-08, + "loss": 2.5285, + "step": 33610 + }, + { + "epoch": 0.9966788245411143, + "grad_norm": 0.06238432228565216, + "learning_rate": 2.826765535596243e-08, + "loss": 2.528, + "step": 33611 + }, + { + "epoch": 0.9967084778934259, + "grad_norm": 0.06078952178359032, + "learning_rate": 2.7769561254553656e-08, + "loss": 2.5484, + "step": 33612 + }, + { + "epoch": 0.9967381312457373, + "grad_norm": 0.06641009449958801, + "learning_rate": 2.7275894493083986e-08, + "loss": 2.5613, + "step": 33613 + }, + { + "epoch": 0.9967677845980488, + "grad_norm": 0.06499429792165756, + "learning_rate": 2.6786655075883292e-08, + "loss": 2.5678, + "step": 33614 + }, + { + "epoch": 0.9967974379503602, + "grad_norm": 0.06306753307580948, + "learning_rate": 2.6301843007281445e-08, + "loss": 2.5207, + "step": 33615 + }, + { + "epoch": 0.9968270913026718, + "grad_norm": 0.06221391633152962, + "learning_rate": 2.5821458291663825e-08, + "loss": 2.5245, + "step": 33616 + }, + { + "epoch": 0.9968567446549832, + "grad_norm": 0.06359943747520447, + "learning_rate": 2.5345500933138254e-08, + "loss": 2.5193, + "step": 33617 + }, + { + "epoch": 0.9968863980072947, + "grad_norm": 0.0637468546628952, + "learning_rate": 2.4873970936034605e-08, + "loss": 2.5573, + "step": 33618 + }, + { + "epoch": 0.9969160513596063, + "grad_norm": 0.06290372461080551, + "learning_rate": 2.4406868304516215e-08, + "loss": 2.5084, + "step": 33619 + }, + { + "epoch": 0.9969457047119177, + "grad_norm": 0.06663282960653305, + "learning_rate": 2.394419304269091e-08, + "loss": 2.5132, + "step": 33620 + }, + { + "epoch": 0.9969753580642292, + "grad_norm": 0.07043551653623581, + "learning_rate": 2.3485945154611e-08, + "loss": 2.5394, + "step": 33621 + }, + { + "epoch": 0.9970050114165406, + "grad_norm": 0.06127272918820381, + "learning_rate": 2.3032124644439823e-08, + "loss": 2.522, + "step": 33622 + }, + { + "epoch": 0.9970346647688522, + "grad_norm": 0.06581244617700577, + "learning_rate": 2.258273151606316e-08, + "loss": 2.5829, + "step": 33623 + }, + { + "epoch": 0.9970643181211636, + "grad_norm": 0.06408220529556274, + "learning_rate": 2.2137765773588835e-08, + "loss": 2.5664, + "step": 33624 + }, + { + "epoch": 0.9970939714734751, + "grad_norm": 0.0636482983827591, + "learning_rate": 2.169722742090263e-08, + "loss": 2.5322, + "step": 33625 + }, + { + "epoch": 0.9971236248257865, + "grad_norm": 0.06392838060855865, + "learning_rate": 2.126111646189033e-08, + "loss": 2.5225, + "step": 33626 + }, + { + "epoch": 0.9971532781780981, + "grad_norm": 0.06628387421369553, + "learning_rate": 2.0829432900493216e-08, + "loss": 2.5391, + "step": 33627 + }, + { + "epoch": 0.9971829315304095, + "grad_norm": 0.06158812716603279, + "learning_rate": 2.0402176740375034e-08, + "loss": 2.5361, + "step": 33628 + }, + { + "epoch": 0.997212584882721, + "grad_norm": 0.0624966025352478, + "learning_rate": 1.997934798547707e-08, + "loss": 2.551, + "step": 33629 + }, + { + "epoch": 0.9972422382350324, + "grad_norm": 0.06330380588769913, + "learning_rate": 1.956094663946306e-08, + "loss": 2.5365, + "step": 33630 + }, + { + "epoch": 0.997271891587344, + "grad_norm": 0.0641169399023056, + "learning_rate": 1.9146972706107768e-08, + "loss": 2.5221, + "step": 33631 + }, + { + "epoch": 0.9973015449396554, + "grad_norm": 0.06527243554592133, + "learning_rate": 1.8737426188963903e-08, + "loss": 2.5513, + "step": 33632 + }, + { + "epoch": 0.9973311982919669, + "grad_norm": 0.0648636594414711, + "learning_rate": 1.833230709175071e-08, + "loss": 2.4902, + "step": 33633 + }, + { + "epoch": 0.9973608516442783, + "grad_norm": 0.06412940472364426, + "learning_rate": 1.793161541802091e-08, + "loss": 2.5221, + "step": 33634 + }, + { + "epoch": 0.9973905049965899, + "grad_norm": 0.06493676453828812, + "learning_rate": 1.7535351171271697e-08, + "loss": 2.553, + "step": 33635 + }, + { + "epoch": 0.9974201583489013, + "grad_norm": 0.06411991268396378, + "learning_rate": 1.7143514355166813e-08, + "loss": 2.5572, + "step": 33636 + }, + { + "epoch": 0.9974498117012128, + "grad_norm": 0.06174083426594734, + "learning_rate": 1.675610497298141e-08, + "loss": 2.5238, + "step": 33637 + }, + { + "epoch": 0.9974794650535243, + "grad_norm": 0.06329652667045593, + "learning_rate": 1.6373123028323723e-08, + "loss": 2.4921, + "step": 33638 + }, + { + "epoch": 0.9975091184058358, + "grad_norm": 0.0700741708278656, + "learning_rate": 1.59945685244689e-08, + "loss": 2.5282, + "step": 33639 + }, + { + "epoch": 0.9975387717581473, + "grad_norm": 0.06300709396600723, + "learning_rate": 1.5620441464803124e-08, + "loss": 2.5338, + "step": 33640 + }, + { + "epoch": 0.9975684251104587, + "grad_norm": 0.06351792812347412, + "learning_rate": 1.5250741852601556e-08, + "loss": 2.5668, + "step": 33641 + }, + { + "epoch": 0.9975980784627703, + "grad_norm": 0.06515083461999893, + "learning_rate": 1.4885469691250374e-08, + "loss": 2.577, + "step": 33642 + }, + { + "epoch": 0.9976277318150817, + "grad_norm": 0.06488469243049622, + "learning_rate": 1.4524624983858203e-08, + "loss": 2.5489, + "step": 33643 + }, + { + "epoch": 0.9976573851673932, + "grad_norm": 0.06353543698787689, + "learning_rate": 1.41682077337002e-08, + "loss": 2.5235, + "step": 33644 + }, + { + "epoch": 0.9976870385197046, + "grad_norm": 0.06293594092130661, + "learning_rate": 1.381621794388499e-08, + "loss": 2.5632, + "step": 33645 + }, + { + "epoch": 0.9977166918720162, + "grad_norm": 0.06126526743173599, + "learning_rate": 1.3468655617521197e-08, + "loss": 2.539, + "step": 33646 + }, + { + "epoch": 0.9977463452243276, + "grad_norm": 0.06457383185625076, + "learning_rate": 1.3125520757772958e-08, + "loss": 2.5234, + "step": 33647 + }, + { + "epoch": 0.9977759985766391, + "grad_norm": 0.06316275894641876, + "learning_rate": 1.2786813367582362e-08, + "loss": 2.5609, + "step": 33648 + }, + { + "epoch": 0.9978056519289505, + "grad_norm": 0.06352946907281876, + "learning_rate": 1.2452533450002523e-08, + "loss": 2.5186, + "step": 33649 + }, + { + "epoch": 0.9978353052812621, + "grad_norm": 0.0646430253982544, + "learning_rate": 1.2122681007975533e-08, + "loss": 2.5758, + "step": 33650 + }, + { + "epoch": 0.9978649586335735, + "grad_norm": 0.06500305980443954, + "learning_rate": 1.1797256044387971e-08, + "loss": 2.5551, + "step": 33651 + }, + { + "epoch": 0.997894611985885, + "grad_norm": 0.06164290010929108, + "learning_rate": 1.1476258562181929e-08, + "loss": 2.5264, + "step": 33652 + }, + { + "epoch": 0.9979242653381964, + "grad_norm": 0.06137683987617493, + "learning_rate": 1.1159688564188475e-08, + "loss": 2.5037, + "step": 33653 + }, + { + "epoch": 0.997953918690508, + "grad_norm": 0.062376488000154495, + "learning_rate": 1.0847546053183165e-08, + "loss": 2.5467, + "step": 33654 + }, + { + "epoch": 0.9979835720428194, + "grad_norm": 0.06115621700882912, + "learning_rate": 1.0539831031997071e-08, + "loss": 2.522, + "step": 33655 + }, + { + "epoch": 0.9980132253951309, + "grad_norm": 0.06547699868679047, + "learning_rate": 1.0236543503239215e-08, + "loss": 2.5595, + "step": 33656 + }, + { + "epoch": 0.9980428787474424, + "grad_norm": 0.0661613717675209, + "learning_rate": 9.937683469685155e-09, + "loss": 2.5759, + "step": 33657 + }, + { + "epoch": 0.9980725320997539, + "grad_norm": 0.06436583399772644, + "learning_rate": 9.643250933943915e-09, + "loss": 2.5348, + "step": 33658 + }, + { + "epoch": 0.9981021854520653, + "grad_norm": 0.0642884224653244, + "learning_rate": 9.353245898624518e-09, + "loss": 2.5473, + "step": 33659 + }, + { + "epoch": 0.9981318388043768, + "grad_norm": 0.06497003883123398, + "learning_rate": 9.06766836633599e-09, + "loss": 2.5176, + "step": 33660 + }, + { + "epoch": 0.9981614921566884, + "grad_norm": 0.0659116730093956, + "learning_rate": 8.786518339576332e-09, + "loss": 2.514, + "step": 33661 + }, + { + "epoch": 0.9981911455089998, + "grad_norm": 0.06099027022719383, + "learning_rate": 8.509795820788036e-09, + "loss": 2.5227, + "step": 33662 + }, + { + "epoch": 0.9982207988613113, + "grad_norm": 0.06491775810718536, + "learning_rate": 8.237500812524612e-09, + "loss": 2.5562, + "step": 33663 + }, + { + "epoch": 0.9982504522136227, + "grad_norm": 0.06369484215974808, + "learning_rate": 7.96963331711753e-09, + "loss": 2.5544, + "step": 33664 + }, + { + "epoch": 0.9982801055659343, + "grad_norm": 0.06374317407608032, + "learning_rate": 7.706193336953771e-09, + "loss": 2.5395, + "step": 33665 + }, + { + "epoch": 0.9983097589182457, + "grad_norm": 0.06398865580558777, + "learning_rate": 7.4471808743648e-09, + "loss": 2.5747, + "step": 33666 + }, + { + "epoch": 0.9983394122705572, + "grad_norm": 0.06337317079305649, + "learning_rate": 7.192595931682089e-09, + "loss": 2.5582, + "step": 33667 + }, + { + "epoch": 0.9983690656228686, + "grad_norm": 0.06316834688186646, + "learning_rate": 6.94243851107057e-09, + "loss": 2.5398, + "step": 33668 + }, + { + "epoch": 0.9983987189751802, + "grad_norm": 0.0629812479019165, + "learning_rate": 6.696708614861713e-09, + "loss": 2.5419, + "step": 33669 + }, + { + "epoch": 0.9984283723274916, + "grad_norm": 0.06398367881774902, + "learning_rate": 6.455406245164941e-09, + "loss": 2.5638, + "step": 33670 + }, + { + "epoch": 0.9984580256798031, + "grad_norm": 0.063839852809906, + "learning_rate": 6.218531404145189e-09, + "loss": 2.5214, + "step": 33671 + }, + { + "epoch": 0.9984876790321146, + "grad_norm": 0.06665542721748352, + "learning_rate": 5.98608409385637e-09, + "loss": 2.5025, + "step": 33672 + }, + { + "epoch": 0.9985173323844261, + "grad_norm": 0.06410316377878189, + "learning_rate": 5.7580643164079075e-09, + "loss": 2.5701, + "step": 33673 + }, + { + "epoch": 0.9985469857367375, + "grad_norm": 0.06847985088825226, + "learning_rate": 5.534472073798202e-09, + "loss": 2.5539, + "step": 33674 + }, + { + "epoch": 0.998576639089049, + "grad_norm": 0.06192600354552269, + "learning_rate": 5.315307367970146e-09, + "loss": 2.5627, + "step": 33675 + }, + { + "epoch": 0.9986062924413605, + "grad_norm": 0.06385411322116852, + "learning_rate": 5.100570200922139e-09, + "loss": 2.5465, + "step": 33676 + }, + { + "epoch": 0.998635945793672, + "grad_norm": 0.06609988212585449, + "learning_rate": 4.890260574541561e-09, + "loss": 2.5496, + "step": 33677 + }, + { + "epoch": 0.9986655991459834, + "grad_norm": 0.06206251680850983, + "learning_rate": 4.6843784906602796e-09, + "loss": 2.4951, + "step": 33678 + }, + { + "epoch": 0.9986952524982949, + "grad_norm": 0.0606943815946579, + "learning_rate": 4.4829239511101625e-09, + "loss": 2.5196, + "step": 33679 + }, + { + "epoch": 0.9987249058506064, + "grad_norm": 0.06467120349407196, + "learning_rate": 4.285896957723079e-09, + "loss": 2.5441, + "step": 33680 + }, + { + "epoch": 0.9987545592029179, + "grad_norm": 0.0624069906771183, + "learning_rate": 4.093297512164362e-09, + "loss": 2.5446, + "step": 33681 + }, + { + "epoch": 0.9987842125552294, + "grad_norm": 0.06280529499053955, + "learning_rate": 3.90512561621037e-09, + "loss": 2.5422, + "step": 33682 + }, + { + "epoch": 0.9988138659075408, + "grad_norm": 0.06139891967177391, + "learning_rate": 3.721381271470925e-09, + "loss": 2.5299, + "step": 33683 + }, + { + "epoch": 0.9988435192598524, + "grad_norm": 0.06164174526929855, + "learning_rate": 3.5420644796668734e-09, + "loss": 2.5402, + "step": 33684 + }, + { + "epoch": 0.9988731726121638, + "grad_norm": 0.06214087828993797, + "learning_rate": 3.3671752422415046e-09, + "loss": 2.4973, + "step": 33685 + }, + { + "epoch": 0.9989028259644753, + "grad_norm": 0.06402699649333954, + "learning_rate": 3.196713560860154e-09, + "loss": 2.5127, + "step": 33686 + }, + { + "epoch": 0.9989324793167867, + "grad_norm": 0.06171131134033203, + "learning_rate": 3.0306794370216218e-09, + "loss": 2.5138, + "step": 33687 + }, + { + "epoch": 0.9989621326690983, + "grad_norm": 0.06499127298593521, + "learning_rate": 2.8690728721136872e-09, + "loss": 2.556, + "step": 33688 + }, + { + "epoch": 0.9989917860214097, + "grad_norm": 0.06285776197910309, + "learning_rate": 2.711893867690662e-09, + "loss": 2.5512, + "step": 33689 + }, + { + "epoch": 0.9990214393737212, + "grad_norm": 0.06435828655958176, + "learning_rate": 2.559142425029304e-09, + "loss": 2.5363, + "step": 33690 + }, + { + "epoch": 0.9990510927260327, + "grad_norm": 0.06134343892335892, + "learning_rate": 2.4108185455173905e-09, + "loss": 2.5715, + "step": 33691 + }, + { + "epoch": 0.9990807460783442, + "grad_norm": 0.06374067813158035, + "learning_rate": 2.2669222305427005e-09, + "loss": 2.6074, + "step": 33692 + }, + { + "epoch": 0.9991103994306556, + "grad_norm": 0.0629391074180603, + "learning_rate": 2.127453481270969e-09, + "loss": 2.5101, + "step": 33693 + }, + { + "epoch": 0.9991400527829671, + "grad_norm": 0.06442886590957642, + "learning_rate": 1.992412298978952e-09, + "loss": 2.5639, + "step": 33694 + }, + { + "epoch": 0.9991697061352786, + "grad_norm": 0.06014024466276169, + "learning_rate": 1.8617986848878942e-09, + "loss": 2.542, + "step": 33695 + }, + { + "epoch": 0.9991993594875901, + "grad_norm": 0.0657782182097435, + "learning_rate": 1.7356126401080196e-09, + "loss": 2.5237, + "step": 33696 + }, + { + "epoch": 0.9992290128399015, + "grad_norm": 0.06100216135382652, + "learning_rate": 1.613854165805062e-09, + "loss": 2.5342, + "step": 33697 + }, + { + "epoch": 0.999258666192213, + "grad_norm": 0.06185489520430565, + "learning_rate": 1.4965232630337332e-09, + "loss": 2.5163, + "step": 33698 + }, + { + "epoch": 0.9992883195445245, + "grad_norm": 0.06400107592344284, + "learning_rate": 1.3836199328487452e-09, + "loss": 2.5587, + "step": 33699 + }, + { + "epoch": 0.999317972896836, + "grad_norm": 0.06541319191455841, + "learning_rate": 1.2751441762492988e-09, + "loss": 2.5267, + "step": 33700 + }, + { + "epoch": 0.9993476262491474, + "grad_norm": 0.06365782767534256, + "learning_rate": 1.1710959941235722e-09, + "loss": 2.5192, + "step": 33701 + }, + { + "epoch": 0.999377279601459, + "grad_norm": 0.06538225710391998, + "learning_rate": 1.071475387470766e-09, + "loss": 2.5214, + "step": 33702 + }, + { + "epoch": 0.9994069329537705, + "grad_norm": 0.06316939741373062, + "learning_rate": 9.762823571790592e-10, + "loss": 2.5175, + "step": 33703 + }, + { + "epoch": 0.9994365863060819, + "grad_norm": 0.0624065026640892, + "learning_rate": 8.855169040256072e-10, + "loss": 2.5422, + "step": 33704 + }, + { + "epoch": 0.9994662396583934, + "grad_norm": 0.06565607339143753, + "learning_rate": 7.99179028898589e-10, + "loss": 2.5367, + "step": 33705 + }, + { + "epoch": 0.9994958930107049, + "grad_norm": 0.06254401057958603, + "learning_rate": 7.172687325196492e-10, + "loss": 2.556, + "step": 33706 + }, + { + "epoch": 0.9995255463630164, + "grad_norm": 0.06392767280340195, + "learning_rate": 6.397860155549218e-10, + "loss": 2.5378, + "step": 33707 + }, + { + "epoch": 0.9995551997153278, + "grad_norm": 0.06497760862112045, + "learning_rate": 5.66730878837074e-10, + "loss": 2.5768, + "step": 33708 + }, + { + "epoch": 0.9995848530676393, + "grad_norm": 0.06605742126703262, + "learning_rate": 4.981033228657061e-10, + "loss": 2.5733, + "step": 33709 + }, + { + "epoch": 0.9996145064199508, + "grad_norm": 0.06373881548643112, + "learning_rate": 4.339033483069521e-10, + "loss": 2.5299, + "step": 33710 + }, + { + "epoch": 0.9996441597722623, + "grad_norm": 0.06269452720880508, + "learning_rate": 3.7413095571592337e-10, + "loss": 2.5323, + "step": 33711 + }, + { + "epoch": 0.9996738131245737, + "grad_norm": 0.06453508883714676, + "learning_rate": 3.187861457032426e-10, + "loss": 2.5685, + "step": 33712 + }, + { + "epoch": 0.9997034664768852, + "grad_norm": 0.06373970955610275, + "learning_rate": 2.678689186019767e-10, + "loss": 2.5408, + "step": 33713 + }, + { + "epoch": 0.9997331198291967, + "grad_norm": 0.06393010169267654, + "learning_rate": 2.2137927502274836e-10, + "loss": 2.5293, + "step": 33714 + }, + { + "epoch": 0.9997627731815082, + "grad_norm": 0.06305104494094849, + "learning_rate": 1.7931721529862444e-10, + "loss": 2.5602, + "step": 33715 + }, + { + "epoch": 0.9997924265338196, + "grad_norm": 0.06633388996124268, + "learning_rate": 1.4168273976267188e-10, + "loss": 2.5562, + "step": 33716 + }, + { + "epoch": 0.9998220798861311, + "grad_norm": 0.06227077171206474, + "learning_rate": 1.0847584885897987e-10, + "loss": 2.5428, + "step": 33717 + }, + { + "epoch": 0.9998517332384426, + "grad_norm": 0.06384747475385666, + "learning_rate": 7.969654275408189e-11, + "loss": 2.5825, + "step": 33718 + }, + { + "epoch": 0.9998813865907541, + "grad_norm": 0.06287533789873123, + "learning_rate": 5.534482183655598e-11, + "loss": 2.5397, + "step": 33719 + }, + { + "epoch": 0.9999110399430655, + "grad_norm": 0.06515224277973175, + "learning_rate": 3.5420686217424444e-11, + "loss": 2.5766, + "step": 33720 + }, + { + "epoch": 0.999940693295377, + "grad_norm": 0.06667762249708176, + "learning_rate": 1.9924136063220743e-11, + "loss": 2.5452, + "step": 33721 + }, + { + "epoch": 0.9999703466476885, + "grad_norm": 0.06424718350172043, + "learning_rate": 8.855171651500627e-12, + "loss": 2.5413, + "step": 33722 + }, + { + "epoch": 1.0, + "grad_norm": 0.06308750063180923, + "learning_rate": 2.2137929267529444e-12, + "loss": 2.5572, + "step": 33723 + } + ], + "logging_steps": 1.0, + "max_steps": 33723, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 6745, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 6.05392005996189e+20, + "train_batch_size": 24, + "trial_name": null, + "trial_params": null +}